I'm trying to crawl this site to create an Excel document containing the the names, locations, phone numbers, and specialty type of individual practitioners on
https://www.psychologytoday.com/us/therapists The link above has links underneath for each state, and each state has about 50 pages or so of contacts. I'm just trying to get the html pulled so I can later pull the contact data out, likely with Tableau Prep. The CSS tags I have from selector gadget are span , h1 , .location-address-phone
This is the operator I'm using, and it's returning absolutely nothing. Can someone please help me figure this out? Thanks!
<?xml version="1.0" encoding="UTF-8"?><process version="9.5.001">
<operator activated="true" class="web:crawl_web_modern" compatibility="9.0.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<list key="crawling_rules">
</list>
<parameter key="max_crawl_depth" value="52"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="enable_basic_auth" value="false"/>
<parameter key="add_content_as_attribute" value="false"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="include_binary_content" value="false"/>
<parameter key="output_dir" value="/Users/ME/Desktop/Web Crawls"/>
<parameter key="output_file_extension" value="html"/>
<parameter key="max_pages" value="2500"/>
<parameter key="max_page_size" value="10000"/>
<parameter key="delay" value="500"/>
<parameter key="max_concurrent_connections" value="100"/>
<parameter key="max_connections_per_host" value="50"/>
<parameter key="user_agent" value="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"/>
<parameter key="ignore_robot_exclusion" value="false"/>
</operator>
</process>