"Need to crawl websites requiring authentication"

Vineet
Vineet New Altair Community Member
edited November 5 in Altair RapidMiner
Hello,
I am trying to crawl a website which requires authentication.
for that, i have used get page operator and enabled cookies entering my details.
still, i am not able to log in, using get pages operator as discussed in: http://rapid-i.com/rapidforum/index.php/topic,6106.0.html

Here is my process.
Please guide me.

Regards,
Vineet
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="web:get_webpage" compatibility="5.3.000" expanded="true" height="60" name="Get Page" width="90" x="179" y="30">
        <parameter key="url" value="https://accounts.google.com/ServiceLogin?service=mail&amp;passive=true&amp;rm=false&amp;continue=https://mail.google.com/mail/&amp;ss=1&amp;scc=1&amp;ltmpl=default&amp;ltmplcache=2"/>
        <parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0 "/>
        <parameter key="accept_cookies" value="all"/>
        <parameter key="request_method" value="POST"/>
        <list key="query_parameters">
          <parameter key="Email" value="infospace007%40gmail.com"/>
          <parameter key="Passwd" value="infospace"/>
          <parameter key="signIn" value="Sign+in"/>
          <parameter key="_utf8" value="%E2%98%83"/>
          <parameter key="service" value="mail"/>
          <parameter key="continue" value="https%3A%2F%2Fmail.google.com%2Fmail%2F%3Ftab%3Dwm"/>
          <parameter key="rm" value="false"/>
          <parameter key="dsh" value="-6653757825738056045"/>
          <parameter key="ltmpl" value="default"/>
          <parameter key="scc" value="1"/>
          <parameter key="pstMsg" value="1"/>
          <parameter key="checkedConnection" value="youtube%3A164%3A1"/>
          <parameter key="bgresponse" value="%21A0J8pNRL3fHDlkSpYLs1st775gIAAABBUgAAAAkqAQXRmXOYFfgQ8E-HKcxRaASVrT6PYOYVYqHciZ4i69haFUqHy15D-LE069TDBl9TaU6Jd_qURb1T5swIKm-JTGKhRnPOaXNDZNkNk3a-qkfh_q9F7fEicPTC8ovAY6PkLaF2UFn9P-Iwzc0Hw4337oLj-WqUHVNNBw4R4qIU_2uMBSj6g7pBi96Cywk8Keplxk-q6UATUVSCJleWXkY5XQBzCU27cbKp5VP9C8VvOK9IMRyYdJSRfvEt-siU02XsealJr4Jx6r6VB6rXfeDc-g-JuBp9HXmu3BZhfIZmd7l_InxQVDrgNMD7XoitF2mHOqamgk_IIYAHp1IT9DG09abzLGRSrF0"/>
          <parameter key="ServiceLoginAuth continue" value="https%3A%2F%2Fmail.google.com%2Fmail%2F%3Ftab%3Dwm&amp;service=mail&amp;rm=false&amp;dsh=-6653757825738056045"/>
          <parameter key="scc" value="1"/>
          <parameter key="GALX" value="HEa-r2pPMxw"/>
          <parameter key="pstMsg" value="1"/>
          <parameter key="checkedDomains" value="youtube"/>
          <parameter key="rmshown" value="1"/>
        </list>
        <list key="request_properties">
          <parameter key="Host" value="mail.google.com"/>
          <parameter key="User-Agent" value="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0"/>
          <parameter key="Accept" value="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"/>
          <parameter key="Accept-Language" value="en-US,en;q=0.5"/>
          <parameter key="Accept-Encoding" value="gzip, deflate"/>
          <parameter key="Content-Type" value="application/x-www-form-urlencoded; charset=UTF-8"/>
          <parameter key="Referer" value="https://mail.google.com/_/mail-static/_/js/main/m_i,t/rt=h/ver=Phek_zmgDkw.en./sv=1/am=!e0FPFCylvh30BOXaPKvsAjcptCvQCCVC9BId1VwCFx8MmmElTTaKOWsaXu4VvCDP35Vm5w/d=1"/>
          <parameter key="Content-Length" value="53"/>
          <parameter key="Cookie" value="S=gmail=fVbtStnM0ILFS8KWFN-3Qw; GX=DQAAALwAAAChzU23SXgpyDRPJTmxXVjoBigeLCrhmYVDKgu2u4VsKYlQhOvrOgvKDe89R62ZGL88RtsvU-N8j24DLvuXE_4sR-kuS5qcFvZ9cTzBMwVFexU6GLDYHe7lFdcmI-3sDan_0RK4_3zbLTJ1NGqf1P2wgFeKfz6vIGOSzkFm-f81jsYjF0nTSKh1dT4qdaE2WRVoq01jBBV-a01GchGz_aU_4DQC-DAlw9bOCLGHU5dEl6YW3K40nHz7eCwdABGGuEs; GMAIL_AT=AF6bupOzN7Bbjpx2Bf-GpEdzRHGwivBcgw; gmailchat=infospace007@gmail.com/692382; GMAIL_IMP=v*2%2Ftl-inv*0!inbox!unk%2Ftl-si-inbox*22%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-si-inbox*22%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-si-inbox*22%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-si-inbox*22%2Ftl-inv*0!inbox!unk%2Ftl-inv*0!inbox!unk%2Ftl-si-inbox*22%2Ftl-inv*0!inbox!unk%2Fr-cs*312%2Ftl-inv*0!inbox!unk; NID=67=sxcfkTjwFxz6m2hog_RyfOMpCvnCFBjYdHHYzV89lTqTOaiO0saz2asvZ9ksSpyYjXtJMQKcyUsN274rj0uTLSWOfUBWvKS0GF08bW2NEJWmTK9zOWj7rhPFKCmsDAgsDPjmou4L7AmsmK-nUGmBImVLo3P_aLaIMK43HLpbfthH4pPcpBo; PREF=ID=ed62523eeec51027:U=cdf6c6a482def491:FF=0:LD=en:TM=1358143677:LM=1358143941:GM=1:S=L6h3Y5ZDjDyAPhbJ; SID=DQAAALgAAAD_jVZHoPU3VifOjphHsX8jERNCurrJ3YfQpKKChF3NgDMOamsK6mSb-31ZZK1N40TSLsKsL6wnlnad6PuQTdFosh83cv7rVUIphdm4pBoI-4K71C_fymyLHR6L0mbtFFygDQVXiC_2afM8szMnagfR1zJ2wkC6TBJlRlqEXtOmOaxg6Tzhrx6wvK47AGBDd2xnDJS7oo6roLCP2KAraSjVFP5laBJeaX5yk2YzV54gg03YqEN1_kmFCpvqDR5G4Bw; HSID=ATb9cr-saLj_-NbeH; SSID=AvFOA5coM2-AT321Q; APISID=-hBge7bhsykHMDcM/AvF1ZY6w7uSivHp7u; SAPISID=Mi7Dh04BKMjP3vsB/AJvvoSHxTkWgyfhlU"/>
          <parameter key="Connection" value="keep-alive"/>
          <parameter key="Pragma" value="no-cache"/>
          <parameter key="Cache-Control" value="no-cache"/>
        </list>
      </operator>
      <operator activated="true" class="read_excel" compatibility="5.3.005" expanded="true" height="60" name="Read Excel" width="90" x="112" y="210">
        <parameter key="excel_file" value="C:\Users\vnagpal\Desktop\Book1.xlsx"/>
        <parameter key="imported_cell_range" value="A1:B1"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="A.true.integer.attribute"/>
          <parameter key="1" value="B.true.file_path.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="313" y="210">
        <parameter key="link_attribute" value="B"/>
        <parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0 "/>
        <parameter key="accept_cookies" value="all"/>
        <parameter key="request_method" value="POST"/>
        <parameter key="delay" value="random"/>
      </operator>
      <connect from_op="Get Page" from_port="output" to_port="result 1"/>
      <connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>