简体   繁体   中英

Using casperjs and phantomjs to scrape multiple pages

I'm trying to scrape a number of pages that have a standard format. I've been able to use Phantomjs to successfully scrape a single page, but when I try to iterate over multiple ones, the asynchronous processing makes things hang up. What's the proper way to tell Casper/Phantom to wait?


var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output file

f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();


   // this is the unique identifier for the locations. For now, I just have three datapoints
  var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"]; 

 /// this code will be used to loop through the different locations. For now, set to look at only one.  
 for (q= 0;  q < 1; q++)  {
    var processing = false;



   //we construct the target url
   var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


   page.open(url);
   page.onLoadFinished = function(status) {
   if ( status === "success" ) {
       page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
           var str = page.evaluate(function() {                   
               $value = [];
               $Object = $(".result tr");                
               for (i =0 ; i < 10; i++) { 
             $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
             $Object = $Object.next();
            } 

            $string = "{ EPAID: "+  $value[0] +  ", " + 
                     "Name: "+  $value[1] +  ", " +                
                     "City: "+  $value[4] +  ", " +
                     "State: "+  $value[6] +  ", " +
                     "ZipCode: "+  $value[8] +  ", " +  
                     "Latitude: "+  $value[14] +  ", " +
                     "Longitude: "+  $value[16] +  " }" ;          
            return $string;
        });

        f = fs.open("lat_long.txt", "a");
        f.write(str);
        f.close();
        processing = true;
        console.log("writing to file");
       phantom.exit();    

    });
 }


 // right here it should delay until the previous page is completed        
 //  while (!processing)  {    
 //       setTimeout(function(){ console.log("waiting....");},1000);
 //    }


};

}

console.log("finished all pages");

If you switched to using casperJS, it is as simple as changing your page.open() into page.thenOpen() . (This CasperJS - How to open up all links in an array of links question looks very similar to yours?)

If you wanted to stick with PhantomJS you need to start the next page load in the onSuccess callback of the previous load. This is tedious, and needs care to avoid large memory usage. (I did it once or twice, but now simply use CasperJS.)

An alternative approach is to create the page object inside the loop. However that is not quite answering your question, as then they will run in parallel. But you could use setTimeout to stagger each once to avoid a burst of activity if you have hundreds of URLs!

Here is the code that ultimately works (using the timeout approach since I wasn't able to get the success callback to work better).

With casperjs installed, I named this file "process.js" and was able to run it from the command line as "casperjs process.js"


var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output f
 // this is the unique identifier for the locations. 
    var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"]; 


f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();


var count = 0;
var target = 1400;
var written = [];

function yourFunction(){

   if (count < target) {

      process(count);
      count++;
      setTimeout(yourFunction, 5000);

   } else {
       console.log("exiting");
       phantom.exit();    
       return;
   }    
}




function process(counter){    

    var processing = false;

         console.log("Beginning record #" + counter); 

    //we construct the target url
    var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


    page.open(url);
    page.onLoadFinished = function(status) {
    if ( status === "success" ) {
        page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
            var str = page.evaluate(function() {                   
                $value = [];
                $Object = $(".result tr");                
              for (i =0 ; i < 10; i++) { 
                 $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
                 $Object = $Object.next();
              } 

                $string = "{ \"EPAID\": \""+  $value[0] +  "\", " + 
                         "\"Name\": \""+  $value[1] +  "\", " +                
                         "\"City\": \""+  $value[4] +  "\", " +
                         "\"State\": \""+  $value[6] +  "\", " +
                         "\"ZipCode\": \""+  $value[8] +  "\", " +  
                         "\"Latitude\": "+  $value[14] +  ", " +
                         "\"Longitude\": "+  $value[16] +  " }," ;          
                return $string;
            });


           if (written[counter] === undefined) { 

             f = fs.open("lat_long.txt", "a");
             f.write(str);
             f.close();
             written[counter] = true;
             console.log("Writing to file #"+  counter);
           }  

        });
    }

    };
}

 console.log("Start...");

yourFunction();

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM