CasperJS loop or iterate through multiple web pages?

前端 未结 3 1569
难免孤独
难免孤独 2020-12-05 01:20

I have a CasperJS script that scrapes ratings and dates from one webpage. Now I want to scrape the same data from multiple pages under the same website. How can I loop throu

相关标签:
3条回答
  • 2020-12-05 01:39

    This code can help you : you define in an array of objects the wanted urls, selectors for each page and in a loop you do what you want to do with these properties.

    You can use a click method in the loop instead of url too.

    var navigation = [
        {
            url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', 
            selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
        }
        ,{
            url: 'yourSecondUrl, etc...',
            selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
            selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
        }
    ],
    content = "";
    
        casper.start()
        .then(function(){
            //loop on the array
            navigation.forEach(function(navIndex){
                //open url : property url 
                casper.thenOpen(navIndex.url)
                //wait for the page to load -> must be useless because thenOpen() do it
                .waitForUrl(navIndex.url, function(){
                    //get the value of attribute title of adequate selector
                    var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
                    //get the HTML of adequate selector
                    var dates = this.getHTML(navIndex.selectorDates);
                    this.echo(ratings);
                    this.echo(dates);
                    content = content +  ' ' + ratings + ' ' + dates;
                }); 
            });
        })
        .run(function() {
                this.echo('----------- All steps done ------------\n');
                this.exit();
        });
    
    0 讨论(0)
  • 2020-12-05 01:43

    Since there exists a next page button, you can use it to traverse all pages recursively:

    function getRatingsAndWrite(){
        ratings = casper.evaluate(getRatings);
        dates = casper.evaluate(getDate);
    
        casper.echo(ratings);
        casper.echo(ratings.length + ' ratings found:');
    
        for(var i=0; i<ratings.length; i++){
            ratings[i] = ratings[i]+': '+dates[i];
            dates[i] = '';
        }
        casper.echo(ratings);
        var content = ratings;
    
        content = content.join("\n");
    
        fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a'); 
    
        casper.echo(dates.length + ' dates found:');
    
        var nextLink = ".BVRRPageLink.BVRRNextPage > a";
        if (casper.visible(nextLink)) {
            casper.thenClick(nextLink);
            casper.then(getRatingsAndWrite);
        } else {
            casper.echo("END")
        }
    }
    
    casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');
    
    casper.then(getRatingsAndWrite);
    
    casper.run();
    

    A related answer is A: CasperJS parse next page after button click.

    0 讨论(0)
  • 2020-12-05 01:43

    Thanks Fanch and Artjom B. Both of your answers rendered the working solution. I used the recursive walk through the 'next' pages on the pagination as given by Artjom B. Next, I added a wait() function to make sure the next ratings page was loaded before scraping them. Without this wait() function, we scrape the same page multiple times between the instant that 'next' is clicked and the resp. next page is done loading. See the working code below:

    var ratings = [];
    var dates = [];
    var casper = require('casper').create({
    
        pageSettings: {
            loadImages:  false,         
            loadPlugins: false          
        },
        logLevel: "debug",               
        verbose: true                   
    });
    
    var fs = require('fs');
    
    function getRatings() {
        var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
        return Array.prototype.map.call(ratings, function(e) {
            return e.getAttribute('title');
        });
    }
    
    function getDate() {
        var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
    
        return Array.prototype.map.call(dates, function(e) {
    
            return e.innerHTML;
    
        });
    }
    
    function getRatingsAndWrite(){
        ratings = casper.evaluate(getRatings);
        dates = casper.evaluate(getDate);
    
    
        casper.echo(ratings.length + ' ratings found:');
    
         for(var i=0; i<ratings.length; i++){
            var rating = ratings[i].substr(0,1);
            ratings[i] = rating +': '+dates[i];
            dates[i] = '';
        } 
    
        var content = ratings;
    
        content = content.join("\n");
    
        fs.write("<filepath to write content>", content, 'a'); 
    
        casper.echo(dates.length + ' dates found:');
    
        var nextLink = ".BVRRPageLink.BVRRNextPage > a";
        if (casper.visible(nextLink)) {
            casper.thenClick(nextLink);
            casper.wait(3000);
            casper.then(getRatingsAndWrite);
        } else {
            casper.echo("END")
        }
    }
    
    casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');
    
    casper.then(getRatingsAndWrite);
    
    casper.run();
    
    0 讨论(0)
提交回复
热议问题