简体   繁体   中英

Issues with request, and cheerio when web-scraping

I'm trying to write a code that makes a request to a website, for webscraping

So this are the steps:

Here First part of Code STARTS

  1. The program makes the request to the mainURL
  2. The program selects some objects from the html of the mainURL , and store them in an array of objects(advert), on of the properties of the object, is it's link, which we'll call numberURL , that the code automatically selects using a css selector, the amount of objects is something like 80-90;
  3. The program makes requests to every numberURL (80-90 requests), and for each of them it does set another properties to the same object, and selects another link, that we'll call accountURL
  4. The program creates an CSV file where it writes every object in different rows

Here First part of Code ENDS

So actually the first part works pretty good, it doesn't have any issues, but the second part does

Here Second part of Code STARTS

  1. The program makes requests to every accountURL from the previous object
  2. The program selects some objects from the html of the accountURL , and stores them in an another array of another objects(account), also using CSS selectors
  3. The program should console.log() all the account objects

Here Second part of Code ENDS

But the second part does have some bugs, because when console.logging the objects we see that the objects properties doesn't changed their default value.

So in debugging purposes I took one advert object and putted it's value manually from the code

post[0].link = 'https://999.md/ru/profile/denisserj'

Finally when running the code for this object it actually works correctly, so it shows the changed properties, but for the rest of them it doesn't.

I tried to set some Timeouts, thinking that the code tries to read the link, before the second request finished, but no effects

I also tried to console.log the link, to see if it exists in the array, so it actually exists there, but also no effect.

Finally here is the code:

// CLASSES
class advert {
    constructor() {
        this.id = 0;
        this.tile = new String();
        this.link = new String();
        this.phone = new String();
        this.account = new String();
        this.accountLink = new String();
        this.text = new String();
        this.operator = new String();
    }
    show() {
        console.log(this.id, this.title, this.link, this.phone, this.account, this.accountLink, this.text, this.operator);
    }

}
class account {
    constructor() {
        this.name = 0;
        this.createdAt = 0;
        this.phone = [];
        this.ads = [];
        this.adsNumber = 0;
    }
    show() {
        console.log(this.name, this.createdAt, this.phone, this.ads, this.adsNumber);
    }
}

// HEADERS
const mainRequest = require('request');
const auxRequest = require('request');
const cheerio1 = require('cheerio');
const cheerio2 = require('cheerio');
const fs = require('fs');
const fs2 = require('fs');
const adFile = fs.createWriteStream('anunturi.csv');
const accFile = fs2.createWriteStream('conturi.csv');

// SETTINGS
const host = 'https://999.md'
const category = 'https://999.md/ru/list/transport/cars'
const timeLimit = 60; //seconds

// VARIABLES
let post = [];
let postNumber = 0;
let acc = [];

// FUNCTIONS
function deleteFromArray(j) {
    post.splice(j, 1);
}

function number(i) {
    let category = post[i].link;
    auxRequest(category, (error, response, html) => {
        if (!error && response.statusCode == 200) {
            const $ = cheerio1.load(html);
            let phone;
            const siteTitle = $('strong').each((id, el) => {
                phone = $(el).text();
            });
            const txt = $('.adPage__content__description').html();
            const person = $('.adPage__header__stats').find('.adPage__header__stats__owner').text();
            const linkToPerson = host + $('.adPage__header__stats').find('.adPage__header__stats__owner').find('a').attr('href');
            post[i].phone = phone;
            post[i].account = person;
            post[i].accountLink = linkToPerson;
            post[i].text = txt;
            if (i == postNumber) {
                console.log('1. Number Putting done')
                writeToFileAd(accountPutter, writeToFileAccount);
            }
        }

    });
}

function writeToFileAd() {
    adFile.write('ID, Titlu, Link, Text, Cont, LinkCont, Operator\n')
    for (let i = 0; i <= postNumber; i++) {
        adFile.write(`${post[i].id}, ${post[i].title}, ${post[i].link}, ${post[i].phone}, ${post[i].account}, ${post[i].accountLink}, ${post[i].operator}\n`);
    }
    console.log('2. Write To File Ad done')
    accountPutter();
}

function accountAnalyzis(i) {
    let category = post[i].link;
    const mainRequest = require('request');
    category = category.replace('/ru/', '/ro/');
    mainRequest(category, (error, response, html) => {

        if (!error && response.statusCode == 200) {
            const $ = cheerio2.load(html);
            const name = $('.user-profile__sidebar-info__main-wrapper').find('.login-wrapper').text();
            let createdAt = $('.date-registration').text();
            createdAt = createdAt.replace('Pe site din ', '');
            const phones = $('.user-profile__info__data').find('dd').each((id, el) => {
                let phone = $(el).text();
                acc[i].phone.push(phone);
            });
            const ads = $('.profile-ads-list-photo-item-title').find('a').each((id, el) => {
                let ad = host + $(el).attr('href');
                acc[i].ads.push(ad);
                acc[i].adsNumber++;
            });
            acc[i].name = name;
            acc[i].createdAt = createdAt;
            console.log(name)
            if (i == postNumber) {
                console.log('3. Account Putting done')
                writeToFileAccount();
            }
        }
    });
}

function writeToFileAccount() {
    for (let i = 0; i <= postNumber; i++) {
        accFile.write(`${acc[i].name}, ${acc[i].createdAt}, ${acc[i].phone}, ${acc[i].ads}, ${acc[i].adsNumber}\n`);
    }
    console.log('4. Write to file Account done');
}

function numberPutter() {
    for (let i = 0; i <= postNumber; i++) {
        number(i);
    }
}

function accountPutter() {
    for (let i = 0; i <= postNumber; i++) {
        accountAnalyzis(i);
    }
}

// MAIN
mainRequest(category, (error, response, html) => {
    let links = [];
    for (let i = 0; i < 1000; i++) {
        post[i] = new advert();
    }
    for (let i = 0; i < 1000; i++) {
        acc[i] = new account();
    }
    if (!error && response.statusCode == 200) {
        const $ = cheerio2.load(html);
        const siteTitle = $('.ads-list-photo-item-title').each((id, el) => {
            const ref = host + $(el).children().attr('href');
            const title = $(el).text();
            post[id].id = id + 1;
            post[id].title = title;
            post[id].link = ref;
            links[id] = ref;
            postNumber = id;
        });
        post[0].link = 'https://999.md/ru/profile/denisserj'
        numberPutter()
    }

});

You have an error in line

const siteTitle = $('.ads-list-photo-item-title').each((id, el) => {

What you actually want is .find('a').each...

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM