[英]Issues with request, and cheerio when web-scraping
我正在嘗試編寫一個向網站發出請求的代碼,用於webscraping
這就是步驟:
這是Code STARTS的第一部分
這里是Code ENDS的第一部分
所以實際上第一部分工作得很好,它沒有任何問題,但第二部分確實如此
這里是Code STARTS的第二部分
這里是Code ENDS的第二部分
但是第二部分確實有一些錯誤,因為當console.logging對象時,我們看到對象屬性沒有改變它們的默認值。
因此,在調試目的中,我獲取了一個廣告對象,並從代碼中手動推出了它的值
post[0].link = 'https://999.md/ru/profile/denisserj'
最后,當運行此對象的代碼時,它實際上正常工作,因此它顯示已更改的屬性,但對於其余的屬性,它不會。
我嘗試設置一些超時,認為代碼嘗試在第二個請求完成之前讀取鏈接,但沒有效果
我也嘗試了console.log鏈接,看看它是否存在於數組中,所以它實際上存在於那里,但也沒有效果。
最后這里是代碼:
// CLASSES
class advert {
constructor() {
this.id = 0;
this.tile = new String();
this.link = new String();
this.phone = new String();
this.account = new String();
this.accountLink = new String();
this.text = new String();
this.operator = new String();
}
show() {
console.log(this.id, this.title, this.link, this.phone, this.account, this.accountLink, this.text, this.operator);
}
}
class account {
constructor() {
this.name = 0;
this.createdAt = 0;
this.phone = [];
this.ads = [];
this.adsNumber = 0;
}
show() {
console.log(this.name, this.createdAt, this.phone, this.ads, this.adsNumber);
}
}
// HEADERS
const mainRequest = require('request');
const auxRequest = require('request');
const cheerio1 = require('cheerio');
const cheerio2 = require('cheerio');
const fs = require('fs');
const fs2 = require('fs');
const adFile = fs.createWriteStream('anunturi.csv');
const accFile = fs2.createWriteStream('conturi.csv');
// SETTINGS
const host = 'https://999.md'
const category = 'https://999.md/ru/list/transport/cars'
const timeLimit = 60; //seconds
// VARIABLES
let post = [];
let postNumber = 0;
let acc = [];
// FUNCTIONS
function deleteFromArray(j) {
post.splice(j, 1);
}
function number(i) {
let category = post[i].link;
auxRequest(category, (error, response, html) => {
if (!error && response.statusCode == 200) {
const $ = cheerio1.load(html);
let phone;
const siteTitle = $('strong').each((id, el) => {
phone = $(el).text();
});
const txt = $('.adPage__content__description').html();
const person = $('.adPage__header__stats').find('.adPage__header__stats__owner').text();
const linkToPerson = host + $('.adPage__header__stats').find('.adPage__header__stats__owner').find('a').attr('href');
post[i].phone = phone;
post[i].account = person;
post[i].accountLink = linkToPerson;
post[i].text = txt;
if (i == postNumber) {
console.log('1. Number Putting done')
writeToFileAd(accountPutter, writeToFileAccount);
}
}
});
}
function writeToFileAd() {
adFile.write('ID, Titlu, Link, Text, Cont, LinkCont, Operator\n')
for (let i = 0; i <= postNumber; i++) {
adFile.write(`${post[i].id}, ${post[i].title}, ${post[i].link}, ${post[i].phone}, ${post[i].account}, ${post[i].accountLink}, ${post[i].operator}\n`);
}
console.log('2. Write To File Ad done')
accountPutter();
}
function accountAnalyzis(i) {
let category = post[i].link;
const mainRequest = require('request');
category = category.replace('/ru/', '/ro/');
mainRequest(category, (error, response, html) => {
if (!error && response.statusCode == 200) {
const $ = cheerio2.load(html);
const name = $('.user-profile__sidebar-info__main-wrapper').find('.login-wrapper').text();
let createdAt = $('.date-registration').text();
createdAt = createdAt.replace('Pe site din ', '');
const phones = $('.user-profile__info__data').find('dd').each((id, el) => {
let phone = $(el).text();
acc[i].phone.push(phone);
});
const ads = $('.profile-ads-list-photo-item-title').find('a').each((id, el) => {
let ad = host + $(el).attr('href');
acc[i].ads.push(ad);
acc[i].adsNumber++;
});
acc[i].name = name;
acc[i].createdAt = createdAt;
console.log(name)
if (i == postNumber) {
console.log('3. Account Putting done')
writeToFileAccount();
}
}
});
}
function writeToFileAccount() {
for (let i = 0; i <= postNumber; i++) {
accFile.write(`${acc[i].name}, ${acc[i].createdAt}, ${acc[i].phone}, ${acc[i].ads}, ${acc[i].adsNumber}\n`);
}
console.log('4. Write to file Account done');
}
function numberPutter() {
for (let i = 0; i <= postNumber; i++) {
number(i);
}
}
function accountPutter() {
for (let i = 0; i <= postNumber; i++) {
accountAnalyzis(i);
}
}
// MAIN
mainRequest(category, (error, response, html) => {
let links = [];
for (let i = 0; i < 1000; i++) {
post[i] = new advert();
}
for (let i = 0; i < 1000; i++) {
acc[i] = new account();
}
if (!error && response.statusCode == 200) {
const $ = cheerio2.load(html);
const siteTitle = $('.ads-list-photo-item-title').each((id, el) => {
const ref = host + $(el).children().attr('href');
const title = $(el).text();
post[id].id = id + 1;
post[id].title = title;
post[id].link = ref;
links[id] = ref;
postNumber = id;
});
post[0].link = 'https://999.md/ru/profile/denisserj'
numberPutter()
}
});
你有一個錯誤
const siteTitle = $('.ads-list-photo-item-title').each((id, el) => {
你真正想要的是.find('a').each...
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.