[英]cannot extract json items on webpage using scrapy
I am working on extracting data from indeed.com for a data science project I am working on.我正在为我正在从事的数据科学项目从 Indeed.com 中提取数据。 Though I am able to successfully scrape various portions of the page, I am having some issues scraping items from within the JSON portion of the page.虽然我能够成功地抓取页面的各个部分,但我在从页面的 JSON 部分抓取项目时遇到了一些问题。
Does anyone know how I would extract the items below from the URL?有谁知道我将如何从 URL 中提取以下项目? >>> view-source:https://www.indeed.com/viewjob?jk=41abec7fde3513dc&tk=1dn0mslbr352v000&from=serp&vjs=3&advn=9434814581076032&adid=197003786&sjdu=BbcXv7z69Xez4bal0Fx7iYB6jxzlBG3p6CfmfgjyGDErM4mqXgOsfEsOF5maJ2GRnKJsHskFl8aEbb4LlD5LibXOuIs0dzzHfVCmKB00C2c43rDVhEZX_8Zmg4zqEyqG5LEfQjRfoyOhULxXHTMitWOUjMOdLRt367-ZewSzfkqUSnPzHungl7uY7NcfOFLy
. >>> view-source:https://www.indeed.com/viewjob?jk=41abec7fde3513dc&tk=1dn0mslbr352v000&from=serp&vjs=3&advn=9434814581076032&adid=197003786&sjdu=BbcXv7z69Xez4bal0Fx7iYB6jxzlBG3p6CfmfgjyGDErM4mqXgOsfEsOF5maJ2GRnKJsHskFl8aEbb4LlD5LibXOuIs0dzzHfVCmKB00C2c43rDVhEZX_8Zmg4zqEyqG5LEfQjRfoyOhULxXHTMitWOUjMOdLRt367-ZewSzfkqUSnPzHungl7uY7NcfOFLy
.
Items to be extracted below: \nPOT-Creation-Date:
\nPO-Revision-Date:
"jobLocation":"Arlington, TX
下面要提取的项目: \nPOT-Creation-Date:
\nPO-Revision-Date:
"jobLocation":"Arlington, TX
A sample script I am running is below我正在运行的示例脚本如下
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
import boto3
class JobsSpider1(scrapy.Spider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = ["https://www.indeed.com/jobs?q=\"owner+operator\"+\"truck\"&l=augusta"]
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI':'me_test.json'
}
def parse(self, response):
jobs = response.xpath('//div[@class="title"]')
for job in jobs:
title = job.xpath('a//@title').extract_first()
posting_link = job.xpath('a//@href').extract_first()
posting_url = "https://indeed.com" + posting_link
yield Request(posting_url, callback=self.parse_page, meta={'title': title, 'posting_url':posting_url})
relative_next_url = response.xpath('//link[@rel="next"]/@href').extract_first()
absolute_next_url = "https://indeed.com" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
posting_url = response.meta.get('posting_url')
job_title = response.meta.get('title')
#job_name= response.xpath('//*[@class="icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"]/text()').extract_first()
job_descriptions=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/ul').extract_first()
job_listing_header=response.xpath('//*[@class="jobSectionHeader"]/ul').extract_first()
posted_on_date= response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/text()').extract_first()
job_location=response.xpath('//*[@class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating"]/div[3]/text()').extract_first()
yield {
'job_title':job_title,
'posting_url':posting_url,
# 'job_name':job_name,
'job_listing_header':job_listing_header,
'job_location': job_location,
'job_descriptions':job_descriptions,
'posted_on_date':posted_on_date
}
```````````````````````````````````````````````````````````````
######################################################
############# UPDATED CODE #########################
############# UPDATED CODE ###########################
#############################################
import time
import os, sys
import json
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
import boto3
from scrapy.loader.processors import Join
import re
class JobsSpider1(scrapy.Spider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = ["https://www.indeed.com/jobs?q=\"owner+operator\"+\"truck\"&l=augusta"]
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI':'test.csv'
}
def parse(self, response):
jobs = response.xpath('//div[@class="title"]')
for job in jobs:
title = job.xpath('a//@title').extract_first()
posting_link = job.xpath('a//@href').extract_first()
posting_url = "https://indeed.com" + posting_link
yield Request(posting_url, callback=self.parse_page, meta={'title': title, 'posting_url':posting_url})
relative_next_url = response.xpath('//link[@rel="next"]/@href').extract_first()
absolute_next_url = "https://indeed.com" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
posting_url = response.meta.get('posting_url')
job_title = response.meta.get('title')
#job_name= response.xpath('//*[@class="icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"]/text()').extract_first()
#job_descriptions_1=response.xpath('//<[@class="jobsearch-jobDescriptionText"]/ul').extract_first()
#job_descriptions_2=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/p').extract_first()
#job_descriptions_3=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/div').extract_first()
#job_descriptions_4=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/br').extract_first()
#job_descriptions_5=response.xpath('//*[@class="jobDescriptionText"]/div').extract_first()
job_listing_header=response.xpath('//*[@class="jobSectionHeader"]/b').extract_first()
#posted_on_date=response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/text()').extract_first()
#posted_on_date=response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/<span').extract_first()
#job_location=response.xpath('//*[@class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating"]/div[3]/text()').extract_first()
data_obj = json.loads(re.search(r'window\._initialData\=(\{.+\});', script_text).group(1))
renew = data_obj['jobLocation']
yield {
'job_title':job_title,
'renew':renew,
'posting_url':posting_url,
'job_listing_header':job_listing_header
#'job_location': job_location
#'job_descriptions_1':job_descriptions_1,
#'job_descriptions_2':job_descriptions_2,
#'job_descriptions_3':job_descriptions_3,
#'job_descriptions_4':job_descriptions_4,
#'job_descriptions_5':job_descriptions_5
#'posted_on_date':posted_on_date
}
The data you need is in json format but inside javascript, so you need to extract by steps:你需要的数据是json格式但是在javascript里面,所以你需要按步骤提取:
After that you should be able to query it as an object.之后,您应该能够将其查询为 object。 To get all the previous steps and extract the object directly, you can use something like this:要获取前面的所有步骤并直接提取 object,您可以使用以下内容:
data_obj = json.loads(re.search(r'window\._initialData\=(\{.+\});', script_text).group(1))
Now you can use that object to get the data you want, example:现在您可以使用该 object 来获取您想要的数据,例如:
> data_obj['jobLocation']
Arlington, TX
For the part of POT-Creation-Date
for example, you'll need to better format that string as it is inside a list and not directly queryable:例如,对于POT-Creation-Date
的部分,您需要更好地格式化该字符串,因为它位于列表中且不可直接查询:
> data_obj['localeData'][''][-1].split('\n')
['Project-Id-Version: ',
'Report-Msgid-Bugs-To: ',
'POT-Creation-Date: 2019-10-03 17:31+0900',
'PO-Revision-Date: 2019-07-11 09:46-0500',
'Last-Translator: Auto Generated <noreply@indeed.com>',
'Language-Team: English (United States) <http://example.com/weblate/projects/indeed/indeedmobile-i18n-content/en_US/>',
'Language: en_US',
'MIME-Version: 1.0',
'Content-Type: text/plain; charset=UTF-8',
'Content-Transfer-Encoding: 8bit',
'Plural-Forms: nplurals=2; plural=n != 1;',
'X-Generator: Weblate 1.8',
'']
I hope that helped you希望对你有帮助
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.