简体   繁体   中英

cannot extract json items on webpage using scrapy

I am working on extracting data from indeed.com for a data science project I am working on. Though I am able to successfully scrape various portions of the page, I am having some issues scraping items from within the JSON portion of the page.

Does anyone know how I would extract the items below from the URL? >>> view-source:https://www.indeed.com/viewjob?jk=41abec7fde3513dc&tk=1dn0mslbr352v000&from=serp&vjs=3&advn=9434814581076032&adid=197003786&sjdu=BbcXv7z69Xez4bal0Fx7iYB6jxzlBG3p6CfmfgjyGDErM4mqXgOsfEsOF5maJ2GRnKJsHskFl8aEbb4LlD5LibXOuIs0dzzHfVCmKB00C2c43rDVhEZX_8Zmg4zqEyqG5LEfQjRfoyOhULxXHTMitWOUjMOdLRt367-ZewSzfkqUSnPzHungl7uY7NcfOFLy .

Items to be extracted below: \nPOT-Creation-Date: \nPO-Revision-Date: "jobLocation":"Arlington, TX

A sample script I am running is below


import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
import boto3

class JobsSpider1(scrapy.Spider):
    name = "indeed"
    allowed_domains = ["indeed.com"]
    start_urls = ["https://www.indeed.com/jobs?q=\"owner+operator\"+\"truck\"&l=augusta"]

    custom_settings = {
    'FEED_FORMAT': 'json',
    'FEED_URI':'me_test.json'
    }

    def parse(self, response):
        jobs = response.xpath('//div[@class="title"]')

        for job in jobs:
            title = job.xpath('a//@title').extract_first()
            posting_link = job.xpath('a//@href').extract_first()
            posting_url = "https://indeed.com" + posting_link

            yield Request(posting_url, callback=self.parse_page, meta={'title': title, 'posting_url':posting_url})

        relative_next_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        absolute_next_url = "https://indeed.com" + relative_next_url

        yield Request(absolute_next_url, callback=self.parse)

    def parse_page(self, response):
        posting_url = response.meta.get('posting_url')
        job_title = response.meta.get('title')

        #job_name= response.xpath('//*[@class="icl-u-xs-mb--xs icl-u-xs-mt--none  jobsearch-JobInfoHeader-title"]/text()').extract_first()
        job_descriptions=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/ul').extract_first()
        job_listing_header=response.xpath('//*[@class="jobSectionHeader"]/ul').extract_first()
        posted_on_date= response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/text()').extract_first()
        job_location=response.xpath('//*[@class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs  jobsearch-DesktopStickyContainer-companyrating"]/div[3]/text()').extract_first()

        yield {
        'job_title':job_title,
        'posting_url':posting_url,
    #    'job_name':job_name,
        'job_listing_header':job_listing_header,
        'job_location': job_location,
        'job_descriptions':job_descriptions,
        'posted_on_date':posted_on_date
        }


```````````````````````````````````````````````````````````````
######################################################
############# UPDATED CODE #########################
############# UPDATED CODE ###########################
#############################################

import time
import os, sys
import json
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
import boto3
from scrapy.loader.processors import Join
import re




class JobsSpider1(scrapy.Spider):
    name = "indeed"
    allowed_domains = ["indeed.com"]
    start_urls = ["https://www.indeed.com/jobs?q=\"owner+operator\"+\"truck\"&l=augusta"]

    custom_settings = {
    'FEED_FORMAT': 'csv',
    'FEED_URI':'test.csv'
    }

    def parse(self, response):
        jobs = response.xpath('//div[@class="title"]')

        for job in jobs:
            title = job.xpath('a//@title').extract_first()
            posting_link = job.xpath('a//@href').extract_first()
            posting_url = "https://indeed.com" + posting_link

            yield Request(posting_url, callback=self.parse_page, meta={'title': title, 'posting_url':posting_url})

        relative_next_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        absolute_next_url = "https://indeed.com" + relative_next_url

        yield Request(absolute_next_url, callback=self.parse)

    def parse_page(self, response):
        posting_url = response.meta.get('posting_url')
        job_title = response.meta.get('title')

        #job_name= response.xpath('//*[@class="icl-u-xs-mb--xs icl-u-xs-mt--none  jobsearch-JobInfoHeader-title"]/text()').extract_first()
        #job_descriptions_1=response.xpath('//<[@class="jobsearch-jobDescriptionText"]/ul').extract_first()
        #job_descriptions_2=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/p').extract_first()
        #job_descriptions_3=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/div').extract_first()
        #job_descriptions_4=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/br').extract_first()
        #job_descriptions_5=response.xpath('//*[@class="jobDescriptionText"]/div').extract_first()
        job_listing_header=response.xpath('//*[@class="jobSectionHeader"]/b').extract_first()
        #posted_on_date=response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/text()').extract_first()
        #posted_on_date=response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/<span').extract_first()
        #job_location=response.xpath('//*[@class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs  jobsearch-DesktopStickyContainer-companyrating"]/div[3]/text()').extract_first()

        data_obj = json.loads(re.search(r'window\._initialData\=(\{.+\});', script_text).group(1))

        renew = data_obj['jobLocation']


        yield {
        'job_title':job_title,
        'renew':renew,
        'posting_url':posting_url,
        'job_listing_header':job_listing_header
        #'job_location': job_location
        #'job_descriptions_1':job_descriptions_1,
        #'job_descriptions_2':job_descriptions_2,
        #'job_descriptions_3':job_descriptions_3,
        #'job_descriptions_4':job_descriptions_4,
        #'job_descriptions_5':job_descriptions_5
        #'posted_on_date':posted_on_date
        } 

The data you need is in json format but inside javascript, so you need to extract by steps:

  • find the right script element
  • parse the javascript code or filter it with regex
  • convert it to json

After that you should be able to query it as an object. To get all the previous steps and extract the object directly, you can use something like this:

data_obj = json.loads(re.search(r'window\._initialData\=(\{.+\});', script_text).group(1))

Now you can use that object to get the data you want, example:

> data_obj['jobLocation']

Arlington, TX 

For the part of POT-Creation-Date for example, you'll need to better format that string as it is inside a list and not directly queryable:

> data_obj['localeData'][''][-1].split('\n')

['Project-Id-Version: ',
 'Report-Msgid-Bugs-To: ',
 'POT-Creation-Date: 2019-10-03 17:31+0900',
 'PO-Revision-Date: 2019-07-11 09:46-0500',
 'Last-Translator: Auto Generated <noreply@indeed.com>',
 'Language-Team: English (United States) <http://example.com/weblate/projects/indeed/indeedmobile-i18n-content/en_US/>',
 'Language: en_US',
 'MIME-Version: 1.0',
 'Content-Type: text/plain; charset=UTF-8',
 'Content-Transfer-Encoding: 8bit',
 'Plural-Forms: nplurals=2; plural=n != 1;',
 'X-Generator: Weblate 1.8',
 '']

I hope that helped you

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM