简体   繁体   中英

Scrapy crawler not yielding any Data

I am facing a weird issue here, crawler running without any errors as well as without yielding any data.

Here is the starter code for one page:


# zillow scraper class
class ZillowScraper(scrapy.Spider):
    # scraper/spider name
    name = "zillow"

    # custom_settings = {
    #     "FEED_FORMAT": "csv",
    #     "FEED_URI": "zillow_data.csv",
    # }

    # base URL
    base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"

    # custom headers
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
    }

    # string query parameters
    params = {
        "searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
    }

    def __init__(self):
        self.zpid = []

    def start_requests(self):
        yield scrapy.Request(
            url=self.base_url, headers=self.headers, callback=self.parse_links
        )

Here is parsing links callback in which I am getting the data from json and getting the id's from json and appending it in the class variable list to use it to compare the id with listing id:

def parse_links(self, response):
        results_selector = response.css(
            'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
        ).get()
        clean_json = (
            results_selector.replace(
                '<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
                "",
            )
            .replace("</script>", "")
            .replace("-->", "")
        )
        parsed_data = json.loads(clean_json)
        data = parsed_data["cat1"]["searchResults"]["listResults"]
        for zid in data:
            self.zpid.append(zid)

        for listing in data:
            yield scrapy.Request(
                url=listing["detailUrl"],
                headers=self.headers,
                callback=self.parse_detail,
            )

Here is the final callback parse details in this function again I am getting the data from the json. First I am doing some url parsing to get the id from the url to compare it with self.zpid list and then I am running the for loop over that self.zpid list and checking if listing_id(url id) is equal to the self.zpid list id's. Then generating keys dynamically with the help of the id to get the detailed data:

def parse_detail(self, response):
        item = {}
        listing_url = response.url.split("/")
        parse_id = [u for u in listing_url if u]
        listing_id = parse_id[4][:8]

        for zid in self.zpid:
            if zid == listing_id:
                print(zid)

                api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
                clean_json = api_endpoint.replace(
                    '<script id="hdpApolloPreloadedData" type="application/json">', ""
                ).replace("</script>", "")
                parsed_data = json.loads(clean_json)
                sub_data = json.loads(parsed_data["apiCache"])

                item["date"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["datePostedString"]
                item["home_status"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["hdpTypeDimension"]
                item["home_type"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["homeType"]
                item["sqft"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid": {zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["livingArea"]
                item["street_address"] = sub_data[
                    f'VariantQuery{{"zpid":{zid},"altId":null}}'
                ]["property"]["streetAddress"]
                item["city"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
                    "property"
                ]["city"]
                item["state"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
                    "property"
                ]["state"]
                item["zipcode"] = sub_data[
                    f'VariantQuery{{"zpid":{zid},"altId":null}}'
                ]["property"]["zipcode"]
                item["price"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
                    "property"
                ]["price"]
                item["zestimate"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["zestimate"]
                item["parcel_number"] = sub_data[
                    f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
                ]["property"]["resoFacts"]["parcelNumber"]

        yield item


# main driver
if __name__ == "__main__":
    # run scraper
    process = CrawlerProcess()
    process.crawl(ZillowScraper)
    process.start()

Right now crawler is running hitting the urls getting 200 response and everything but not yielding the data. What I am doing wrong here?

I tried running the crawler without comparing id's it outputs the keyerror which makes sense but apart from that crawler is just running and hittng the urls getting 200 response but empty dictionaries. I tried

response.follow 

instead of initiating

scrapy.Request 

but no output just {} empty dictionaries.

I am expecting:

{'date': 2022-03-11, 'home_status': 'For sale', 'home_type': 'Residential', 'sqft': '2,249', 'street_address': '659 Erskine Dr', 'city': 'Pacific Palisades', 'state': 'CA', 'zipcode': '90272', 'price': '$2,995,000', 'zestimate': '$3,356,900', 'parcel_number': 4413016022}

2022-03-24 01:04:17 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 01:04:17 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 54014,
 'downloader/request_count': 41,
 'downloader/request_method_count/GET': 41,
 'downloader/response_bytes': 9157579,
 'downloader/response_count': 41,
 'downloader/response_status_count/200': 41,
 'elapsed_time_seconds': 15.943654,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2022, 3, 23, 20, 4, 17, 44889),
 'httpcompression/response_bytes': 49582733,
 'httpcompression/response_count': 41,
 'item_scraped_count': 40,
 'log_count/DEBUG': 90,
 'log_count/INFO': 10,
 'memusage/max': 54341632,
 'memusage/startup': 54341632,
 'request_depth_max': 1,
 'response_received_count': 41,
 'scheduler/dequeued': 41,
 'scheduler/dequeued/memory': 41,
 'scheduler/enqueued': 41,
 'scheduler/enqueued/memory': 41,
 'start_time': datetime.datetime(2022, 3, 23, 20, 4, 1, 101235)}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Spider closed (finished)

You have the same problem in many places.

First place

if zid == listing_id:

listing_id is a number but zid is a dictionary.

You have to use ["id"] to get number from dictionary

if zid["id"] == listing_id:

And later the same problem in all keys with "zpid":{zid} - you need "zpid":{zid["id"]}

In one key you have also extra space "zpid": {zid} which you have to remove.


EDIT:

Another small problem - you yield item outside if but sometimes it can't find if zid["id"] == listing_id: and it generate empty row in file. You should yield inside if .


BTW:

Frankly, I don't like idea of list self.zpid because it has to search on all values in list. And code may run on many workers and they may have separated self.zpid and they may not find element on this list. Standard method is to send value to next function using

Request(... , meta={"data": zid})

and callback gets it as

zid = response.meta["data"]

But newest scrapy can send it as parameter for callback

Request(... , cb_kwars={"data": zid})

and callback gets it as argument in

def parse_detail(self, response, data):

Full working code with other changes

import scrapy
import json


class ZillowScraper(scrapy.Spider):

    name = "zillow"

    # custom_settings = {
    #     "FEED_FORMAT": "csv",
    #     "FEED_URI": "zillow_data.csv",
    # }

    # base URL
    base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"

    # custom headers
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
    }

    # string query parameters
    params = {
        "searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
    }

    def start_requests(self):
        yield scrapy.Request(
            url=self.base_url, headers=self.headers, callback=self.parse_links
        )

    def parse_links(self, response):
        print('[parse_links] url:', response.url)
        
        results_selector = response.css(
            'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
        ).get()
        
        clean_json = (
            results_selector.replace(
                '<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
                "",
            )
            .replace("</script>", "")
            .replace("-->", "")
        )
        
        parsed_data = json.loads(clean_json)
        data = parsed_data["cat1"]["searchResults"]["listResults"]

        for listing in data:
            yield scrapy.Request(
                url=listing["detailUrl"],
                headers=self.headers,
                callback=self.parse_detail,
                meta={'data': listing}
            )

    def parse_detail(self, response):
        print('[parse_detail] url:', response.url)

        listing_url = response.url.split("/")
        parse_id = [u for u in listing_url if u]
        
        listing_id = parse_id[4][:8]
        zid = response.meta['data']        

        #print('listing_id:', listing_id)
        #print("zid['id']:", zid['id'])
        
        if zid['id'] == listing_id:

            api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
            
            clean_json = api_endpoint.replace(
                '<script id="hdpApolloPreloadedData" type="application/json">', ""
            ).replace("</script>", "")
            
            parsed_data = json.loads(clean_json)
            sub_data = json.loads(parsed_data["apiCache"])
            
            id_ = zid['id']
            
            key_1 = f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{id_},"contactFormRenderParameter":{{"zpid":{id_},"platform":"desktop","isDoubleScroll":true}}}}' 
            key_2 = f'VariantQuery{{"zpid":{id_},"altId":null}}'

            properties_1 = sub_data[key_1]["property"]
            properties_2 = sub_data[key_2]["property"]

            item = {}
            
            item["date"]        = properties_1["datePostedString"]
            item["home_status"] = properties_1["hdpTypeDimension"]
            item["home_type"]   = properties_1["homeType"]
            item["sqft"]        = properties_1["livingArea"]
            
            item["street_address"] = properties_2["streetAddress"]
            item["city"]           = properties_2["city"]
            item["state"]          = properties_2["state"]
            item["zipcode"]        = properties_2["zipcode"]
            item["price"]          = properties_2["price"]
            
            item["zestimate"]     = properties_1["zestimate"]
            item["parcel_number"] = properties_1["resoFacts"]["parcelNumber"]

            yield item
        
from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ZillowScraper)
c.start() 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM