简体   繁体   中英

ItemLoader causes start_requests()) TypeError: 'NoneType' object is not iterable

For some reasons, when the ItemLoader is involved in the code it causes this error

start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable

Below is the code for GetTikTokMetricsSpider.py and items.py respectively. The median values work without the ItemLoader process that comes after GetTikTokMetricsSpider.py however not getting through to def get_medians with the ItemLoader. I tried putting the whole process including the ItemLoader in start_requests and it returns the same error. How would the ItemLoader be causing the error here? Here is the code.

GetTikTokMetricsSpider.py:

import scrapy
import json
import csv
import os
import pandas as pd
import numexpr as ne
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path


class GettiktokmetricsSpider(scrapy.Spider):
    name = 'GetTikTokMetricsSpider'
    custom_settings = {
        "FEEDS": {
            "data/metrics.csv": {
                "format": "csv",
                "overwrite": True
            }
        },
        "FEED_EXPORT_FIELDS": [
            "user", "view_med", "like_med", "comment_med", "share_med"
        ],
    }


    def start_requests(self):
        print("START REQUEST")
        users = self.get_users()
        print(users)
        for user in users:
            get_medians = self.get_medians(user)

    def get_medians(self, user):
        print("GET MEDIANS")
        df_counts = self.get_df_counts()
        df_counts.query(f"user == '{user}'", inplace=True)

        df_counts["view_med"] = df_counts["view_count"].median(axis=0)
        df_counts["like_med"] = df_counts["like_count"].median(axis=0)
        df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
        df_counts["share_med"] = df_counts["share_count"].median(axis=0)

        view_med = df_counts["view_med"].iloc[0]
        like_med = df_counts["like_med"].iloc[0]
        comment_med = df_counts["comment_med"].iloc[0]
        share_med = df_counts["share_med"].iloc[0]

        print(user)
        print(view_med)
        print(like_med)
        print(comment_med)
        print(share_med)

        print(type(view_med))
        print(type(like_med))
        print(type(comment_med))
        print(type(share_med)) #Works til here without below il

        il = ItemLoader(item=MedsItem())
        il.add_value("user", user)
        il.add_value("view_med", view_med)
        il.add_value("like_med", like_med)
        il.add_value("comment_med", comment_med)
        il.add_value("share_med", share_med)
        yield il.load_item()
        print(MedsItem())

    def get_users(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        users = df_counts["user"].unique()
        return users

    def get_df_counts(self):
        counts_url = self.get_csv_counts_url()
        df_counts = pd.read_csv(counts_url)
        return df_counts

    def get_csv_counts_url(self):
        url = f"{get_project_path()}/data/counts.csv"
        return url

items.py:

import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import Join, MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags

def get_count(view):
    view_count = str(view)
    if ("Share" or "share"
        or "Comment" or "comment"
        or "Like" or "like") in view_count:
        view_count = "0"
        return view_count
    if "." in view:
        view_count = view_count.replace(".", "")
    if "K" == view[-1]:
        view_count = view_count.replace("K", "000")
    if "M" == view[-1]:
        view_count = view_count.replace("M", "000000")
    return view_count

def get_med(value):
    if type(value) != str:
        str_value = str(value)
    else:
        return value
    return str_value

class CountsItem(scrapy.Item):
    user = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor = TakeFirst())
    view_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    like_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    comment_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
    share_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())

class MedsItem(scrapy.Item):
    user = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    view_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    like_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    comment_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
    share_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())

Your start_requests doesn't return or yield at all. So the return value is always going to be NoneType .

In this line you hand the process over to the get_medians method:

for user in users:
    get_medians = self.get_medians(user)

And then in the get_medians method you yield the loaded item:

yield il.load_item()
print(MedsItem())

So the item is yielded back to the start_requests method and stored in the variable get_medians .

At this point you should yield the get_medians variable that represents the item. Instead the next iteration of the loop begins and the get_medians variable is overwritten with the next item.

Simply adding a yield statement to your start requests should solve your problem.

For example:

for user in users:
    get_medians = self.get_medians(user)
    yield get_medians

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM