For some reasons, when the ItemLoader is involved in the code it causes this error
start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable
Below is the code for GetTikTokMetricsSpider.py
and items.py
respectively. The median values work without the ItemLoader process that comes after GetTikTokMetricsSpider.py
however not getting through to def get_medians
with the ItemLoader. I tried putting the whole process including the ItemLoader in start_requests
and it returns the same error. How would the ItemLoader be causing the error here? Here is the code.
GetTikTokMetricsSpider.py:
import scrapy
import json
import csv
import os
import pandas as pd
import numexpr as ne
from scrapy.loader import ItemLoader
from ScrapeTikTok.items import MedsItem
from TikTokLocalPaths import get_project_path
class GettiktokmetricsSpider(scrapy.Spider):
name = 'GetTikTokMetricsSpider'
custom_settings = {
"FEEDS": {
"data/metrics.csv": {
"format": "csv",
"overwrite": True
}
},
"FEED_EXPORT_FIELDS": [
"user", "view_med", "like_med", "comment_med", "share_med"
],
}
def start_requests(self):
print("START REQUEST")
users = self.get_users()
print(users)
for user in users:
get_medians = self.get_medians(user)
def get_medians(self, user):
print("GET MEDIANS")
df_counts = self.get_df_counts()
df_counts.query(f"user == '{user}'", inplace=True)
df_counts["view_med"] = df_counts["view_count"].median(axis=0)
df_counts["like_med"] = df_counts["like_count"].median(axis=0)
df_counts["comment_med"] = df_counts["comment_count"].median(axis=0)
df_counts["share_med"] = df_counts["share_count"].median(axis=0)
view_med = df_counts["view_med"].iloc[0]
like_med = df_counts["like_med"].iloc[0]
comment_med = df_counts["comment_med"].iloc[0]
share_med = df_counts["share_med"].iloc[0]
print(user)
print(view_med)
print(like_med)
print(comment_med)
print(share_med)
print(type(view_med))
print(type(like_med))
print(type(comment_med))
print(type(share_med)) #Works til here without below il
il = ItemLoader(item=MedsItem())
il.add_value("user", user)
il.add_value("view_med", view_med)
il.add_value("like_med", like_med)
il.add_value("comment_med", comment_med)
il.add_value("share_med", share_med)
yield il.load_item()
print(MedsItem())
def get_users(self):
counts_url = self.get_csv_counts_url()
df_counts = pd.read_csv(counts_url)
users = df_counts["user"].unique()
return users
def get_df_counts(self):
counts_url = self.get_csv_counts_url()
df_counts = pd.read_csv(counts_url)
return df_counts
def get_csv_counts_url(self):
url = f"{get_project_path()}/data/counts.csv"
return url
items.py:
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import Join, MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags
def get_count(view):
view_count = str(view)
if ("Share" or "share"
or "Comment" or "comment"
or "Like" or "like") in view_count:
view_count = "0"
return view_count
if "." in view:
view_count = view_count.replace(".", "")
if "K" == view[-1]:
view_count = view_count.replace("K", "000")
if "M" == view[-1]:
view_count = view_count.replace("M", "000000")
return view_count
def get_med(value):
if type(value) != str:
str_value = str(value)
else:
return value
return str_value
class CountsItem(scrapy.Item):
user = scrapy.Field(input_processor=MapCompose(remove_tags), output_processor = TakeFirst())
view_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
like_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
comment_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
share_count = scrapy.Field(input_processor=MapCompose(remove_tags, get_count), output_processor = TakeFirst())
class MedsItem(scrapy.Item):
user = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
view_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
like_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
comment_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
share_med = scrapy.Field(input_processor=MapCompose(get_med), output_processor = TakeFirst())
Your start_requests
doesn't return or yield at all. So the return value is always going to be NoneType
.
In this line you hand the process over to the get_medians
method:
for user in users:
get_medians = self.get_medians(user)
And then in the get_medians
method you yield the loaded item:
yield il.load_item()
print(MedsItem())
So the item is yielded back to the start_requests
method and stored in the variable get_medians
.
At this point you should yield the get_medians
variable that represents the item. Instead the next iteration of the loop begins and the get_medians
variable is overwritten with the next item.
Simply adding a yield
statement to your start requests should solve your problem.
For example:
for user in users:
get_medians = self.get_medians(user)
yield get_medians
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.