[英]PYTHON Scrapy | inserting into MySQL from items
我一直在嘗試抓取一個新聞站點來將每篇文章存儲在 mySQL 數據庫中。 我的目標是為新聞網站上的每篇文章存儲以下數據:日期、標題、摘要、鏈接
我一直在嘗試不同的方法,並在嘗試了幾周后決定在 stackoverflow 上找到我的問題的解決方案。 (注意:我有一個代碼可以解決我的問題,但它只一次取出所有項目而不是一個一個,所以我嘗試了一種新的方法,這就是我撞牆的地方)
蜘蛛俠
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
for date in response.xpath('//pubDate/text()').extract():
yield WebspiderItem(date = date)
for title in response.xpath('//title/text()').extract():
yield WebspiderItem(title = title)
for summary in response.xpath('//description/text()').extract():
yield WebspiderItem(summary = summary)
for link in response.xpath('//link/text()').extract():
yield WebspiderItem(link = link)
項目.PY
import scrapy
class WebspiderItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
管道.PY
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='HIDDENPASSWORD',
database='news_db'
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news_tb""")
self.curr.execute("""create table news_tb(
date text,
title text,
summary text,
link text
)""")
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
響應倍數:
2020-03-17 07:54:32 [scrapy.core.scraper] ERROR: Error processing {'link': 'https://www.coindesk.com/makerdaos-problems-are-a-textbook-case-of-governance-failure'}
Traceback (most recent call last):
File "c:\users\r\pycharmprojects\project\venv\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 36, in process_item
self.store_db(item)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 41, in store_db
item['date'],
File "c:\users\r\pycharmprojects\_project\venv\lib\site-packages\scrapy\item.py", line 91, in __getitem__
return self._values[key]
KeyError:
你應該一次產生所有數據,不要在循環時這樣做,python從上到下讀取代碼,你首先產生日期,管道收到它並嘗試找到值標題,摘要和鏈接,現在它丟失了返回密鑰錯誤
class NewsSpider(scrapy.Spider):
name = 'news'
def start_requests(self):
page = 'https://www.coindesk.com/feed'
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
links = response.xpath('//link/text()').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
def parse_contents(self, response):
url = response.url
article_title = response.xpath('//h1/text()').extract()[0]
pub_date = response.xpath('//div[@class="article-hero-datetime"]/time/@datetime').extract()[0]
description = response.xpath('//meta[@name="description"]/@content').extract()[0]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = article_title
item['summary'] = description
item['link'] = url
yield item
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.