I have been trying to scrape a news site to store each article in mySQL database. My goal is to store the following data for each article on the news site: date, title, summary, link
I been trying different methods and decided after trying for a few weeks to come here on stackoverflow to get a solution to my problem. (Note: I have one code that is near to solve my problem, but it only takes out all of the items at once and not one by one so I tried a new approche and here is where I hit the wall)
SPIDER.PY
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
for date in response.xpath('//pubDate/text()').extract():
yield WebspiderItem(date = date)
for title in response.xpath('//title/text()').extract():
yield WebspiderItem(title = title)
for summary in response.xpath('//description/text()').extract():
yield WebspiderItem(summary = summary)
for link in response.xpath('//link/text()').extract():
yield WebspiderItem(link = link)
ITEMS.PY
import scrapy
class WebspiderItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
PIPELINES.PY
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='HIDDENPASSWORD',
database='news_db'
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news_tb""")
self.curr.execute("""create table news_tb(
date text,
title text,
summary text,
link text
)""")
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
Response Multiple of these:
2020-03-17 07:54:32 [scrapy.core.scraper] ERROR: Error processing {'link': 'https://www.coindesk.com/makerdaos-problems-are-a-textbook-case-of-governance-failure'}
Traceback (most recent call last):
File "c:\users\r\pycharmprojects\project\venv\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 36, in process_item
self.store_db(item)
File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 41, in store_db
item['date'],
File "c:\users\r\pycharmprojects\_project\venv\lib\site-packages\scrapy\item.py", line 91, in __getitem__
return self._values[key]
KeyError:
you should yield all the data once, don't do it while on loop, python reads code from top to bottom, you yield the date first and the pipelines received it and try to find the value title, summary and link and its missing now returns KeyError
class NewsSpider(scrapy.Spider):
name = 'news'
def start_requests(self):
page = 'https://www.coindesk.com/feed'
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
links = response.xpath('//link/text()').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
def parse_contents(self, response):
url = response.url
article_title = response.xpath('//h1/text()').extract()[0]
pub_date = response.xpath('//div[@class="article-hero-datetime"]/time/@datetime').extract()[0]
description = response.xpath('//meta[@name="description"]/@content').extract()[0]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = article_title
item['summary'] = description
item['link'] = url
yield item
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.