简体   繁体   中英

PYTHON Scrapy | inserting into MySQL from items

I have been trying to scrape a news site to store each article in mySQL database. My goal is to store the following data for each article on the news site: date, title, summary, link

I been trying different methods and decided after trying for a few weeks to come here on stackoverflow to get a solution to my problem. (Note: I have one code that is near to solve my problem, but it only takes out all of the items at once and not one by one so I tried a new approche and here is where I hit the wall)

SPIDER.PY

    import scrapy
    from ..items import WebspiderItem


    class NewsSpider(scrapy.Spider):
        name = 'news'
        start_urls = [
            'https://www.coindesk.com/feed'
        ]

        def parse(self, response):

            for date in response.xpath('//pubDate/text()').extract():
                yield WebspiderItem(date = date)


            for title in response.xpath('//title/text()').extract():
                yield WebspiderItem(title = title)


            for summary in response.xpath('//description/text()').extract():
                yield WebspiderItem(summary = summary)


            for link in response.xpath('//link/text()').extract():
                yield WebspiderItem(link = link)

ITEMS.PY

import scrapy


class WebspiderItem(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    summary = scrapy.Field()
    link = scrapy.Field()

PIPELINES.PY

import mysql.connector


class WebspiderPipeline(object):

    def __init__(self):
        self.create_connection()
        self.create_table()

    def create_connection(self):
        self.conn = mysql.connector.connect(
            host='localhost',
            user='root',
            passwd='HIDDENPASSWORD',
            database='news_db'
        )
        self.curr = self.conn.cursor()

    def create_table(self):
        self.curr.execute("""DROP TABLE IF EXISTS news_tb""")
        self.curr.execute("""create table news_tb(
                        date text,
                        title text,
                        summary text,
                        link text
                        )""")

    def process_item(self, item, spider):
        self.store_db(item)
        return item

    def store_db(self, item):
        self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
            item['date'],
            item['title'],
            item['summary'],
            item['link']

        ))
        self.conn.commit()

Response Multiple of these:

2020-03-17 07:54:32 [scrapy.core.scraper] ERROR: Error processing {'link': 'https://www.coindesk.com/makerdaos-problems-are-a-textbook-case-of-governance-failure'}
Traceback (most recent call last):
  File "c:\users\r\pycharmprojects\project\venv\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 36, in process_item
    self.store_db(item)
  File "C:\Users\r\PycharmProjects\Project\webspider v3 RSS\webspider\pipelines.py", line 41, in store_db
    item['date'],
  File "c:\users\r\pycharmprojects\_project\venv\lib\site-packages\scrapy\item.py", line 91, in __getitem__
    return self._values[key]
KeyError:

you should yield all the data once, don't do it while on loop, python reads code from top to bottom, you yield the date first and the pipelines received it and try to find the value title, summary and link and its missing now returns KeyError

class NewsSpider(scrapy.Spider):
        name = 'news'
    def start_requests(self):
        page = 'https://www.coindesk.com/feed'
        yield scrapy.Request(url=page, callback=self.parse)

    def parse(self, response):
        links = response.xpath('//link/text()').extract()
        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_contents)

    def parse_contents(self, response):
        url = response.url
        article_title = response.xpath('//h1/text()').extract()[0]
        pub_date = response.xpath('//div[@class="article-hero-datetime"]/time/@datetime').extract()[0]
        description = response.xpath('//meta[@name="description"]/@content').extract()[0]
        item = WebspiderItem()
        item['date'] = pub_date
        item['title'] = article_title
        item['summary'] = description
        item['link'] = url

        yield item

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM