简体   繁体   English

根据来自 scrapy 的信号更新主线程内的 PyQt5 Gui

[英]Update PyQt5 Gui inside a main thread based on signal from scrapy

I have a very basic spider that looks like the followall spider from scrapy testspiders.我有一个非常基本的蜘蛛,它看起来像来自 scrapy 测试蜘蛛的跟随蜘蛛。

import re

import scrapy.signals
from scrapy.http import Request, HtmlResponse
from scrapy.linkextractors import LinkExtractor
from six.moves.urllib.parse import urlparse

from page import Page


class ZenSpider( scrapy.Spider ) :
    def __init__(self) :
        super().__init__()

    name = 'followall'
    custom_settings = {
        'CLOSESPIDER_PAGECOUNT' : 2,
        "FEEDS" : {
            "items.csv" : {"format" : "csv"},
        },
    }

    def __init__(self, **kw) :
        super( ZenSpider, self ).__init__( **kw )
        url = kw.get( 'url' ) or kw.get( 'domain' ) or 'http://scrapinghub.com/'
        if not url.startswith( 'http://' ) and not url.startswith( 'https://' ) :
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()

    def start_requests(self):
        return [Request(self.url, callback=self.parse, dont_filter=True)]

    def parse(self, response):
        """Parse a PageItem and all requests to follow

        @url http://www.scrapinghub.com/
        @returns items 1 1
        @returns requests 1
        @scrapes url title foo
        """
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):
        items = []
        item = Page(
            url=response.url,
            size=str( len( response.body ) ),
            status=response.status,
            # content_type=response.request.headers.get('Content-Type'),
            # encoding=response.request.headers.get('encoding'),
            # referer=response.request.headers.get('Referer'),
        )
        self._set_title( item, response )
        self._set_description( item, response )
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links( response )
            r.extend( Request( x.url, callback=self.parse ) for x in links )
        return r

    def _set_title(self, page, response) :
        if isinstance( response, HtmlResponse ) :
            title = response.xpath( "//title/text()" ).extract()
            if title :
                page['title'] = title[0]

    def _set_description(self, page, response) :
        if isinstance( response, HtmlResponse ) :
            description = response.xpath( "//meta[@name='description']/@content" ).extract()
            if description :
                page['description'] = description[0]

I am calling this spider from a script as below.我从下面的脚本中调用这个蜘蛛。 The spider is run using the CrawlRunner class and when it fetches an item emits a signal as p.signals.connect which then calls the method crawler_results and prints item scraped.蜘蛛使用 CrawlRunner class 运行,当它获取一个项目时会发出一个信号作为 p.signals.connect ,然后调用方法 crawler_results 并打印抓取的项目。

As far as my understanding is I cannot move the crawling into it's own class because then the signal wont work with PyQt5据我了解,我无法将爬行移动到它自己的 class 中,因为这样信号就无法与 PyQt5 一起使用

import scrapy
from PyQt5 import QtWidgets, QtCore, QtGui
from PyQt5.QtCore import QRunnable, pyqtSlot, QThread, pyqtSignal, QTimer
from PyQt5.QtWidgets import QTableWidgetItem, QLabel
from scrapy import signals
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from twisted.internet import reactor
from scrapy.utils.log import configure_logging

from Layout import Ui_MainWindow
from ZenSpider import ZenSpider


class MainWindow( QtWidgets.QMainWindow, Ui_MainWindow ) :

    def __init__(self, parent=None) :
        super(MainWindow, self).__init__()

        self.setupUi( self )
        self.pushButton.pressed.connect( self.on_url_entered )

    def crawler_results(self, item) :
        print( "SCRAPED AN ITEM" )
        ##Do Something here ##

    def on_url_entered(self) :
        # global userInput
        # userInput = self.urlbar.text()
        configure_logging()
        runner = CrawlerRunner()
        runner.crawl(ZenSpider, domain="google.com.au")
        for p in runner.crawlers :
            p.signals.connect(self.crawler_results, signal=signals.item_scraped)
        reactor.run()

if __name__ == "__main__" :
    app = QtWidgets.QApplication( [] )
    main_window = MainWindow()
    main_window.show()
    app.exec_()

I have a layout with a simple QTableWidget and a pushbutton我有一个简单的 QTableWidget 和一个按钮的布局

# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'basic.ui'
#
# Created by: PyQt5 UI code generator 5.14.2
#
# WARNING! All changes made in this file will be lost!


from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(1034, 803)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")
        self.tableWidget = QtWidgets.QTableWidget(self.centralwidget)
        self.tableWidget.setGeometry(QtCore.QRect(140, 200, 831, 401))
        self.tableWidget.setObjectName("tableWidget")
        self.tableWidget.setColumnCount(1)
        self.tableWidget.setRowCount(0)
        item = QtWidgets.QTableWidgetItem()
        self.tableWidget.setHorizontalHeaderItem(0, item)
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setGeometry(QtCore.QRect(880, 610, 89, 25))
        self.pushButton.setObjectName("pushButton")
        MainWindow.setCentralWidget(self.centralwidget)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        item = self.tableWidget.horizontalHeaderItem(0)
        item.setText(_translate("MainWindow", "URL"))
        self.pushButton.setText(_translate("MainWindow", "Start"))


if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    MainWindow = QtWidgets.QMainWindow()
    ui = Ui_MainWindow()
    ui.setupUi(MainWindow)
    MainWindow.show()
    sys.exit(app.exec_())

When I hit the pushbutton I can see the crawler running and entering the crawler_results method as it prints the item scraped.当我按下按钮时,我可以看到爬虫正在运行并进入 crawler_results 方法,因为它打印了抓取的项目。 The spider returns each item as the following value蜘蛛将每个项目返回为以下值

{'size': '164125',
 'status': 200,
 'title': 'Google Advanced Search',
 'url': 'https://www.google.com.au/advanced_search?hl=en-AU&authuser=0'}

Page is simply my scrapy items页面只是我的 scrapy 项目

import scrapy

class Page(scrapy.Item):
    url = scrapy.Field()
    size = scrapy.Field()
    status = scrapy.Field()
    title = scrapy.Field()

My question is how do I translate this data into the GUI and have it auto refresh as long as the spider runs.我的问题是如何将这些数据转换为 GUI 并让它在蜘蛛运行时自动刷新。 This means that every time an item is scraped the GUI updates and then the spider continues.这意味着每次抓取一个项目时,GUI 都会更新,然后蜘蛛会继续。

I have so far explored到目前为止我已经探索过

  1. Using scrapy deferred without much luck使用 scrapy 延迟没有太多运气
  2. Slots/Signals but am unable to get the GUI to update.插槽/信号,但无法更新 GUI。
  3. A Qtimer function to update the GUI every second but that again yields no result.一个 Qtimer function 每秒更新一次 GUI,但这又不会产生任何结果。

Any help is much appreciated任何帮助深表感谢

You have to install a reactor compatible with the Qt event loop, for example using:您必须安装与 Qt 事件循环兼容的反应器,例如使用:

import sys

from PyQt5 import QtWidgets, QtCore, QtGui

import qt5reactor
# import qreactor

from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

import twisted

from Layout import Ui_MainWindow
from ZenSpider import ZenSpider


class MainWindow(QtWidgets.QMainWindow, Ui_MainWindow):
    def __init__(self, parent=None):
        super(MainWindow, self).__init__()

        self.setupUi(self)
        self.pushButton.pressed.connect(self.on_url_entered)
        self.tableWidget.horizontalHeader().setSectionResizeMode(
            QtWidgets.QHeaderView.ResizeToContents
        )

    def crawler_results(self, item):
        row = self.tableWidget.rowCount()

        url = item["url"]

        it = QtWidgets.QTableWidgetItem(url)
        self.tableWidget.insertRow(row)
        self.tableWidget.setItem(row, 0, it)

    def on_url_entered(self):
        configure_logging()
        runner = CrawlerRunner()
        runner.crawl(ZenSpider, domain="google.com.au")
        for p in runner.crawlers:
            p.signals.connect(self.crawler_results, signal=signals.item_scraped)

    def closeEvent(self, event):
        super(MainWindow, self).closeEvent(event)
        twisted.internet.reactor.stop()


if __name__ == "__main__":
    app = QtWidgets.QApplication([])

    qt5reactor.install()
    # qreactor.install()

    main_window = MainWindow()
    main_window.show()
    twisted.internet.reactor.run()

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM