繁体   English   中英

如何使用 PyQt5 的 QWebEngineView “渲染” HTML

[英]How to “render” HTML with PyQt5's QWebEngineView

如何使用 PyQt5 v5.6 QWebEngineView“渲染”HTML?

我之前使用 PyQt5 v5.4.1 QWebPage 执行过任务,但建议尝试较新的 QWebEngineView。

这是该实现(它通常按预期工作,但在某些站点和情况下有无限期挂起的趋势):

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebKitWidgets import QWebPage

    class Render(QWebPage):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebPage.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.mainFrame().setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            self.html = self.mainFrame().toHtml()
            self.app.quit()

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

下面是我使用 QWebEngineView 的尝试。 一、 PyQt5 v5.6在Ubuntu上的安装设置:

# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5

# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/

现在对于 Python... 以下导致分段错误:

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            # what's going on here? how can I get the HTML from toHtml?
            self.page().toHtml(self.callable)
            self.app.quit()

        def callable(self, data):
            self.html = data

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

问题似乎在于对异步toHtml()的调用。 看起来它应该相当简单,但我不知道该怎么做。 我看到它已经在 C++ 的上下文中进行了讨论,但我不确定如何将其转换为 Python。 我怎样才能得到 HTML?

在以下线程中对该主题进行了相当多的讨论: https : //riverbankcomputing.com/pipermail/pyqt/20​​15-January/035324.html

新的 QWebEngine 接口考虑了底层 Chromium 引擎是异步的这一事实。 因此,我们必须将异步 API 转换为同步 API。

这是它的外观:

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtCore import QEventLoop
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            while self.html is None:
                self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
            self.app.quit()

        def _callable(self, data):
            self.html = data

        def _loadFinished(self, result):
            self.page().toHtml(self._callable)

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

Six & Veehmot 的回答很好,但我发现这对我来说是不够的,因为它没有扩展我想要抓取的页面的下拉元素。 一个轻微的修改修复了这个:

def render(url):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtCore import QEventLoop,QUrl
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, url):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.load(QUrl(url))
            while self.html is None:
                self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
            self.app.quit()

        def _callable(self, data):
            self.html = data

        def _loadFinished(self, result):
            self.page().toHtml(self._callable)

    return Render(url).html


print(render(dummy_url))

正如您所指出的,Qt5.4 依赖于异步调用。 没有必要使用循环(如您的答案所示),因为您唯一的错误是在toHtml调用完成之前调用quit

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            # This is an async call, you need to wait for this
            # to be called before closing the app
            self.page().toHtml(self.callable)

        def callable(self, data):
            self.html = data
            # Data has been stored, it's safe to quit the app
            self.app.quit()

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

当再次运行render()时,以上代码会导致“分段错误(核心转储)”错误。 稍作修改即可解决此问题:

import sys

from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


def render(source_html):
    app = QApplication(sys.argv)

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            app.exec_()

        def _loadFinished(self, result):
            self.page().toPlainText(self._callable)

        def _callable(self, data):
            self.html = data
            app.quit()

    return Render(source_html).html

我并不完全清楚你所说的“渲染”是什么意思。 我理解它的意思是“在屏幕上相应地显示 HTML”。 下面就是这样做的。

# main.py
import sys
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets


class Browser(QtWebEngineWidgets.QWebEngineView):

    def __init__(self):
        super().__init__()

        html = """
        <!DOCTYPE html>
        <html>
            <head>
                <title>Example</title>
                <meta charset="utf-8" />
            </head>
            <body>
                <script>alert('Running some Javascript');</script>
                <h1>Hello world!</h1>
                <p>Goodbye, cruel world...</p>
            </body>
        </html>

        """

        # With QWebEnginePage.setHtml, the html is loaded immediately.
        # baseUrl is used to resolve relative URLs in the document.
        # For whatever reason, it seems like the baseUrl resolves to
        # the parent of the path, not the baseUrl itself.  As a
        # workaround, either append a dummy directory to the base url
        # or start all relative paths in the html with the current
        # directory.
        # https://doc-snapshots.qt.io/qtforpython-5.15/PySide2/QtWebEngineWidgets/QWebEnginePage.html#PySide2.QtWebEngineWidgets.PySide2.QtWebEngineWidgets.QWebEnginePage.setHtml
        here = os.path.dirname(os.path.abspath(__file__)).replace('\\', '/')
        base_path = os.path.join(os.path.dirname(here), 'dummy').replace('\\', '/')
        self.url = QtCore.QUrl('file:///' + base_path)
        self.page().setHtml(html, baseUrl=self.url)


class MainWindow(QtWidgets.QMainWindow):

    def __init__(self):
        super().__init__()

        self.init_widgets()
        self.init_layout()

    def init_widgets(self):
        self.browser = Browser()
        self.browser.loadFinished.connect(self.load_finished)

    def init_layout(self):
        layout = QtWidgets.QVBoxLayout()
        layout.addWidget(self.browser)

        centralWidget = QtWidgets.QWidget()
        centralWidget.setLayout(layout)
        self.setCentralWidget(centralWidget)

    def load_finished(self, status):
        self.msg = QtWidgets.QMessageBox()
        self.msg.setIcon(QtWidgets.QMessageBox.Information)
        self.msg.setWindowTitle('Load Status')
        self.msg.setText(f"It is {str(status)} that the page loaded.")
        self.msg.show()


if __name__ == '__main__':
    app = QtWidgets.QApplication(sys.argv)
    main_window = MainWindow()
    main_window.show()
    sys.exit(app.exec_())

setHtml方法接受一个字符串,因此在使用 HTML 文件时必须首先读取它。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM