简体   繁体   中英

How to “render” HTML with PyQt5's QWebEngineView

How can I "render" HTML with with PyQt5 v5.6 QWebEngineView?

I have previously performed the task with PyQt5 v5.4.1 QWebPage, but it was suggested to try the newer QWebEngineView.

Here's that implementation (it generally works as expected, but has a tendency to hang indefinitely for some sites and situations):

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebKitWidgets import QWebPage

    class Render(QWebPage):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebPage.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.mainFrame().setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            self.html = self.mainFrame().toHtml()
            self.app.quit()

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

What follows is my attempt at using QWebEngineView. First, the installation and setup of PyQt5 v5.6 on Ubuntu:

# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5

# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/

Now for the Python... The following results in a segmentation fault:

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            # what's going on here? how can I get the HTML from toHtml?
            self.page().toHtml(self.callable)
            self.app.quit()

        def callable(self, data):
            self.html = data

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

The trouble appears to lie in the call to asynchronous toHtml() . It seems like it should be fairly simple, but I'm at a loss with what to do with it. I see it's been discussed in the context of C++, but I'm not sure how to translate this to Python. How can I get the HTML out?

Quite a bit of discussion on the topic was made in the following thread: https://riverbankcomputing.com/pipermail/pyqt/2015-January/035324.html

The new QWebEngine interface takes account of the fact that the underlying Chromium engine is asynchronous. As such we have to turn an asynchronous API into a synchronous one.

Here's how that looks:

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtCore import QEventLoop
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            while self.html is None:
                self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
            self.app.quit()

        def _callable(self, data):
            self.html = data

        def _loadFinished(self, result):
            self.page().toHtml(self._callable)

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

The answer by Six & Veehmot is great, but I found out that for my purpose it was not sufficient, as it did not expand the dropdown elements of the page that I wanted to scrape. A slight modification fixed this:

def render(url):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtCore import QEventLoop,QUrl
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, url):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.load(QUrl(url))
            while self.html is None:
                self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
            self.app.quit()

        def _callable(self, data):
            self.html = data

        def _loadFinished(self, result):
            self.page().toHtml(self._callable)

    return Render(url).html


print(render(dummy_url))

As you pointed out, Qt5.4 relies on async calls. It's not necessary to use the Loop (as seen on your answer), since your only mistake was to call quit before the toHtml call finishes.

def render(source_html):
    """Fully render HTML, JavaScript and all."""

    import sys
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtWebEngineWidgets import QWebEngineView

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            self.app = QApplication(sys.argv)
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            self.app.exec_()

        def _loadFinished(self, result):
            # This is an async call, you need to wait for this
            # to be called before closing the app
            self.page().toHtml(self.callable)

        def callable(self, data):
            self.html = data
            # Data has been stored, it's safe to quit the app
            self.app.quit()

    return Render(source_html).html

import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))

The above codes cause 'segmentation fault (core dumped)' error when running render() secondly. A slight modification fixed this:

import sys

from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


def render(source_html):
    app = QApplication(sys.argv)

    class Render(QWebEngineView):
        def __init__(self, html):
            self.html = None
            QWebEngineView.__init__(self)
            self.loadFinished.connect(self._loadFinished)
            self.setHtml(html)
            app.exec_()

        def _loadFinished(self, result):
            self.page().toPlainText(self._callable)

        def _callable(self, data):
            self.html = data
            app.quit()

    return Render(source_html).html

It's not entirely clear to me what you mean by "render". I understand it to mean, "display the HTML accordingly on the screen." The following does just that.

# main.py
import sys
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets


class Browser(QtWebEngineWidgets.QWebEngineView):

    def __init__(self):
        super().__init__()

        html = """
        <!DOCTYPE html>
        <html>
            <head>
                <title>Example</title>
                <meta charset="utf-8" />
            </head>
            <body>
                <script>alert('Running some Javascript');</script>
                <h1>Hello world!</h1>
                <p>Goodbye, cruel world...</p>
            </body>
        </html>

        """

        # With QWebEnginePage.setHtml, the html is loaded immediately.
        # baseUrl is used to resolve relative URLs in the document.
        # For whatever reason, it seems like the baseUrl resolves to
        # the parent of the path, not the baseUrl itself.  As a
        # workaround, either append a dummy directory to the base url
        # or start all relative paths in the html with the current
        # directory.
        # https://doc-snapshots.qt.io/qtforpython-5.15/PySide2/QtWebEngineWidgets/QWebEnginePage.html#PySide2.QtWebEngineWidgets.PySide2.QtWebEngineWidgets.QWebEnginePage.setHtml
        here = os.path.dirname(os.path.abspath(__file__)).replace('\\', '/')
        base_path = os.path.join(os.path.dirname(here), 'dummy').replace('\\', '/')
        self.url = QtCore.QUrl('file:///' + base_path)
        self.page().setHtml(html, baseUrl=self.url)


class MainWindow(QtWidgets.QMainWindow):

    def __init__(self):
        super().__init__()

        self.init_widgets()
        self.init_layout()

    def init_widgets(self):
        self.browser = Browser()
        self.browser.loadFinished.connect(self.load_finished)

    def init_layout(self):
        layout = QtWidgets.QVBoxLayout()
        layout.addWidget(self.browser)

        centralWidget = QtWidgets.QWidget()
        centralWidget.setLayout(layout)
        self.setCentralWidget(centralWidget)

    def load_finished(self, status):
        self.msg = QtWidgets.QMessageBox()
        self.msg.setIcon(QtWidgets.QMessageBox.Information)
        self.msg.setWindowTitle('Load Status')
        self.msg.setText(f"It is {str(status)} that the page loaded.")
        self.msg.show()


if __name__ == '__main__':
    app = QtWidgets.QApplication(sys.argv)
    main_window = MainWindow()
    main_window.show()
    sys.exit(app.exec_())

The setHtml method takes a string so it must be read in first when using an HTML file.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM