[英]How to “render” HTML with PyQt5's QWebEngineView
如何使用 PyQt5 v5.6 QWebEngineView“渲染”HTML?
我之前使用 PyQt5 v5.4.1 QWebPage 執行過任務,但建議嘗試較新的 QWebEngineView。
這是該實現(它通常按預期工作,但在某些站點和情況下有無限期掛起的趨勢):
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebKitWidgets import QWebPage
class Render(QWebPage):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
下面是我使用 QWebEngineView 的嘗試。 一、 PyQt5 v5.6在Ubuntu上的安裝設置:
# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5
# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/
現在對於 Python... 以下導致分段錯誤:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# what's going on here? how can I get the HTML from toHtml?
self.page().toHtml(self.callable)
self.app.quit()
def callable(self, data):
self.html = data
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
問題似乎在於對異步toHtml()
的調用。 看起來它應該相當簡單,但我不知道該怎么做。 我看到它已經在 C++ 的上下文中進行了討論,但我不確定如何將其轉換為 Python。 我怎樣才能得到 HTML?
在以下線程中對該主題進行了相當多的討論: https : //riverbankcomputing.com/pipermail/pyqt/2015-January/035324.html
新的 QWebEngine 接口考慮了底層 Chromium 引擎是異步的這一事實。 因此,我們必須將異步 API 轉換為同步 API。
這是它的外觀:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
Six & Veehmot 的回答很好,但我發現這對我來說是不夠的,因為它沒有擴展我想要抓取的頁面的下拉元素。 一個輕微的修改修復了這個:
def render(url):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop,QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(url).html
print(render(dummy_url))
正如您所指出的,Qt5.4 依賴於異步調用。 沒有必要使用循環(如您的答案所示),因為您唯一的錯誤是在toHtml
調用完成之前調用quit
。
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# This is an async call, you need to wait for this
# to be called before closing the app
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
# Data has been stored, it's safe to quit the app
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
當再次運行render()時,以上代碼會導致“分段錯誤(核心轉儲)”錯誤。 稍作修改即可解決此問題:
import sys
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def render(source_html):
app = QApplication(sys.argv)
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
app.exec_()
def _loadFinished(self, result):
self.page().toPlainText(self._callable)
def _callable(self, data):
self.html = data
app.quit()
return Render(source_html).html
我並不完全清楚你所說的“渲染”是什么意思。 我理解它的意思是“在屏幕上相應地顯示 HTML”。 下面就是這樣做的。
# main.py
import sys
import os
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class Browser(QtWebEngineWidgets.QWebEngineView):
def __init__(self):
super().__init__()
html = """
<!DOCTYPE html>
<html>
<head>
<title>Example</title>
<meta charset="utf-8" />
</head>
<body>
<script>alert('Running some Javascript');</script>
<h1>Hello world!</h1>
<p>Goodbye, cruel world...</p>
</body>
</html>
"""
# With QWebEnginePage.setHtml, the html is loaded immediately.
# baseUrl is used to resolve relative URLs in the document.
# For whatever reason, it seems like the baseUrl resolves to
# the parent of the path, not the baseUrl itself. As a
# workaround, either append a dummy directory to the base url
# or start all relative paths in the html with the current
# directory.
# https://doc-snapshots.qt.io/qtforpython-5.15/PySide2/QtWebEngineWidgets/QWebEnginePage.html#PySide2.QtWebEngineWidgets.PySide2.QtWebEngineWidgets.QWebEnginePage.setHtml
here = os.path.dirname(os.path.abspath(__file__)).replace('\\', '/')
base_path = os.path.join(os.path.dirname(here), 'dummy').replace('\\', '/')
self.url = QtCore.QUrl('file:///' + base_path)
self.page().setHtml(html, baseUrl=self.url)
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.init_widgets()
self.init_layout()
def init_widgets(self):
self.browser = Browser()
self.browser.loadFinished.connect(self.load_finished)
def init_layout(self):
layout = QtWidgets.QVBoxLayout()
layout.addWidget(self.browser)
centralWidget = QtWidgets.QWidget()
centralWidget.setLayout(layout)
self.setCentralWidget(centralWidget)
def load_finished(self, status):
self.msg = QtWidgets.QMessageBox()
self.msg.setIcon(QtWidgets.QMessageBox.Information)
self.msg.setWindowTitle('Load Status')
self.msg.setText(f"It is {str(status)} that the page loaded.")
self.msg.show()
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
main_window = MainWindow()
main_window.show()
sys.exit(app.exec_())
setHtml
方法接受一個字符串,因此在使用 HTML 文件時必須首先讀取它。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.