How can I download from links on a JavaScript rendered webpage? Python is the preferred language.
So far, I've tried using the Python bindings for Selenium on a headless server. This approach is terribly slow, fraught with error, and is incapable of reliably determining download progress or success. Additionally, the headless server interferes with my clipboard (which is a problem). I used Firefox as it can be configured to download to a default directory, but I don't think the Chrome situation is any better.
Alternatively, I've tried using WebKit.
def render(url):
"""Fully render a webpage (JavaScript and all) and return the HTML."""
import subprocess
from textwrap import dedent
script = dedent("""\
import sys
from PyQt4.QtCore import QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
render = Render(sys.argv[1])
print render.frame.toHtml().toAscii()""").encode()
process = subprocess.Popen(['python2', '-', url],
stderr=subprocess.PIPE,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
# pipe script into Python's stdin
return process.communicate(script)[0].decode('latin1')
This would be great if not for the fact that I need the download to be in the same session. Is there any way to preserve the session used to render the page? PyQt4 and WebKit are just a bunch of shared libraries. I'm not sure how to tear up the guts of them or whether such a thing even possible.
Right now I'm just doing the following:
with requests.Session() as session:
html = session.get(url).text
link = get_url(html)
download(link, session=session)
Without getting into the details, get_url(html, url)
simply extracts the JavaScript from the page, hacks away any calls to the DOM, then executes it in node
. Really nasty stuff...
Any way I can safely render a webpage and keep the session?
I'm also open to doing it completely in node if Python is not appropriate or the JavaScript alternative is much more elegant. It looks like perhaps node-dom might suffice? I'm not really familiar with it enough to tell but I'm interested in any suggestions.
If a direct command-line option is suitable for you instead of going through Python and/or Selenium, Google Chrome can be run in headless mode. It will do all the javascript rendering before dumping the DOM.
/usr/local/bin/google-chrome \
--headless \
--virtual-time-budget=10000 \
--timeout=10000 \
--run-all-compositor-stages-before-draw \
--user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36" \
--disable-gpu \
--dump-dom "https://example.com/index.html" > rendered.html
PyQt5 in Python 2 or 3 does the trick in this case. Note the function is overly complex so as to support earlier versions of PyQt5 that use WebKit as well as later versions that use WebEngine.
import sys
def render(source_html):
"""Return rendered HTML."""
try:
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
class Render(QWebEngineView):
"""Render HTML with PyQt5 WebEngine."""
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(
QEventLoop.ExcludeUserInputEvents |
QEventLoop.ExcludeSocketNotifiers |
QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
except ImportError:
from PyQt5.QtWebKitWidgets import QWebPage
from PyQt5.QtWidgets import QApplication
class Render(QWebPage):
"""Render HTML with PyQt5 WebKit."""
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
return Render(source_html).html
Or PyQt4 in Python 2.
import sys
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
"""Fully render HTML, JavaScript and all."""
def __init__(self, html):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
render = Render(html)
result = str(render.frame.toHtml().toAscii())
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.