使用 Python Selenium 在內存中下載文件，而不是在磁盤中

Question

我有一堆腳本可以進行網頁抓取、下載文件並用 Pandas 讀取它們。 這個過程必須部署在一個新的架構中，在這種架構中，將文件下載到磁盤上是不合適的，而是最好將文件保存在內存中並從那里用 Pandas 讀取它。 出於演示目的，我在這里留下了一個從隨機網站下載 excel 文件的網絡抓取腳本：

import time
import pandas as pd
from io import StringIO, BytesIO
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from datetime import date, timedelta
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


pathDriver = #Path to chromedriver

driver  = webdriver.Chrome(executable_path=pathDriver)

url = 'https://file-examples.com/index.php/sample-documents-download/sample-xls-download/'

driver.get(url)
time.sleep(1)

file_link = driver.find_element_by_xpath('//*[@id="table-files"]/tbody/tr[1]/td[5]/a[1]')
file_link.click()

該腳本有效地下載了我的下載文件夾中的文件。 我所嘗試的是在click()方法之前和之后放置一個StringIO()或BytesIO()流並讀取BytesIO()此的對象：

file_object = StringIO()
df = pd.read_excel(file_object.read())

但是 file_object 沒有捕獲文件，甚至文件仍然下載到我的磁盤中。

有什么建議嗎？

Answer 1

您的問題可以通過添加硒 add_experimental_option來完成。 我還重新設計了您的代碼以遍歷表以提取 href 以將它們傳遞給 StringIO。 使用此代碼不會將任何文件下載到我的本地系統。

如果我錯過了什么，請告訴我。

import pandas as pd
from time import sleep
from io import StringIO
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

capabilities = DesiredCapabilities().CHROME

chrome_options = Options()
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")

prefs = {
    'profile.default_content_setting_values':
     {
        'automatic_downloads': 0
  },

      'profile.content_settings.exceptions':
    {
        'automatic_downloads': 0
    }
  }

chrome_options.add_experimental_option('prefs', prefs)
capabilities.update(chrome_options.to_capabilities())

driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)

url_main = 'https://file-examples.com/index.php/sample-documents-download/sample-xls-download/'

driver.get(url_main)

elements = driver.find_elements_by_xpath('//*[@id="table-files"]//td/a')
for element in elements:
   if str(element.get_attribute("href")).endswith('.xls'):
     file_object = StringIO(element.get_attribute("href"))
      xls_file = file_object.read()
      df = pd.read_excel(xls_file)
      print(df.to_string(index=False))

        First Name  Last Name  Gender        Country  Age        Date    Id
      1      Dulce      Abril  Female  United States   32  15/10/2017  1562
      2       Mara  Hashimoto  Female  Great Britain   25  16/08/2016  1582
      3     Philip       Gent    Male         France   36  21/05/2015  2587
      4   Kathleen     Hanner  Female  United States   25  15/10/2017  3549
      5    Nereida    Magwood  Female  United States   58  16/08/2016  2468
      6     Gaston      Brumm    Male  United States   24  21/05/2015  2554
      7       Etta       Hurn  Female  Great Britain   56  15/10/2017  3598
      8    Earlean     Melgar  Female  United States   27  16/08/2016  2456
      9   Vincenza    Weiland  Female  United States   40  21/05/2015  6548
      
      sleep(360)

這是一個使用注釋中提到的 RAMDISK 的示例。 此選項不使用selenium add_experimental_option或 StringIO。

import fs
import pandas as pd
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")

driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)

url_main = 'https://file-examples.com/index.php/sample-documents-download/sample-xls-download/'

driver.get(url_main)

urls_to_process = []
elements = driver.find_elements_by_xpath('//*[@id="table-files"]//td/a')

# Create RAMDISK
mem_fs = fs.open_fs('mem://')
mem_fs.makedir('hidden_dir')

for element in elements:
  if str(element.get_attribute("href")).endswith('.xls'):
     with mem_fs.open('hidden_dir/file1.csv', 'w') as in_file:
        in_file.write(element.get_attribute("href"))
        in_file.close()
     with mem_fs.open('hidden_dir/file1.csv', 'r') as out_file:
        df = pd.read_excel(out_file.read())
        print(df.to_string(index=False))
        # same output as above
        sleep(360)

Answer 2

IMO，使用 selenium 顯然是不必要的。只使用requests + beautifulsoup + pandas是可以的。（這比使用 selenium 快得多，並且需要更少的代碼）。

代碼如下：

from io import BytesIO
import requests
from bs4 import BeautifulSoup
import pandas as pd

response = requests.get("https://file-examples.com/index.php/sample-documents-download/sample-xls-download/")
soup = BeautifulSoup(response.text, "html.parser")
# get the download link
file_link = soup.select_one(".file-link > a").get("href")

# download it in memory
bytes_of_file = requests.get(file_link).content
df = pd.read_excel(BytesIO(bytes_of_file))
print(df)

結果：

   0 First Name  Last Name  Gender        Country  Age        Date    Id
0  1      Dulce      Abril  Female  United States   32  15/10/2017  1562
1  2       Mara  Hashimoto  Female  Great Britain   25  16/08/2016  1582
2  3     Philip       Gent    Male         France   36  21/05/2015  2587
3  4   Kathleen     Hanner  Female  United States   25  15/10/2017  3549
4  5    Nereida    Magwood  Female  United States   58  16/08/2016  2468
5  6     Gaston      Brumm    Male  United States   24  21/05/2015  2554
6  7       Etta       Hurn  Female  Great Britain   56  15/10/2017  3598
7  8    Earlean     Melgar  Female  United States   27  16/08/2016  2456
8  9   Vincenza    Weiland  Female  United States   40  21/05/2015  6548

這不會下載任何excel文件。

使用 Python Selenium 在內存中下載文件，而不是在磁盤中

問題描述

2 個解決方案

解決方案1
4 2020-11-03 19:58:00

解決方案2
2 2020-11-05 10:29:04

使用 Python Selenium 在內存中下載文件，而不是在磁盤中

問題描述

2 個解決方案

解決方案1 4 2020-11-03 19:58:00

解決方案2 2 2020-11-05 10:29:04

解決方案1
4 2020-11-03 19:58:00

解決方案2
2 2020-11-05 10:29:04