[英]Downloading a file from webpage using python, getting a html file instead of the actual zip file
[英]Downloading Zip file from a webpage
我正在嘗試下載 zip,我們也可以通過點擊此網頁上的“SCARICA I DATI CSV”來下載。 我想用漂亮的湯為 7000 多個意大利城市做這件事。
現在,我有一個城市/直轄市的以下代碼:
city_name = "vandoies-vintl"
prov_name = "bz"
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
# this is where the code breaks. because the HTML body does not have any mention of "csv" whatsoever, which is weird.
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
使用print(soup)
檢查時,我找不到csv的任何提及。 有人可以幫忙嗎? 謝謝!
以下代碼有效。
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request
import re
import os
import urllib
import zipfile
import re
output_path = r"/Users/aartimalik/Dropbox/delphine-miscellaneous/italy/test"
munis = [("monale", "at"), ("portacomaro", "at")]
munis = pd.DataFrame(munis)
munis.columns = ['municipality_clean','prov_abb']
def remove_paren(string):
return re.sub(r'\(.*\)', '', str(string))
munis['municipality_clean']= munis['municipality_clean'].apply(lambda x: remove_paren(x))
munis['municipality_clean'] = munis['municipality_clean'].str.strip()
munis = munis.replace(' ', '-', regex=True)
munis = munis.apply(lambda x: x.str.lower())
for i in range(0,len(munis)):
city_name = munis.iloc[i]['municipality_clean']
prov_name = munis.iloc[i]['prov_abb']
try:
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
try:
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
#print('Downloaded and extracted zip for ' + city_name + ', ' + prov_name)
print(str(i) + ". " + city_name+": success")
scrapesuccess = scrapesuccess.append(munis.iloc[i])
newfolder= output_path + "/" + city_name.capitalize()
if not os.path.exists(newfolder):
os.makedirs(newfolder)
zip_ref = zipfile.ZipFile(output_path + "/" + city_name + ".zip", 'r')
zip_ref.extractall(newfolder)
zip_ref.close()
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) + ". " + city_name+": fail")
這是一個在 memory 中下載 zip 並寫入包含所有 csv 文件的城市目錄的示例。
import urllib.request as request
from io import StringIO
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, **kwargs):
self.url_root = "http://storico.openbilanci.it"
self.city_name = kwargs.get("city_name")
self.prov_name = kwargs.get("prov_name")
def main(self) -> None:
file_link = self.get_link()
zipped_file = self.download_file(file_link)
unzipped_files_mapping = self.unzip_file(zipped_file)
self.write_files(unzipped_files_mapping)
def get_link(self) -> str:
url = f"{self.url_root}/bilanci/{self.city_name}-comune-{self.prov_name}"
response = request.urlopen(url).read()
soup = BeautifulSoup(response, "lxml")
return soup.find_all("a", attrs={"class": "pull-right csv"})[0]["href"]
def download_file(self, zip_link: str) -> str:
url = f"{self.url_root}{zip_link}"
return request.urlretrieve(url)[0]
@staticmethod
def unzip_file(file_handle: str) -> dict:
zip_file_object = ZipFile(file_handle, "r")
files = zip_file_object.namelist()
return {
file: pd.read_csv(StringIO(zip_file_object.open(file).read().decode("utf-8")), sep=";")
for file in files
}
def write_files(self, file_mapping: dict) -> None:
for file, df in file_mapping.items():
file_path, file_name = file.rsplit("/", 1)
path = Path(f"/path/to/files/{self.city_name}/{file_path}")
path.mkdir(parents=True, exist_ok=True)
df.to_csv(f"{path}/{file_name}")
city_name = "vandoies-vintl"
prov_name = "bz"
Scraper(city_name=city_name, prov_name=prov_name).main()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.