簡體   English   中英

從網頁下載 Zip 文件

[英]Downloading Zip file from a webpage

我正在嘗試下載 zip,我們也可以通過點擊此網頁上的“SCARICA I DATI CSV”來下載。 我想用漂亮的湯為 7000 多個意大利城市做這件事。

現在,我有一個城市/直轄市的以下代碼:

city_name = "vandoies-vintl"
prov_name = "bz"

r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
soup = BeautifulSoup(r, 'lxml')

# this is where the code breaks. because the HTML body does not have any mention of "csv" whatsoever, which is weird.

csv = soup.find_all('a', attrs={'class':'pull-right csv'})
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")

使用print(soup)檢查時,我找不到csv的任何提及。 有人可以幫忙嗎? 謝謝!

以下代碼有效。

import pandas as pd
import numpy as np
import time

from bs4 import BeautifulSoup
import urllib.request
import re
import os
import urllib
import zipfile
import re

output_path = r"/Users/aartimalik/Dropbox/delphine-miscellaneous/italy/test"

munis = [("monale", "at"), ("portacomaro", "at")]

munis = pd.DataFrame(munis)

munis.columns = ['municipality_clean','prov_abb']

def remove_paren(string):
    return re.sub(r'\(.*\)', '', str(string))
munis['municipality_clean']= munis['municipality_clean'].apply(lambda x: remove_paren(x))
munis['municipality_clean'] = munis['municipality_clean'].str.strip()
munis = munis.replace(' ', '-', regex=True)
munis = munis.apply(lambda x: x.str.lower())

for i in range(0,len(munis)):
    city_name = munis.iloc[i]['municipality_clean']
    prov_name = munis.iloc[i]['prov_abb']
    
    try:
        r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' + city_name + "-comune-" + prov_name).read()
        soup = BeautifulSoup(r, 'lxml')
        csv = soup.find_all('a', attrs={'class':'pull-right csv'})
        try:
            csvlink = csv[0]['href']
            urllib.request.urlretrieve("http://storico.openbilanci.it" + csvlink, city_name+".zip")
            #print('Downloaded and extracted zip for ' + city_name + ', ' + prov_name)
            print(str(i) + ". " + city_name+": success")
            scrapesuccess = scrapesuccess.append(munis.iloc[i])

            newfolder= output_path + "/" + city_name.capitalize()
            if not os.path.exists(newfolder):
                os.makedirs(newfolder)

            zip_ref = zipfile.ZipFile(output_path + "/" + city_name + ".zip", 'r')
            zip_ref.extractall(newfolder)
            zip_ref.close()

        except:
            scrapefail = scrapefail.append(munis.iloc[i])
            print(str(i) + ". " + city_name+": fail")
    
    except:
        scrapefail = scrapefail.append(munis.iloc[i])
        print(str(i) + ". " + city_name+": fail")

這是一個在 memory 中下載 zip 並寫入包含所有 csv 文件的城市目錄的示例。

import urllib.request as request
from io import StringIO
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
from bs4 import BeautifulSoup


class Scraper:
    def __init__(self, **kwargs):
        self.url_root = "http://storico.openbilanci.it"
        self.city_name = kwargs.get("city_name")
        self.prov_name = kwargs.get("prov_name")

    def main(self) -> None:
        file_link = self.get_link()
        zipped_file = self.download_file(file_link)
        unzipped_files_mapping = self.unzip_file(zipped_file)
        self.write_files(unzipped_files_mapping)

    def get_link(self) -> str:
        url = f"{self.url_root}/bilanci/{self.city_name}-comune-{self.prov_name}"

        response = request.urlopen(url).read()
        soup = BeautifulSoup(response, "lxml")

        return soup.find_all("a", attrs={"class": "pull-right csv"})[0]["href"]

    def download_file(self, zip_link: str) -> str:
        url = f"{self.url_root}{zip_link}"

        return request.urlretrieve(url)[0]

    @staticmethod
    def unzip_file(file_handle: str) -> dict:
        zip_file_object = ZipFile(file_handle, "r")
        files = zip_file_object.namelist()

        return {
            file: pd.read_csv(StringIO(zip_file_object.open(file).read().decode("utf-8")), sep=";")
            for file in files
        }

    def write_files(self, file_mapping: dict) -> None:
        for file, df in file_mapping.items():
            file_path, file_name = file.rsplit("/", 1)
            path = Path(f"/path/to/files/{self.city_name}/{file_path}")
            path.mkdir(parents=True, exist_ok=True)
            df.to_csv(f"{path}/{file_name}")


city_name = "vandoies-vintl"
prov_name = "bz"
Scraper(city_name=city_name, prov_name=prov_name).main()

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM