简体   繁体   中英

download a pdf from a website and change title - python and curl

I have a python script to download pdf's from a ASP-site. I would like to save the pdf file using the name it is displayed on the website. So from this line of html, get the link to download the pdf and get the name how it is displayed. So for the following html line:

<a href="https://www.ib3.nl/curriculum/engels\100 TB 3 Ch 3.pdf">Chapter 3 - Weird science</a></li>

get the link https://www.ib3.nl/curriculum/engels\100 TB 3 Ch 3.pdf and save this pdf as Chapter 3 - Weird science.pdf

below is the script to get all the pdf's

from bs4 import BeautifulSoup as BeautifulSoup
import urllib.request as requests
from urllib import parse as urlparse
import requests
import os


klassen = ['1e klas']
vakken = ['Wiskunde']
'''['Engels','Aardrijkskunde','Economie', 'Filosofie','Frans', 'Geschiedenis', \
          'Nask', 'Natuurkunde', 'Nederlands', 'Scheikunde', 'Spaans', 'Wiskunde'\
          'Biologie', 'Duits', 'Grieks','Latijn','Leesmateriaal', \
          'Loopbaanorientatie','NLT']'''
links = []
for klas in klassen: 
    for vak in vakken: 
        url = "https://www.svpo.nl/curriculum.asp"
        payload = 'vak='+ vak + '&klas_en_schoolsoort='+klas
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
        }

    response = requests.post(url, data=payload, headers=headers)
    
    path_out = 'c:\books\\'
    path = (path_out + klas + "\\" + vak + "\\")
    if not(os.path.exists(path)): os.makedirs(path)
    
    links = BeautifulSoup(response.text, "lxml")#.find_all('a')
    a=BeautifulSoup(response.text, "lxml").find_all('a')
    
    for link in BeautifulSoup(response.text, "lxml").find_all('a'):
        current_link = link.get('href')
        if str(link.get('href')) != 'None':
            if current_link.endswith('pdf'):
                print(current_link)
                links.append(current_link)
                filename = current_link[current_link.find('\\')+1:]
                filename_url = urlparse.quote(filename) 
                path_url = current_link[:current_link.find('\\')] + '/' + filename_url
                os.system('Curl -o "' + path + filename + '" ' + path_url)                    

Simply:

filename = link.text + '.pdf'

That's all.


My version with changes from comments:

import os
import requests
from bs4 import BeautifulSoup
from urllib import parse as urlparse


klassen = ['1e klas']
vakken = ['Wiskunde']
'''['Engels','Aardrijkskunde','Economie', 'Filosofie','Frans', 'Geschiedenis', \
          'Nask', 'Natuurkunde', 'Nederlands', 'Scheikunde', 'Spaans', 'Wiskunde'\
          'Biologie', 'Duits', 'Grieks','Latijn','Leesmateriaal', \
          'Loopbaanorientatie','NLT']'''

links = []

url = "https://www.svpo.nl/curriculum.asp"
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}

path_out = r'c:\books'

for klas in klassen: 
    for vak in vakken: 
        path = os.path.join(path_out, klas, vak)
        os.makedirs(path, exist_ok=True)

        payload = {'vak': vak, 'klas_en_schoolsoort': klas}
        
        response = requests.post(url, data=payload, headers=headers)
    
        all_links = BeautifulSoup(response.text, "lxml").find_all('a', {'href': True})
    
        for link in all_links:
            url = link.get('href')

            if url.lower().endswith('.pdf'):
                url = url.replace('\\', '/')
                links.append(url)
                print('url:', url)
                
                #filename = url.split('\\')[-1]
                filename = link.text + '.pdf'
                print('filename:', filename)
                
                full_path = os.path.join(path, filename)
                print('full_path:', full_path)
                
                response = requests.get(url)
                with open(full_path, 'wb') as fh:
                    fh.write(response.content)
                    
                print('---')

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM