使用 Python 从网站下载所有 pdf 文件

Question

我遵循了几个在线指南，试图构建一个脚本，该脚本可以识别并从网站下载所有 pdf，从而避免我手动执行此操作。 到目前为止，这是我的代码：

from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib

# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")     
links = soup.find_all('a', href=re.compile(r'(.pdf)'))


# clean the pdf link names
url_list = []
for el in links:
    url_list.append(("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href']))
#print(url_list)


# download the pdfs to a specified location
for url in url_list:
    print(url)
    fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", "").replace(".pdf",""))
    print(fullfilename)
    request.urlretrieve(url, fullfilename)

该代码似乎可以找到所有 pdf（取消注释print(url_list)以查看）。 但是，它在下载阶段失败。 特别是我收到此错误，我无法理解出了什么问题：

E:\webscraping>python get_pdfs.py
http://www.gatsby.ucl.ac.uk/teaching/courses/http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/cribsheet.pdf
E:\webscraping\http://www.gatsby.ucl.ac.uk/teaching/courses/cribsheet
Traceback (most recent call last):
  File "get_pdfs.py", line 26, in <module>
    request.urlretrieve(url, fullfilename)
  File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 248, in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
  File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 532, in open
    response = meth(req, response)
  File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 570, in error
    return self._call_chain(*args)
  File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 504, in _call_chain
    result = func(*args)
  File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 650, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found

有人可以帮我吗？

Answer 1

查看以下实现。 我使用requests模块而不是urllib来进行下载。 此外，我使用.select()方法而不是.find_all()来避免使用re 。

import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"

#If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)

response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")     
for link in soup.select("a[href$='.pdf']"):
    #Name the pdf files using the last portion of each link which are unique in this case
    filename = os.path.join(folder_location,link['href'].split('/')[-1])
    with open(filename, 'wb') as f:
        f.write(requests.get(urljoin(url,link['href'])).content)

Answer 2

一般来说，上面的答案应该有效。 但是，您应该评估您尝试使用的网页的 html 源代码。 例如，有些可能在元标记中有 og_url 属性，而其他可能没有。 如果您使用的是安全网站（比如您大学的课程网页），则这是可能的。 在这种情况下，您将不得不以不同的方式提取 pdf 链接。

你可以在这里找到一个很好的解释和解决方案：

https://medium.com/@dementorwriter/notesdownloader-use-web-scraping-to-download-all-pdfs-with-python-511ea9f55e48

Answer 3

几个链接已经包含导致 404 未找到的服务器地址。 此外，您不应从文件名中删除.pdf ，因为它会在没有扩展名的情况下保存它。

from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib

# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")     
links = soup.find_all('a', href=re.compile(r'(.pdf)'))


# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
    url_list.append(el['href'])
else:
    url_list.append("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href'])

print(url_list)


# download the pdfs to a specified location
for url in url_list:
    print(url)
    fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", ""))
    print(fullfilename)
    request.urlretrieve(url, fullfilename)

Answer 4

我根据argparse的回答和附加的argparse编写了一个新颖的脚本。 我的完整代码如下：

import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import argparse

#%% Example
# one pdf
#   python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
# many pdfs
#   python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html

#%% Functions
def all_pdf_download(args):
    base_url = args.link
    if args.save_here:
        folder_path = os.getcwd()
    else:
        folder_path = args.folder_path
        if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
    print("====== 1. Set savepath: {} ======".format(folder_path))
    print("====== 2. Start searching ======")
    #response = requests.get(base_url)
    response = requests.get(base_url, headers={'User-Agent': 'Custom'})
    soup= BeautifulSoup(response.text, "html.parser")
    search_res = soup.select("a[href$='.pdf']")
    print("{} files found!!!".format(len(search_res)))
    print("====== 3. Start downloading ======")
    for counter, link in enumerate(search_res):
        #Name the pdf files using the last portion of each link which are unique in this case
        filename = link['href'].split('/')[-1]
        file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
        if args.print_all:
            print("[{}/{}] {}".format(counter+1, len(search_res), filename))
        with open(file_save_path, 'wb') as f:
            f.write(requests.get(urljoin(base_url,link['href'])).content)
    print("====== 4. Finished!!! ======")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Test argparse')
    ####################################
    ############ ALL OPTION ############
    ## Main option
    # -l/--link
    parser.add_argument('-l', '--link', required=True, type=str,
                        help='write down site name')
    # --print-all
    parser.add_argument('--print-all', dest='print_all', action='store_true',
                        help="print all filename")
    parser.set_defaults(print_all=True)
    # --save-here
    parser.add_argument('--save-here', dest='save_here', action='store_true',
                        help="save files here")
    parser.set_defaults(save_here=False)
    # --save--folder
    # default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
    parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"), 
                        type=str, help='save files in the given folder')

    ########################################
    ############ PARSING OPTION ############
    args = parser.parse_args()
    all_pdf_download(args)

更多细节和更新可以参考我的gist- hibetterheyj/all_pdf_dl.py

最好的事物！

Answer 5

@SIM 对我的需求的回答的变化：

from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib

# connect to website and get list of all pdfs
url="http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=Compilers&doc=docs/slides.html"
pdfPath = "http://openclassroom.stanford.edu/MainFolder/courses/Compilers/docs/"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")     
links = soup.find_all('a', href=re.compile(r'(.pdf)'))


# clean the pdf link names
url_list = []
for el in links:
    if(el['href'].startswith('http')):
        url_list.append(el['href'])
    else:
        url_list.append(pdfPath + el['href'])

print(f'url_list: {url_list}\n')


# download the pdfs to a specified location
for url in url_list:
    print(f'urL: {url}\n')
    fullfilename = os.path.join(r'standfordPdfs', url.replace(pdfPath, ""))
    print(f'fullfilename: {fullfilename}')
    request.urlretrieve(url, fullfilename)

使用 Python 从网站下载所有 pdf 文件

问题描述

5 个解决方案

解决方案1
22 已采纳 2019-02-10 16:10:38

解决方案2
4 2020-06-20 23:29:52

解决方案3
3 2019-02-10 13:18:39

解决方案4
0 2021-01-20 17:01:49

解决方案5
0 2022-12-21 02:11:38

使用 Python 从网站下载所有 pdf 文件

问题描述

5 个解决方案

解决方案1 22 已采纳 2019-02-10 16:10:38

解决方案2 4 2020-06-20 23:29:52

解决方案3 3 2019-02-10 13:18:39

解决方案4 0 2021-01-20 17:01:49

解决方案5 0 2022-12-21 02:11:38

解决方案1
22 已采纳 2019-02-10 16:10:38

解决方案2
4 2020-06-20 23:29:52

解决方案3
3 2019-02-10 13:18:39

解决方案4
0 2021-01-20 17:01:49

解决方案5
0 2022-12-21 02:11:38