I try to download all images from ' https://www.nytimes.com/section/todayspaper ' with this code:
import requests
from io import open as iopen
from urlparse import urlsplit
file_url= 'https://www.nytimes.com/section/todayspaper'
def requests_image(file_url):
suffix_list = ['jpg', 'gif', 'png', 'tif', 'svg',]
file_name = urlsplit(file_url)[2].split('/')[-1]
file_suffix = file_name.split('.')[1]
i = requests.get(file_url)
if file_suffix in suffix_list and i.status_code == requests.codes.ok:
with iopen(file_name, 'wb') as file:
file.write(i.content)
else:
return False
no error occur when run it:
>>>
>>>
but i don't know where the images downloaded in my PC?
i checked download folder and they aren't there.
If you want to download all images in the page you should:
<img>
) src
attribute content import os
import hashlib
import requests
from bs4 import BeautifulSoup
page_url = 'https://www.nytimes.com/section/todayspaper'
# Download page html
page_data = requests.get(page_url).text
# Find all links in page
images_urls = [
image.attrs.get('src')
for image in BeautifulSoup(page_data, 'lxml').find_all('img')
]
# Clean empty links (<img src="" /> <img> etc)
images_urls = [
image_url
for image_url in images_urls
if image_url and len(image_url)>0
]
# Download files
def download_image(source_url, dest_dir):
# TODO: add filename extension
image_name = hashlib.md5(source_url.encode()).hexdigest()
with open(os.path.join(dest_dir, image_name), 'wb') as f:
image_data = requests.get(source_url).content
f.write(image_data)
for image_url in images_urls:
download_image(image_url, './tmp')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.