[英]How to extract and download all images from a website using beautifulSoup?
我正在嘗試從 url 中提取和下載所有圖像。我寫了一個腳本
import urllib2
import re
from os.path import basename
from urlparse import urlsplit
url = "http://filmygyan.in/katrina-kaifs-top-10-cutest-pics-gallery/"
urlContent = urllib2.urlopen(url).read()
# HTML image tag: <img src="url" alt="some_text"/>
imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
# download all images
for imgUrl in imgUrls:
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
我不想提取此頁面的圖像看到此圖像http://i.share.pho.to/1c9884b1_l.jpeg我只想獲取所有圖像而不單擊“下一步”按鈕我不知道怎么能我得到了“下一個”class 中的所有圖片。?我應該在 findall 中做哪些更改?
以下內容應從給定頁面中提取所有圖像並將其寫入正在運行腳本的目錄。
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
對 Jonathan 的回答稍作修改(因為我無法發表評論):向網站添加“www”將修復大多數“不支持文件類型”的錯誤。
import re
import requests
from bs4 import BeautifulSoup
site = 'http://www.google.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
from bs4 import *
import requests
import os
def folder_create(images):
try:
folder_name = input("Enter Folder Name:- ")
# folder creation
os.mkdir(folder_name)
except:
print("Folder Exist with that name!")
folder_create()
download_images(images, folder_name)
def download_images(images, folder_name):
count = 0
print(f"Total {len(images)} Image Found!")
if len(images) != 0:
for i, image in enumerate(images):
try:
image_link = image["data-srcset"]
except:
try:
image_link = image["data-src"]
except:
try:
image_link = image["data-fallback-src"]
except:
try:
image_link = image["src"]
except:
pass
try:
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
with open(f"{folder_name}/images{i+1}.jpg", "wb+") as f:
f.write(r)
count += 1
except:
pass
if count == len(images):
print("All Images Downloaded!")
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
folder_create(images)
url = input("Enter URL:- ")
main(url)`
如果您只想要圖片,那么您可以直接下載它們,甚至無需刪除網頁。 都具有相同的 URL:
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute1.jpg
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute2.jpg
...
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute10.jpg
如此簡單的代碼將為您提供所有圖像:
import os
import urllib
import urllib2
baseUrl = "http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-"\
"cutest-pics-gallery/cute%s.jpg"
for i in range(1,11):
url = baseUrl % i
urllib.urlretrieve(url, os.path.basename(url))
使用 Beautifulsoup,您必須單擊或轉到下一頁才能刪除圖像。 如果您想單獨廢棄每個頁面,請嘗試使用該類對它們進行刮擦,該類是shutterset_katrina-kaifs-top-10-cutest-pics-gallery
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.