I'm not very familiar with data scrapping and I'm unable to download images using beautiful soup.
I need to download all images from a website. I'm using code below:
import re
import requests
from bs4 import BeautifulSoup
site = 'http://someurl.org/'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
# img_tags = soup.findAll('img')
img_tags = soup.findAll('img',{"src":True})
print('img_tags: ')
print(img_tags)
urls = [img['src'] for img in img_tags]
print('urls: ')
print(urls)
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
However this is ignoring all images present on the page that have html similar to this:
<img data-bind="attr: { src: thumbURL() }" src="/assets/images/submissions/abfc-2345345234.thumb.png">
I assume it is because of the data attribute also containing string "src" but I can't seem to figure it out.
You need to use selenium or some that can run javascript. This is the code load image until found it
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
site = 'http://phylopic.org/'
dr = webdriver.Chrome()
dr.get(site)
try:
element = WebDriverWait(dr, 20, 0.5).until(
EC.visibility_of_element_located((By.CLASS_NAME, "span1"))
)
except:
print("Wait a bit more")
time.sleep(5)
text = dr.page_source
soup = BeautifulSoup(text,"lxml")
imgs = soup.find_all('img')
print(imgs)
dr.close()
The second problem is how to convert a relative path to an absolute path. There are a few types of relative path on HTML
.
When the url is http://someurl.org/somefd/somefd2
<img src="picture.jpg"> http://someurl.org/somefd/somefd2/picture.jpg
<img src="images/picture.jpg"> http://someurl.org/somefd/somefd2/images/picture.jpg
<img src="/images/picture.jpg"> http://someurl.org/images/picture.jpg
<img src="../picture.jpg"> http://someurl.org/somefd/picture.jpg
This is my code to convert rp to ap.
import re
site = 'https://en.wikipedia.org/wiki/IMAGE'
def r2a(path,site=site):
rp = re.findall(r"(/?\W{2}\/)+?",path)
if path.find("http") == 0:
#full http url
return path
elif path.find("//") == 0:
#http url lack of http:
return "http:" + path
elif path.find("//") < 0 and path.find("/") == 0:
# located in the folder at the root of the current web
site_root = re.findall("http.{3,4}[^/]+",site)
return site_root[0] + path
elif rp:
# located in the folder one level up from the current folder
sitep = len(re.findall(r"([^/]+)+",site)) - 2 - len(rp)
# raise error when sitep-len(rp)
new_path = re.findall("(http.{4}[^/]+)(/[^/]+){%d}"%(sitep),site)
return "{}/{}".format("".join(new_path[0]),path.replace( "".join(rp) , ""))
else:
# located in the folder one level up from the current folder
# located in the same folder as the current page
return "{}/{}".format(site,path)
assert "https://en.wikipedia.org/wiki/IMAGE/a.jpg" == r2a("a.jpg")
assert "https://en.wikipedia.org/wiki/IMAGE/unknow/a.jpg" == r2a("unknow/a.jpg")
assert "https://en.wikipedia.org/unknow/a.jpg" == r2a("/unknow/a.jpg")
assert "https://en.wikipedia.org/wiki/a.jpg" == r2a("../a.jpg")
assert "https://en.wikipedia.org/a.jpg" == r2a("../../a.jpg")
assert "https://en.wikipedia.org/wiki/IMAGE/a.jpg" == r2a("https://en.wikipedia.org/wiki/IMAGE/a.jpg")
assert "http://en.wikipedia.org/" == r2a("//en.wikipedia.org/")
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.