[英]BoxOfficeMojo refusing connections from Selenium. Works fine with beautiful soup so it's not the actual connection
Anyone know what's going on?有谁知道发生了什么? I'm about to throw my computer out the window.
我即将把我的电脑扔出 window。 Beautiful soup is working fine with the same page, so I know it's not the connection.
美丽的汤在同一页面上运行良好,所以我知道这不是连接。 And I've tried putting
WebDriverWait
and time.sleep(10)
in between every line - after the delay, the same error comes up.而且我尝试将
WebDriverWait
和time.sleep(10)
放在每一行之间 - 在延迟之后,出现同样的错误。
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
chromedriver = "/usr/bin/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
i = 2
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_1')
wait = WebDriverWait(driver, 10)
wait
while i < 269:
wait
driver.find_element_by_xpath('//*[@id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
wait
get_movie_dict(driver.current_url)
wait
i += 1
wait
driver.back()
I receive the followed errors:我收到以下错误:
ConnectionRefusedError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
156 conn = connection.create_connection(
--> 157 (self._dns_host, self.port), self.timeout, **extra_kw
158 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
83 if err is not None:
---> 84 raise err
85
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
73 sock.bind(source_address)
---> 74 sock.connect(sa)
75 return sock
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
671 headers=headers,
--> 672 chunked=chunked,
673 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
375 try:
--> 376 self._validate_conn(conn)
377 except (SocketTimeout, BaseSSLError) as e:
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
993 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 994 conn.connect()
995
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in connect(self)
299 # Add certificate verification
--> 300 conn = self._new_conn()
301 hostname = self.host
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
168 raise NewConnectionError(
--> 169 self, "Failed to establish a new connection: %s" % e
170 )
NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
719 retries = retries.increment(
--> 720 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
721 )
~/anaconda3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
435 if new_retry.is_exhausted():
--> 436 raise MaxRetryError(_pool, url, error or ResponseError(cause))
437
MaxRetryError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-55-0dd26218976b> in <module>
9 driver.find_element_by_xpath('//*[@id="table"]/div/table[2]/tbody/tr[{}]/td[2]/a'.format(i)).click()
10 wait
---> 11 get_movie_dict(driver.current_url)
12 wait
13 i += 1
<ipython-input-45-2533561becb9> in get_movie_dict(link)
19 wait = WebDriverWait(driver, 10)
20 wait
---> 21 response = requests.get(url)
22 wait
23 page = response.text
~/anaconda3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~/anaconda3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='www.boxofficemojo.comhttps', port=443): Max retries exceeded with url: //www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f42d8fec710>: Failed to establish a new connection: [Errno 111] Connection refused'))
Edit: Added the function get_movie_dict
:编辑:添加了 function
get_movie_dict
:
def get_movie_dict(link):
'''
From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
collect
- title
- domestic gross
- runtime
- MPAA rating
- full release date
Return information as a dictionary.
'''
base_url = 'https://www.boxofficemojo.com'
#Create full url to scrape
url = base_url + link
#Request HTML and parse
wait = WebDriverWait(driver, 10)
wait
response = requests.get(url)
wait
page = response.text
soup = BeautifulSoup(page,"lxml")
headers = ['movie_title', 'domestic_total_gross',
'runtime_minutes', 'rating', 'release_date', 'budget']
#Get title
title_string = soup.find('title').text
title = title_string.split('-')[0].strip()
#Get domestic gross
try:
raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
.find_all('span', class_='money')[0]
.text
)
except:
raw_domestic_total_gross = float("NaN")
if type(raw_domestic_total_gross) == float or type(raw_domestic_total_gross) == 'NoneType':
print('This is NaN')
domestic_total_gross = float("NaN")
else:
domestic_total_gross = money_to_int(raw_domestic_total_gross)
#Get runtime
raw_runtime = get_movie_value(soup,'Running')
if type(raw_runtime) != float and type(raw_runtime) != 'NoneType':
runtime = runtime_to_minutes(raw_runtime)
#Get rating
rating = get_movie_value(soup,'MPAA')
#Get release date
if '-' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
elif '(' in get_movie_value(soup, 'Release Date'):
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
else:
raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
release_date = to_date(raw_release_date)
# Get budget alt
raw_budget = get_movie_value(soup,'Budget')
budget = money_to_int(raw_budget)
#Create movie dictionary and return
movie_dict = dict(zip(headers,[title,
domestic_total_gross,
runtime,
rating,
release_date,
budget]))
return movie_dict
The link
that you are extracting from the page is "absolute" (it includes the scheme and hostname), when you add this to base_url
you are getting a string that looks like https://www.boxofficemojo.comhttps://www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1
您从页面中提取的
link
是“绝对的”(它包括方案和主机名),当您将其添加到base_url
时,您将获得一个类似于https://www.boxofficemojo.comhttps://www.boxofficemojo.com/release/rl1182631425/?ref_=bo_yld_table_1
You should use urljoin
to join the base url with the extracted url as it will handle both relative and absolute urls.您应该使用
urljoin
将基础 url 与提取的 url 连接起来,因为它将同时处理相对和绝对 url。
from urllib.parse import urljoin
url = urljoin(base_url, link)
I found out what the issue was.我发现了问题所在。 After removing this block from
get_movie_dict
the function worked properly:从
get_movie_dict
中删除此块后,function 工作正常:
#Request HTML and parse
wait = WebDriverWait(driver, 10)
wait
response = requests.get(url)
wait
page = response.text
soup = BeautifulSoup(page,"lxml")
get_movie_dict
is a helper function. get_movie_dict
是一个助手 function。 The line with response = requests.get(url)
was attempting to send another, unrelated GET request inside the helper function, which was unnecessary because one had already been sent outside of it - this is what was causing the problem.带有
response = requests.get(url)
的行试图在帮助程序 function 中发送另一个不相关的 GET 请求,这是不必要的,因为已经在它之外发送了一个 - 这就是导致问题的原因。
This is an example of why it's important to understand what each line of code is doing, before copying and pasting it into your own code.这是一个示例,说明为什么在将每行代码复制并粘贴到您自己的代码中之前了解每行代码在做什么很重要。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.