Some times I can effectively handle the socket.timeout, although some other times I get that socket timeout error and my script stops abruptly... Is there something I'm missing in my exception handling? how come it goes right trough it?
Happens randomly in either one of the following pieces of code:
First snippet:
for _ in range(max_retries):
try:
req = Request(url, headers={'User-Agent' :'Mozilla/5.0'})
response = urlopen(req,timeout=5)
break
except error.URLError as err:
print("URL that generated the error code: ", url)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", url)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", url)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", url)
print("Error description: Socket error.")
if response.getheader('Content-Type').startswith('text/html'):
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
Second snippet
for _ in range(max_retries):
try:
req = Request(i, headers={'User-Agent' :'Mozilla/5.0'})
with urlopen(req,timeout=5) as response, open(aux, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
with open(path, fname), 'a') as f:
f.write(("link" + str(intaux) + "-" + auxstr + str(index) + i[-4:] + " --- " + metadata[index%batch] + '\n'))
break
except error.URLError as err:
print("URL that generated the error code: ", i)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", i)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", i)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", i)
print("Error description: Socket error.")
The error:
Traceback (most recent call last):
File "/mydir/crawler.py", line 202, in <module>
spider("urls.txt", maxPages=0, debug=1, dailyRequests=9600)
File "/mydir/crawler.py", line 142, in spider
parser.getLinks(url + "?start=" + str(currbot) + "&tab=" + auxstr,auxstr)
File "/mydir/crawler.py", line 81, in getLinks
htmlBytes = response.read()
File "/usr/lib/python3.5/http/client.py", line 455, in read
return self._readall_chunked()
File "/usr/lib/python3.5/http/client.py", line 561, in _readall_chunked
value.append(self._safe_read(chunk_left))
File "/usr/lib/python3.5/http/client.py", line 607, in _safe_read
chunk = self.fp.read(min(amt, MAXAMOUNT))
File "/usr/lib/python3.5/socket.py", line 575, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.5/ssl.py", line 929, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.5/ssl.py", line 791, in read
return self._sslobj.read(len, buffer)
File "/usr/lib/python3.5/ssl.py", line 575, in read
v = self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out
EDIT:
I noticed I missed a few lines of code thanks to @tdelaney I added them to the code above and I'm posting the solution I wrote if you post the solution or if you have a better approach to solve it I will mark the answer as correct
Solution:
for _ in range(max_retries):
try:
req = Request(url, headers={'User-Agent' :'Mozilla/5.0'})
response = urlopen(req,timeout=5)
break
except error.URLError as err:
print("URL that generated the error code: ", url)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", url)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", url)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", url)
print("Error description: Socket error.")
if response.getheader('Content-Type').startswith('text/html'):
for _ in range(max_retries):
try:
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
break
except error.URLError as err:
print("URL that generated the error code: ", url)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", url)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", url)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", url)
print("Error description: Socket error.")
The python "Requests" library uses its own set of exceptions to handle errors pertaining to the HTTP protocol as well as the socket. It automatically maps exceptions returned from it's embedded socket() functions to custom ones defined in requests.exceptions.
So the exceptions raised from this...
import Requests
try:
req = Request("http://stackoverflow.com", headers={'User-Agent' :'Mozilla/5.0'})
urlopen(req,timeout=5)
except Timeout:
print "Session Timed Out!"
Are equivalent to the exceptions raised by this...
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
s.connect(("127.0.0.1", 80))
except socket.timeout:
print "Session Timed Out"
Your fixed code...
for _ in range(max_retries):
try:
req = Request(url, headers={'User-Agent' :'Mozilla/5.0'})
response = urlopen(req,timeout=5)
break
except error.URLError as err:
print("URL that generated the error code: ", url)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", url)
print("Error code:", err.code)
print("Error description:", err.reason)
except Timeout:
print("URL that generated the error code: ", url)
print("Error description: Session timed out.")
except ConnectionError:
print("URL that generated the error code: ", url)
print("Error description: Socket error timed out.")
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.