[英]How to deal with the 'ascii' codec can't encode character '\xe9' error?
我正在尝试从网站下载 Excel 文件。 我的代码如下:
import os
import requests
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
import urllib
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get("https://www.elections.on.ca/en/resource-centre/elections-results.html#accordion2022ge")
soup = BeautifulSoup(resp.text,"html.parser")
for link in soup.find_all('a', href=True):
# print(link)
if 'xlsx' in link['href']:
print(link['href'])
url="https://www.elections.on.ca/"+link['href']
# print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
# print(file)
urllib.request.urlretrieve(url, file)
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-9-e1694f5ee458> in <module>
8 file= url.split("/")[-1].split(".")[0]+".xlsx"
9 # print(file)
---> 10 urllib.request.urlretrieve(url, file)
...
UnicodeEncodeError: 'ascii' codec can't encode characters in position 101-102: ordinal not in range(128).
编辑:我尝试了safeStr
解决方案形式UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128) ,但它不起作用。 请看下面:
def safeStr(obj):
try: return str(obj).encode('ascii', 'ignore').decode('ascii')
except: return ""
url="https://www.elections.on.ca/"+'/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
# print(url)
print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
url = safeStr(url)
print(url)
# print(file)
urllib.request.urlretrieve(url, file)
我得到的错误是:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orlans%20076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-33-01070419a054> in <module>
6 print(url)
7 # print(file)
----> 8 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
我从urlretrieve cannot get image from url contains unicode string 的问题尝试了另一种解决方案,但它也不起作用:
url = "https://www.elections.on.ca/"+urllib.parse.quote('/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx')
#url = safeStr(url)
print(url)
urllib.request.urlretrieve(url, file)
我得到的错误是:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%2520Totals%2520From%2520Official%2520Tabulation%2520-%2520Orl%C3%A9ans%2520076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-56-cfce9d1344d0> in <module>
2 #url = safeStr(url)
3 print(url)
----> 4 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
我认为这是一个解决方案...
问题是您开始的网址:
"https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
已经被 url 引用(例如用%20
替换的空格),但在Orléans
仍然包含非 ascii 字符
因此,这个问题的解决方案将对我们有所帮助,但仅应用urllib.parse.quote(...)
会导致两次编码的空格为%2520
。 这就是为什么在请求处理后的 url 时会收到 404 的原因。
所以首先我们需要取消引用 url(即%20 ->> " "
),然后再次引用它 - 这次重音字符也将被引用,它应该可以工作。
尝试这个:
path = urllib.parse.quote(urllib.parse.unquote(link['href']))
url = "https://www.elections.on.ca" + path
我们得到的结果是:
https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orl%C3%A9ans%20076.xlsx
...现在应该可以工作了!
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.