如何处理“ascii”编解码器无法编码字符“\xe9”错误？

Question

我正在尝试从网站下载 Excel 文件。 我的代码如下：

import os
import requests
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
import urllib

headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get("https://www.elections.on.ca/en/resource-centre/elections-results.html#accordion2022ge")
soup = BeautifulSoup(resp.text,"html.parser")

for link in soup.find_all('a', href=True):
#    print(link)
    if 'xlsx' in link['href']:
        print(link['href'])
        url="https://www.elections.on.ca/"+link['href']
#        print(url)
        file= url.split("/")[-1].split(".")[0]+".xlsx"
#        print(file)
        urllib.request.urlretrieve(url, file)

但是，当https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20OrlÃ©ans% 20076.xlsx正在尝试打开

UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-9-e1694f5ee458> in <module>
      8         file= url.split("/")[-1].split(".")[0]+".xlsx"
      9 #        print(file)
---> 10         urllib.request.urlretrieve(url, file)
...

UnicodeEncodeError: 'ascii' codec can't encode characters in position 101-102: ordinal not in range(128).

编辑：我尝试了safeStr解决方案形式UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128) ，但它不起作用。 请看下面：

def safeStr(obj):
    try: return str(obj).encode('ascii', 'ignore').decode('ascii')
    except: return ""

url="https://www.elections.on.ca/"+'/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'
#        print(url)
print(url)
file= url.split("/")[-1].split(".")[0]+".xlsx"
url = safeStr(url)
print(url)
#        print(file)
urllib.request.urlretrieve(url, file)

我得到的错误是：

https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orlans%20076.xlsx

HTTPError                                 Traceback (most recent call last)
<ipython-input-33-01070419a054> in <module>
      6 print(url)
      7 #        print(file)
----> 8 urllib.request.urlretrieve(url, file)

~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    245     url_type, path = _splittype(url)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()
    249 

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found

我从urlretrieve cannot get image from url contains unicode string 的问题尝试了另一种解决方案，但它也不起作用：

url = "https://www.elections.on.ca/"+urllib.parse.quote('/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx')
#url = safeStr(url)
print(url)
urllib.request.urlretrieve(url, file)

我得到的错误是：

https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%2520Totals%2520From%2520Official%2520Tabulation%2520-%2520Orl%C3%A9ans%2520076.xlsx

HTTPError                                 Traceback (most recent call last)
<ipython-input-56-cfce9d1344d0> in <module>
      2 #url = safeStr(url)
      3 print(url)
----> 4 urllib.request.urlretrieve(url, file)

~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    245     url_type, path = _splittype(url)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()
    249 

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found

Answer 1

我认为这是一个解决方案...

问题是您开始的网址：

"https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx'

已经被 url 引用（例如用%20替换的空格），但在Orléans仍然包含非 ascii 字符

因此，这个问题的解决方案将对我们有所帮助，但仅应用urllib.parse.quote(...)会导致两次编码的空格为%2520 。 这就是为什么在请求处理后的 url 时会收到 404 的原因。

所以首先我们需要取消引用 url（即%20 ->> " " ），然后再次引用它 - 这次重音字符也将被引用，它应该可以工作。

尝试这个：

path = urllib.parse.quote(urllib.parse.unquote(link['href']))
url = "https://www.elections.on.ca" + path

我们得到的结果是：

https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orl%C3%A9ans%20076.xlsx

...现在应该可以工作了！

如何处理“ascii”编解码器无法编码字符“\xe9”错误？

问题描述

1 个解决方案

解决方案1
0 已采纳 2022-06-14 18:02:35

如何处理“ascii”编解码器无法编码字符“\xe9”错误？

问题描述

1 个解决方案

解决方案1 0 已采纳 2022-06-14 18:02:35

解决方案1
0 已采纳 2022-06-14 18:02:35