[英]UTF-8 Trouble encoding UTF-8 characters in my python code. They show up as literal UTF-8
我有一份清单
[[“由于本周末的暴风雨,我们将布鲁门菲尔德自行车骑行的时间重新安排在 2 月 26 日。希望能在那里见到你。\\xe2\\x80\\xa6 '”], ['本周末阳光充足,好好利用海滩只需 $\\xe2\\x80\\xa6 '] 即可让您从 Woodland Hills 到海滩的巴士,[“RT @LHansenLA:昨天在@LAPPL @EagleandBadge 内窥视了观看结束纪念墙的新装备。向堕落者致敬@LAPD w/\\xe2\\x80\\xa6'"], ["很高兴加入 Art Sherman 和 Wings Over @Wendys 以纪念退伍军人以及由 Ron 和 \\xe2\\x80\\xa6 主持的 15 年每周会议'"],[ “和我一起参加第 4 届年度 Blumenfield 自行车骑行。在两个轮子上享受西谷。回复:'”]]
如您所见,不幸的是,列表显示的是文字 UTF-8 而不是字符本身。 在我的代码中的某个时候,我编码为 UTF-8
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
为了删除 b 前缀,以上代码都是必要的。 这些不能出现在我的推文中,因为我正在做机器学习分析并且让 bs 影响它。
如何用实际字符替换 UTF-8 脚本?
我需要以某种方式对其进行编码,因为我从(3 个城市)x(50 名官员)x(每个城市 12 个月的推文)中提取推文,因此尝试手动替换它们的效率是不可能的。
import tweepy #https://github.com/tweepy/tweepy
#Twitter API credentials
consumer_key = "insert key here"
consumer_secret = "insert key here"
access_key = "insert key here"
access_secret = "insert key here"
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import json
import csv
import datetime
from datetime import datetime
import os.path
failed_accounts = []
def get_all_tweets(screen_name,mode):
#try:
#Twitter only allows access to a users most recent 3240 tweets with this method
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
i = 0
num_req = 0
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print ("...%s tweets downloaded so far" % (len(alltweets)))
num_req = num_req + 1
# makes further requests only if batch doesn't contain tweets beyond oldest limit
oldest_limit = datetime(2016, 1, 20,0,0,0)
x = 0
for tweet in new_tweets:
raw_date = tweet.created_at
if raw_date < oldest_limit:
x = 1
else:
continue
if x == 1:
break
#BSP this script is designed to just keep going. I want it to stop.
#i = i + 1
#if i == 10:
# break
print("Number of Tweet Request Rounds: %s" %num_req)
correct_date_tweet = []
for tweet in alltweets:
raw_date = tweet.created_at
#date = datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S")
newest_limit = datetime(2017, 1, 20,0,0,0)
oldest_limit = datetime(2016, 1, 20,0,0,0)
if raw_date > oldest_limit and raw_date < newest_limit:
correct_date_tweet.append(tweet)
else:
continue
#transform the tweepy tweets into a 2D array that will populate the csv
if mode == "tweets only" or "instance file":
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
outtweets = [["1 ",stuff.replace('"', "")] for sublist in outtweets for stuff in sublist]
#outtweets = [["1 ",stuff] for sublist in outtweets for stuff in sublist]
else:
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),tweet.retweet_count,tweet.favorite_count,len(tweet.entities.get("hashtags")),len(tweet.entities.get("urls")),len(tweet.entities.get("user_mentions"))] for tweet in correct_date_tweet]
#write the csv
if mode == "instance file":
with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
writer = csv.writer(f)
writer.writerows(outtweets)
else:
with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
writer = csv.writer(f)
if mode != "tweets only":
writer.writerow(["id","created_at","text","retweets","favorites","hashtags","urls"])
writer.writerows(outtweets)
pass
print("Done with %s" % screen_name)
get_all_tweets("BobBlumenfield","instance file")
根据答案,我尝试将其中一行更改为outtweets = [[tweet.text] for tweet in correct_date_tweet]
但这不起作用,因为它产生了
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-12-a864b5efe8af> in <module>()
----> 1 get_all_tweets("BobBlumenfield","instance file")
<ipython-input-9-d0b9b37c7261> in get_all_tweets(screen_name, mode)
104 with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
105 writer = csv.writer(f)
--> 106 writer.writerows(outtweets)
107 else:
108 with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
C:\Users\Stan Shunpike\Anaconda3\lib\encodings\cp1252.py in encode(self, input, final)
17 class IncrementalEncoder(codecs.IncrementalEncoder):
18 def encode(self, input, final=False):
---> 19 return codecs.charmap_encode(input,self.errors,encoding_table)[0]
20
21 class IncrementalDecoder(codecs.IncrementalDecoder):
UnicodeEncodeError: 'charmap' codec can't encode characters in position 64-65: character maps to <undefined>
删除以下行:
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
原因如下:
b
。str
。 在这种模式下,您将获得数组的表示,包括类型,因此b
和 UTF-8 转义。open()
的内置编码器,您很少需要自己调用.encode()
。 在文本模式下使用open()
时,请始终指定编码,因为每个平台都不同。
从您的代码中删除.encode()
所有其他用途。
您现在可以删除试图更正错误的其他行。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.