I want to use TwitterSearch to import tweets into a csv. However, the script doesn't catch special caracters (for example accents in French). I've tried several things, like adding .encode('utf-8'), without any success.
If I try to write :
tweet_text = tweet['text'].strip().encode('utf-8', 'ignore')
Then I get
Traceback (most recent call last): File "/Users/usr/Documents/Python/twitter_search2.py", line 56, in <module> get_tweets(query, max_tweets) File "/Users/usr/Documents/Python/twitter_search2.py", line 44, in get_tweets print('@%s: %s' % (user, tweet_text)) UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 32: ordinal not in range(128)
Does anybody have an idea?
I'm on Python 2.7. The code is :
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from TwitterSearch import *
import csv
def get_tweets(query, max = 10):
i = 0
search = query
with open(search+'.csv', 'wb') as outf:
writer = csv.writer(outf)
writer.writerow(['user','time','tweet','latitude','longitude'])
try:
tso = TwitterSearchOrder()
tso.set_keywords([search])
tso.set_include_entities(True)
# tso.set_language('fr')
ts = TwitterSearch(
consumer_key = 'YOUR CONSUMER KEY',
consumer_secret = 'YOUR CONSUMER SECRET',
access_token = 'YOUR ACCESS TOKEN',
access_token_secret = 'YOUR ACCESS TOKEN SECRET'
)
for tweet in ts.search_tweets_iterable(tso):
lat = None
long = None
time = tweet['created_at']
user = tweet['user']['screen_name']
tweet_text = tweet['text'].strip().encode('ascii', 'ignore')
tweet_text = ''.join(tweet_text.splitlines())
print i,time,
if tweet['geo'] != None and tweet['geo']['coordinates'][0] != 0.0: # avoiding bad values
lat = tweet['geo']['coordinates'][0]
long = tweet['geo']['coordinates'][1]
print('@%s: %s' % (user, tweet_text)), lat, long
else:
print('@%s: %s' % (user, tweet_text))
writer.writerow([user, time, tweet_text, lat, long])
i += 1
if i > max:
return()
except TwitterSearchException as e:
print(e)
query = raw_input ("Recherche : ")
max_tweets = 10
get_tweets(query, max_tweets)
Thank you very much for your help!
You are interpolating the encoded tweet together with the username :
print('@%s: %s' % (user, tweet_text))
If the user
object is a Unicode string this'll fail:
>>> user = u'Héllo'
>>> tweet_text = u'Héllo'.encode('utf8')
>>> '@%s: %s' % (user, tweet_text)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1: ordinal not in range(128)
because you are mixing types. Python tries to decode the tweet_text
value to make it a unicode
object again.
Stick to one type ; either encode everything, or leave everything Unicode, and encode at the last instance.
You'll have to encode your user
value for the CSV file anyway, leave encoding of the tweet until then:
tweet_text = tweet['text'].strip()
tweet_text = u''.join(tweet_text.splitlines())
print i, time,
if tweet['geo'] and tweet['geo']['coordinates'][0]:
lat, long = tweet['geo']['coordinates'][:2]
print u'@%s: %s' % (user, tweet_text), lat, long
else:
print u'@%s: %s' % (user, tweet_text)
writer.writerow([user.encode('utf8'), time.encode('utf8'),
tweet_text.encode('utf8'), lat, long])
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.