I was trying to translate tweet text using a deep translator but I found some issues. Before translating the texts, I did some text preprocessing such as cleaning, removing emoji, etc. This is the ddefined functions of pre-processing :
def deEmojify(text):
regrex_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return regrex_pattern.sub(r'',text)
def cleaningText(text):
text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove mentions
text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
text = re.sub(r'RT[\s]', '', text) # remove RT
text = re.sub(r"http\S+", '', text) # remove link
text = re.sub(r"[!@#$]", '', text) # remove link
text = re.sub(r'[0-9]+', '', text) # remove numbers
text = text.replace('\n', ' ') # replace new line into space
text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
text = text.strip(' ') # remove characters space from both left and right text
return text
def casefoldingText(text): # Converting all the characters in a text into lower case
text = text.lower()
return text
def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
text = word_tokenize(text)
return text
def filteringText(text): # Remove stopwors in a text
listStopwords = set(stopwords.words('indonesian'))
filtered = []
for txt in text:
if txt not in listStopwords:
filtered.append(txt)
text = filtered
return text
def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
factory = StemmerFactory()
stemmer = factory.create_stemmer()
text = [stemmer.stem(word) for word in text]
return text
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate_batch(text)
return text
And here's the translate function :
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate(text)
return text
this is an example of the expected result ( text in Indonesian)
text = '@jshuahaee Ketemu agnes mo lagii😍😍'
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result :
After cleaning ==> Ketemu agnes mo lagii😍😍
After emoji ==> Ketemu agnes mo lagii
After case folding ==> ketemu agnes mo lagii
After token ==> ['ketemu', 'agnes', 'mo', 'lagii']
After filter ==> ['ketemu', 'agnes', 'mo', 'lagii']
After Stem ==> ['ketemu', 'agnes', 'mo', 'lagi']
After translate ==> ['meet', 'agnes', 'mo', 'again']
But, I found issues when the sentences contain some dots, the error happened when after the stem process the text contain of [''] ( I don't know how to call this)
text = 'News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada 1… '
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result
After cleaning ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada …
After emoji ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada …
After case folding ==> news update meski kurang diaspirasi shoppee yg korea minded dalam waktu indonesa belaja di bulan november lazada …
After token ==> ['news', 'update', 'meski', 'kurang', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'dalam', 'waktu', 'indonesa', 'belaja', 'di', 'bulan', 'november', 'lazada', '…']
After filter ==> ['news', 'update', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'belaja', 'november', 'lazada', '…']
After Stem ==> ['news', 'update', 'aspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'baja', 'november', 'lazada', '']
This is the error message
NotValidPayload Traceback (most recent call last)
<ipython-input-40-cb9390422d3c> in <module>
14 print('After Stem ==> ', stem)
15
---> 16 en = convert_eng(stem)
17 print('After translate ==> ', en)
<ipython-input-28-28bc36c96914> in convert_eng(text)
8 return text
9 def convert_eng(text):
---> 10 text = GoogleTranslator(source='auto', target='en').translate_batch(text)
11 return text
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate_batch(self, batch, **kwargs)
195 for i, text in enumerate(batch):
196
--> 197 translated = self.translate(text, **kwargs)
198 arr.append(translated)
199 return arr
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate(self, text, **kwargs)
108 """
109
--> 110 if self._validate_payload(text):
111 text = text.strip()
112
C:\Python\lib\site-packages\deep_translator\parent.py in _validate_payload(payload, min_chars, max_chars)
44
45 if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit():
---> 46 raise NotValidPayload(payload)
47
48 # check if payload contains only symbols
NotValidPayload: --> text must be a valid text with maximum 5000 character, otherwise it cannot be translated
My idea is to remove the '' , i think that was the problem, but I have no idea how to do that. Anyone, please kindly help me
You need to introduce a bit of error checking into your code, and only process an expected data type. Your convert_eng
function requires a non-empty string as an argument (see if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit():
part), and your stem
contains an empty string as the last item in the list.
Besides, it is possible that filteringText(text)
can return []
because all words can turn out to be stopwords. Also, do not use filter
as a name of a variable, it is a built-in.
So, change
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
to
filter1 = filteringText(token)
print('After filter ==> ', filter1)
if filter1:
stem = stemmingText(filter1)
print('After Stem ==> ', stem)
en = [] # as stem_cleaned as a list
for i in stem_cleaned:
if len(i.strip()) > 0: # If the item length is greater than 0
en.append(convert_eng(i)) # Translate and append to 'en'
print('After translate ==> ', en)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.