Text Preprocessing Translation Error Python

Question

I was trying to translate tweet text using a deep translator but I found some issues. Before translating the texts, I did some text preprocessing such as cleaning, removing emoji, etc. This is the ddefined functions of pre-processing :

def deEmojify(text):
    regrex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return regrex_pattern.sub(r'',text)
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove mentions
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
    text = re.sub(r'RT[\s]', '', text) # remove RT
    text = re.sub(r"http\S+", '', text) # remove link
    text = re.sub(r"[!@#$]", '', text) # remove link
    text = re.sub(r'[0-9]+', '', text) # remove numbers

    text = text.replace('\n', ' ') # replace new line into space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
    text = text.strip(' ') # remove characters space from both left and right text
    return text

def casefoldingText(text): # Converting all the characters in a text into lower case
    text = text.lower() 
    return text

def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
    text = word_tokenize(text) 
    return text

def filteringText(text): # Remove stopwors in a text
    listStopwords = set(stopwords.words('indonesian'))
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered 
    return text
def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = [stemmer.stem(word) for word in text]
    return text
def convert_eng(text):
    text = GoogleTranslator(source='auto', target='en').translate_batch(text)
    return text

And here's the translate function :

def convert_eng(text):
    text = GoogleTranslator(source='auto', target='en').translate(text)
    return text

this is an example of the expected result ( text in Indonesian)

text = '@jshuahaee Ketemu agnes mo lagii😍😍'
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)

stem = stemmingText(filter)
print('After Stem ==> ', stem)

en = convert_eng(stem)
print('After translate ==> ', en)

Result :

After cleaning ==>  Ketemu agnes mo lagii😍😍
After emoji ==>  Ketemu agnes mo lagii
After case folding ==>  ketemu agnes mo lagii
After token ==>  ['ketemu', 'agnes', 'mo', 'lagii']
After filter ==>  ['ketemu', 'agnes', 'mo', 'lagii']
After Stem ==>  ['ketemu', 'agnes', 'mo', 'lagi']
After translate ==>  ['meet', 'agnes', 'mo', 'again']

But, I found issues when the sentences contain some dots, the error happened when after the stem process the text contain of [''] ( I don't know how to call this)

text = 'News update Meski kurang diaspirasi Shoppee yg korea minded  dalam waktu indonesa belaja di bulan November Lazada 1… '
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)

stem = stemmingText(filter)
print('After Stem ==> ', stem)

en = convert_eng(stem)
print('After translate ==> ', en)

Result

After cleaning ==>  News update Meski kurang diaspirasi Shoppee yg korea minded  dalam waktu indonesa belaja di bulan November Lazada …
After emoji ==>  News update Meski kurang diaspirasi Shoppee yg korea minded  dalam waktu indonesa belaja di bulan November Lazada …
After case folding ==>  news update meski kurang diaspirasi shoppee yg korea minded  dalam waktu indonesa belaja di bulan november lazada …
After token ==>  ['news', 'update', 'meski', 'kurang', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'dalam', 'waktu', 'indonesa', 'belaja', 'di', 'bulan', 'november', 'lazada', '…']
After filter ==>  ['news', 'update', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'belaja', 'november', 'lazada', '…']
After Stem ==>  ['news', 'update', 'aspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'baja', 'november', 'lazada', '']

This is the error message

NotValidPayload                           Traceback (most recent call last)
<ipython-input-40-cb9390422d3c> in <module>
     14 print('After Stem ==> ', stem)
     15 
---> 16 en = convert_eng(stem)
     17 print('After translate ==> ', en)

<ipython-input-28-28bc36c96914> in convert_eng(text)
      8     return text
      9 def convert_eng(text):
---> 10     text = GoogleTranslator(source='auto', target='en').translate_batch(text)
     11     return text

C:\Python\lib\site-packages\deep_translator\google_trans.py in translate_batch(self, batch, **kwargs)
    195         for i, text in enumerate(batch):
    196 
--> 197             translated = self.translate(text, **kwargs)
    198             arr.append(translated)
    199         return arr

C:\Python\lib\site-packages\deep_translator\google_trans.py in translate(self, text, **kwargs)
    108         """
    109 
--> 110         if self._validate_payload(text):
    111             text = text.strip()
    112 

C:\Python\lib\site-packages\deep_translator\parent.py in _validate_payload(payload, min_chars, max_chars)
     44 
     45         if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit():
---> 46             raise NotValidPayload(payload)
     47 
     48         # check if payload contains only symbols

NotValidPayload:  --> text must be a valid text with maximum 5000 character, otherwise it cannot be translated

My idea is to remove the '' , i think that was the problem, but I have no idea how to do that. Anyone, please kindly help me

Answer 1

You need to introduce a bit of error checking into your code, and only process an expected data type. Your convert_eng function requires a non-empty string as an argument (see if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit(): part), and your stem contains an empty string as the last item in the list.

Besides, it is possible that filteringText(text) can return [] because all words can turn out to be stopwords. Also, do not use filter as a name of a variable, it is a built-in.

So, change

filter= filteringText(token)
print('After filter ==> ', filter)

stem = stemmingText(filter)
print('After Stem ==> ', stem)

to

filter1 = filteringText(token)
print('After filter ==> ', filter1)

if filter1:
    stem = stemmingText(filter1)
    print('After Stem ==> ', stem)
    en = [] # as stem_cleaned as a list
    for i in stem_cleaned:
        if len(i.strip()) > 0:        # If the item length is greater than 0
            en.append(convert_eng(i)) # Translate and append to 'en'

    print('After translate ==> ', en)

Text Preprocessing Translation Error Python

Question

1 answers

solution1
0 2021-11-26 21:18:34

Text Preprocessing Translation Error Python

Question

1 answers

solution1 0 2021-11-26 21:18:34

solution1
0 2021-11-26 21:18:34