[英]Preprocessing data: to remove italian stopwords for text analysis
! pip install stop-words
from stop_words import get_stop_words
stop = get_stop_words('italian')
import re
# helper function to clean tweets
def processTweet(tweet):
# Remove HTML special entities (e.g. &)
tweet = re.sub(r'\&\w*;', '', tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','',tweet)
# Remove tickers
tweet = re.sub(r'\$\w*', '', tweet)
# To lowercase
tweet = tweet.lower()
# Remove hyperlinks
tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
# Remove hashtags
tweet = re.sub(r'#\w*', '', tweet)
# Remove Punctuation and split 's, 't, 've with a space for filter
tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#)|(\w+:\/\/\S+)|(\S*\d\S*)|([,;.?!:])",
" ", tweet).split())
#tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
# Remove words with 2 or fewer letters
tweet = re.sub(r'\b\w{1,3}\b', '', tweet)
# Remove whitespace (including new line characters)
tweet = re.sub(r'\s\s+', ' ', tweet)
# Remove single space remaining at the front of the tweet.
tweet = tweet.lstrip(' ')
# Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
tweet = ''.join(c for c in tweet if c <= '\uFFFF')
return tweet
df['text'] = df['text'].apply(processTweet)
只需使用你一直在使用的 re.sub() :
exclusions = '|'.join(stop)
tweet = re.sub(exclusions, '', tweet)
考虑以下示例
import re
stops = ["and","or","not"] # list of words to remove
text = "Band and nothing else!" # and in Band and not in nothing should stay
pattern = r'\b(?:' + '|'.join(re.escape(s) for s in stops) + r')\b'
clean = re.sub(pattern, '', text)
print(clean)
输出
Band nothing else!
说明: re.escape
处理在正则表达式模式中具有特殊含义的字符(例如.
)并将它们转换为文字版本(因此re.escape(".")
匹配文字.
不是任何字符), |
是替代方法,使用所有单词的连接替代方法是构建, (?:
... )
是非捕获组,它允许我们在开始时使用一个\b
\b
在结尾处使用一个 \b 而不是每个单词。 \b
是单词边界,这里用于确保仅删除整个单词,而不是例如Band
变成B
。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.