文本预处理
过滤符号,去掉上标,转换为小写,非英文字符用空格隔开,连续重复字母数大于等于3的只保留1个,去掉指定单词中的空格。
import regex as re
import unicodedata
def process(text):
try:
text = re.sub(ur"\p{P}+|\p{Z}+|\p{S}+|\p{N}+", u' ', text)
text = unicodedata.normalize('NFKD',text)#.encode('ascii','ignore')
text = re.sub(ur"\p{M}+", u'', text)
text = re.sub(ur"\p{P}+|\p{S}+|\p{N}+|\p{Cs}+|\p{Cf}+|\p{Co}+", u'', text)
text = re.sub("([A-Za-z]+)", lambda m:m.group(1).lower(),text)
text = re.sub(ur'([^\x00-\x7f])', lambda m:u' '+m.group(1)+u' ', text)
text = re.sub(ur"(\w)\1{2,}",lambda m:m.group(1), text)
text = re.sub("(\s+)", u' ',text)
for fword in fword_list:
f_re = ''
for i in xrange(len(fword)):
w = fword[i]
f_re += w + "+\s*" if i < (len(fword)-1) else w + "+"
text = re.sub(f_re, u' '+fword+u' ',text)
text = re.sub("(\s+)", u' ',text)
return text
except:
return text
df['text'] = df['text'].apply(lambda x: process(x))