cpuwdl 发布的文章

使用keras.preprocessing进行文本序列化

使用keras.preprocessing进行文本序列化。

from keras.preprocessing import text, sequence
max_features = 200000
maxlen = 500
list_sentences_train = train["text"].fillna("[na]").values
list_sentences_test = test["text"].fillna("[na]").values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
train_sequence = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
test_sequence = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)


多个句子词频统计

使用collections.Counter计算词频,顺便去除只出现1次的单词。

from collections import Counter
cc = Counter()
def get_count(text):
    try:
        text_split = text.strip().split(' ')
        count = Counter(text_split)
        cc.update(count)
        return text_split
    except: 
        return text
def remove_one(text):
    try:
        text = u" ".join([x for x in [y for y in text.strip().split(u" ")] if cc[x] > 1])
        return text
    except: 
        return text
df['text_split'] = df['text'].apply(lambda x: get_count(x))
df['text'] = df['text'].apply(lambda x: remove_one(x))


文本预处理

过滤符号,去掉上标,转换为小写,非英文字符用空格隔开,连续重复字母数大于等于3的只保留1个,去掉指定单词中的空格。

import regex as re
import unicodedata
def process(text):
    try:
        text = re.sub(ur"\p{P}+|\p{Z}+|\p{S}+|\p{N}+", u' ', text)
        text = unicodedata.normalize('NFKD',text)#.encode('ascii','ignore')
        text = re.sub(ur"\p{M}+", u'', text)
        text = re.sub(ur"\p{P}+|\p{S}+|\p{N}+|\p{Cs}+|\p{Cf}+|\p{Co}+", u'', text)
        text = re.sub("([A-Za-z]+)", lambda m:m.group(1).lower(),text)
        text = re.sub(ur'([^\x00-\x7f])', lambda m:u' '+m.group(1)+u' ', text)
        text = re.sub(ur"(\w)\1{2,}",lambda m:m.group(1), text)
        text = re.sub("(\s+)", u' ',text)
        for fword in fword_list:
            f_re = ''
            for i in xrange(len(fword)):
                w = fword[i]
                f_re += w + "+\s*" if i < (len(fword)-1) else w + "+" 
            text = re.sub(f_re, u' '+fword+u' ',text)
        text = re.sub("(\s+)", u' ',text)
        return text
    except: 
        return text
df['text'] = df['text'].apply(lambda x: process(x))


回归LightGBM Cross Validation

使用LightGBM,Cross Validation进行回归,RMSE loss。

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import gc
def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False):
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=17)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=17)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = train.columns.tolist()
    test_df = test_df[feats]
    #test_df = csr_matrix(test_df)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y)):
        print('FOLD {}'.format(n_fold))
        train_x, train_y = train_df.iloc[train_idx], y.iloc[train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], y.iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        lgb_params =  {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            #"n_estimators":10000,
            "learning_rate": 0.01,
            
            'num_leaves': 60,
            'subsample': 0.6143,
            'colsample_bytree': 0.6453,
            'min_split_gain': np.power(10, -2.5988),
            'reg_alpha': np.power(10, -2.2887),
            'reg_lambda': np.power(10, 1.7570),
            'min_child_weight': np.power(10, -0.1477),
            'max_depth': -1,
            #'zero_as_missing':True
        }
        '''
        lgb_params = {
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
            'num_leaves': 16,
            'max_depth': -1,
            'min_child_samples': 1,
            'max_bin': 300,
            'subsample': 1.0,
            'subsample_freq': 1,
            'colsample_bytree': 0.5,
            'min_child_weight': 10,
            'reg_lambda': 0.1,
            'reg_alpha': 0.0,
            'scale_pos_weight': 1,
            'zero_as_missing': True,
            'num_threads': -1,
        }
        '''
        #train_x = csr_matrix(train_x)
        #valid_x = csr_matrix(valid_x)
        lgtrain = lgb.Dataset(train_x, train_y,
                        feature_name=feats,
                        categorical_feature = 'auto')
        lgvalid = lgb.Dataset(valid_x, valid_y,
                        feature_name=feats,
                        categorical_feature = 'auto')
        clf = lgb.train(
            lgb_params,
            lgtrain,
            num_boost_round=3000,
            valid_sets=[lgtrain, lgvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=200,
            verbose_eval=100
        )
        
       # clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            #eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        
        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        #print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        print('Fold %2d RMSE : %.6f' % (n_fold, np.sqrt(metrics.mean_squared_error(valid_y, oof_preds[valid_idx]))))      
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    rmse = np.sqrt(metrics.mean_squared_error(y, oof_preds))
    #print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
    print('Full RMSE score %.6f' % rmse)
    
    display_importances(feature_importance_df)
    return feature_importance_df, sub_preds
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgb_importances.png')

多分类LightGBM Cross Validation

使用LightGBM,Cross Validation进行多分类,multi log loss。

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import gc
def multi_log_loss(y_true, y_pred, num_classes):  # score function for CV
    esp = 1e-12
    y_pred += esp
    y_true = y_true.astype('int')
    # Handle all zeroes
    all_zeros = np.all(y_pred == 0, axis=1)
    y_pred[all_zeros] = 1/num_classes
    # Normalise sum of row probabilities to one
    row_sums = np.sum(y_pred, axis=1)
    y_pred /= row_sums.reshape((-1, 1))
    # Calculate score
    n_rows = y_true.size
    score_sum = 0
    for i in range(y_true.size):
        score_sum -= np.log(y_pred[i, y_true[i]])
    score = score_sum / n_rows
    return score
def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False):
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=SEED)
    num_classes = 6
    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0],num_classes))
    sub_preds = np.zeros((test_df.shape[0],num_classes))
    feature_importance_df = pd.DataFrame()
    feats = train_df.columns.tolist()
    cat_feats = 'auto'
    #feats = select
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], y)):
        train_x, train_y = train_df[feats].iloc[train_idx], y.iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], y.iloc[valid_idx]
        
        lgtrain = lgb.Dataset(train_x, train_y,
                        feature_name=feats,
                        categorical_feature = cat_feats)
        lgvalid = lgb.Dataset(valid_x, valid_y,
                        feature_name=feats,
                        categorical_feature = cat_feats)

        print('get lgb train valid dataset end')
        lgb_params =  {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class':num_classes,
            'metric': 'multi_logloss',
            #"n_estimators":10000,
            
            "learning_rate": 0.02,
            #"num_leaves": 200,
            #"feature_fraction": 0.50,
            #"bagging_fraction": 0.50,
            #'bagging_freq': 4,
            #"max_depth": -1,
            
            'num_leaves': 32,
            'max_depth': 8,
            'bagging_fraction': 0.7,
            'bagging_freq': 5,
            'feature_fraction': 0.7,
            
            "reg_alpha": 0.3,
            "reg_lambda": 0.1,
            'min_child_samples': 100,
            #'max_bin': 100,
            "min_split_gain":0.2,
            'nthread': 4,
            "min_child_weight":10,
        }
        clf = lgb.train(
            lgb_params,
            lgtrain,
            num_boost_round=3000,
            valid_sets=[lgtrain, lgvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=100,
            verbose_eval=100
        )

        #clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        #    eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        
        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d Multi Log Loss : %.6f' % (n_fold + 1, multi_log_loss(valid_y.values, oof_preds[valid_idx], num_classes)))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full Multi Log Loss %.6f' % multi_log_loss(y.values, oof_preds, num_classes))
    display_importances(feature_importance_df)
    return feature_importance_df,sub_preds
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgb_importances.png')