在Keras模型中使用预训练的词向量
注:训练词向量的时候很可能需要设置VOCAB_MIN_COUNT=1,否则在数据集可能出现不在词典中的情况。
pad_sequences的操作数据为list of list。而CountVectorizer的操作数据为list of str。
0. 数据预处理
0.1 添加新数据
file_handle = open('new_data.txt', 'r') new_data = [] api_list = pd.read_pickle(r'api_list.pkl') api_list = [i.lower() for i in api_list] old_data_set = set(api_list) for line in file_handle.readlines(): new_line = line.strip().split(' ') new_data_set = set(new_line) if len(new_data_set - old_data_set) == 0: new_data.append(new_line) 0.2 转换原数据
import zipfile import pandas as pd import numpy as np from functools import partial #提取压缩文件中的csv的内容 def get_data(path): with zipfile.ZipFile(path, 'r') as z: #传入训练集或测试集数据路径 if len(z.filelist) == 1: filename = z.filelist[0].filename if filename.endswith('.csv'): f = z.open(filename) data = pd.read_csv(f) return data def keep_elements(api_list, seq): api_set = set(api_list) seq_set = set(seq) if len(seq_set - api_set) != 0: return None list_value = [] for index, element in enumerate(seq): list_value.append(element.lower()) return list_value #由于同一个文件包括了多个API序列,将所有API序列合并到一起 def get_sequence(df, period_idx): seq_list = [] #list of list #第一个到倒数第二个文件的API for _id,begin in enumerate(period_idx[:-1]): seq_list.append(df.iloc[begin: period_idx[_id+1]]['api'].values) #最后一个文件的API seq_list.append(df.iloc[period_idx[-1]:]['api'].values) return seq_list unique_api = pd.read_pickle('api_list.pkl') train = get_data('security_train.zip') test = get_data('security_test.zip') keep_all_elements= partial(keep_elements, unique_api) #api2index = {item:(i+1) for i,item in enumerate(unique_api)} #留0进行padding #index2api = {(i+1):item for i,item in enumerate(unique_api)} #train['api_idx'] = train['api'].map(api2index) train_period_idx = train.file_id.drop_duplicates(keep='first').index.values #train_peroid_idx表示的是每个file最开始的index train_df = train[['file_id','label']].drop_duplicates(keep='first') train_df['seq'] = get_sequence(train, train_period_idx) train_df['seq'] = train_df['seq'].apply(keep_all_elements) train_seq = train_df[train_df['seq'].notnull()]['seq'].values train_seq = train_seq.tolist() #test['api_idx'] = test['api'].map(api2index) test_period_idx = test.file_id.drop_duplicates(keep='first').index.values #test_peroid_idx表示的是每个file最开始的index test_df = test[['file_id']].drop_duplicates(keep='first') test_df['seq'] = get_sequence(test, test_period_idx) test_df['seq'] = test_df['seq'].apply(keep_all_elements) test_seq = test_df[test_df['seq'].notnull()]['seq'].values test_seq = test_seq.tolist() 1. 训练词向量
1.1 Word2Vec词向量
训练词向量的输入有两种方式,一种是list of list,如下所示。也可以使用文件输入,如
model = gensim.models.Word2Vec(corpus_file=r'/home/learn/code/glove/glove.txt', size=100, window=5, min_count=1, workers=8, sg=0, iter=5) import gensim import pickle vector_size = 100 #词向量维度 sentences = train_seq + test_seq + new_data model = gensim.models.Word2Vec(sentences=sentences, size=vector_size, window=5, min_count=1, workers=8, sg=0, iter=5) #window=5效果更好 wv = model.wv vocab_list = wv.index2word word_idx_dict = {} for idx, word in enumerate(vocab_list): word_idx_dict[word] = idx + 1 vectors_arr = wv.vectors vectors_arr = np.concatenate((np.zeros(vector_size)[np.newaxis, :], vectors_arr), axis=0)#此处0位置的向量指代的是padding f_vectors = open('./word_seg_vectors_arr.pkl', 'wb') pickle.dump(vectors_arr, f_vectors) f_vectors.close() import json with open(r'word2idx_vec.json', 'w') as f: json.dump(word_idx_dict, f) 1.2 训练glove词向量
- git clone http://github.com/stanfordnlp/glove
- 生成文本
def keep_all_elements(seq): list_value = [] for index, element in enumerate(seq): list_value.append(element) return list_value train_df['seq'] = train_df['seq'].apply(keep_all_elements) train_df['seq'] = train_df['seq'].apply(lambda x:' '.join(x)) train_array = train_df['seq'].tolist() with open('train_seq.txt', 'w') as f: for item in train_array: f.write("%s\n" % item) test_df['seq'] = test_df['seq'].apply(keep_all_elements) test_df['seq'] = test_df['seq'].apply(lambda x:' '.join(x)) test_array = test_df['seq'].tolist() with open('test_seq.txt', 'w') as f: for item in test_array: f.write("%s\n" % item) - make进行编译。
- 修改参数(如VECTOR_SIZE设置成100),然后sh demo.sh即可产生glove词向量。(其中VOCAB_MIN_COUNT大概率需要设置成1)。
1.2.2 训练bigram word2vec
def get_bigram(text): pieces = text.split() return ' '.join('_'.join(pieces[i:i+2]) for i in range(0, len(pieces), 2)) for index, file in enumerate(files): files[index] = get_bigram(file) with open('train_bigram.txt', 'w') as f: for item in files: f.write("%s\n" % item) for index, file in enumerate(outfiles): outfiles[index] = get_bigram(file) with open('test_bigram.txt', 'w') as f: for item in outfiles: f.write("%s\n" % item) cat train_bigram.txt test_bigram.txt > glove.txt 1.3 合并word2vec和glove词向量
此步骤是根据需要进行。
import pandas as pd import numpy as np w2v = pd.read_csv(r'E:\work\competition\security\word_embedding\word2vec.vec', sep=' ', header = None, index_col=0, error_bad_lines=False, skiprows = [0]) w2v.reset_index(inplace=True) w2v.columns = w2v.columns - 1 w2v.rename(columns={-1:'API'}, inplace=True) glove = pd.read_csv(r'E:\work\competition\security\word_embedding\glove.txt', sep=' ', header = None, index_col=0, error_bad_lines=False) glove.reset_index(inplace=True) glove = glove.iloc[:-1,:] glove.columns = glove.columns + 100 glove.rename(columns={100: 'API'}, inplace=True) total_embeddings = pd.merge(w2v, glove, on='API') total_embeddings_array = total_embeddings.iloc[:, 1:].values #去除第一列API的名称 total_embeddings_array = np.concatenate((np.zeros(150)[np.newaxis, :], total_embeddings_array), axis=0) #增加填充0对应的数据 pd.to_pickle(total_embeddings_array, r'E:\work\competition\security\word_embedding\word2vec\windows5\word_seg_vectors_arr_add_glove.pkl') 1.4 使用keras对API序列进行数字化
from keras.preprocessing.sequence import pad_sequences import json def get_dict_data(file_path): with open(file_path, 'r') as f: dict_data = json.load(f) return dict_data def keep_all_elements_word2vec(seq): ret_value = [] dict_data = get_dict_data(r'./new_data/word2idx_vec.json') #此路径需要修改 ret_value = [dict_data[i] for i in seq] return ret_value train_df['seq'] = train_df['seq'].apply(keep_all_elements_word2vec) train_seq = pad_sequences(train_df['seq'], maxlen=50000, padding='post', truncating='post') test_df['seq'] = test_df['seq'].apply(keep_all_elements_word2vec) test_seq = pad_sequences(test_df['seq'], maxlen=50000, padding='post', truncating='post') pd.to_pickle(train_seq, "train_word2vec_w10_seq.pkl") pd.to_pickle(test_seq, "test_word2vec_w10_seq.pkl") 2. 使用预训练的词向量
embedding_matrix_path = 'word_seg_vectors_arr.pkl' embedding_matrix = pickle.load(open(os.path.join(data_folder_path, embedding_matrix_path), 'rb')) _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero, weights=[embedding_matrix], trainable=False)(_input)