参数:
--train_file /home/li***/workspace/nplm/example/work/train.ngrams --validation_file /home/li***/workspace/nplm/example/work/validation.ngrams --num_epochs 10 --words_file /home/li***/workspace/nplm/example/work/words --model_prefix /home/li***/workspace/nplm/example/work/inferno.nnlm --learning_rate 1 --minibatch_size 8
1) 读取并获取各参数
2) 读取训练数据
// Read training data vector<int> training_data_flat; vec * training_data_flat_mmap; data_size_t training_data_size; //num_tokens; ip::managed_mapped_file mmap_file; if (use_mmap_file == false) { cerr << "Reading data from regular text file " << endl; readDataFile(myParam.train_file, myParam.ngram_size, training_data_flat, myParam.minibatch_size); training_data_size = training_data_flat.size() / myParam.ngram_size; }readDataFile()方法会读取train.ngrams中所有的数据, 并放入到vector training_data_flat中, 即training_data_flat[0 ~ 2]为第一个ngram (这里ngram = 3-gram), training_data_flat[3~ 5]为第二个ngram, 依此类推.
Matrix<int, Dynamic, Dynamic> training_data; //(training_data_flat.data(), myParam.ngram_size, training_data_size); #ifdef MAP cerr<<"Setting up eigen map"<<endl; if (use_mmap_file == false) { training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size); } else { training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size); } cerr<<"Created eigen map"<<endl; #else if (use_mmap_file == false) { training_data = Map<Matrix<int, Dynamic, Dynamic> >( training_data_flat.data(), myParam.ngram_size, training_data_size); } #endif由于MAP没有预定义,运行training_data = Map< Matrix<int dynamic=""> >(training_data_flat.data(), myParam.ngram_size, training_data_size);得到的结果是,生成一个int类型的二维矩陈,其中行数为myParam.ngram_size (即3), 列数为training_data_size (即ngram的样例数). 二维矩阵的值可能根据vector training_data_flat进行初始化. 接着,
if (use_mmap_file == false && randomize == true) { cerr << "Randomly shuffling data..." << endl; // Randomly shuffle training data to improve learning for (data_size_t i = training_data_size - 1; i > 0; i--) { data_size_t j = uniform_int_distribution<data_size_t>(0, i - 1)(rng); training_data.col(i).swap(training_data.col(j)); } }对training_data中的一些进行替换, 即对数据进行重新洗牌.
3) 读取校正数据
// Read validation data vector<int> validation_data_flat; int validation_data_size = 0; if (myParam.validation_file != "") { readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat); validation_data_size = validation_data_flat.size() / myParam.ngram_size; cerr << "Number of validation instances: " << validation_data_size << endl; } Map<Matrix<int, Dynamic, Dynamic> > validation_data( validation_data_flat.data(), myParam.ngram_size, validation_data_size);与训练数据处理的类似, 得到校正数据的二维矩阵validation_data
4) 读取输入单词/和输出单词文件
vector<string> input_words; if (myParam.input_words_file != "") { readWordsFile(myParam.input_words_file, input_words); if (myParam.input_vocab_size == 0) myParam.input_vocab_size = input_words.size(); } vector<string> output_words; if (myParam.output_words_file != "") { readWordsFile(myParam.output_words_file, output_words); if (myParam.output_vocab_size == 0) myParam.output_vocab_size = output_words.size(); }对每个ngram, 前n-1项数字对应的单词可根据输入单词文件找到原型. 而第n项数字对应的单词可根据输出单词文件找到原型.
5)
vector<data_size_t> unigram_counts(myParam.output_vocab_size); for (data_size_t train_id = 0; train_id < training_data_size; train_id++) { int output_word; if (use_mmap_file == false) { output_word = training_data(myParam.ngram_size - 1, train_id); } else { //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl; output_word = training_data_flat_mmap->at( (train_id + 1) * myParam.ngram_size - 1); } //cerr<<"output word is "<<output_word<<endl; unigram_counts[output_word] += 1; } multinomial<data_size_t> unigram(unigram_counts);unigram_counts统计每个输出单词出现的次数, unigram_counts[i]表示编号为i的单词出现在ngram最后一项的次数. unigram为一个多项式分布变量,以unigram_counts初始化每个输出单词的概率. 5) 模型训练
model nn; // IF THE MODEL FILE HAS BEEN DEFINED, THEN // LOAD THE NEURAL NETWORK MODEL if (myParam.model_file != "") { nn.read(myParam.model_file); cerr << "reading the model" << endl; } else { nn.resize(myParam.ngram_size, myParam.input_vocab_size, myParam.output_vocab_size, myParam.input_embedding_dimension, myParam.num_hidden, myParam.output_embedding_dimension); nn.initialize(rng, myParam.init_normal, myParam.init_range, -log(myParam.output_vocab_size), myParam.parameter_update, myParam.adagrad_epsilon); nn.set_activation_function( string_to_activation_function(myParam.activation_function)); } loss_function_type loss_function = string_to_loss_function( myParam.loss_function);在model类(model.h)中, 定义了:
Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> output_embedding_matrix, input_embedding_matrix, input_and_output_embedding_matrix; Input_word_embeddings input_layer; Output_word_embeddings output_layer;其中input_embedding_matrix和output_embedding_matrix都是动态维数的double类型矩阵.
没有评论:
发表评论