文本校对中,利用最大熵(maxent)对量名搭配进行校对
一.在文本校对中,利用最大熵对量名搭配进行校对测试,本文参考了张仰森教授的论文《最大熵方法中特征选择算法的改进与纠错排歧》
1.量名纠错步骤:
(1).挖掘量名搭配库
(2).根据量名搭配库,找出句子中可能出现的量名搭配错误
(3).将有量名搭配错误的句子输入最大熵纠错模型中,输出预测的最大概率对应的量词作为纠正词(再根据量名搭配库进行剔除)
2.本文主要是讲解怎么利用最大熵训练一个量名纠错模型,步骤如下:
(1).对语料进行分词和词性标注
(2).利用量词的左右各2个词及其词性作为特征模板
(3).特征的挖掘,可利用文本挖掘中常规方法,如信息增益、卡方统计以及词频等
(4).训练
二.程序
import nltk import os import pickle ''' maxent的特征模板怎么设计? 1.利用中心词的上下文及其词性作为约束条件 2.特征词的选择可通过词频,互信息等进行选取 3.可否加入更多有意义的约束条件? 4.根据以上形成特征模板,应用于train以及predict ''' def generate_features_events(word_li): ''' generate featuers events :param word_li: :return: ''' features_events = [] # left and right words, pos word_li = [('pre2', 'pre2_pos')] + [('pre1', 'pre1_pos')] + word_li + [('post1', 'post1_pos')] + [('post2', 'post2_pos')] for i in range(2, len(word_li) - 2): word = word_li[i][0] pos = word_li[i][1] if pos in ['q', 'm']: if pos == 'm' and len(word) >= 2: m_len = len(word) m_str = '' for m_index in range(-m_len, 0)[::-1]: if (word[m_index] + m_str) in quantifier_list: m_str = word[m_index] + m_str if len(m_str) != 0: word = m_str if word in quantifier_list: # feature dictionary features_dict = dict() # 1. first word on the left, pos features_dict['pre1_word'] = word_li[i-1][0] features_dict['pre1_pos'] = word_li[i-1][1] # 2. second word on the left, pos features_dict['pre2_word'] = word_li[i-2][0] features_dict['pre2_pos'] = word_li[i-2][1] # 3. first word on the right, pos features_dict['post1_word'] = word_li[i+1][0] features_dict['post1_pos'] = word_li[i+1][1] # 4. second word on the right, pos features_dict['post2_word'] = word_li[i+2][0] features_dict['post2_pos'] = word_li[i+2][1] # label label = word # feature sample features_events.append((features_dict, label)) return features_events def load_data(file_name): ''' load data and generate train set :param file_name: :return: ''' print('load data and generate features events...') data_set = [] with open(file_name, 'r', encoding='utf-8') as file_reader: for line in file_reader: line = line.strip() if line: word_li = line.split() word_li = [tuple(w.split('/')) for w in word_li if len(w.split('/')) == 2] # generate features events data_set.extend(generate_features_events(word_li)) print("generate features events %d" % len(data_set)) return data_set def predict(classifier, features): ''' predict :param classifier: :param features: :return: ''' predict_res = classifier.prob_classify(features) print('predict: {}'.format(sorted(predict_res._prob_dict.items(), key=lambda kv:kv[1], reverse=True))) return predict_res def train(data_set, model_save_path): ''' train model :param data_set: :param model_save_path: :return: ''' print('maxent training...') train_size = int(len(data_set) * 0.95) # 95% for train test_size = int(len(data_set) * 0.05) # 5% for test train_set = data_set[:train_size] test_set = data_set[-test_size:] print("train set size = ", len(train_set)) print("test set size = ", len(test_set)) # train classifier_iis = nltk.classify.maxent.MaxentClassifier.train(train_set, trace=3, algorithm='iis', max_iter=100) print("accuracy = ", nltk.classify.accuracy(classifier_iis, test_set)) # model save pickle_save(classifier_iis, model_save_path) def pickle_save(data, filename): with open(filename, 'wb') as fw: pickle.dump(data, fw) def pickle_load(filename): with open(filename, 'rb') as fr: model = pickle.load(fr) return model def read_quantifier(quantifier_path): ''' read quantifier from file :param quantifier_path: :return: ''' quantifier_list = [] with open(quantifier_path, 'r', encoding='utf-8') as r_file: for line in r_file: line = line.strip().split(":") quantifier_list.append(line[0]) print('quantifier list size: {}'.format(len(quantifier_list))) return quantifier_list if __name__ == "__main__": # 1.train quantifier_list = read_quantifier(os.path.join(os.getcwd(), '量名搭配库.txt')) model_save_path = os.path.join(os.getcwd(), 'maxent_model.bin') data_set = load_data(os.path.join(os.getcwd(), '训练集.txt')) train(data_set, model_save_path)