文本校对中,利用最大熵(maxent)对量名搭配进行校对
一.在文本校对中,利用最大熵对量名搭配进行校对测试,本文参考了张仰森教授的论文《最大熵方法中特征选择算法的改进与纠错排歧》
1.量名纠错步骤:
(1).挖掘量名搭配库
(2).根据量名搭配库,找出句子中可能出现的量名搭配错误
(3).将有量名搭配错误的句子输入最大熵纠错模型中,输出预测的最大概率对应的量词作为纠正词(再根据量名搭配库进行剔除)
2.本文主要是讲解怎么利用最大熵训练一个量名纠错模型,步骤如下:
(1).对语料进行分词和词性标注
(2).利用量词的左右各2个词及其词性作为特征模板
(3).特征的挖掘,可利用文本挖掘中常规方法,如信息增益、卡方统计以及词频等
(4).训练
二.程序
import nltk
import os
import pickle
'''
maxent的特征模板怎么设计?
1.利用中心词的上下文及其词性作为约束条件
2.特征词的选择可通过词频,互信息等进行选取
3.可否加入更多有意义的约束条件?
4.根据以上形成特征模板,应用于train以及predict
'''
def generate_features_events(word_li):
'''
generate featuers events
:param word_li:
:return:
'''
features_events = []
# left and right words, pos
word_li = [('pre2', 'pre2_pos')] + [('pre1', 'pre1_pos')] + word_li + [('post1', 'post1_pos')] + [('post2', 'post2_pos')]
for i in range(2, len(word_li) - 2):
word = word_li[i][0]
pos = word_li[i][1]
if pos in ['q', 'm']:
if pos == 'm' and len(word) >= 2:
m_len = len(word)
m_str = ''
for m_index in range(-m_len, 0)[::-1]:
if (word[m_index] + m_str) in quantifier_list:
m_str = word[m_index] + m_str
if len(m_str) != 0:
word = m_str
if word in quantifier_list:
# feature dictionary
features_dict = dict()
# 1. first word on the left, pos
features_dict['pre1_word'] = word_li[i-1][0]
features_dict['pre1_pos'] = word_li[i-1][1]
# 2. second word on the left, pos
features_dict['pre2_word'] = word_li[i-2][0]
features_dict['pre2_pos'] = word_li[i-2][1]
# 3. first word on the right, pos
features_dict['post1_word'] = word_li[i+1][0]
features_dict['post1_pos'] = word_li[i+1][1]
# 4. second word on the right, pos
features_dict['post2_word'] = word_li[i+2][0]
features_dict['post2_pos'] = word_li[i+2][1]
# label
label = word
# feature sample
features_events.append((features_dict, label))
return features_events
def load_data(file_name):
'''
load data and generate train set
:param file_name:
:return:
'''
print('load data and generate features events...')
data_set = []
with open(file_name, 'r', encoding='utf-8') as file_reader:
for line in file_reader:
line = line.strip()
if line:
word_li = line.split()
word_li = [tuple(w.split('/')) for w in word_li if len(w.split('/')) == 2]
# generate features events
data_set.extend(generate_features_events(word_li))
print("generate features events %d" % len(data_set))
return data_set
def predict(classifier, features):
'''
predict
:param classifier:
:param features:
:return:
'''
predict_res = classifier.prob_classify(features)
print('predict: {}'.format(sorted(predict_res._prob_dict.items(), key=lambda kv:kv[1], reverse=True)))
return predict_res
def train(data_set, model_save_path):
'''
train model
:param data_set:
:param model_save_path:
:return:
'''
print('maxent training...')
train_size = int(len(data_set) * 0.95) # 95% for train
test_size = int(len(data_set) * 0.05) # 5% for test
train_set = data_set[:train_size]
test_set = data_set[-test_size:]
print("train set size = ", len(train_set))
print("test set size = ", len(test_set))
# train
classifier_iis = nltk.classify.maxent.MaxentClassifier.train(train_set, trace=3, algorithm='iis', max_iter=100)
print("accuracy = ", nltk.classify.accuracy(classifier_iis, test_set))
# model save
pickle_save(classifier_iis, model_save_path)
def pickle_save(data, filename):
with open(filename, 'wb') as fw:
pickle.dump(data, fw)
def pickle_load(filename):
with open(filename, 'rb') as fr:
model = pickle.load(fr)
return model
def read_quantifier(quantifier_path):
'''
read quantifier from file
:param quantifier_path:
:return:
'''
quantifier_list = []
with open(quantifier_path, 'r', encoding='utf-8') as r_file:
for line in r_file:
line = line.strip().split(":")
quantifier_list.append(line[0])
print('quantifier list size: {}'.format(len(quantifier_list)))
return quantifier_list
if __name__ == "__main__":
# 1.train
quantifier_list = read_quantifier(os.path.join(os.getcwd(), '量名搭配库.txt'))
model_save_path = os.path.join(os.getcwd(), 'maxent_model.bin')
data_set = load_data(os.path.join(os.getcwd(), '训练集.txt'))
train(data_set, model_save_path)