命名实体识别实验
实验简介
命名实体识别(Named Entity Recognition, NER) 是NLP领域最经典的任务之一,实体识别提取一些专有的实体,如人名,地名,机构名,公司名,药品名等,实体识别广泛应用于搜索,对话,问答,知识库构建等场景中。基于transformer的BERT预训练模型相对于循环神经网络(Recurrent Neural Network,RNN), 长短期记忆网络(Long Short-Term Memory, LSTM)以及传统的隐马尔科夫模型(Hidden Markov Model, HMM)、条件随机场(Conditional Random Field, CRF)能够更好地捕捉上下文语义,从而提升识别性能。
本实验在华为云ModelArts平台上使用MindSpore1.3实现BERT+CRF命名实体识别模型。
代码分析
同步数据和源码到本地容器
import moxing as mox
mox.file.copy_parallel(src_url="s3://ner-ccc/ner/src/", dst_url='./src/')
mox.file.copy_parallel(src_url="s3://ner-ccc/ner/data/", dst_url='./data/')
mox.file.copy_parallel(src_url="s3://ner-ccc/ner/pre_model/", dst_url='./pre_model/')
导入依赖库
import os
import argparse
import numpy as np
import json
from sklearn.metrics import classification_report # 需放在前面导入
import mindspore.nn as nn
from easydict import EasyDict as edict
import mindspore.common.dtype as mstype
from mindspore import context
from mindspore import log as logger
from mindspore.common.tensor import Tensor
import mindspore.dataset as de
from mindspore.ops import operations as P
import mindspore.dataset.transforms.c_transforms as C
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.nn.optim import AdamWeightDecay
from mindspore.train.model import Model
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor, LossMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common.initializer import TruncatedNormal
from src import tokenization
from src.CRF import CRF
from src.CRF import postprocess
from src.cluener_evaluation import process_one_example_p, label_generation
from src.utils import BertLearningRate
from src.bert_for_finetune import BertFinetuneCell
from src.config import optimizer_cfg
from src.bert_model import BertConfig, BertModel
设置运行环境
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
定义参数配置
cfg = edict({
'is_train': True,
'num_labels': 41,
'schema_file': r'./data/clue_ner/schema.json',
'ckpt_prefix': 'bert-ner-crf', # 'bert-ner' 'bert-ner-crf'
'train_file': r'./data/clue_ner/train.tf_record',
'eval_file': r'./data/clue_ner/dev.tf_record',
'use_crf': True,
'epoch_num': 5,
'batch_size': 16,
'ckpt_dir': 'ckpt',
'pre_training_ckpt': './pre_model/bert_base.ckpt',
'finetune_ckpt': './ckpt/bert-ner-crf-5_671.ckpt',
'label2id_file': './data/clue_ner/label2id.json',
'vocab_file': './data/vocab.txt',
'eval_out_file': 'ner_crf_result.txt' # ner_result.txt ner_crf_result.txt
})
bert_net_cfg = BertConfig(
seq_length=128,
vocab_size=21128,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
dtype=mstype.float32,
compute_type=mstype.float16
)
定义数据集加载函数
def get_dataset(data_file, schema_file, batch_size):
'''
get dataset
'''
ds = de.TFRecordDataset([data_file], schema_file, columns_list=["input_ids", "input_mask","segment_ids", "label_ids"])
type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
ds = ds.map(input_columns="input_mask", operations=type_cast_op)
ds = ds.map(input_columns="input_ids", operations=type_cast_op)
ds = ds.map(input_columns="label_ids", operations=type_cast_op)
# apply shuffle operation
buffer_size = 960
ds = ds.shuffle(buffer_size=buffer_size)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
return ds
数据集测试
next(get_dataset(cfg.train_file, cfg.schema_file, batch_size=1).create_dict_iterator())['input_ids'][0]
定义BertNER模型
class BertNER(nn.Cell):
"""
Train interface for sequence labeling finetuning task.
"""
def __init__(self, config, batch_size, is_training, num_labels=11, use_crf=False, tag_to_index=None, dropout_prob=0.0,
use_one_hot_embeddings=False):
super(BertNER, self).__init__()
self.bert = BertModel(config, is_training, use_one_hot_embeddings)
self.cast = P.Cast()
self.weight_init = TruncatedNormal(config.initializer_range)
self.log_softmax = P.LogSoftmax(axis=-1)
self.dtype = config.dtype
self.num_labels = num_labels
self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
has_bias=True).to_float(config.compute_type)
self.dropout = nn.Dropout(1 - dropout_prob)
self.reshape = P.Reshape()
self.shape = (-1, config.hidden_size)
self.use_crf = use_crf
self.origin_shape = (batch_size, config.seq_length, self.num_labels)
if use_crf:
if not tag_to_index:
raise Exception("The dict for tag-index mapping should be provided for CRF.")
self.loss = CRF(tag_to_index, batch_size, config.seq_length, is_training)
else:
self.loss = CrossEntropyCalculation(is_training)
self.num_labels = num_labels
self.use_crf = use_crf
def construct(self, input_ids, input_mask, token_type_id, label_ids):
sequence_output, _, _ = \
self.bert(input_ids, token_type_id, input_mask)
seq = self.dropout(sequence_output)
seq = self.reshape(seq, self.shape)
logits = self.dense_1(seq)
logits = self.cast(logits, self.dtype)
if self.use_crf:
return_value = self.reshape(logits, self.origin_shape)
loss = self.loss(return_value, label_ids)
else:
return_value = self.log_softmax(logits)
loss = self.loss(return_value, label_ids, self.num_labels)
return loss
加载词汇-ID映射表
tag_to_index = json.loads(open(cfg.label2id_file).read())
if cfg.use_crf:
print(tag_to_index)
max_val = len(tag_to_index)
tag_to_index[""] = max_val
tag_to_index[""] = max_val + 1
number_labels = len(tag_to_index)
else:
number_labels = cfg.num_labels
定义训练函数
def train():
'''
finetune function
'''
# BertNER train for sequence labeling
netwithloss = BertNER(bert_net_cfg, cfg.batch_size, True, num_labels=number_labels,
use_crf=cfg.use_crf,
tag_to_index=tag_to_index, dropout_prob=0.1)
dataset = get_dataset(data_file=cfg.train_file, schema_file=cfg.schema_file, batch_size=cfg.batch_size)
steps_per_epoch = dataset.get_dataset_size()
print('steps_per_epoch:',steps_per_epoch)
# optimizer
steps_per_epoch = dataset.get_dataset_size()
lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate,
end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate,
warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1),
decay_steps=steps_per_epoch * cfg.epoch_num,
power=optimizer_cfg.AdamWeightDecay.power)
params = netwithloss.trainable_params()
decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params))
other_params = list(filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params))
group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay},
{'params': other_params, 'weight_decay': 0.0}]
optimizer = AdamWeightDecay(group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps)
# load checkpoint into network
ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config)
param_dict = load_checkpoint(cfg.pre_training_ckpt)
load_param_into_net(netwithloss, param_dict)
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
netwithgrads = BertFinetuneCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
model = Model(netwithgrads)
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossMonitor(), ckpoint_cb]
model.train(cfg.epoch_num, dataset, callbacks=callbacks, dataset_sink_mode=True)
启动训练
train()
加载离线模型
netwithloss = BertNER(bert_net_cfg, 1, False, num_labels=number_labels,
use_crf=cfg.use_crf,
tag_to_index=tag_to_index)
netwithloss.set_train(False)
param_dict = load_checkpoint(cfg.finetune_ckpt)
load_param_into_net(netwithloss, param_dict)
model = Model(netwithloss)
tokenizer_ = tokenization.FullTokenizer(vocab_file=cfg.vocab_file)
定义测试集评估函数
def eval():
'''
evaluation function
'''
dataset = get_dataset(cfg.eval_file, cfg.schema_file, 1)
columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
y_true, y_pred = [], []
for data in dataset.create_dict_iterator():
input_data = []
for i in columns_list:
input_data.append(Tensor(data[i]))
input_ids, input_mask, token_type_id, label_ids = input_data
logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
if cfg.use_crf:
backpointers, best_tag_id = logits
best_path = postprocess(backpointers, best_tag_id)
logit_ids = []
for ele in best_path:
logit_ids.append(ele)
else:
logits = logits.asnumpy()
logit_ids = np.argmax(logits, axis=-1)
for ids in label_ids.asnumpy():
y_true.extend(ids)
for ids in logit_ids:
y_pred.extend(ids)
print(classification_report(y_true, y_pred, labels=range(1, 41), target_names=list(tag_to_index.keys())[1:41]))
启动测试集评估
eval()
定义在线推理函数
def inference(text):
"""
online inference
"""
feature = process_one_example_p(tokenizer_, cfg.vocab_file, text, max_seq_len=bert_net_cfg.seq_length)
input_ids, input_mask, token_type_id = feature
input_ids = Tensor(np.array(input_ids), mstype.int32)
input_mask = Tensor(np.array(input_mask), mstype.int32)
token_type_id = Tensor(np.array(token_type_id), mstype.int32)
if cfg.use_crf:
backpointers, best_tag_id = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
best_path = postprocess(backpointers, best_tag_id)
logits = []
for ele in best_path:
logits.extend(ele)
ids = logits
else:
logits = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
ids = logits.asnumpy()
ids = np.argmax(ids, axis=-1)
ids = list(ids)
res = label_generation(text=text, probs=ids, tag_to_index=tag_to_index)
return res
在线推理测试
inference("温格的球队终于又踢了一场经典的比赛,2比1战胜曼联之后枪手仍然留在了夺冠集团之内,")
inference("郑阿姨就赶到文汇路排队拿钱,希望能将缴纳的一万余元学费拿回来,顺便找校方或者教委要个说法。")