三十一、基于HMM的词性标注
基于HMM的中文词性标注
1 实验目标
- 理解HMM模型的原理和基本问题
- 理解HMM的实现命名实体识别的具体步骤
- 掌握HMM模型实现命名实体识别的方法
2 实验环境
- HMM的中文词性标注的实验环境。
3 实验步骤
该项目主要由3个代码文件组成,分别为hmm.py
、tagging.py
和run.py
,具体功能如下。
hmm.py
:构建HMM类,转移概率矩阵、发射矩阵,以及``viterbi`算法的实现。taging.py
:完成数据预处理,调用HMM算法实现词性的标注。run.py
:主程序入口。
首先创建项目工程目录words_tag,在words_tag目录下创建源码文件hmm.py、tagging.py和run.py,以及目录文件corpus,用于存储renmin.txt数据文件。最终的实验目录结构如下图所示:
3.1 HMM类的实现
- 编写HMM类,完成转移概率矩阵、发生矩阵的计算,以及viterbi算法的实现
import numpy as npclass HMM():def build_transition(self, states_n, state_state_n, states):len_status = len(states_n) # 状态的集合,词性列表的长度# 状态转移概率矩阵transition_prob = np.zeros((len_status, len_status),dtype=float)for i in range(len_status):for j in range(len_status):s = states[i] + '_' + states[j]tag_i = states[i]try:# 条件概率transition_prob[i, j] = \state_state_n[s] / (states_n[tag_i] + 1)except KeyError:transition_prob[i, j] = 0.0return transition_probdef build_emission(self, states_n, o_state_n, o_sequence, states):# 发射概率emission_prob = np.zeros((len(states), len(o_sequence)),dtype=float)# 遍历词性列表的长度,计算发生概率for i in range(len(states)):for j in range(len(o_sequence)):s = o_sequence[j] + '/' + states[i]tag_i = states[i]try:# 条件概率emission_prob[i, j] = o_state_n[s] / states_n[tag_i]except KeyError:emission_prob[i, j] = 0return emission_probdef viterbi(self, o_sequence, A, B, pi):# o_sequence:观测序列,A 条件转移概率 B 观测概率 pi:初始状态概率len_status = len(pi)status_record = {i: [[0, 0] for j in range(len(o_sequence))]for i in range(len_status)}for i in range(len(pi)):status_record[i][0][0] = pi[i] * B[i, o_sequence[0]]status_record[i][0][1] = 0# 遍历观测序列,获取最优结果for t in range(1, len(o_sequence)):for i in range(len_status):max = [-1, 0]for j in range(len_status):tmp_prob = status_record[j][t - 1][0] * A[j, i]if tmp_prob > max[0]:max[0] = tmp_probmax[1] = jstatus_record[i][t][0] = max[0] * B[i, o_sequence[t]]status_record[i][t][1] = max[1]# 符合最优的状态序列,即句子的词向组合return self.get_state_sequence(len_status,o_sequence, status_record)# 获取最大概率的状态序列def get_state_sequence(self, len_status, o_seq, status_record):max = 0max_idx = 0t = len(o_seq) - 1for i in range(len_status):if max < status_record[i][t][0]:max = status_record[i][t][0]max_idx = istate_sequence = [] # 栈结构state_sequence.append(max_idx)while (t > 0):max_idx = status_record[max_idx][t][1]state_sequence.append(max_idx)t -= 1state_sequence.reverse()return state_sequence
3.2 PosTagging类的实现
- 编写PosTagging类,调用HMM算法实现词性标注。
import re
from words_tag.hmm import HMMclass PosTagging():def __init__(self):self.term_tag_n = {} # 统计单词的次数self.tag_tag_n = {} # 词性转移统计self.tags_n = {} # 语料库中词性的数量self.term_list = [] # 观测序列,单词列表self.states = [] # 状态序列,词性列表self.hmm = HMM() # hmm算法# 数据预处理def process_corpus(self, path):term_list = set()with open(file=path, mode='r', encoding='utf-8') as f:lines = f.readlines()for line in lines:# 处理语料中的前一项时间信息line = re.sub("\d{8}-\d{2}-\d{3}-\d{3}/m? ", "", line)sentences = line.split("/w")# 切分句子sentences = [term + '/w' for term in sentences[:-1]]for sentence in sentences:terms = sentence.split()for i in range(len(terms)):if terms[i] == '':continuetry:self.term_tag_n[terms[i]] += 1except KeyError:self.term_tag_n[terms[i]] = 1word_tag = terms[i].split('/')term_list.add(word_tag[0])try:self.tags_n[word_tag[-1]] += 1except KeyError:self.tags_n[word_tag[-1]] = 1if i == 0:tag_tag = 'Pos' + "_" + word_tag[-1]else:tag_tag = terms[i - 1].split('/')[-1] +\'_' + word_tag[-1]try:self.tag_tag_n[tag_tag] += 1except KeyError:self.tag_tag_n[tag_tag] = 1self.states = list(self.tags_n.keys())self.term_list = list(term_list)self.transition = self.hmm.build_transition(self.tags_n,self.tag_tag_n,self.states)self.emission = self.hmm.build_emission(self.tags_n,self.term_tag_n,self.term_list,self.states)self.build_init_prob()# 初始化概率矩阵def build_init_prob(self):sum_tag = sum(list(self.tag_tag_n.values()))self.pi = [self.tags_n[value] / sum_tag forvalue in self.tags_n]# 预测句子中单词的词性def predict_tag(self, sentence): # sentence 为分词后的数组形式o_seq = self.convert_sentence(sentence)s_seq = self.hmm.viterbi(o_seq, self.transition,self.emission, self.pi)self.out_put_result(o_seq, s_seq, self.term_list, self.states)# 单词到编号的转换def convert_sentence(self, sentence):return [self.term_list.index(word) for word in sentence]#def out_put_result(self, o_seq, s_seq, term_list, states):for i in range(len(o_seq)):tag = states[s_seq[i]]print(term_list[o_seq[i]] + '/' + tag, end=' ')
3.3 主函数
# coding:utf-8
from words_tag.taging import PosTaggingif __name__ == "__main__":pt = PosTagging()pt.process_corpus("./corpus/renmin.txt")pt.predict_tag(['你', '可以', '永远','相信', '这', '界', '年轻人','。'])
3.4 运行结果
通过执行上述代码,程序在控制台输出的结果如下所示。
你/r 可以/v 永远/d 相信/v 这/r 界/n 年轻人/n 。/w
4 实验小结
在本章中使用HMM模型实现了词向标注的任务。从程序运行结果可以看出,HMM算法能够给出句子中每个单词的词性。