关于根据词库分词的算法逻辑实现(最长词汇匹配原则)
import re
def remove_double_brackets(text):
# 替换掉双括号中的额外括号,只保留一个括号,最长词汇匹配原则
text=re.sub(r'\(\((.*?)\)(.*?)\)', r'(\1\2)', text)
text=re.sub(r'\((.*?)\((.*?)\)\)', r'(\1\2)', text)
text=re.sub(r'\((.*?)\((.*?)\)(.*?)\)', r'(\1\2\3)', text)
for j in re.findall('\(.*?\)',text):
if j.count('(')>1:
return remove_double_brackets(text)
return text
import os
def fun(data, block, dic, user_dic):
n = len(data)
for i in range(n):
for size in range(block, 1, -1): # 从最长的块开始,直到长度为2
if i + size <= n:
same_word = data[i:i+size]
if same_word in dic:
user_dic.add(same_word)
if __name__ == "__main__":
####ss是已知词库
ss = set(['我爱北京天安门','北京','天安','爱北','张三'])
####查找最长词汇
block = max(len(word) for word in ss)
a = '我爱北京天安门,天安门上太阳升'
user_dic = set()
####根据最长词汇block窗口大小移动套取词汇
fun(a, block, ss, user_dic)
#####得到句子中包含的词汇列表
print(user_dic)
#####根据用户词典列表把词汇用括号括起来
for i in sorted(user_dic,key=len,reverse=True):
a=a.replace(i,f'({ i })')
######过滤多括号嵌套,因为是最长匹配原则
a=remove_double_brackets(a)
print(a)