当前位置: 首页 > news >正文

文本数据词汇级增强

import nltkfrom nltk.corpus import wordnetfrom nltk.tokenize import word_tokenizeimport random# nltk.download('wordnet')# nltk.download('punkt')def get_synonyms(word):"""获取单词的同义词列表"""synonyms = []for syn in wordnet.synsets(word):for lemma in syn.lemmas():if lemma.name() != word:synonyms.append(lemma.name().replace('_', ' '))return list(set(synonyms))def synonym_replacement(text, n=1):"""对文本中的n个词进行同义词替换"""words = word_tokenize(text)words = [word for word in words if word.isalpha()]  # 只处理字母单词if len(words) < n:return text# 随机选择n个词进行替换random_words = random.sample(words, n)for random_word in random_words:synonyms = get_synonyms(random_word)if synonyms:synonym = random.choice(synonyms)text = text.replace(random_word, synonym)return textdef random_insertion(text, n=1):"""随机插入n个词到文本中"""words = word_tokenize(text)words = [word for word in words if word.isalpha()]augmented_text = textfor _ in range(n):if not words:break# 随机选择一个词并获取其同义词random_word = random.choice(words)synonyms = get_synonyms(random_word)if synonyms:synonym = random.choice(synonyms)# 随机选择插入位置insertion_idx = random.randint(0, len(words))words.insert(insertion_idx, synonym)augmented_text = " ".join(words)return augmented_textdef random_swap(text, n=1):"""随机交换文本中n对词的位置"""words = word_tokenize(text)words = [word for word in words if word.isalpha()]augmented_words = words.copy()for _ in range(n):if len(words) < 2:break# 随机选择两个位置idx1, idx2 = random.sample(range(len(words)), 2)# 交换位置augmented_words[idx1], augmented_words[idx2] = augmented_words[idx2], augmented_words[idx1]return " ".join(augmented_words)def random_deletion(text, p=0.1):"""以概率p随机删除文本中的词"""words = word_tokenize(text)if not words:return text# 决定每个词是否保留remaining_words = [word for word in words if random.uniform(0, 1) > p]if not remaining_words:# 至少保留一个词return random.choice(words)return " ".join(remaining_words)def aug(text, method=[]):for m in method:if m == 're':text = synonym_replacement(text, n=int(len(text.split(" "))/3))print("re:",int(len(text.split(" "))/3))elif m == 'in':text = random_insertion(text, n=int(len(text.split(" "))/4))print("in:",int(len(text.split(" "))/4))elif m == "swap":text = random_swap(text, n=2)elif m == "delet":text = random_deletion(text, p=0.2)return textif __name__=='__main__':text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."print(len(text.split(" ")))# augmented_text = synonym_replacement(text, n=20)# inserted_text = random_insertion(augmented_text, n=10)# swap_text = random_swap(inserted_text, n=2)# delet_text = random_deletion(swap_text, p=0.2)final_text = aug(text, ["re", "in", "swap"])print(f"原始文本: {text}")print(f"增强后文本: {final_text}")

使用nlpaug库

import nlpaug.augmenter.word as naw# 同义词替换增强器(基于WordNet)
aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."
augmented_text = aug.augment(text)
if text == augmented_text:print(1)
print(f"原始文本: {text}")
print(f"增强后文本: {augmented_text}")# 使用词向量进行同义词替换
# aug = naw.WordEmbsAug(
#     model_type='word2vec', model_path='path/to/word2vec.bin',
#     action="substitute")

http://www.xdnf.cn/news/489367.html

相关文章:

  • Python 之类型注解
  • MCU开发学习记录16* - 看门狗学习与实践(HAL库) - IWDG与WWDG -STM32CubeMX
  • java加强 -IO流
  • 基于React的高德地图api教程005:圆形标记的绘制、删除、修改
  • 【AI学习】AI大模型技术发展研究月报的生成提示词
  • 【Linux】序列化与反序列化、会话与进程组、守护进程
  • 投影仪基础知识及选购方向小记③
  • 曝光融合(Exposure Fusion)
  • 【大模型系列篇】驱动编码助手Cursor与Windsurf工作的隐藏算法解读
  • 小结:jvm 类加载过程
  • 车道线检测----Lane-ATT
  • Linux自有服务
  • LLM学习笔记(四)信息论
  • 公路水运安全员B证主要考核内容有哪些
  • 中级统计师-统计学基础知识-第一章
  • C++ lambda表达式
  • 构建稳定的金字塔模式生态:从自然法则到系统工程
  • LVGL常见面试题
  • 腾讯云MCP数据智能处理:简化数据探索与分析的全流程指南
  • S32DS中定义的全局变量对应的路径查看${ProjDirPath}
  • ConcurrentSkipListMap的深入学习
  • 中国 MRO 的市场概况及发展趋势
  • LlamaIndex 第九篇 Indexing索引
  • C# RSA加密
  • No module named‘serial‘解决办法
  • 计算机视觉----感兴趣区域(ROI)、非极大值抑制
  • 日语简单记录
  • 物联网设备远程管理:基于代理IP的安全固件更新通道方案
  • 共有四个站进行码分多址CDMA通信。四个站的码片序列为......
  • 地磁传感器RM3100简单介绍