当前位置：首页 > news >正文

文本数据词汇级增强

news 2025/7/12 18:50:10

import nltkfrom nltk.corpus import wordnetfrom nltk.tokenize import word_tokenizeimport random# nltk.download('wordnet')# nltk.download('punkt')def get_synonyms(word):"""获取单词的同义词列表"""synonyms = []for syn in wordnet.synsets(word):for lemma in syn.lemmas():if lemma.name() != word:synonyms.append(lemma.name().replace('_', ' '))return list(set(synonyms))def synonym_replacement(text, n=1):"""对文本中的n个词进行同义词替换"""words = word_tokenize(text)words = [word for word in words if word.isalpha()]  # 只处理字母单词if len(words) < n:return text# 随机选择n个词进行替换random_words = random.sample(words, n)for random_word in random_words:synonyms = get_synonyms(random_word)if synonyms:synonym = random.choice(synonyms)text = text.replace(random_word, synonym)return textdef random_insertion(text, n=1):"""随机插入n个词到文本中"""words = word_tokenize(text)words = [word for word in words if word.isalpha()]augmented_text = textfor _ in range(n):if not words:break# 随机选择一个词并获取其同义词random_word = random.choice(words)synonyms = get_synonyms(random_word)if synonyms:synonym = random.choice(synonyms)# 随机选择插入位置insertion_idx = random.randint(0, len(words))words.insert(insertion_idx, synonym)augmented_text = " ".join(words)return augmented_textdef random_swap(text, n=1):"""随机交换文本中n对词的位置"""words = word_tokenize(text)words = [word for word in words if word.isalpha()]augmented_words = words.copy()for _ in range(n):if len(words) < 2:break# 随机选择两个位置idx1, idx2 = random.sample(range(len(words)), 2)# 交换位置augmented_words[idx1], augmented_words[idx2] = augmented_words[idx2], augmented_words[idx1]return " ".join(augmented_words)def random_deletion(text, p=0.1):"""以概率p随机删除文本中的词"""words = word_tokenize(text)if not words:return text# 决定每个词是否保留remaining_words = [word for word in words if random.uniform(0, 1) > p]if not remaining_words:# 至少保留一个词return random.choice(words)return " ".join(remaining_words)def aug(text, method=[]):for m in method:if m == 're':text = synonym_replacement(text, n=int(len(text.split(" "))/3))print("re:",int(len(text.split(" "))/3))elif m == 'in':text = random_insertion(text, n=int(len(text.split(" "))/4))print("in:",int(len(text.split(" "))/4))elif m == "swap":text = random_swap(text, n=2)elif m == "delet":text = random_deletion(text, p=0.2)return textif __name__=='__main__':text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."print(len(text.split(" ")))# augmented_text = synonym_replacement(text, n=20)# inserted_text = random_insertion(augmented_text, n=10)# swap_text = random_swap(inserted_text, n=2)# delet_text = random_deletion(swap_text, p=0.2)final_text = aug(text, ["re", "in", "swap"])print(f"原始文本: {text}")print(f"增强后文本: {final_text}")

使用nlpaug库

import nlpaug.augmenter.word as naw# 同义词替换增强器（基于WordNet）
aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."
augmented_text = aug.augment(text)
if text == augmented_text:print(1)
print(f"原始文本: {text}")
print(f"增强后文本: {augmented_text}")# 使用词向量进行同义词替换
# aug = naw.WordEmbsAug(
#     model_type='word2vec', model_path='path/to/word2vec.bin',
#     action="substitute")

查看全文

http://www.xdnf.cn/news/489367.html