当前位置: 首页 > ai >正文

NLP常用工具包

✨做一次按NLP项目常见工具的使用拆解

1. tokenizer

from torchtext.data.utils import get_tokenizertokenizer = get_tokenizer('basic_english')
text_sample = "We're going on an adventure! The weather is really nice today."
tokens = tokenizer(text_sample)
print(tokens)

['we', "'", 're', 'going', 'on', 'an', 'adventure', '!', 'the', 'weather', 'is', 'really', 'nice', 'today', '.']

2. vocab

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer# 创建分词器
tokenizer = get_tokenizer('basic_english')# 测试数据
test_sentences = ["The quick brown fox jumps over the lazy dog.","Hello world! This is a test for building vocabulary.",
]vocab = build_vocab_from_iterator((tokenizer(sentence) for sentence in test_sentences),specials=['<unk>', '<pad>'],min_freq=1  # 设置最小频率为1
)vocab.set_default_index(vocab['<unk>'])print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])

词表大小: 21   

'fox'的索引: 10

3. Dataloader(示例1)

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer# 1. 创建分词器
tokenizer = get_tokenizer('basic_english')# 2. 测试数据
train_sentences = ["The quick brown fox jumps over the lazy dog.","Hello world! This is a test for building vocabulary.",# 你可以在这里添加更多训练句子
]
test_sentences = ["The quick brown fox jumps over the lazy dog.","Hello world! This is a test for building vocabulary.",
]# 3. 构建词表
vocab = build_vocab_from_iterator((tokenizer(sentence) for sentence in train_sentences),specials=['<unk>', '<pad>'],min_freq=1
)
vocab.set_default_index(vocab['<unk>'])print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])# 4. 自定义 Dataset
class TextDataset(Dataset):def __init__(self, sentences, vocab, tokenizer):self.sentences = sentencesself.vocab = vocabself.tokenizer = tokenizerdef __len__(self):return len(self.sentences)def __getitem__(self, idx):tokens = self.tokenizer(self.sentences[idx])indices = [self.vocab[token] for token in tokens]return torch.tensor(indices, dtype=torch.long)# 5. 创建 Dataset 实例
train_dataset = TextDataset(train_sentences, vocab, tokenizer)
test_dataset  = TextDataset(test_sentences, vocab, tokenizer)# 6. DataLoader 与 Padding Collate 函数def collate_fn(batch):# batch 是一个 list of tensorsreturn pad_sequence(batch, batch_first=True, padding_value=vocab['<pad>'])train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)# 7. 测试 DataLoader 输出
print("\n=== Train Batch Indices ===")
for batch in train_loader:print(batch)breakprint("\n=== Test Batch Indices ===")
for batch in test_loader:print(batch)break

=== Train Batch Indices ===
tensor([[11, 20,  4, 18, 12,  5, 17,  9,  7, 19,  2],
        [ 3, 16,  6, 10, 13, 15,  3, 14,  8,  2,  1]])

=== Test Batch Indices ===
tensor([[ 3, 16,  6, 10, 13, 15,  3, 14,  8,  2,  1],
        [11, 20,  4, 18, 12,  5, 17,  9,  7, 19,  2]])

4. Dataloader(示例2) 

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer# 1. 创建分词器
tokenizer = get_tokenizer('basic_english')# 2. 带标签的训练与测试数据 (句子, 标签)
train_data = [("The quick brown fox jumps over the lazy dog.", 1),  # 正面情感("Hello world! This is a test for building vocabulary.", 0),  # 负面情感# 可添加更多 (sentence, label)
]
test_data = [("The quick brown fox jumps over the lazy dog.", 1),("Hello world! This is a test for building vocabulary.", 0),
]# 3. 构建词表,只基于训练数据中的句子
vocab = build_vocab_from_iterator((tokenizer(sentence) for sentence, _ in train_data),specials=['<unk>', '<pad>'],min_freq=1
)
vocab.set_default_index(vocab['<unk>'])print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])# 4. 自定义 Dataset,返回 (indices_tensor, label_tensor)
class TextDataset(Dataset):def __init__(self, data, vocab, tokenizer):self.data = dataself.vocab = vocabself.tokenizer = tokenizerdef __len__(self):return len(self.data)def __getitem__(self, idx):sentence, label = self.data[idx]tokens = self.tokenizer(sentence)indices = [self.vocab[token] for token in tokens]return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)# 5. Padding 与 collate_fn
def collate_fn(batch):sequences, labels = zip(*batch)padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=vocab['<pad>'])labels_tensor = torch.stack(labels)return padded_seqs, labels_tensor# 6. 创建 DataLoader
train_dataset = TextDataset(train_data, vocab, tokenizer)
test_dataset  = TextDataset(test_data, vocab, tokenizer)train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn
)# 7. 测试输出
print("\n=== Train Batch ===")
for seq_batch, label_batch in train_loader:print("Sequences:", seq_batch)print("Labels:   ", label_batch)breakprint("\n=== Test Batch ===")
for seq_batch, label_batch in test_loader:print("Sequences:", seq_batch)print("Labels:   ", label_batch)break

http://www.xdnf.cn/news/11842.html

相关文章:

  • video-audio-extractor【源码版】
  • 出口合规管理
  • 在 Android Studio 中使用 GitLab 添加图片到 README.md
  • 【免费数据】1980-2022年中国2384个站点的水质数据
  • Attention Is All You Need:抛弃循环神经网络的时代来了!
  • Gateway 搭建
  • AD四层板的层叠设计
  • window 显示驱动开发-提供视频解码功能(三)
  • 飞算JavaAI 炫技赛重磅回归!用智能编码攻克老项目重构难题
  • oracle从表B更新拼接字段到表A
  • 链表题解——环形链表【LeetCode】
  • MySQL 索引:为使用 B+树作为索引数据结构,而非 B树、哈希表或二叉树?
  • mysql 悲观锁和乐观锁(—悲观锁)
  • MySQL 关联查询速查笔记
  • MySQL 事务深度解析:面试核心知识点与实战
  • nginx配置
  • 机器学习基础相关问题
  • vue2 项目中 npm run dev 运行98% after emitting CopyPlugin 卡死
  • QT聊天项目DAY13
  • 掌握 MotionLayout:交互动画开发
  • 用户 xxx is not in the sudoers file.
  • 基于Gemini 2.5 Pro打造的AI智能体CanvasX上线,绘制常见图表(折线图、柱状图等),国内直接使用
  • FreeCAD:开源世界的三维建模利器
  • (每日一道算法题)求根节点到叶节点数字之和
  • HTML基础学习
  • MYSQL之表的内连和外连
  • ABP-Book Store Application中文讲解 - Part 8: Authors: Application Layer
  • 解决Java项目NoProviderFoundException报错
  • C++课设:银行账户管理系统
  • 【Golang笔记04】Go语言中文件操作的学习笔记