from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk# 下载必要的资源(第一次运行需要)
nltk.download('wordnet')# 用于 BLEU 的 tokenizer 等
nltk.download('punkt')# 用于 tokenizedefbatch_bleu(references, candidates):"""计算批量 BLEU 分数 (BLEU-4)Args:references: List of lists of reference sentences (每项是多个参考答案列表)candidates: List of candidate sentences (模型生成的句子列表)Returns:float: 平均 BLEU-4 分数"""smoothing = SmoothingFunction()# 将每个参考句子 tokenizetokenized_references =[[nltk.word_tokenize(sent)for sent in ref]for ref in references]# 将每个候选句子 tokenizetokenized_candidates =[nltk.word_tokenize(sent)for sent in candidates]# 计算 corpus BLEUbleu_score = corpus_bleu(tokenized_references,tokenized_candidates,weights=(0.25,0.25,0.25,0.25),smoothing_function=smoothing.method1)return bleu_scoredefbatch_rouge(references, candidates):"""计算批量 ROUGE 分数 (ROUGE-1, ROUGE-2, ROUGE-L)Args:references: List of reference sentences (每个样本一个参考句)candidates: List of candidate sentencesReturns:dict: {'rouge1': f1, 'rouge2': f1, 'rougeL': f1}"""scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)scores ={'rouge1':[],'rouge2':[],'rougeL':[]}for ref, cand inzip(references, candidates):score = scorer.score(ref, cand)scores['rouge1'].append(score['rouge1'].fmeasure)scores['rouge2'].append(score['rouge2'].fmeasure)scores['rougeL'].append(score['rougeL'].fmeasure)avg_scores ={k:sum(v)/len(v)for k, v in scores.items()}return avg_scoresdefevaluate_all(references, candidates):"""同时计算 BLEU 和 ROUGE 的批量评估函数Args:references: List of reference sentencescandidates: List of candidate sentencesReturns:dict: 包含 BLEU 和 ROUGE 的平均分数"""bleu = batch_bleu([[ref]for ref in references], candidates)rouge = batch_rouge(references, candidates)return{'BLEU':round(bleu,4),'ROUGE-1':round(rouge['rouge1'],4),'ROUGE-2':round(rouge['rouge2'],4),'ROUGE-L':round(rouge['rougeL'],4)}
测试代码
# 示例数据:批量输入
references =["the cat is on the mat","a dog is playing in the garden"]candidates =["the cat sat on the mat","a dog plays in the garden"]# 调用评估函数
results = evaluate_all(references, candidates)print("Evaluation Results:", results)