LangChain实战(十八):构建ReAct模式的网页内容摘要与分析Agent
本文是《LangChain实战课》系列的第十八篇,将深入讲解如何构建一个基于ReAct模式的智能网页内容摘要与分析Agent。这个Agent能够自主浏览网页、提取关键信息、生成智能摘要,并进行深入的内容分析,让信息获取和理解变得更加高效。
前言
在信息爆炸的时代,我们每天都需要处理大量的网页内容。手动阅读、理解和摘要这些内容既耗时又容易遗漏重要信息。通过结合LangChain的ReAct模式和网络搜索工具,我们可以构建一个智能Agent,让它自主地浏览网页、提取信息、生成摘要并进行深度分析,极大地提高信息处理效率。
ReAct模式与网页分析的核心概念
什么是ReAct模式?
ReAct(Reason + Act)是一种让LLM能够自主推理和行动的框架:
-
Reason(推理):LLM分析当前情况,决定需要采取什么行动
-
Act(行动):LLM选择并执行适当的工具或操作
-
观察结果:根据行动结果决定下一步行动
网页内容分析的挑战
-
内容提取:从复杂的HTML中提取核心内容
-
信息过滤:识别和过滤广告、导航等无关内容
-
摘要生成:保持原文关键信息的同时进行压缩
-
多页面处理:处理跨多个页面的相关内容
-
实时性:处理动态更新的网页内容
环境准备与安装
首先安装必要的依赖包:
# 安装核心库
pip install langchain openai python-dotenv# 安装网络请求和内容提取库
pip install requests beautifulsoup4 newspaper3k# 安装搜索引擎工具
pip install google-search-results# 安装异步处理库
pip install aiohttp asyncio# 安装文本处理库
pip install nltk sumy# 安装可视化库(可选)
pip install matplotlib seaborn
设置必要的环境变量:
export OPENAI_API_KEY="your-openai-api-key"
export SERPAPI_API_KEY="your-serpapi-key" # 用于搜索引擎
构建网页内容提取工具
1. 基础网页内容提取器
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from urllib.parse import urlparse
from langchain.schema import Document
from typing import List, Dict, Any
import reclass WebContentExtractor:def __init__(self):self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}def extract_content(self, url: str) -> Dict[str, Any]:"""提取网页主要内容"""try:# 使用newspaper3k提取文章内容article = Article(url)article.download()article.parse()article.nlp() # 进行自然语言处理# 使用BeautifulSoup作为备选方案response = requests.get(url, headers=self.headers, timeout=10)soup = BeautifulSoup(response.content, 'html.parser')# 提取元数据metadata = self._extract_metadata(soup, url)# 提取主要内容content = self._extract_main_content(article, soup)return {"url": url,"title": article.title or metadata.get("title", ""),"content": content,"summary": article.summary,"publish_date": article.publish_date or metadata.get("publish_date", ""),"authors": article.authors or metadata.get("authors", []),"keywords": article.keywords,"metadata": metadata,"success": True}except Exception as e:print(f"提取网页内容失败: {e}")return {"url": url,"success": False,"error": str(e)}def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:"""提取网页元数据"""metadata = {}# 提取标题if soup.title:metadata["title"] = soup.title.string# 提取meta描述meta_desc = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"})if meta_desc and meta_desc.get("content"):metadata["description"] = meta_desc["content"]# 提取发布时间time_tag = soup.find("meta", property="article:published_time") or soup.find("time")if time_tag and time_tag.get("datetime"):metadata["publish_date"] = time_tag["datetime"]# 提取作者信息author_meta = soup.find("meta", property="article:author") or soup.find("meta", attrs={"name": "author"})if author_meta and author_meta.get("content"):metadata["authors"] = [author_meta["content"]]# 提取域名信息parsed_url = urlparse(url)metadata["domain"] = parsed_url.netlocreturn metadatadef _extract_main_content(self, article, soup: BeautifulSoup) -> str:"""提取网页主要内容"""# 如果newspaper3k成功提取内容,优先使用if article.text and len(article.text.strip()) > 100:return article.text# 备选方案:使用启发式规则提取主要内容# 移除无关元素for element in soup(["script", "style", "nav", "footer", "aside", "form"]):element.decompose()# 尝试找到主要内容区域main_content = ""content_selectors = ["article","main","[role='main']",".content",".main-content",".post-content",".entry-content"]for selector in content_selectors:elements = soup.select(selector)if elements:main_content = "\n".join([elem.get_text(separator="\n", strip=True) for elem in elements])if len(main_content) > 200: # 确保有足够的内容break# 如果以上方法都失败,返回整个页面的文本if not main_content or len(main_content) < 200:main_content = soup.get_text(separator="\n", strip=True)# 清理文本main_content = re.sub(r'\n\s*\n', '\n\n', main_content) # 移除多余空行return main_contentdef extract_multiple_urls(self, urls: List[str]) -> List[Dict[str, Any]]:"""批量提取多个网页内容"""results = []for url in urls:result = self.extract_content(url)results.append(result)return results# 使用示例
extractor = WebContentExtractor()
content = extractor.extract_content("https://example.com")
print(f"标题: {content['title']}")
print(f"内容长度: {len(content['content'])} 字符")
print(f"摘要: {content['summary'][:200]}...")
2. 高级内容处理与清洗
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import re# 下载NLTK数据
nltk.download('punkt')
nltk.download('stopwords')class ContentProcessor:def __