当前位置：首页 > java >正文

【人工智能-agent】--Dify中MCP工具存数据到MySQL

java 2025/8/30 4:08:15

本文记录的工作如下：

自定义MCP工具，爬取我的钢铁网数据
爬取的数据插值处理
自定义MCP工具，把爬取到的数据（str）存入本地excel表格中
自定义MCP工具，把爬取到的数据（str）存入本地MySQL数据库中
搭建MCP-server
使用Dify调用MCP工具，实现自动化爬取和存入

1.工具--爬取数据

2.工具--保存到excel

3.工具--保存到MySQL中

4.搭建MCP-server

5.Dify调用MCP工具

1.工具--爬取数据

async def fetch_website(data_type: str,Start_Time: str,End_Time: str,
)-> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.edge.service import Servicefrom selenium.webdriver.edge.options import Optionsfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.action_chains import ActionChains# 设置Edge浏览器选项# try:# except Exception as e:#     print("Edge浏览器启动失败，请检查驱动是否正确安装！")#     print(e)all_data_dist = {"综合": "GANGCAIZONGHE","长材": "CHANGCAI","扁平": "BIANPING","一次材": "YICICAI","华东": "HUADONG","华南": "HUANAN","华北": "HUABEI","中南": "ZHONGNAN","东北": "DONGBEI","西南": "XINAN","西北": "XIBEI","螺纹": "LUOWEN","线材": "XIANCAI","型材": "XINCAI","中厚": "ZHONGHOU","锅炉容器板": "GUOLURONGQIBAN","造船板": "ZAOCHUANBAN","热卷": "REJUAN","窄带": "ZAIDAI","冷板": "LENGBAN","镀锌板卷": "DUXIN","无缝管": "WUFENGGUAN","聊城无缝钢管": "WUFENG_LIAOCHENG","焊管": "HANGUAN","盘扣式钢管脚手架": "PKSJSJ"}print("开始爬取数据")print("普钢的所有数据类型：" + str(all_data_dist.keys()))# Start_Time = input("请输入开始日期（格式：2024-01-01）：") or "2024-04-01"Start_Time_year = Start_Time.split("-")[0]Start_Time_month = Start_Time.split("-")[1]Start_Time_day = Start_Time.split("-")[2]# day格式转换02需要去掉前导0Start_Time_day = str(int(Start_Time_day))print(f"开始日期：{Start_Time_year}-{Start_Time_month}-{Start_Time_day}")# End_Time = input("请输入结束日期（格式：2025-04-01）：") or "2025-04-01"End_Time_year = End_Time.split("-")[0]End_Time_month = End_Time.split("-")[1]End_Time_day = End_Time.split("-")[2]# day格式转换02需要去掉前导0End_Time_day = str(int(End_Time_day))print(f"结束日期：{End_Time_year}-{End_Time_month}-{End_Time_day}")# data_type = input(#     "请输入需要爬取的数据类型：普钢['综合', '长材', '扁平', '一次材', '华东', '华南', '华北', '中南', '东北', '西南', '西北', '螺纹', '线材', '型材', '中厚', '锅炉容器板', '造船板', '热卷', '窄带', '冷板', '镀锌板卷', '无缝管', '聊城无缝钢管', '焊管', '盘扣式钢管脚手架']")data_type1 = all_data_dist[data_type]# 等待页面加载3s# time.sleep(3)# # element = driver.find_element(By.CLASS_NAME, "mRightBox")# # 等待页面加载3s-# time.sleep(3)edge_options = Options()edge_options.add_argument("--headless")  # 无头模式，不显示浏览器窗口edge_options.add_argument("--disable-gpu")edge_options.add_argument("--window-size=1920,1080")edge_service = Service('D:\桌面文件\edgedriver_win64\msedgedriver.exe')  # 替换为你的Edge驱动路径driver = webdriver.Edge(service=edge_service, options=edge_options)print("开始打开浏览器")url = "https://index.mysteel.com/xpic/detail.html?tabName=pugang"driver.get(url)time.sleep(3)driver.find_element(By.CSS_SELECTOR, "img.addBtn[src*='icon.png']").click()  # 点击展开print("点击展开")time.sleep(3)# print(element.text)try:# 点击类型key1 = driver.find_element(By.ID, data_type1)key1.click()# 等待页面加载3stime.sleep(1)# //*[@id="searchTimeLiDiv"]/ul/li[1]/a按日查询key2 = driver.find_element(By.XPATH, '//*[@id="searchTimeLiDiv"]/ul/li[1]/a')key2.click()time.sleep(1)# 起始日期//*[@id="startDay"]start_date = driver.find_element(By.XPATH, '//*[@id="startDay"]')# start_date.clear()start_date.click()time.sleep(1)driver.maximize_window()# # 解析年月日# target_date = "2021-09-01"# year, month, day = target_date.split('-')# # 等待日历面板加载# WebDriverWait(driver, 10).until(#     EC.presence_of_element_located((By.CSS_SELECTOR,  ".daterangepicker.dropdown-menu"))# )from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import Select# 选择年份（如果页面有年份下拉框）year_dropdown = driver.find_element(By.XPATH, "/html/body/div[3]/div[2]/div/table/thead/tr[1]/th[2]/select[2]")year_dropdown.click()time.sleep(1)print("选择年份下拉框")# 选择年份<option value="1975">1975</option># year_dropdown.find_element(By.XPATH,  f"//option[@value='{year}']").click()# print(f"选择年份：{year}")select = Select(year_dropdown)select.select_by_visible_text(Start_Time_year)  # 根据文本选择print(f"选择年份：{Start_Time_year}")# 输出# 选择月份  /html/body/div[3]/div[2]/div/table/thead/tr[1]/th[2]/select[1]month_dropdown = driver.find_element(By.XPATH, "/html/body/div[3]/div[2]/div/table/thead/tr[1]/th[2]/select[1]")month_dropdown.click()time.sleep(1)print("选择月份下拉框")select1 = Select(month_dropdown)select1.select_by_visible_text(Start_Time_month)  # 根据文本选择print(f"选择月份：{Start_Time_month}")# 选择日期date_cell = driver.find_element(By.XPATH, f"//td[contains(@class, 'available') and text()='{Start_Time_day}']")date_cell.click()time.sleep(1)print(f"选择日期：{Start_Time_day}")except Exception as e:print(f"执行出错: {str(e)}")driver.save_screenshot('error.png')try:# //*[@id="searchTimeLiDiv"]/ul/li[1]/a按日查询key2 = driver.find_element(By.XPATH, '//*[@id="searchTimeLiDiv"]/ul/li[1]/a')key2.click()time.sleep(1)# 终止日期//*[@id="endDay"]end_date = driver.find_element(By.XPATH, '//*[@id="endDay"]')# start_date.clear()end_date.click()time.sleep(1)driver.maximize_window()# 解析年月日target_date = "2021-09-01"year, month, day = target_date.split('-')# # 等待日历面板加载# WebDriverWait(driver, 10).until(#     EC.presence_of_element_located((By.CSS_SELECTOR,  ".daterangepicker.dropdown-menu"))# )from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import Select# 选择年份（如果页面有年份下拉框）year_end_dropdown = driver.find_element(By.XPATH,"/html/body/div[4]/div[2]/div/table/thead/tr[1]/th[2]/select[2]")year_end_dropdown.click()print("选择年份下拉框")# 选择年份<option value="1975">1975</option># year_dropdown.find_element(By.XPATH,  f"//option[@value='{year}']").click()# print(f"选择年份：{year}")select_year_end = Select(year_end_dropdown)select_year_end.select_by_visible_text(End_Time_year)  # 根据文本选择print(f"选择年份：{End_Time_year}")# 输出# 选择月份  /html/body/div[4]/div[2]/div/table/thead/tr[1]/th[2]/select[1]month_end_dropdown = driver.find_element(By.XPATH,"/html/body/div[4]/div[2]/div/table/thead/tr[1]/th[2]/select[1]")month_end_dropdown.click()print("选择月份下拉框")select_month_end = Select(month_end_dropdown)select1_month_end = select_month_end.select_by_visible_text(End_Time_month)  # 根据文本选择print(f"选择月份：{End_Time_month}")# /html/body/div[4]/div[2]# 找到右侧日历left_calendar = driver.find_element(By.XPATH,'/html/body/div[4]/div[2]')# 选择日期# left_calendar = driver.find_element(By.CLASS_SELECTOR, "calendar single left")# driver.find_element(By.XPATH, "//td[contains(@class, 'available') and text()='2']")date_end_cell = left_calendar.find_element(By.XPATH,f'.//td[text()={End_Time_day}]')date_end_cell.click()driver.save_screenshot('C:\pythonProject\python爬虫\我的钢铁网\end_date.png')print(f"选择日期：{End_Time_day}")except Exception as e:print(f"执行出错: {str(e)}")driver.save_screenshot('error.png')# 点击搜索按钮search_btn = driver.find_element(By.XPATH, '//*[@id="dome1"]/table/tbody/tr/td[5]/img')search_btn.click()# driver.save_screenshot('C:\pythonProject\python爬虫\我的钢铁网\搜索之后.png')element = driver.find_element(By.CLASS_NAME, "mRightBox")# 等待页面加载3s-time.sleep(3)#保存截图driver.save_screenshot('C:\pythonProject\项目五（钢铁价格预测）\我的钢铁网\搜索之后.png')print(element.text)# data_str = element.text# driver.quit()return [types.TextContent(type="text", text=element.text)]# return data_str

2.工具--保存到excel

async def Save_To_Excel(data_str:str):# 把爬取的text数据保存到Excel文档中import pandas as pdfrom openpyxl import load_workbookimport osimport re# 示例数据准备（替换为你的实际数据）# 按行拆分数据并转换为列表lines = [line.split() for line in data_str.strip().split('\n')]# 工作簿名称，去掉空格sheet_name = str(lines[0]).strip()sheet_name =  sheet_namesheet_name = re.sub(r'[^\u4e00-\u9fa5]', '', sheet_name)header = lines[1]  # 假设第一行是表头rows = lines[2:]  # 数据行df = pd.DataFrame(rows, columns=header)'''数据插值，然后存入表格'''print(df.columns.tolist())  # 查看所有列名df['时间'] = pd.to_datetime(df['时间'], format='%Y/%m/%d')  # 确保日期为datetime类型# 2. 创建完整日期范围（从数据最早日期到最晚日期）date_range = pd.date_range(start=df['时间'].min(),end=df['时间'].max(),freq='D')# 3. 重新索引并保留原始数据df = df.set_index('时间').reindex(date_range).rename_axis('时间').reset_index()# 4. 定义需要插值的数值列（根据实际Excel列名调整）numeric_cols = ['本日', '昨日', '日环比', '上周', '周环比','上月度', '与上月比', '去年同期', '与去年比']# 5. 线性插值填充（取前后值的平均值）for col in numeric_cols:if col in df.columns:if col in ['日环比', '周环比', '与上月比', '与去年比']:df[col] = df[col].str.rstrip('%').astype(float) / 100# 2. 线性插值df[col] = df[col].interpolate(method='linear')# 3. 还原为百分比字符串df[col] = (df[col] * 100).round(2).astype(str) + '%'else:# 2. 线性插值,保留两位小数# df[col] = df[col].astype(float).interpolate(method='linear', limit_direction='both', limit=2)df[col] = pd.to_numeric(df[col], errors='coerce')  # 非数值转为NaNdf[col] = df[col].interpolate(method='linear').round(2)# 6. 格式化日期为YYYY/MM/DDdf['时间'] = df['时间'].dt.strftime('%Y/%m/%d')from openpyxl.styles import Alignment# 目标文件路径（注意Windows路径要用双反斜杠或原始字符串）file_path = r"C:\pythonProject\项目五（钢铁价格预测）\我的钢铁网\gyp4.xlsx"# 核心逻辑：追加或新建if os.path.exists(file_path):# 追加模式（加载现有工作簿）with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:book = load_workbook(file_path)if sheet_name in book.sheetnames:# # 读取现有数据并合并# existing_df = pd.read_excel(file_path,  sheet_name=sheet_name)# combined_df = pd.concat([existing_df,  new_df], ignore_index=True)combined_df = df# 移除旧表（为了覆盖写入）book.remove(book[sheet_name])else:combined_df = df# 写入合并后的数据combined_df.to_excel(writer, index=False, sheet_name=sheet_name)# 调整列宽worksheet = writer.sheets[sheet_name]for col in worksheet.columns:max_length = max(len(str(cell.value)) for cell in col)worksheet.column_dimensions[col[0].column_letter].width = max_length + 5for cell in col:cell.alignment = Alignment(horizontal='left')  # 左对else:# 新建模式with pd.ExcelWriter(file_path, engine='openpyxl') as writer:df.to_excel(writer, index=False, sheet_name=sheet_name)# 调整列宽worksheet = writer.sheets[sheet_name]for col in worksheet.columns:max_length = max(len(str(cell.value)) for cell in col)worksheet.column_dimensions[col[0].column_letter].width = max_length + 5for cell in col:cell.alignment = Alignment(horizontal='left')  # 左对print(f"数据已保存到：{file_path}")return [types.TextContent(type="text", text=f"数据已保存到：{file_path}")]

3.工具--保存到MySQL中

async  def Save_To_MySQL(data_str:str):# 把爬取的text数据保存到MySQL数据库中import pandas as pdfrom openpyxl import load_workbookimport osimport re# 示例数据准备（替换为你的实际数据）Small_type1 = '综合'# 按行拆分数据并转换为列表lines = [line.split() for line in data_str.strip().split('\n')]# 工作簿名称，去掉空格sheet_name = str(lines[0]).strip()# sheet_name = Large_type1 + '-' + Small_type1# sheet_name = re.sub(r'[^\u4e00-\u9fa5]',  '', sheet_name)header = lines[1]  # 假设第一行是表头rows = lines[2:]  # 数据行df = pd.DataFrame(rows, columns=header)'''数据插值，然后存入表格'''print(df.columns.tolist())  # 查看所有列名df['时间'] = pd.to_datetime(df['时间'], format='%Y/%m/%d')  # 确保日期为datetime类型# 2. 创建完整日期范围（从数据最早日期到最晚日期）date_range = pd.date_range(start=df['时间'].min(),end=df['时间'].max(),freq='D')# 3. 重新索引并保留原始数据df = df.set_index('时间').reindex(date_range).rename_axis('时间').reset_index()# 4. 定义需要插值的数值列（根据实际Excel列名调整）numeric_cols = ['本日', '昨日', '日环比', '上周', '周环比','上月度', '与上月比', '去年同期', '与去年比']# 5. 线性插值填充（取前后值的平均值）for col in numeric_cols:if col in df.columns:if col in ['日环比', '周环比', '与上月比', '与去年比']:df[col] = df[col].str.rstrip('%').astype(float) / 100# 2. 线性插值df[col] = df[col].interpolate(method='linear')# 3. 还原为百分比字符串df[col] = (df[col] * 100).round(2).astype(str) + '%'else:# 2. 线性插值,保留两位小数# df[col] = df[col].astype(float).interpolate(method='linear', limit_direction='both', limit=2)df[col] = pd.to_numeric(df[col], errors='coerce')  # 非数值转为NaNdf[col] = df[col].interpolate(method='linear').round(2)# 6. 格式化日期为YYYY/MM/DDdf['时间'] = df['时间'].dt.strftime('%Y/%m/%d')print("插值成功")df.insert(0, '类型', sheet_name)print("第一列插入类型成功")import pandas as pdfrom sqlalchemy import create_engine, text, Datefrom sqlalchemy.exc import SQLAlchemyError# MySQL连接配置（替换为你的实际配置）db_config = {'host': 'localhost','user': 'root','password': 'root111111','database': 'gyp_test','port': 3306}try:# 创建SQLAlchemy引擎engine = create_engine(f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4")# 检查表是否存在，不存在则创建with engine.connect() as conn:table_exists = conn.execute(text(f"SHOW TABLES LIKE '原材料指数表'")).fetchone()if not table_exists:create_table_sql = """CREATE TABLE `原材料指数表` (`id` INT AUTO_INCREMENT PRIMARY KEY,`类型` VARCHAR(50) NOT NULL,`时间` DATE NOT NULL,`本日` DECIMAL(10,2),`昨日` DECIMAL(10,2),`日环比` VARCHAR(20),`上周` DECIMAL(10,2),`周环比` VARCHAR(20),`上月度` DECIMAL(10,2),`与上月比` VARCHAR(20),`去年同期` DECIMAL(10,2),`与去年比` VARCHAR(20),INDEX (`时间`),INDEX (`类型`)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""conn.execute(text(create_table_sql))print("表'原材料指数表'创建成功")# 将DataFrame写入数据库df.to_sql(name='原材料指数表',con=engine,if_exists='append',  # 追加模式index=False,dtype={'时间': Date  # 确保日期格式正确})print("数据插入成功")return [types.TextContent(type="text", text="数据插入MySQL数据库成功")]except SQLAlchemyError as e:print(f"数据库操作出错: {e}")except Exception as e:print(f"发生错误: {e}")finally:engine.dispose()

4.搭建MCP-server

from math import pi
@click.command()
@click.option("--port", default=8000, help="Port to listen on for SSE")
@click.option("--transport",type=click.Choice(["stdio", "sse"]),default="stdio",help="Transport type",
)
def main(port: int, transport: str) -> int:app = Server("mcp-website-fetcher")@app.call_tool()async def call_tool(name: str, arguments: dict) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:# if name != "fetch":#     raise ValueError(f"Unknown tool: {name}")# if "url" not in arguments:#     raise ValueError("Missing required argument 'url'")if name=="Web_crawling":return await fetch_website(arguments["data_type"], arguments["Start_Time"], arguments["End_Time"])elif name=="add_gyp":A = arguments["A"]B = arguments["B"]return [types.TextContent(type="text", text=str(A+B+pi))]elif name=="save_to_excel":return await Save_To_Excel(arguments["data_str"])elif name=="save_to_mysql":return await Save_To_MySQL(arguments["data_str"])#普钢：类型为综合，2025.1.1-2025.5.6的数据@app.list_tools()async def list_tools() -> list[types.Tool]:return [types.Tool(name="Web_crawling",description="抓取网页数据，需要输入抓取钢材的类型，开始时间，结束时间三个参数",inputSchema={"type": "object","required": ["data_type", "Start_Time", "End_Time"],"properties":{"data_type": {"type": "string", "description":"普钢['综合', '长材', '扁平', '一次材', '螺纹', '线材', '型材', '中厚', '锅炉容器板', '造船板', '热卷', '窄带', '冷板', '镀锌板卷', '无缝管', '聊城无缝钢管', '焊管', '盘扣式钢管脚手架'],""特钢['综合', '特钢'],""铁矿石['综合','进口矿','国产矿'],""焦炭['综合']"},"Start_Time": {"type": "string", "format": "date", "description": "开始时间（YYYY-MM-DD）"},"End_Time": {"type": "string", "format": "date", "description": "结束时间（YYYY-MM-DD）"},},},),types.Tool(name="add_gyp",description="这是一个计算器工具",inputSchema={"type": "object","required": ["A", "B"],"properties":{"A": {"type": "number", "description": "第一个数"},"B": {"type": "number", "description": "第二个数"},},},),types.Tool(name="save_to_excel",description="保存爬取的数据到本地Excel文档当中",inputSchema={"type": "object","required": ["data_str"],"properties":{"data_str": {"type": "string", "description": "爬取的数据"},},},),types.Tool(name="save_to_mysql",description="保存爬取的数据到MySQL数据库中",inputSchema={"type": "object","required": ["data_str"],"properties":{"data_str": {"type": "string", "description": "爬取的数据"},},},)]# @app.list_resources()# async def list_resources() -> list[types.Resource]:#     return [#         types.Resource(#             uri=FileUrl(r"D:\桌面文件\test-知识库.xlsx"),#             name='student_grade',#             description="这是一个成绩单",#             mimeType="text/plain",#         )#     ]# @app.read_resource()# async def read_resource(name: str,uri:FileUrl) -> str | bytes:#     if name=="student_grade":#         with open(uri.path, "rb") as f:#             return f.read()if transport == "sse":from mcp.server.sse import SseServerTransportfrom starlette.applications import Starlettefrom starlette.responses import Responsefrom starlette.routing import Mount, Routesse = SseServerTransport("/messages/")async def handle_sse(request):async with sse.connect_sse(request.scope, request.receive, request._send) as streams:await app.run(streams[0], streams[1], app.create_initialization_options())return Response()starlette_app = Starlette(debug=True,routes=[Route("/sse", endpoint=handle_sse, methods=["GET"]),Mount("/messages/", app=sse.handle_post_message),],)import uvicornuvicorn.run(starlette_app, host="0.0.0.0", port=port)else:from mcp.server.stdio import stdio_serverasync def arun():async with stdio_server() as streams:await app.run(streams[0], streams[1], app.create_initialization_options())anyio.run(arun)return 0

运行：uv run mcp_simple_tool --transport sse --port 8000

5.Dify调用MCP工具

你是一个智能助手，可根据用户输入的指令，进行推理并调用工具，完成任务后返回给用户结果。其中
server_name1为地图和天气服务，其中server_name2为搜索服务，server 3是·爬虫、保存数据服务。注意：1.必须使用mcp_sse_list_tools工具列出可以调用的工具。2.必须使用mcp_sse_call_tool调用合适的工具完成用户的需求。

1.查询：普钢：类型为综合，2025.1.1-2025.5.6的数据

2.保存到本机数据库中，MySQL

完整代码：

import anyio
import click
import httpx
import mcp.types as types
from mcp.server.lowlevel import Server
import pandas as pd
import time
from pydantic import FileUrl#启动服务端：
# muv run mcp_simple_tool --transport sse --port 8000# async def fetch_website(
#     url: str,
# ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
#     headers = {
#         "User-Agent": "MCP Test Server (github.com/modelcontextprotocol/python-sdk)"
#     }
#     async with httpx.AsyncClient(follow_redirects=True, headers=headers) as client:
#         response = await client.get(url)
#         response.raise_for_status()
#         return [types.TextContent(type="text", text=response.text)]async def fetch_website(data_type: str,Start_Time: str,End_Time: str,
)-> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.edge.service import Servicefrom selenium.webdriver.edge.options import Optionsfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.action_chains import ActionChains# 设置Edge浏览器选项# try:# except Exception as e:#     print("Edge浏览器启动失败，请检查驱动是否正确安装！")#     print(e)all_data_dist = {"综合": "GANGCAIZONGHE","长材": "CHANGCAI","扁平": "BIANPING","一次材": "YICICAI","华东": "HUADONG","华南": "HUANAN","华北": "HUABEI","中南": "ZHONGNAN","东北": "DONGBEI","西南": "XINAN","西北": "XIBEI","螺纹": "LUOWEN","线材": "XIANCAI","型材": "XINCAI","中厚": "ZHONGHOU","锅炉容器板": "GUOLURONGQIBAN","造船板": "ZAOCHUANBAN","热卷": "REJUAN","窄带": "ZAIDAI","冷板": "LENGBAN","镀锌板卷": "DUXIN","无缝管": "WUFENGGUAN","聊城无缝钢管": "WUFENG_LIAOCHENG","焊管": "HANGUAN","盘扣式钢管脚手架": "PKSJSJ"}print("开始爬取数据")print("普钢的所有数据类型：" + str(all_data_dist.keys()))# Start_Time = input("请输入开始日期（格式：2024-01-01）：") or "2024-04-01"Start_Time_year = Start_Time.split("-")[0]Start_Time_month = Start_Time.split("-")[1]Start_Time_day = Start_Time.split("-")[2]# day格式转换02需要去掉前导0Start_Time_day = str(int(Start_Time_day))print(f"开始日期：{Start_Time_year}-{Start_Time_month}-{Start_Time_day}")# End_Time = input("请输入结束日期（格式：2025-04-01）：") or "2025-04-01"End_Time_year = End_Time.split("-")[0]End_Time_month = End_Time.split("-")[1]End_Time_day = End_Time.split("-")[2]# day格式转换02需要去掉前导0End_Time_day = str(int(End_Time_day))print(f"结束日期：{End_Time_year}-{End_Time_month}-{End_Time_day}")# data_type = input(#     "请输入需要爬取的数据类型：普钢['综合', '长材', '扁平', '一次材', '华东', '华南', '华北', '中南', '东北', '西南', '西北', '螺纹', '线材', '型材', '中厚', '锅炉容器板', '造船板', '热卷', '窄带', '冷板', '镀锌板卷', '无缝管', '聊城无缝钢管', '焊管', '盘扣式钢管脚手架']")data_type1 = all_data_dist[data_type]# 等待页面加载3s# time.sleep(3)# # element = driver.find_element(By.CLASS_NAME, "mRightBox")# # 等待页面加载3s-# time.sleep(3)edge_options = Options()edge_options.add_argument("--headless")  # 无头模式，不显示浏览器窗口edge_options.add_argument("--disable-gpu")edge_options.add_argument("--window-size=1920,1080")edge_service = Service('D:\桌面文件\edgedriver_win64\msedgedriver.exe')  # 替换为你的Edge驱动路径driver = webdriver.Edge(service=edge_service, options=edge_options)print("开始打开浏览器")url = "https://index.mysteel.com/xpic/detail.html?tabName=pugang"driver.get(url)time.sleep(3)driver.find_element(By.CSS_SELECTOR, "img.addBtn[src*='icon.png']").click()  # 点击展开print("点击展开")time.sleep(3)# print(element.text)try:# 点击类型key1 = driver.find_element(By.ID, data_type1)key1.click()# 等待页面加载3stime.sleep(1)# //*[@id="searchTimeLiDiv"]/ul/li[1]/a按日查询key2 = driver.find_element(By.XPATH, '//*[@id="searchTimeLiDiv"]/ul/li[1]/a')key2.click()time.sleep(1)# 起始日期//*[@id="startDay"]start_date = driver.find_element(By.XPATH, '//*[@id="startDay"]')# start_date.clear()start_date.click()time.sleep(1)driver.maximize_window()# # 解析年月日# target_date = "2021-09-01"# year, month, day = target_date.split('-')# # 等待日历面板加载# WebDriverWait(driver, 10).until(#     EC.presence_of_element_located((By.CSS_SELECTOR,  ".daterangepicker.dropdown-menu"))# )from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import Select# 选择年份（如果页面有年份下拉框）year_dropdown = driver.find_element(By.XPATH, "/html/body/div[3]/div[2]/div/table/thead/tr[1]/th[2]/select[2]")year_dropdown.click()time.sleep(1)print("选择年份下拉框")# 选择年份<option value="1975">1975</option># year_dropdown.find_element(By.XPATH,  f"//option[@value='{year}']").click()# print(f"选择年份：{year}")select = Select(year_dropdown)select.select_by_visible_text(Start_Time_year)  # 根据文本选择print(f"选择年份：{Start_Time_year}")# 输出# 选择月份  /html/body/div[3]/div[2]/div/table/thead/tr[1]/th[2]/select[1]month_dropdown = driver.find_element(By.XPATH, "/html/body/div[3]/div[2]/div/table/thead/tr[1]/th[2]/select[1]")month_dropdown.click()time.sleep(1)print("选择月份下拉框")select1 = Select(month_dropdown)select1.select_by_visible_text(Start_Time_month)  # 根据文本选择print(f"选择月份：{Start_Time_month}")# 选择日期date_cell = driver.find_element(By.XPATH, f"//td[contains(@class, 'available') and text()='{Start_Time_day}']")date_cell.click()time.sleep(1)print(f"选择日期：{Start_Time_day}")except Exception as e:print(f"执行出错: {str(e)}")driver.save_screenshot('error.png')try:# //*[@id="searchTimeLiDiv"]/ul/li[1]/a按日查询key2 = driver.find_element(By.XPATH, '//*[@id="searchTimeLiDiv"]/ul/li[1]/a')key2.click()time.sleep(1)# 终止日期//*[@id="endDay"]end_date = driver.find_element(By.XPATH, '//*[@id="endDay"]')# start_date.clear()end_date.click()time.sleep(1)driver.maximize_window()# 解析年月日target_date = "2021-09-01"year, month, day = target_date.split('-')# # 等待日历面板加载# WebDriverWait(driver, 10).until(#     EC.presence_of_element_located((By.CSS_SELECTOR,  ".daterangepicker.dropdown-menu"))# )from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import Select# 选择年份（如果页面有年份下拉框）year_end_dropdown = driver.find_element(By.XPATH,"/html/body/div[4]/div[2]/div/table/thead/tr[1]/th[2]/select[2]")year_end_dropdown.click()print("选择年份下拉框")# 选择年份<option value="1975">1975</option># year_dropdown.find_element(By.XPATH,  f"//option[@value='{year}']").click()# print(f"选择年份：{year}")select_year_end = Select(year_end_dropdown)select_year_end.select_by_visible_text(End_Time_year)  # 根据文本选择print(f"选择年份：{End_Time_year}")# 输出# 选择月份  /html/body/div[4]/div[2]/div/table/thead/tr[1]/th[2]/select[1]month_end_dropdown = driver.find_element(By.XPATH,"/html/body/div[4]/div[2]/div/table/thead/tr[1]/th[2]/select[1]")month_end_dropdown.click()print("选择月份下拉框")select_month_end = Select(month_end_dropdown)select1_month_end = select_month_end.select_by_visible_text(End_Time_month)  # 根据文本选择print(f"选择月份：{End_Time_month}")# /html/body/div[4]/div[2]# 找到右侧日历left_calendar = driver.find_element(By.XPATH,'/html/body/div[4]/div[2]')# 选择日期# left_calendar = driver.find_element(By.CLASS_SELECTOR, "calendar single left")# driver.find_element(By.XPATH, "//td[contains(@class, 'available') and text()='2']")date_end_cell = left_calendar.find_element(By.XPATH,f'.//td[text()={End_Time_day}]')date_end_cell.click()driver.save_screenshot('C:\pythonProject\python爬虫\我的钢铁网\end_date.png')print(f"选择日期：{End_Time_day}")except Exception as e:print(f"执行出错: {str(e)}")driver.save_screenshot('error.png')# 点击搜索按钮search_btn = driver.find_element(By.XPATH, '//*[@id="dome1"]/table/tbody/tr/td[5]/img')search_btn.click()# driver.save_screenshot('C:\pythonProject\python爬虫\我的钢铁网\搜索之后.png')element = driver.find_element(By.CLASS_NAME, "mRightBox")# 等待页面加载3s-time.sleep(3)#保存截图driver.save_screenshot('C:\pythonProject\项目五（钢铁价格预测）\我的钢铁网\搜索之后.png')print(element.text)# data_str = element.text# driver.quit()return [types.TextContent(type="text", text=element.text)]# return data_strasync def Save_To_Excel(data_str:str):# 把爬取的text数据保存到Excel文档中import pandas as pdfrom openpyxl import load_workbookimport osimport re# 示例数据准备（替换为你的实际数据）# 按行拆分数据并转换为列表lines = [line.split() for line in data_str.strip().split('\n')]# 工作簿名称，去掉空格sheet_name = str(lines[0]).strip()sheet_name =  sheet_namesheet_name = re.sub(r'[^\u4e00-\u9fa5]', '', sheet_name)header = lines[1]  # 假设第一行是表头rows = lines[2:]  # 数据行df = pd.DataFrame(rows, columns=header)'''数据插值，然后存入表格'''print(df.columns.tolist())  # 查看所有列名df['时间'] = pd.to_datetime(df['时间'], format='%Y/%m/%d')  # 确保日期为datetime类型# 2. 创建完整日期范围（从数据最早日期到最晚日期）date_range = pd.date_range(start=df['时间'].min(),end=df['时间'].max(),freq='D')# 3. 重新索引并保留原始数据df = df.set_index('时间').reindex(date_range).rename_axis('时间').reset_index()# 4. 定义需要插值的数值列（根据实际Excel列名调整）numeric_cols = ['本日', '昨日', '日环比', '上周', '周环比','上月度', '与上月比', '去年同期', '与去年比']# 5. 线性插值填充（取前后值的平均值）for col in numeric_cols:if col in df.columns:if col in ['日环比', '周环比', '与上月比', '与去年比']:df[col] = df[col].str.rstrip('%').astype(float) / 100# 2. 线性插值df[col] = df[col].interpolate(method='linear')# 3. 还原为百分比字符串df[col] = (df[col] * 100).round(2).astype(str) + '%'else:# 2. 线性插值,保留两位小数# df[col] = df[col].astype(float).interpolate(method='linear', limit_direction='both', limit=2)df[col] = pd.to_numeric(df[col], errors='coerce')  # 非数值转为NaNdf[col] = df[col].interpolate(method='linear').round(2)# 6. 格式化日期为YYYY/MM/DDdf['时间'] = df['时间'].dt.strftime('%Y/%m/%d')from openpyxl.styles import Alignment# 目标文件路径（注意Windows路径要用双反斜杠或原始字符串）file_path = r"C:\pythonProject\项目五（钢铁价格预测）\我的钢铁网\gyp4.xlsx"# 核心逻辑：追加或新建if os.path.exists(file_path):# 追加模式（加载现有工作簿）with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:book = load_workbook(file_path)if sheet_name in book.sheetnames:# # 读取现有数据并合并# existing_df = pd.read_excel(file_path,  sheet_name=sheet_name)# combined_df = pd.concat([existing_df,  new_df], ignore_index=True)combined_df = df# 移除旧表（为了覆盖写入）book.remove(book[sheet_name])else:combined_df = df# 写入合并后的数据combined_df.to_excel(writer, index=False, sheet_name=sheet_name)# 调整列宽worksheet = writer.sheets[sheet_name]for col in worksheet.columns:max_length = max(len(str(cell.value)) for cell in col)worksheet.column_dimensions[col[0].column_letter].width = max_length + 5for cell in col:cell.alignment = Alignment(horizontal='left')  # 左对else:# 新建模式with pd.ExcelWriter(file_path, engine='openpyxl') as writer:df.to_excel(writer, index=False, sheet_name=sheet_name)# 调整列宽worksheet = writer.sheets[sheet_name]for col in worksheet.columns:max_length = max(len(str(cell.value)) for cell in col)worksheet.column_dimensions[col[0].column_letter].width = max_length + 5for cell in col:cell.alignment = Alignment(horizontal='left')  # 左对print(f"数据已保存到：{file_path}")return [types.TextContent(type="text", text=f"数据已保存到：{file_path}")]async  def Save_To_MySQL(data_str:str):# 把爬取的text数据保存到MySQL数据库中import pandas as pdfrom openpyxl import load_workbookimport osimport re# 示例数据准备（替换为你的实际数据）Small_type1 = '综合'# 按行拆分数据并转换为列表lines = [line.split() for line in data_str.strip().split('\n')]# 工作簿名称，去掉空格sheet_name = str(lines[0]).strip()# sheet_name = Large_type1 + '-' + Small_type1# sheet_name = re.sub(r'[^\u4e00-\u9fa5]',  '', sheet_name)header = lines[1]  # 假设第一行是表头rows = lines[2:]  # 数据行df = pd.DataFrame(rows, columns=header)'''数据插值，然后存入表格'''print(df.columns.tolist())  # 查看所有列名df['时间'] = pd.to_datetime(df['时间'], format='%Y/%m/%d')  # 确保日期为datetime类型# 2. 创建完整日期范围（从数据最早日期到最晚日期）date_range = pd.date_range(start=df['时间'].min(),end=df['时间'].max(),freq='D')# 3. 重新索引并保留原始数据df = df.set_index('时间').reindex(date_range).rename_axis('时间').reset_index()# 4. 定义需要插值的数值列（根据实际Excel列名调整）numeric_cols = ['本日', '昨日', '日环比', '上周', '周环比','上月度', '与上月比', '去年同期', '与去年比']# 5. 线性插值填充（取前后值的平均值）for col in numeric_cols:if col in df.columns:if col in ['日环比', '周环比', '与上月比', '与去年比']:df[col] = df[col].str.rstrip('%').astype(float) / 100# 2. 线性插值df[col] = df[col].interpolate(method='linear')# 3. 还原为百分比字符串df[col] = (df[col] * 100).round(2).astype(str) + '%'else:# 2. 线性插值,保留两位小数# df[col] = df[col].astype(float).interpolate(method='linear', limit_direction='both', limit=2)df[col] = pd.to_numeric(df[col], errors='coerce')  # 非数值转为NaNdf[col] = df[col].interpolate(method='linear').round(2)# 6. 格式化日期为YYYY/MM/DDdf['时间'] = df['时间'].dt.strftime('%Y/%m/%d')print("插值成功")df.insert(0, '类型', sheet_name)print("第一列插入类型成功")import pandas as pdfrom sqlalchemy import create_engine, text, Datefrom sqlalchemy.exc import SQLAlchemyError# MySQL连接配置（替换为你的实际配置）db_config = {'host': 'localhost','user': 'root','password': 'root111111','database': 'gyp_test','port': 3306}try:# 创建SQLAlchemy引擎engine = create_engine(f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}?charset=utf8mb4")# 检查表是否存在，不存在则创建with engine.connect() as conn:table_exists = conn.execute(text(f"SHOW TABLES LIKE '原材料指数表'")).fetchone()if not table_exists:create_table_sql = """CREATE TABLE `原材料指数表` (`id` INT AUTO_INCREMENT PRIMARY KEY,`类型` VARCHAR(50) NOT NULL,`时间` DATE NOT NULL,`本日` DECIMAL(10,2),`昨日` DECIMAL(10,2),`日环比` VARCHAR(20),`上周` DECIMAL(10,2),`周环比` VARCHAR(20),`上月度` DECIMAL(10,2),`与上月比` VARCHAR(20),`去年同期` DECIMAL(10,2),`与去年比` VARCHAR(20),INDEX (`时间`),INDEX (`类型`)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""conn.execute(text(create_table_sql))print("表'原材料指数表'创建成功")# 将DataFrame写入数据库df.to_sql(name='原材料指数表',con=engine,if_exists='append',  # 追加模式index=False,dtype={'时间': Date  # 确保日期格式正确})print("数据插入成功")return [types.TextContent(type="text", text="数据插入MySQL数据库成功")]except SQLAlchemyError as e:print(f"数据库操作出错: {e}")except Exception as e:print(f"发生错误: {e}")finally:engine.dispose()from math import pi
@click.command()
@click.option("--port", default=8000, help="Port to listen on for SSE")
@click.option("--transport",type=click.Choice(["stdio", "sse"]),default="stdio",help="Transport type",
)
def main(port: int, transport: str) -> int:app = Server("mcp-website-fetcher")@app.call_tool()async def call_tool(name: str, arguments: dict) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:# if name != "fetch":#     raise ValueError(f"Unknown tool: {name}")# if "url" not in arguments:#     raise ValueError("Missing required argument 'url'")if name=="Web_crawling":return await fetch_website(arguments["data_type"], arguments["Start_Time"], arguments["End_Time"])elif name=="add_gyp":A = arguments["A"]B = arguments["B"]return [types.TextContent(type="text", text=str(A+B+pi))]elif name=="save_to_excel":return await Save_To_Excel(arguments["data_str"])elif name=="save_to_mysql":return await Save_To_MySQL(arguments["data_str"])#普钢：类型为综合，2025.1.1-2025.5.6的数据@app.list_tools()async def list_tools() -> list[types.Tool]:return [types.Tool(name="Web_crawling",description="抓取网页数据，需要输入抓取钢材的类型，开始时间，结束时间三个参数",inputSchema={"type": "object","required": ["data_type", "Start_Time", "End_Time"],"properties":{"data_type": {"type": "string", "description":"普钢['综合', '长材', '扁平', '一次材', '螺纹', '线材', '型材', '中厚', '锅炉容器板', '造船板', '热卷', '窄带', '冷板', '镀锌板卷', '无缝管', '聊城无缝钢管', '焊管', '盘扣式钢管脚手架'],""特钢['综合', '特钢'],""铁矿石['综合','进口矿','国产矿'],""焦炭['综合']"},"Start_Time": {"type": "string", "format": "date", "description": "开始时间（YYYY-MM-DD）"},"End_Time": {"type": "string", "format": "date", "description": "结束时间（YYYY-MM-DD）"},},},),types.Tool(name="add_gyp",description="这是一个计算器工具",inputSchema={"type": "object","required": ["A", "B"],"properties":{"A": {"type": "number", "description": "第一个数"},"B": {"type": "number", "description": "第二个数"},},},),types.Tool(name="save_to_excel",description="保存爬取的数据到本地Excel文档当中",inputSchema={"type": "object","required": ["data_str"],"properties":{"data_str": {"type": "string", "description": "爬取的数据"},},},),types.Tool(name="save_to_mysql",description="保存爬取的数据到MySQL数据库中",inputSchema={"type": "object","required": ["data_str"],"properties":{"data_str": {"type": "string", "description": "爬取的数据"},},},)]# @app.list_resources()# async def list_resources() -> list[types.Resource]:#     return [#         types.Resource(#             uri=FileUrl(r"D:\桌面文件\test-知识库.xlsx"),#             name='student_grade',#             description="这是一个成绩单",#             mimeType="text/plain",#         )#     ]# @app.read_resource()# async def read_resource(name: str,uri:FileUrl) -> str | bytes:#     if name=="student_grade":#         with open(uri.path, "rb") as f:#             return f.read()if transport == "sse":from mcp.server.sse import SseServerTransportfrom starlette.applications import Starlettefrom starlette.responses import Responsefrom starlette.routing import Mount, Routesse = SseServerTransport("/messages/")async def handle_sse(request):async with sse.connect_sse(request.scope, request.receive, request._send) as streams:await app.run(streams[0], streams[1], app.create_initialization_options())return Response()starlette_app = Starlette(debug=True,routes=[Route("/sse", endpoint=handle_sse, methods=["GET"]),Mount("/messages/", app=sse.handle_post_message),],)import uvicornuvicorn.run(starlette_app, host="0.0.0.0", port=port)else:from mcp.server.stdio import stdio_serverasync def arun():async with stdio_server() as streams:await app.run(streams[0], streams[1], app.create_initialization_options())anyio.run(arun)return 0

查看全文

http://www.xdnf.cn/news/5694.html