技术实战:从零开发一个淘宝商品实时数据采集接口
在当今的电商时代,获取商品的实时数据对于市场分析、竞品监控和价格策略制定至关重要。本文将带您从零开始,开发一个淘宝商品实时数据采集接口,通过这个接口可以获取商品的基本信息、价格、销量等关键数据。
技术选型
淘宝商品数据采集接口,我们将使用以下技术栈:
- Python:作为主要开发语言,简洁高效
- Requests:处理 HTTP 请求
- BeautifulSoup:解析 HTML 页面
- Flask:搭建 API 服务
- Redis:缓存数据,减轻服务器压力
实现思路
- 分析淘宝商品页面结构,确定需要采集的数据字段
- 编写爬虫代码,模拟浏览器请求并解析页面
- 实现数据缓存机制,避免频繁请求
- 搭建 API 服务,提供统一的数据访问接口
- 添加异常处理和反爬措施
代码实现
1. 项目结构
taobao-crawler/
├── app.py # Flask应用主文件
├── crawler.py # 爬虫核心逻辑
├── cache.py # 缓存相关操作
├── config.py # 配置文件
└── requirements.txt # 依赖包列表
2. 依赖安装
首先创建 requirements.txt 文件:
flask==2.0.1
requests==2.26.0
beautifulsoup4==4.10.0
redis==3.5.3
python-dotenv==0.19.0
fake_useragent==0.1.11
安装依赖:
pip install -r requirements.txt
3. 配置文件
import os
from dotenv import load_dotenv# 加载环境变量
load_dotenv()# 爬虫配置
TIMEOUT = 10
RETRY_TIMES = 3
CACHE_EXPIRE = 300 # 缓存过期时间,单位秒# Redis配置
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
REDIS_DB = int(os.getenv('REDIS_DB', 0))# API配置
API_HOST = os.getenv('API_HOST', '0.0.0.0')
API_PORT = int(os.getenv('API_PORT', 5000))
4. 缓存模块
import redis
import json
from config import REDIS_HOST, REDIS_PORT, REDIS_DB, CACHE_EXPIRE# 初始化Redis连接
redis_client = redis.Redis(host=REDIS_HOST,port=REDIS_PORT,db=REDIS_DB,decode_responses=True
)def get_cache(key):"""获取缓存数据"""data = redis_client.get(key)if data:return json.loads(data)return Nonedef set_cache(key, data, expire=CACHE_EXPIRE):"""设置缓存数据"""redis_client.setex(key, expire, json.dumps(data))def delete_cache(key):"""删除缓存数据"""redis_client.delete(key)
5. 爬虫核心逻辑
import requests
from bs4 import BeautifulSoup
import re
import json
from fake_useragent import UserAgent
from config import TIMEOUT, RETRY_TIMES
from cache import get_cache, set_cache# 初始化UserAgent
ua = UserAgent()def get_taobao_item(item_id):"""获取淘宝商品信息:param item_id: 商品ID:return: 商品信息字典"""# 先从缓存获取cache_key = f"taobao:item:{item_id}"cached_data = get_cache(cache_key)if cached_data:return cached_data# 构建商品详情页URLurl = f"https://item.taobao.com/item.htm?id={item_id}"# 设置请求头,模拟浏览器headers = {"User-Agent": ua.random,"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3","Connection": "keep-alive","Upgrade-Insecure-Requests": "1"}# 发送请求,带重试机制for _ in range(RETRY_TIMES):try:response = requests.get(url, headers=headers, timeout=TIMEOUT)response.encoding = "gbk" # 淘宝页面通常使用gbk编码if response.status_code == 200:# 解析页面data = parse_item_page(response.text, item_id)if data:# 存入缓存set_cache(cache_key, data)return dataexcept Exception as e:print(f"获取商品信息失败: {str(e)}")return Nonedef parse_item_page(html, item_id):"""解析商品详情页HTML:param html: 页面HTML内容:param item_id: 商品ID:return: 解析后的商品信息"""soup = BeautifulSoup(html, 'lxml')# 提取商品基本信息result = {"item_id": item_id,"title": "","price": "","sales": 0,"shop_name": "","shop_url": "","category": "","images": [],"specifications": {}}# 获取标题title_tag = soup.find('h3', class_='tb-main-title')if title_tag:result["title"] = title_tag.get_text(strip=True)# 获取价格price_tag = soup.find('em', class_='tb-rmb-num')if price_tag:result["price"] = price_tag.get_text(strip=True)# 获取销量(淘宝商品详情页销量可能通过JS加载,这里采用另一种方式)sales_script = re.search(r'aucNumId:"(\d+)",.*?viewSales:"(.*?)"', html)if sales_script and sales_script.group(2):sales_text = sales_script.group(2)# 提取数字sales_num = re.search(r'\d+', sales_text)if sales_num:result["sales"] = int(sales_num.group(0))# 获取店铺信息shop_name_tag = soup.find('a', class_='s-logo-shopname')if shop_name_tag:result["shop_name"] = shop_name_tag.get_text(strip=True)result["shop_url"] = shop_name_tag.get('href', '')# 获取商品图片image_tags = soup.find_all('img', class_='J_ItemImg')for img in image_tags:img_url = img.get('src')if img_url and img_url.startswith('//'):img_url = 'https:' + img_urlif img_url:result["images"].append(img_url)# 获取商品规格信息(简化版)spec_script = re.search(r'var g_config = (.*?);\n', html)if spec_script:try:spec_data = json.loads(spec_script.group(1))if "item" in spec_data and "category" in spec_data["item"]:result["category"] = spec_data["item"]["category"]except:passreturn resultdef search_taobao(keyword, page=1):"""搜索淘宝商品:param keyword: 搜索关键词:param page: 页码:return: 商品列表"""# 先从缓存获取cache_key = f"taobao:search:{keyword}:{page}"cached_data = get_cache(cache_key)if cached_data:return cached_data# 构建搜索URLurl = f"https://s.taobao.com/search?q={keyword}&s={(page-1)*44}"# 设置请求头headers = {"User-Agent": ua.random,"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3","Connection": "keep-alive","Upgrade-Insecure-Requests": "1","Referer": "https://www.taobao.com/"}# 发送请求try:response = requests.get(url, headers=headers, timeout=TIMEOUT)response.encoding = "utf-8"if response.status_code == 200:# 解析搜索结果items = parse_search_result(response.text)# 存入缓存set_cache(cache_key, items)return itemsexcept Exception as e:print(f"搜索商品失败: {str(e)}")return []def parse_search_result(html):"""解析搜索结果页面"""# 从脚本中提取商品数据pattern = re.compile(r'g_page_config = (.*?);\n')match = pattern.search(html)if not match:return []try:data = json.loads(match.group(1))if "mods" in data and "itemlist" in data["mods"] and "data" in data["mods"]["itemlist"]:items = data["mods"]["itemlist"]["data"]["auctions"]# 提取需要的字段result = []for item in items:result.append({"item_id": item.get("nid", ""),"title": item.get("title", ""),"price": item.get("view_price", ""),"sales": item.get("view_sales", ""),"location": item.get("item_loc", ""),"shop_name": item.get("nick", ""),"shop_url": item.get("shopLink", ""),"pic_url": item.get("pic_url", "").replace("//", "https://") if item.get("pic_url") else ""})return resultexcept Exception as e:print(f"解析搜索结果失败: {str(e)}")return []
6. API 服务实现
from flask import Flask, request, jsonify
from crawler import get_taobao_item, search_taobao
from config import API_HOST, API_PORTapp = Flask(__name__)@app.route('/api/item/<item_id>', methods=['GET'])
def get_item(item_id):"""获取单个商品详情"""try:data = get_taobao_item(item_id)if data:return jsonify({"code": 200,"message": "success","data": data})else:return jsonify({"code": 404,"message": "商品不存在或获取失败","data": None})except Exception as e:return jsonify({"code": 500,"message": f"服务器错误: {str(e)}","data": None})@app.route('/api/search', methods=['GET'])
def search():"""搜索商品"""try:keyword = request.args.get('keyword', '')page = int(request.args.get('page', 1))if not keyword:return jsonify({"code": 400,"message": "请提供搜索关键词","data": None})data = search_taobao(keyword, page)return jsonify({"code": 200,"message": "success","data": {"items": data,"page": page,"count": len(data)}})except Exception as e:return jsonify({"code": 500,"message": f"服务器错误: {str(e)}","data": None})@app.route('/api/health', methods=['GET'])
def health_check():"""健康检查接口"""return jsonify({"code": 200,"message": "service is running","timestamp": int(request.args.get('timestamp', 0)) or None})if __name__ == '__main__':app.run(host=API_HOST, port=API_PORT, debug=True)
接口使用示例
1. 获取单个商品详情
请求:
GET /api/item/586722645217
响应示例:
{"code": 200,"message": "success","data": {"item_id": "586722645217","title": "示例商品标题","price": "199.00","sales": 1256,"shop_name": "示例店铺","shop_url": "https://shop12345678.taobao.com","category": "服饰鞋包 > 女装","images": ["https://img.alicdn.com/imgextra/i1/abc.jpg","https://img.alicdn.com/imgextra/i2/def.jpg"],"specifications": {}}
}
反爬措施与优化
- User-Agent 随机化:使用 fake_useragent 库生成不同的浏览器标识,避免被识别为爬虫
- 请求频率控制:可以在实际应用中添加请求间隔控制
- IP 代理池:对于大规模采集,可以使用 IP 代理池避免 IP 被封禁
- 数据缓存:使用 Redis 缓存减轻目标服务器压力,同时提高接口响应速度
- 异常处理:完善的异常处理机制,提高程序稳定性
注意事项
- 本接口仅用于学习和研究目的,使用时请遵守淘宝的 robots 协议和相关规定
- 频繁的请求可能会导致 IP 被封禁,建议合理控制请求频率
- 淘宝页面结构可能会发生变化,需要定期维护爬虫代码
- 商业用途请联系淘宝官方获取合法的数据接口
通过本文的实战教程,我们实现了一个简单但功能完整的淘宝商品数据采集接口。您可以根据实际需求扩展更多功能,如批量采集、数据导出、定时更新等。在实际应用中,还需要考虑更多的反爬策略和性能优化,以确保接口的稳定运行。