当前位置: 首页 > news >正文

Scrapy

新浪新闻

爬虫文件

import scrapy
from sina_news_crawler.items import SinaNewsCrawlerItemclass SinaNewsSpider(scrapy.Spider):name = "sina_news"allowed_domains = ["news.sina.com.cn"]start_urls = ["https://news.sina.com.cn"]def parse(self, response):# 提取新闻列表页链接news_links = response.xpath('//a[contains(@href, "/news/") or contains(@href, "/article/")]/@href').getall()for link in news_links:# 确保URL是完整链接full_url = response.urljoin(link)# 只爬取新闻详情页if ".shtml" in full_url or ".html" in full_url:yield scrapy.Request(url=full_url, callback=self.parse_news_detail)def parse_news_detail(self, response):# 解析新闻详情页item = SinaNewsCrawlerItem()# 提取标题item['title'] = response.xpath('//h1[@class="main-title"]/text()').get(default='').strip()# 提取发布时间item['pub_time'] = response.xpath('//span[@class="date"]/text() | //div[@class="date-source"]/span/text()').get(default='').strip()# 提取新闻来源item['source'] = response.xpath('//span[@class="source"]/text() | //div[@class="date-source"]/a/text()').get(default='').strip()# 提取新闻内容content_paragraphs = response.xpath('//div[@class="article"]/p/text() | //div[@id="artibody"]/p/text()').getall()item['content'] = '\n'.join([p.strip() for p in content_paragraphs if p.strip()])# 记录新闻URLitem['url'] = response.urlyield item

 items.py 文件

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass SinaNewsCrawlerItem(scrapy.Item):# 新闻标题title = scrapy.Field()# 新闻内容content = scrapy.Field()# 发布时间pub_time = scrapy.Field()# 新闻URLurl = scrapy.Field()# 新闻来源source = scrapy.Field()

middlewares 文件

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlfrom scrapy import signals# useful for handling different item types with a single interface
from itemadapter import ItemAdapterclass SinaNewsCrawlerSpiderMiddleware:# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects.@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, or item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Request or item objects.passasync def process_start(self, start):# Called with an async iterator over the spider start() method or the# maching method of an earlier spider middleware.async for item_or_request in start:yield item_or_requestdef spider_opened(self, spider):spider.logger.info("Spider opened: %s" % spider.name)class SinaNewsCrawlerDownloaderMiddleware:# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the downloader middleware does not modify the# passed objects.@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of#   installed downloader middleware will be calledreturn Nonedef process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info("Spider opened: %s" % spider.name)

 pipelines文件

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymysql
from itemadapter import ItemAdapterclass SinaNewsCrawlerPipeline:def process_item(self, item, spider):return itemclass MySQLPipeline:def __init__(self, host, user, password, database, port):self.host = hostself.user = userself.password = passwordself.database = databaseself.port = portself.db = Noneself.cursor = None@classmethoddef from_crawler(cls, crawler):return cls(host=crawler.settings.get('MYSQL_HOST', 'localhost'),user=crawler.settings.get('MYSQL_USER', 'root'),password=crawler.settings.get('MYSQL_PASSWORD', ''),database=crawler.settings.get('MYSQL_DATABASE', 'sina_news'),port=crawler.settings.getint('MYSQL_PORT', 3306))def open_spider(self, spider):# 连接数据库self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,database=self.database,port=self.port,charset='utf8mb4')self.cursor = self.db.cursor()# 创建新闻表self.cursor.execute('''CREATE TABLE IF NOT EXISTS news (id INT AUTO_INCREMENT PRIMARY KEY,title VARCHAR(255) NOT NULL,content TEXT,pub_time DATETIME,url VARCHAR(255) UNIQUE NOT NULL,source VARCHAR(100),created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4''')self.db.commit()def close_spider(self, spider):self.db.close()def process_item(self, item, spider):# 插入数据try:# 打印接收到的itemspider.logger.debug(f'接收到的item: {item}')title = item.get('title', '')content = item.get('content', '')pub_time = item.get('pub_time', '')url = item.get('url', '')source = item.get('source', '')spider.logger.debug(f'准备插入数据库: 标题={title}, 来源={source}')self.cursor.execute('''INSERT INTO news (title, content, pub_time, url, source)VALUES (%s, %s, %s, %s, %s)ON DUPLICATE KEY UPDATEtitle=VALUES(title),content=VALUES(content),pub_time=VALUES(pub_time),source=VALUES(source)''', (title,content,pub_time,url,source))self.db.commit()spider.logger.debug(f'成功插入数据库: {url}')except pymysql.MySQLError as e:self.db.rollback()spider.logger.error(f'Database error: {e}')return item

 settings文件

# Scrapy settings for sina_news_crawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = "sina_news_crawler"SPIDER_MODULES = ["sina_news_crawler.spiders"]
NEWSPIDER_MODULE = "sina_news_crawler.spiders"ADDONS = {}# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"# Obey robots.txt rules
ROBOTSTXT_OBEY = True# 配置日志级别
LOG_LEVEL = 'DEBUG'# MySQL数据库配置
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'  
MYSQL_DATABASE = 'sina_news'
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306# 请确保先在MySQL中创建数据库: CREATE DATABASE sina_news CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;# Concurrency and throttling settings
#CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 3  # 增加延迟以避免被封禁# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
}# Enable or disable spider middlewares
#SPIDER_MIDDLEWARES = {
#    "sina_news_crawler.middlewares.SinaNewsCrawlerSpiderMiddleware": 543,
#}# Enable or disable downloader middlewares
#DOWNLOADER_MIDDLEWARES = {
#    "sina_news_crawler.middlewares.SinaNewsCrawlerDownloaderMiddleware": 543,
#}# Enable or disable extensions
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {"sina_news_crawler.pipelines.MySQLPipeline": 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 3600  # 缓存1小时
HTTPCACHE_DIR = "httpcache"
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"

 数据库

http://www.xdnf.cn/news/1185949.html

相关文章:

  • Qwen3-235B-A22B-Thinking-2507 - 开源思维推理模型的新标杆
  • 第二十天(正则表达式与功能实际运用)
  • VR 技术在污水处理领域的创新性应用探索​
  • STM32与ADS1220实现多通道数据采集的完整分析和源程序
  • 算法:数组part02: 209. 长度最小的子数组 +
  • SpringBoot整合Liquibase提升数据库变更的可控性、安全性、自动化程度(最详细)
  • 嵌入式学习-(李宏毅)机器学习(3)-day30
  • 图片查重从设计到实现(4)图片向量化存储-Milvus 单机版部署
  • Android悬浮窗导致其它应用黑屏问题解决办法
  • The Magic Mask for Android:解锁无限可能的安卓自定义套件
  • FT和RAG如何选择
  • win11 使用adb 获取安卓系统日志
  • freqtrade关于获取k线数量,以及显示时间的问题
  • C++中使用Essentia实现STFT/ISTFT
  • DNS 协议
  • 【unitrix】 6.15 “非零非负一“的整数类型(NonZeroNonMinusOne)特质(non_zero_non_minus_one.rs)
  • Linux parted问题:指定分区边界失效
  • 【vue vapor jsx 未雨绸缪】
  • C# 基于halcon的视觉工作流-章23-圆查找
  • Spring Boot2 静态资源、Rest映射、请求映射源码分析
  • Sklearn 机器学习 数值指标 均方误差MSE
  • 初探HashMap中的HashCode方法
  • Java——Spring框架全面解析
  • Seaborn可视化
  • 如何理解SpringBoot starters的自动装配
  • 【linux】Haproxy七层代理
  • 基于新型群智能优化算法的BP神经网络初始权值与偏置优化
  • docker-compose up -d 显示no configuration file provided: not found什么问题
  • 【C++】二叉搜索数
  • CIU32L051 DMA+Lwrb环形队列实现串口无阻塞性数据的收发 + 数据百分百不丢失的实现