爬虫基础学习 - Beautifulsoup
beautiful soup:
bs4
用于解析html and xml文档
解析器:html.parser、lxml解析器和XML的内置解析器
文档遍历:跟xpath差不多,也是整理成树形结构
搜索:find() find_all()
修改:增删改查bs4都支持
提取数据
处理特殊字符解析器:
html.parser(内置) 还行
lxml 速度比较快
xml 速度比较快
html5lib 用的比较少 速度慢pip install bs4
节点选择器:
直接调用节点名既可选择该节点。
如果有多个标签,使用标签名来打印该节点的话,就是打印第一个
类型:
<class 'bs4.element.Tag'>
属性:
name:表示标签的名字
方法:嵌套选择:
返回<class 'bs4.element.Tag'>,可以继续下一步的选择关联选择:
有时候我们选择的时候不能一部到位,需要先选择某节点,然后选择 他的 父节点,子节点子节点:
soup.p.children
子孙节点:
soup.p.descendants
父节点:
soup.a.parent
祖先节点:
soup.a.parents
上一个兄弟节点:
soup.a.previous_sibling
下一个兄弟节点:
soup.a.next_sibling
后面所有的兄弟节点:
soup.a.next_siblings
前面所有的兄弟节点:
soup.a.previous_siblings方法选择器:
find_all,
find方法CSS选择器:
调用select方法,传入对应的css
#! /usr/bin/env python3import re
from bs4 import BeautifulSouphtml = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""html_c = """
<html><head><title>The Dormouse's story</title></head>
<body>
"""soup = BeautifulSoup("<p>hello</p>", 'lxml')
print(soup.p.string)soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())soup = BeautifulSoup(html, 'lxml')# 选择html 文档下的title标签
print(soup.title)# 打印标签下的节点内容
print(soup.title.string)
print(soup.head)
print(soup.head.string)
print(soup.p)# 存在多个节点,会选择第一个出现的节点
print(soup.p.string)# <class 'bs4.element.Tag'>
print(type(soup.title))# 用name 属性获取节点名称
print(soup.title.name)# 获取属性 id class
print(soup.p.attrs)
print(soup.p.attrs['name'])print(soup.p['name'])
print(soup.p['class'])soup = BeautifulSoup(html_c, 'lxml')# 嵌套选择:可以进一步选择
# 打印html文档里的title标签
print(soup.head.title)# <class 'bs4.element.Tag'>
print(type(soup.head.title))
print(soup.head.title.string)html_2 = """
<html><head><title>The Dormouse's story</title></head><body><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p>
"""# 关联选择
soup = BeautifulSoup(html_2, 'lxml')
print(soup.p.contents)
print(soup.p.children)for i, child in enumerate(soup.p.children):print(i)print(child)print(soup.p.descendants)for i, child in enumerate(soup.p.descendants):print(i, child)soup1 = BeautifulSoup(html_2, 'lxml')
print(soup1.a.parent)
print(soup1.a.parents)
print(list(enumerate(soup1.a.parents)))print(soup1.a.next_sibling)
print(soup1.a.previous_sibling)
print(list(enumerate(soup1.a.next_sibling)))
print(list(enumerate(soup1.a.previous_sibling)))# 祖先节点
print(soup1.a.parents)
print(list(soup1.a.parents)[0])
print(list(soup1.a.parents)[0].attrs['class'])html_3 = """
<div class="panel"><div class="panel-heading"><h4>Hello</h4></div><div class="panel-body"><ul class="list" id="list-1"><li class="element">Foo</li><li class="element">Bar</li><li class="element">Jay</li></ul><ul class="list list-small" id="list-2"><li class="element">Foo</li><li class="element">Bar</li></ul></div>
</div>
"""soup = BeautifulSoup(html_3, 'lxml')
print(soup.find_all(name='ul'))
print(soup.find_all(name='ul')[1])for ul in soup.find_all(name='ul'):# 打印ul节点下的li元素print(type(ul.find_all(name='li')))for li in ul.find_all(name='li'):print(li.string)# 使用find_all查询所有ID属性为'list-1'的属性
print(soup.find_all(attrs={'id': 'list-2'}))
print(soup.find_all(attrs={'class': 'element'}))
print(soup.find_all(class_='element'))html_4 = """
<div class="panel"><div class="panel-body"><a>Hello, this is a link</a><a>Hello, this is a link, too</a></div>
"""html_5 = """
<div class="panel"><div class="panel-heading"><h4>Hello</h4></div><div class="panel-body"><ul class="list" id="list-1"><li class="element">Foo</li><li class="element">Bar</li><li class="element">Jay</li></ul><ul class="list list-small" id="list-2"><li class="element">Foo</li><li class="element">Bar</li></ul></div>
</div>
"""soup = BeautifulSoup(html_4, 'lxml')
found_elements = soup.find_all(text=re.compile('link'))
print(found_elements)soup = BeautifulSoup(html_5, 'lxml')ul_tag = soup.find(name='ul')
print(ul_tag)list_element = soup.find(class_='list')
print(list_element)html_6 = """
<div class="panel"><div class="panel-heading"><h4>Hello</h4></div><div class="panel-body"><ul class="list" id="list-1"><li class="element">Foo</li><li class="element">Bar</li><li class="element">Jay</li></ul><ul class="list list-small" id="list-2"><li class="element">Foo</li><li class="element">Bar</li></ul></div>
</div>
"""soup = BeautifulSoup(html_6, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))print(soup.select('#list-2 .element'))for ul in soup.select('ul'):print(ul.select('li'))print(ul['id'])print(ul.attrs['id'])for li in soup.select('li'):print(li.string)print(li.get_text())