pip install lxml
from lxml import etree
#1. 将本地的html文档中的源码数据加载到etree对象。
etree.parse('file_path')
#2. 将从互联网上获取的源码数据加载到etree对象中
etree.HTML(page_text)#其中page_text是响应的html数据
from lxml import etree# HTML 字符串
html_content = """
<html><body><div id="content"><p class="paragraph">Hello, World!</p><p class="paragraph">This is a test.<div>xiaohei</div></p><p class="paragraph1" title="xiaohei">This is another test.</p></div></body>
</html>
"""# 解析 HTML
tree = etree.HTML(html_content)
print(tree.xpath('//p'))# 获取所有 <p> 元素,返回[<Element p at 0x197374ecf80>, <Element p at 0x197374ecf00>, <Element p at 0x197374ecfc0>]
print(tree.xpath('//p[@class="paragraph"]'))#返回class为paragraph的p标签,[<Element p at 0x197374ecf00>, <Element p at 0x197374ecfc0>]
print(tree.xpath('//p/@class'))#返回所有p标签的class属性,['paragraph', 'paragraph', 'paragraph1']
print(tree.xpath('//p[@title="xiaohei"]/@class'))#返回['paragraph1']
print(tree.xpath('//p/text()'))#返回p标签的文本内容,['Hello, World!', 'This is a test.', 'This is another test.']
#如果用//text(),返回的是标签下的所有文本print('end')