Playwright 是由微软开发的一款开源的 Web 自动化测试框架,主要用于自动化测试和浏览器操作
它是一个跨浏览器的自动化工具,支持 Python、JavaScript 等多种语言
安装
pip install playwright
安装 Playwright 支持的浏览器
playwright install
从 HTML 中提取文字、标题、摘要和关键字
from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup # 用于解析 HTMLdef extract_page_content(url):with sync_playwright() as p:# 启动浏览器browser = p.chromium.launch(headless=True) # 可以设置为 headless=False 方便调试page = browser.new_page()# 导航到目标页面page.goto(url)page.wait_for_load_state("networkidle") # 等待页面加载完成# 获取页面的 HTML 内容html_content = page.content()# 关闭浏览器browser.close()# 使用 BeautifulSoup 解析 HTMLsoup = BeautifulSoup(html_content, "html.parser")# 提取标题title = soup.find("title").text if soup.find("title") else "No title found"# 提取摘要(meta description)meta_description = soup.find("meta", attrs={"name": "description"})description = meta_description["content"] if meta_description else "No description found"# 提取关键字(meta keywords)meta_keywords = soup.find("meta", attrs={"name": "keywords"})keywords = meta_keywords["content"] if meta_keywords else "No keywords found"# 提取正文内容(去除 HTML 标签)text = soup.get_text(separator="\n", strip=True)return {"title": title,"description": description,"keywords": keywords,"text": text}url = "https://www.cnblogs.com/baby123/p/18772196" result = extract_page_content(url) print("Title:", result["title"]) print("Description:", result["description"]) print("Keywords:", result["keywords"]) print("Content:", result["text"])