1.导出html格式书签
2.对数据做处理
提取 <a></a>
标签
可以用vscode正则替换 <a></a>
标签的ICON属性ICON=".*"
3.安装python
pip替换成清华镜像
4.描述问题生成代码
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import redef parse_html_for_links(html_file, excel_file): # 使用BeautifulSoup解析HTML文件 with open(html_file, 'r', encoding='utf-8') as file: html_content = file.read() soup = BeautifulSoup(html_content, 'lxml') # 或者使用'html.parser' # 准备存储结果的DataFrame data = [] # 查找所有的<a>标签 for link in soup.find_all('a'): # 提取文本内容和href属性 text = link.get_text(strip=True) href = link.get('href') if re.search("csdn",href):linkType = "CSDN"elif re.search("jianshu",href):linkType = "简书"elif re.search("cnblogs",href):linkType = "博客园"elif re.search("zhihu",href):linkType = "知乎"elif re.search("gitee",href):linkType = "gitee"elif re.search("ruanyifeng",href):linkType = "阮一峰"elif re.search("v2ex",href):linkType = "v2ex"elif re.search("juejin",href):linkType = "掘金"elif re.search("oschina",href):linkType = "开源中国"elif re.search("douban",href):linkType = "豆瓣"elif re.search("doc88",href):linkType = "道客巴巴"elif re.search("pmcaff",href):linkType = "pmcaff"elif re.search("github",href):linkType = "github"elif re.search("bilibili",href):linkType = "bilibili"elif re.search("weixin",href):linkType = "微信公众号"else:linkType = "其他"# 尝试提取额外的自定义属性,比如add_date(如果不存在则为None) add_date = link.get('add_date') dt_object = datetime.fromtimestamp(int(add_date)) # 将时间戳格式转为年月日时分秒formatted_date = dt_object.strftime('%Y-%m-%d %H:%M:%S') year = formatted_date[:4]month = formatted_date[5:7]monthStatistic = f"{year}.{month}"# 将结果添加到DataFrame的数据中 data.append({ '标题': text, '链接': href, '添加日期': formatted_date,"链接类型" : linkType, "月份" : monthStatistic}) # 创建DataFrame df = pd.DataFrame(data) # 将DataFrame写入Excel文件 df.to_excel(excel_file, index=False, engine='openpyxl') # 替换为你的HTML和Excel文件路径
html_file_path = 'd:\\favorites_2024_7_20.html'
excel_file_path = 'd:\\favorites_202407201254.xlsx'
parse_html_for_links(html_file_path, excel_file_path) print(f"数据已成功写入 {excel_file_path}")