python【爬虫】【批量下载】年报抓取

python年报爬取更新

本人测试发现，ju chao网的年报爬取距离我上一篇博客并没有啥变化，逻辑没变，应好多朋友的需要，这里补充代码

import json
import osimport requestsweb_url = '改成网站的域名，因为csdn屏蔽'
def load_json():with open("./stock.json", encoding="utf-8", mode="r") as f:return json.loads(f.read())def query_report(_stock, time_span):headers = {'Accept': '*/*','Accept-Language': 'zh-CN,zh;q=0.9','Cache-Control': 'no-cache','Connection': 'keep-alive','Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}data = {'pageNum': '1','pageSize': '30','column': 'szse','tabName': 'fulltext','plate': '','stock': f'{_stock["code"]},{_stock["orgId"]}','searchkey': '','secid': '','category': 'category_ndbg_szsh','trade': '','seDate': time_span,'sortName': '','sortType': '','isHLtitle': 'true',}response = requests.post(web_url+'/new/hisAnnouncement/query',headers=headers,data=data,)return response.json()def download_pdf(d_url, _bank, _file_name):resp = requests.get(web_url + d_url, verify=False)if not os.path.exists(os.path.join(os.path.abspath("./"), _bank)):os.mkdir(os.path.join(os.path.abspath("./"), _bank))with open(f"./{_bank}/{_file_name}.pdf", mode="w", encoding="utf-8") as f:f.write(resp.text)def name_to_stock(_bank):for item in stock_dic["stockList"]:if item["zwjc"] == _bank:return itemif __name__ == '__main__':stock_dic = load_json()bank_list = ["平安银行", "深南电A", "天健集团"]for bank in bank_list:stock = name_to_stock(bank)if not stock:print("[x]未找到该企业")exit(0)query_result = query_report(_stock=stock, time_span='2022-12-26~2023-06-27')print(f"[*]搜索【{bank}】找到:")for res in query_result['announcements']:print(f"\t[*]{res['adjunctUrl']} {res['announcementTitle']}", end="")download_pdf(res['adjunctUrl'], bank, res['announcementTitle'])print(f"\r\t[*]{res['adjunctUrl']} {res['announcementTitle']}  ...........   ok")print("下载完毕~")