每次copy & paste总是很麻烦,现在有点问题,先记录下来。
需求:获取url 里Feature list,并输出表格形式
可以用Convert curl commands to code:得到get请求的header,cookie等
import requests
import re
from json2html import json2html
from bs4 import BeautifulSoupcookies = {'_ga': 'GA1.2.1362872320.1699326902','_fbp': 'fb.1.1703745569173.788449175','_zm_visitor_guid': 'ab14067a105b55591ca36931e79a6fc0','_zm_mtk_guid': 'b214987e283ec1df03f09df41170675b','_ds_id': '8c2d2994-3b41-4b59-be95-2b8717ffe0e6','__utmzz': 'source=(direct)|medium=(none)|campaign=(not set)','AMP_MKTG_0753e77572': 'JTdCJTdE','_gcl_au': '1.1.55355038.1703817513','OnetrustActiveGroups': 'C0004C0003C0002C0001','AMP_0753e77572': 'JTdCJTIyZGV2aWNlSWQlMjIlM0ElMjJkYWQyMGM3NS0xYzdkLTRmODYtYjI4Yi03MTNmZTNlY2E5ZjglMjIlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNzAzODE3NTEyNDY3JTJDJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJsYXN0RXZlbnRUaW1lJTIyJTNBMTcwMzgxNzUxMzMxMiUyQyUyMmxhc3RFdmVudElkJTIyJTNBMyU3RA==','_yjsu_yjad': '1703817513.e9d3aadf-244b-4756-90c8-d8152831b27e','_uetvid': '5c32b050a5f311ee8e0337e664efcd94','iv': '51a85645-5246-4995-9a5b-627ccafbae0b','_cs_c': '0','_cs_id': '0b459793-a9d5-a89c-c1e2-70499565b08c.1703817514.2.1703833540.1703833540.1.1737981514343','_gid': 'GA1.2.1035150089.1704176623','BIGipServerpool_zoomus': '2f6ba358017c66e5283571a5c5fc3b1a','JSESSIONID': '2C44F6D93F6593E571F97C2BAE1AF4DB','glide_user_route': 'glide.de6ecf26cf6f93e1b52b94d2be12e7df','glide_language': 'zh','OptanonAlertBoxClosed': '2024-01-03T04:07:00.123Z','OptanonConsent': 'isGpcEnabled=0&datestamp=Wed+Jan+03+2024+12%3A07%3A00+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202310.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=e9dfd41b-73f2-470f-ab16-4e504558809b&interactionCount=32&landingPath=NotLandingPage&groups=C0004%3A0%2CC0003%3A0%2CC0002%3A0%2CC0001%3A1&geolocation=JP%3B13&AwaitingReconsent=false',
}headers = {'Accept': 'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Connection': 'keep-alive',# 'Cookie': '_ga=GA1.2.1362872320.1699326902; _fbp=fb.1.1703745569173.788449175; _zm_visitor_guid=ab14067a105b55591ca36931e79a6fc0; _zm_mtk_guid=b214987e283ec1df03f09df41170675b; _ds_id=8c2d2994-3b41-4b59-be95-2b8717ffe0e6; __utmzz=source=(direct)|medium=(none)|campaign=(not set); AMP_MKTG_0753e77572=JTdCJTdE; _gcl_au=1.1.55355038.1703817513; OnetrustActiveGroups=C0004C0003C0002C0001; AMP_0753e77572=JTdCJTIyZGV2aWNlSWQlMjIlM0ElMjJkYWQyMGM3NS0xYzdkLTRmODYtYjI4Yi03MTNmZTNlY2E5ZjglMjIlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNzAzODE3NTEyNDY3JTJDJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJsYXN0RXZlbnRUaW1lJTIyJTNBMTcwMzgxNzUxMzMxMiUyQyUyMmxhc3RFdmVudElkJTIyJTNBMyU3RA==; _yjsu_yjad=1703817513.e9d3aadf-244b-4756-90c8-d8152831b27e; _uetvid=5c32b050a5f311ee8e0337e664efcd94; iv=51a85645-5246-4995-9a5b-627ccafbae0b; _cs_c=0; _cs_id=0b459793-a9d5-a89c-c1e2-70499565b08c.1703817514.2.1703833540.1703833540.1.1737981514343; _gid=GA1.2.1035150089.1704176623; BIGipServerpool_zoomus=2f6ba358017c66e5283571a5c5fc3b1a; JSESSIONID=2C44F6D93F6593E571F97C2BAE1AF4DB; glide_user_route=glide.de6ecf26cf6f93e1b52b94d2be12e7df; glide_language=zh; OptanonAlertBoxClosed=2024-01-03T04:07:00.123Z; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Jan+03+2024+12%3A07%3A00+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202310.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=e9dfd41b-73f2-470f-ab16-4e504558809b&interactionCount=32&landingPath=NotLandingPage&groups=C0004%3A0%2CC0003%3A0%2CC0002%3A0%2CC0001%3A1&geolocation=JP%3B13&AwaitingReconsent=false','Referer': 'https://support.zoom.com/hc/zh/article?id=zm_kb&sysparm_article=KB0069432','Sec-Fetch-Dest': 'empty','Sec-Fetch-Mode': 'cors','Sec-Fetch-Site': 'same-origin','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36','X-Requested-With': 'XMLHttpRequest','X-Transaction-Source': 'Interface=Web,Interface-Name=HC,Interface-Type=Service Portal,Interface-SysID=89275a53cb13020000f8d856634c9c51','X-Use-Polaris': 'false','X-UserToken': '51f7263487ef711481aec8cd0ebb355c186ebdcd75d1cab6f29335aa03a871b3bffff3f0','sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','x-portal': '89275a53cb13020000f8d856634c9c51',
}params = {'id': 'zm_kb','sysparm_article': 'KB0069432','time': '1704254869320','portal_id': '89275a53cb13020000f8d856634c9c51',# 89275a53cb13020000f8d856634c9c51'request_uri': '/hc/zh/article?id=zm_kb&sysparm_article=KB0069432',
}response = requests.get('https://support.zoom.com/api/now/sp/page', params=params, cookies=cookies, headers=headers)data = response.text
# pattern = r'"kbContentData": \{'
pattern = r'"kbContentData":(.*/?)<li>Security enhancements'
ret = re.findall(pattern, data)[0]#["data"]
看得出,得到的response为Json格式,但是我要获取的网页内容在kbContentData下
但是数据解析难住我了(已知 数据为转义后的html内容)
尝试方法1:用re,但尝试如下:加了空格和\{的匹配后就获取不到匹配项了,用\s代替空格也不行,明明网页上复制可以直接找得到的???但就是匹配为空
pattern = r'"kbContentData": \{'
尝试方法2:可以得到数据,但是<\/strong, 加了反斜杠转义后的字符串,明明是html的语言,写出来的脚本,但是我不知道怎么直接把 str(加了反斜杠转义后的字符串) --》 转成html,可以his用lxml或者bs4 进行解析???
搞不懂,明明刚学了re,没解决。。。(╬◣д◢)