python爬取bilibili,下载视频

一. 内容简介

python爬取bilibili,下载视频

二. 软件环境

2.1vsCode

2.2Anaconda

version: conda 22.9.0

2.3代码

链接:https://pan.baidu.com/s/1WuXTso_iltLlnrLffi1kYQ?pwd=1234

三.主要流程

3.1 下载单个视频

代码

import requests
import os
from lxml import etree
import redef videoDownload1(url_):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text  # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data)  # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!'  # 名称进行修改,避免重名title_1 = title_1.replace(':', '_')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{title_}"  # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{title_}/{title_}.mp4 -loglevel quiet'os.system(command)# 显示合成文件的大小print(f'{title_}  下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')

3.2 下载选集视频

选集视频的播放链接很好找,就是后面的p=几啥的,拼一下就可以拿到整个的播放链接了
代码

import requests
import os
from lxml import etree
import re# 获取网页源码
def getUrls2(url):# 发送请求,得到响应对象# 设置用户代理,cookieheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}response_ = requests.get(url, headers=headers)str_data = response_.text  # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data)  # 转换格式类型urls = []# 获取了li的数量,lis = html_obj.xpath("//ul[@class='list-box']/li")question_mark_index = url.find('?')# 如果找到了 '?',就截取该位置之前的子串if question_mark_index != -1:cleaned_url = url[:question_mark_index]else:cleaned_url = url# print(cleaned_url)# 拼接apifor i in range(1,len(lis)+1):# print(i)strs = cleaned_url + "?p=" + str(i)urls.append(strs)# print(content)return urls
import requests
import os
from lxml import etree
import redef videoDownload3(url_,i,name):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text  # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data)  # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]fileName = name# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')title_ = title_.replace('-', '')title_ = title_.replace('—', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!'  # 名称进行修改,避免重名title_1 = title_1.replace(':', '')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{fileName}"  # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{fileName}/{i}.{title_1}.mp4 -loglevel quiet'file_path = f"./video/{fileName}/{i}.{title_}.mp4"if os.path.exists(file_path):passelse:os.system(command)# 显示合成文件的大小print(f'{i}.{title_}  下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')

3.3 下载合集视频

合集的里面数据的访问api
在这里插入图片描述
合集里面的数据,就是从这个里面拿到播放id,给json中的处理拿出来,拼接视频播放链接
在这里插入图片描述

代码

# 获取网页源码
def getUrls3(url):# 发送请求,得到响应对象# 设置用户代理,cookieheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 使用正则表达式提取数字pattern = r'\d+'numbers = re.findall(pattern, url)mid = numbers[0]season_id = numbers[1]page_num = 1url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={page_num}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()# print(json_data["data"]["page"]["total"])total = int(json_data["data"]["page"]["total"])page_size = int(json_data["data"]["page"]["page_size"])page = int(total / page_size) + 1name = json_data["data"]["meta"]["name"]# print(total,page)urls = []# for i in range(1,page+1):# print(i) url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={i}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()archives = json_data["data"]["archives"]num = 0for j in archives:bvid = archives[num]["bvid"]videoUrl = f"https://www.bilibili.com/video/{bvid}/"num = num + 1urls.append(videoUrl)return urls,name
import requests
import os
from lxml import etree
import redef videoDownload2(url_,i):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text  # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data)  # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]fileName = html_obj.xpath('//h1[@class="video-title"]/text()')[0]# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!'  # 名称进行修改,避免重名title_1 = title_1.replace(':', '_')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{fileName}"  # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{fileName}/{i}.{title_}.mp4 -loglevel quiet'file_path = f"./video/{fileName}/{i}.{title_}.mp4"if os.path.exists(file_path):passelse:os.system(command)# 显示合成文件的大小print(f'{i}.{title_}  下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')

3.4 多线程

代码

import concurrent.futures
import requests# 定义一个下载函数
def download_video(URL):url, index, name = URL.split(" ", 2)videoDownload3(url,index,name)def THREAD(URLS):# 创建线程池,指定线程数量max_workers = 10  # 这里设置线程数量,根据需要进行调整with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:# 提交下载任务给线程池处理futures = [executor.submit(download_video, URL) for URL in URLS]# 等待所有任务完成for future in concurrent.futures.as_completed(futures):try:future.result()  # 获取任务的结果(这里不需要结果)except Exception as e:print(f"An error occurred: {e}")

3.5 结果

url_model = "https://space.bilibili.com/471303350/channel/collectiondetail?sid=1278346 3"
value = url_model.split(' ')
url = value[0]
model = value[1]if model == "1":videoDownload1(url)print("下载完成")
if model == "2":# 接口分析# 点进去的话接口# https://www.bilibili.com/video/BV1qW4y1a7fU/?spm_id_from=333.337.search-card.all.click# 点击视频的话就这样# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# https://www.bilibili.com/video/BV1qW4y1a7fU?p=2&vd_source=de2dcd0f37ff916ec3f8fb83c6366123# 可以发现不同的集的接口格式应该是这样的,p = 几就是第几集# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# 查看有多少集# 一种是视频选集那块会写有多少个# 获取源码urls = getUrls2(url)i = 1for index,url in enumerate(urls):videoDownload2(url,index)print("下载完成")
if model == "3":# 接口分析# 视频合计每个视频接口没有规律,然后再播放页中网页没有直接的播放链接,所以就用合集页的链接来分析# 网页里面的每个链接都是动态加载的,需要访问json数据获取,也或者用虚拟浏览器那种等页面加载完成后访问(这种以后可能会更新,感觉这个有点麻烦),# 这里是用json数据做的# https://space.bilibili.com/107762251/channel/collectiondetail?sid=877119# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=107762251&season_id=877119&sort_reverse=false&page_num=1&page_size=30# https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=389199842&season_id=1275285&sort_reverse=false&page_num=1&page_size=30# 这是两个接口,前面那个数字是用户,后面那个数字代表的是合集,下载的接口其实是股东urls,name = getUrls3(url)# print(len(urls))for index,url in enumerate(urls):# print(url)videoDownload3(url,index,name)# print(urls)# 多线程# for index,url in enumerate(urls):#     URLS.append(url + " " + str(index) + " " + name)#     THREAD(URLS)

那切里做展示,有些合集下载时候有点bug,还没找到问题,可以下载,但是保存路径有点问题,应该是和命令行冲突了,我就不改了
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

3.6 合集视频更新

原来会出现部分合集显示下载成功,但是文件夹里面没有东西,是因为有些合集名字在命令里面没办法执行,因为一些特殊符号什么的,所以把合集名字手动指定一下下载就可以了,然后多线程加上去,代码如下
拿视频链接的

# 获取网页源码
def getUrls3(url):# 发送请求,得到响应对象# 设置用户代理,cookieheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 使用正则表达式提取数字pattern = r'\d+'numbers = re.findall(pattern, url)mid = numbers[0]season_id = numbers[1]page_num = 1url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={page_num}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()# print(json_data["data"]["page"]["total"])total = int(json_data["data"]["page"]["total"])page_size = int(json_data["data"]["page"]["page_size"])page = int(total / page_size) + 1name = json_data["data"]["meta"]["name"]# print(total,page)urls = []# for i in range(1,page+1):# print(i) url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={i}&page_size=30"response = requests.get(url)if response.status_code == 200:json_data = response.json()archives = json_data["data"]["archives"]num = 0for j in archives:bvid = archives[num]["bvid"]videoUrl = f"https://www.bilibili.com/video/{bvid}/"num = num + 1urls.append(videoUrl)return urls,name

下载视频的

import requests
import os
from lxml import etree
import redef videoDownload3(url_,index,name):# 设置用户代理,cookieheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36','Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"}# 发送请求,得到响应对象response_ = requests.get(url_, headers=headers_)str_data = response_.text  # 视频主页的html代码,类型是字符串# 使用xpath解析html代码,,得到想要的urlhtml_obj = etree.HTML(str_data)  # 转换格式类型# 获取视频的名称res_ = html_obj.xpath('//title/text()')[0]# 视频名称的获取title_ = re.findall(r'(.*?)_哔哩哔哩', res_)[0]# 影响视频合成的特殊字符的处理,目前就遇到过这三个,实际上很有可能不止这三个,遇到了就用同样的方法处理就好了title_ = title_.replace('/', '')title_ = title_.replace(' ', '')title_ = title_.replace('&', '')title_ = title_.replace(':', '')# 使用xpath语法获取数据,取到数据为列表,索引[0]取值取出里面的字符串,即包含视频音频文件的url字符串url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]# 纯视频的urlvideo_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 纯音频的urlaudio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]# 设置跳转字段的headersheaders_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36','Referer': url_}# 获取纯视频的数据response_video = requests.get(video_url, headers=headers_, stream=True)bytes_video = response_video.content# 获取纯音频的数据response_audio = requests.get(audio_url, headers=headers_, stream=True)bytes_audio = response_audio.content# 获取文件大小, 单位为KBvideo_size = int(int(response_video.headers['content-length']) / 1024)audio_size = int(int(response_audio.headers['content-length']) / 1024)# 保存纯视频的文件title_1 = title_ + '!'  # 名称进行修改,避免重名title_1 = title_1.replace(':', '_')with open(f'{title_1}.mp4', 'wb') as f:f.write(bytes_video)# print(f'{title_1}纯视频文件下载完毕...,大小为:{video_size}KB, {int(video_size/1024)}MB')with open(f'{title_1}.mp3', 'wb') as f:f.write(bytes_audio)# print(f'{title_1}纯音频文件下载完毕...,大小为:{audio_size}KB, {int(audio_size/1024)}MB')# 利用第三方工具ffmpeg 合成视频, 需要执行终端命令ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')folder_path = f"./video/{name}"  # 替换为你想要创建的文件夹路径if not os.path.exists(folder_path):os.mkdir(folder_path)# print(f"The folder '{folder_path}' already exists.")command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{name}/{index}.{title_}.mp4 -loglevel quiet'os.system(command)# 显示合成文件的大小print(f'{title_}  下载完成')# 移除纯视频文件,os.remove(f'{title_1}.mp4')# 移除纯音频文件,os.remove(f'{title_1}.mp3')

多线程

import concurrent.futures
import requests# 定义一个下载函数
def download_video(URL):url, index, name = URL.split(" ", 2)videoDownload3(url,index,name)def THREAD(URLS):# 创建线程池,指定线程数量max_workers = 10  # 这里设置线程数量,根据需要进行调整with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:# 提交下载任务给线程池处理futures = [executor.submit(download_video, URL) for URL in URLS]# 等待所有任务完成for future in concurrent.futures.as_completed(futures):try:future.result()  # 获取任务的结果(这里不需要结果)except Exception as e:print(f"An error occurred: {e}")

执行


url_model = "https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285 3"
value = url_model.split(' ')
url = value[0]
model = value[1]if model == "1":videoDownload1(url)print("下载完成")
if model == "2":# 接口分析# 点进去的话接口# https://www.bilibili.com/video/BV1qW4y1a7fU/?spm_id_from=333.337.search-card.all.click# 点击视频的话就这样# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# https://www.bilibili.com/video/BV1qW4y1a7fU?p=2&vd_source=de2dcd0f37ff916ec3f8fb83c6366123# 可以发现不同的集的接口格式应该是这样的,p = 几就是第几集# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1# 查看有多少集# 一种是视频选集那块会写有多少个# 获取源码urls = getUrls2(url)i = 1for index,url in enumerate(urls):videoDownload2(url,index)print("下载完成")
if model == "3":# 接口分析# 视频合计每个视频接口没有规律,然后再播放页中网页没有直接的播放链接,所以就用合集页的链接来分析# 网页里面的每个链接都是动态加载的,需要访问json数据获取,也或者用虚拟浏览器那种等页面加载完成后访问(这种以后可能会更新,感觉这个有点麻烦),# 这里是用json数据做的# https://space.bilibili.com/107762251/channel/collectiondetail?sid=877119# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=107762251&season_id=877119&sort_reverse=false&page_num=1&page_size=30# https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=389199842&season_id=1275285&sort_reverse=false&page_num=1&page_size=30# 这是两个接口,前面那个数字是用户,后面那个数字代表的是合集,下载的接口其实是股东urls,name = getUrls3(url)name = "qml项目"URLS = []# print(len(urls))for index,url in enumerate(urls):# print(url)URLS.append(url + " " + str(index+1) + " " + name)THREAD(URLS)print("全部下载完成!!!")# print(urls)# for index,url in enumerate(urls):#     URLS.append(url + " " + str(index) + " " + name)#     THREAD(URLS)

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

四.参考

http://t.csdn.cn/6Pt7v 想下载B站视频却不知如何下手?一文教你爬B站!

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.hqwc.cn/news/93685.html

如若内容造成侵权/违法违规/事实不符,请联系编程知识网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

ChatGPT总结(持续更新)

目录 体验渠道 weTab CSDN-AI助手 其他插件 ChatGPT简介 ChatGPT主要用途 ChatGPT发展历程 GPT-4架构的特点和优势 ChatGPT的工作原理 神经网络和自然语言处理技术 Transformer模型 模型训练优化技巧 ChatGPT对程序员的帮助 与ChatGPT交互和提问技巧 ChatGPT未来…

go语言--锁

锁的基础,go的锁是构建在原子操作和信号锁之上的 原子锁 原子包实现协程的对同一个数据的操作,可以实现原子操作,只能用于简单变量的简单操作,可以把多个操作变成一个操作 sema锁 也叫信号量锁/信号锁 核心是一个uint32值&#…

38、springboot为 spring mvc 提供的静态资源管理,覆盖和添加静态资源目录

springboot为 spring mvc 提供的静态资源管理 ★ Spring Boot为Spring MVC提供了默认的静态资源管理: ▲ 默认的四个静态资源目录: /META-INF/resources > /resources > /static > /public ▲ ResourceProperties.java类的源代码&#xff0…

结合OB Cloud区别于MySQL的4大特性,规划降本方案

任何一家企业想要获得持续性的发展与盈利,“降本增效”都是难以绕开的命题。但是“一刀切”的降本影响往往不太可控,成本的快速收缩往往会给业务带来低效运营和增长缓慢的风险。所以我们所说的降本,是指在成本降低的同时,效率不降…

Python中 re.compile 函数的使用

前言 嗨喽,大家好呀~这里是爱看美女的茜茜呐 以下介绍在python的re模块中怎样应用正则表达式 👇 👇 👇 更多精彩机密、教程,尽在下方,赶紧点击了解吧~ python源码、视频教程、插件安装教程、资料我都准备…

QtConcurrent和QFuture的使用

在Qt中,有时候我们会遇到这样一种情况,需要执行一个很长时间的操作,这时候我们的主界面就会卡住。我们的通常做法就是把这个很长时间的操作扔到线程里去处理,可以使用标准库中的线程也可以使用QThread。 如果我们要在这个很长时间…

Citespace、vosviewer、R语言的文献计量学 、SCI

文献计量学是指用数学和统计学的方法,定量地分析一切知识载体的交叉科学。它是集数学、统计学、文献学为一体,注重量化的综合性知识体系。特别是,信息可视化技术手段和方法的运用,可直观的展示主题的研究发展历程、研究现状、研究…

实现不同局域网间的文件共享和端口映射,使用Python自带的HTTP服务

文章目录 1. 前言2. 本地文件服务器搭建2.1 python的安装和设置2.2 cpolar的安装和注册 3. 本地文件服务器的发布3.1 Cpolar云端设置3.2 Cpolar本地设置 4. 公网访问测试5. 结语 1. 前言 数据共享作为和连接作为互联网的基础应用,不仅在商业和办公场景有广泛的应用…

Pytorch-以数字识别更好地入门深度学习

目录 一、数据介绍 二、下载数据 三、可视化数据 四、模型构建 五、模型训练 六、模型预测 一、数据介绍 MNIST数据集是深度学习入门的经典案例,因为它具有以下优点: 1. 数据量小,计算速度快。MNIST数据集包含60000个训练样本和1000…

【OpenCV入门】第六部分——腐蚀与膨胀

文章结构 腐蚀膨胀开运算闭运算形态学方法梯度运算顶帽运算黑帽运算 腐蚀 腐蚀操作可以让图像沿着自己的边界向内收缩。OpenCV通过”核“来实现收缩计算。“核”在形态学中可以理解为”由n个像素组成的像素块“,像素块包含一个核心(通常在中央位置&…

HTTP介绍:一文了解什么是HTTP

前言: 在当今数字时代,互联网已经成为人们生活中不可或缺的一部分。无论是浏览网页、发送电子邮件还是在线购物,我们都离不开超文本传输协议(HTTP)。HTTP作为一种通信协议,扮演着连接客户端和服务器的重要角…

excel绘制直方图

Excel 2016直方图使用指南 excel绘制各种曲线十分方便,可以通过代码将计算的数据输出到excel里面,然后通过excel的插入标签,绘制各种需要的曲线。 对于直方图,横坐标是分布区间,纵坐标是这个区间内数值的频数&#x…