这是个单线程:获得图片
import osimport requests
from lxml import etree
from lxml import html
from html.parser import HTMLParser
import re
count = 0
wenjian = input("你的照片将要储存到......文件夹:")
img_path = f"./{wenjian}/" # 指定保存地址
if not os.path.exists(img_path):print("您没有这个文件为您新建一个文件:")os.mkdir(img_path)
else:for i in range(1,5,1):if i==1:url = "https://sc.chinaz.com/tupian/nvshengtupian.html"else:url = f"https://sc.chinaz.com/tupian/nvshengtupian_{i}.html"headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"}response = requests.get(url,headers=headers)response.encoding= "utf-8"response = response.textimg_html = re.findall('data-original="(.*?)"',response)for img in img_html:img = 'https:'+imgcount += 1myimg = requests.get(img)file_name = f'{img_path}图片{str(count)}.jpg'# 图片和音乐WB的二进制写入方式f = open(file_name, "wb")f.write(myimg.content)print("正在保存" + str(count) + " 张图片")
不用说速度超级慢。
所以采用多线程。
import queue
import re
import time
import random
import threading
import requests
from bs4 import BeautifulSoup
import queuedef get_main(url):img_html2=[]headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"}response = requests.get(url,headers=headers)response.encoding= "utf-8"response = response.textimg_html = re.findall('data-original="(.*?)"',response)for img in img_html:img = 'https:' + imgimg_html2.append(img)img_name = re.findall('alt="(.*?)"',response)dict1 = dict(zip(img_html2,img_name))resultList =list(dict1.items())return resultListdef do_craw(url_queue:queue.Queue,fout):while True:url = url_queue.get()list3 = get_main(url)for list in list3:fout.write(str(list)+"\n")time.sleep(random.randint(1, 2))if __name__=="__main__":urls = {f"https://sc.chinaz.com/tupian/nvshengtupian_{i}.html"for i in range(2,16)}url_queue = queue.Queue()for url in urls:url_queue.put(url)fout = open("02.zhanzhang.txt", "w")for away in range(14):t=threading.Thread(target=do_craw,args=(url_queue,fout,))t.start()
for away in range(14):
t=threading.Thread(target=do_craw,args=(url_queue,fout,))
t.start()
这个的意思是我开了14个线程,因为我只有14张要保存的东西吗嘛,所以14个就可以ok。
要是想要更快,在获取网址上面加入线程。
这这个是例子,要看上一篇我写的,就会对线程有初步了解。我写了几个例子加深理解。
最重要的是使用queue模块,使各个线程共享数据。
如果想要下载图片,在加一个线程,用来get图片的网址。然后下载。
一定得到的是jpg网址哦!!!