89ip代理爬取代码实现
一、代码实现
import requests
import time
import random
from fake_useragent import UserAgent
from lxml import etree
import os
import csv"""
89ip代理爬取
"""class IPSipder(object):def __init__(self):self.url = "https://www.89ip.cn/index_{}.html"self.headers = {'User-Agent': UserAgent().random}# 统计有效ip个数self.count = 0# 获取ip表格行def get_html(self, url):html = requests.get(url=url, headers=self.headers).textparser_html = etree.HTML(html)tr_list = parser_html.xpath('//tbody/tr')return tr_list# 提取ip和portdef parser_html(self, tr_list):proxies_list = []for tr in tr_list:# 获取ipip = tr.xpath('./td/text()')[0].strip()# 获取portport = tr.xpath('./td/text()')[1].strip()# 将ip和port封装到字典中,便于proxies代理调用ip_dict = {"http": "http://" + ip + ":" + port,"https": "https://" + ip + ":" + port}# 将获取的所有ip和port放入列表proxies_list.append(ip_dict)return proxies_list# 保存有效ip到csv文件,如不要保存,可用在run方法中将其注释掉即可def save_ip(self, proxy, save_filename):try:if proxy:# 设置将保持的文件放到桌面save_path = "c:/Users/" + os.getlogin() + "/Desktop/"save_file = save_path + save_filenameprint('保存位置:', save_file + '.csv')with open(save_file + ".csv", 'a+', encoding='utf-8') as f:fieldnames = ['http', 'https']writer = csv.DictWriter(f, fieldnames=fieldnames)writer.writerows(proxy)except Exception as e:print(e.args)# 检查哪些IP是可用的def check_ip(self, proxies_list):use_proxy = []for ip in proxies_list:try:response = requests.get(url="http://httpbin.org/", headers=self.headers, proxies=ip, timeout=3)# 使用百度一直失败,不知何原因# response = requests.get(url="https://www.baidu.com/", headers=self.headers, proxies=ip, timeout=3)# 判断哪些ip可用if response.status_code == 200:# 将可用IP封装到列表,共后期使用或保存use_proxy.append(ip)self.count += 1print('当前检测ip', ip, '检测可用')except Exception as e:# print(e.args)print('当前检测ip', ip, '请求超时,检测不合格')# else:# print('当前检测ip', ip, '检测可用')return use_proxydef run(self):begin = int(input("请输入要抓取的开始页:"))end = int(input("请输入要抓取的终止页:"))filename = input("请输入保存文件名称:")for page in range(begin, end + 1):print(f"#################抓取第{page}页################################")# 重构urlurl = self.url.format(page)# 解析出所有的ip行parser_html = self.get_html(url)# 获取所有的ip代理proxies_list = self.parser_html(parser_html)# 筛选可用的ipproxy_id = self.check_ip(proxies_list)# 将可用的IP代理存入文件中:如若不想保存到文件中,将下面这行代码注销即可self.save_ip(proxy_id, filename)# 随机休眠2~3秒time.sleep(random.randint(2, 3))if __name__ == "__main__":spider = IPSipder()# 执行spider.run()print(f'共统计到有效ip' + str(spider.count) + "个!")
二、代码运行
请输入要抓取的开始页:2
请输入要抓取的终止页:2
请输入保存文件名称:proxy-ip
#################抓取第2页################################
当前检测ip {'http': 'http://139.196.151.191:9999', 'https': 'https://139.196.151.191:9999'} 检测可用
当前检测ip {'http': 'http://114.102.45.39:8089', 'https': 'https://114.102.45.39:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.231.46.231:8089', 'https': 'https://114.231.46.231:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://124.71.157.181:8020', 'https': 'https://124.71.157.181:8020'} 检测可用
当前检测ip {'http': 'http://121.40.137.141:80', 'https': 'https://121.40.137.141:80'} 请求超时,检测不合格
当前检测ip {'http': 'http://117.69.232.45:8089', 'https': 'https://117.69.232.45:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.102.45.89:8089', 'https': 'https://114.102.45.89:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://115.29.148.215:8999', 'https': 'https://115.29.148.215:8999'} 检测可用
当前检测ip {'http': 'http://120.46.197.14:8083', 'https': 'https://120.46.197.14:8083'} 检测可用
当前检测ip {'http': 'http://113.223.215.128:8089', 'https': 'https://113.223.215.128:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://112.124.2.212:20000', 'https': 'https://112.124.2.212:20000'} 检测可用
当前检测ip {'http': 'http://114.102.47.164:8089', 'https': 'https://114.102.47.164:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://117.69.154.91:41122', 'https': 'https://117.69.154.91:41122'} 请求超时,检测不合格
当前检测ip {'http': 'http://123.182.59.167:8089', 'https': 'https://123.182.59.167:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://223.215.176.74:8089', 'https': 'https://223.215.176.74:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.231.105.68:8089', 'https': 'https://114.231.105.68:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://121.43.34.143:80', 'https': 'https://121.43.34.143:80'} 请求超时,检测不合格
当前检测ip {'http': 'http://121.40.109.183:80', 'https': 'https://121.40.109.183:80'} 请求超时,检测不合格
当前检测ip {'http': 'http://116.63.130.30:7890', 'https': 'https://116.63.130.30:7890'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.102.44.113:8089', 'https': 'https://114.102.44.113:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://116.63.130.30:443', 'https': 'https://116.63.130.30:443'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.231.46.160:8089', 'https': 'https://114.231.46.160:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.164.243.29:8089', 'https': 'https://183.164.243.29:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://114.102.44.137:8089', 'https': 'https://114.102.44.137:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://117.57.93.63:8089', 'https': 'https://117.57.93.63:8089'} 请求超时,检测不合格
当前检测ip {'http': 'http://159.226.227.90:80', 'https': 'https://159.226.227.90:80'} 请求超时,检测不合格
当前检测ip {'http': 'http://159.226.227.99:80', 'https': 'https://159.226.227.99:80'} 请求超时,检测不合格
当前检测ip {'http': 'http://183.164.243.44:8089', 'https': 'https://183.164.243.44:8089'} 请求超时,检测不合格
保存位置: c:/Users/qwy/Desktop/proxy-ip.csv
共统计到有效ip5个!
三、说明
1.在 c:/Users/qwy/Desktop/proxy-ip.csv下的文件如下: