一个彩笔的日常...

0%

一个简易的ip代理池

用py写了一个简单的ip代理池,源码如下:

从xici爬取ip

tip: 需要在目录下创建HTTPS_ip.txt和HTTP_ip.txt和一个enable_ip.txt
# -- coding: utf-8 --

from bs4 import BeautifulSoup
import requests
import os

def get_ip_list(url):
    headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
            'Accept - Encoding':'gzip, deflate',
            'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
            'Connection':'Keep-Alive',
            'Host':'zhannei.baidu.com',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
    web_data = requests.get(url=url,headers=headers)
    if '200' not in web_data:
        print("访问失败,可能是禁了ip,当前访问页面状态码:%s" % (web_data))
    soup = BeautifulSoup(web_data.text,'html.parser')
    ips = soup.find_all('tr')
    fhttp = open("HTTP_IP.txt",'a')
    fhttps = open("HTTPS_IP.txt",'a')
    fsocks = open("socks_IP.txt",'a')
    for i in range(1,len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        print("IP:{}    port:{}    noun:{}".format(tds[1].text,tds[2].text,tds[5].text))
        item = str(tds[1].text) + ',' + str(tds[2].text) + ',' +str(tds[5].text) + '\n'
        if 'HTTP' in item and len(tds[5].text) == 4:
            fhttp.write(item)
        if 'HTTPS' in item:
            fhttps.write(item)
        if 'socks4/5' in item:
            fsocks.write(item)
    fhttp.close()
    fhttps.close()
    fsocks.close()




if __name__=="__main__":
    if (os.path.exists("HTTP_IP.txt")):  #判断文件是否存在,存在返回True,否则返回False
        os.remove("HTTP_IP.txt")  #删除存在的文件,为了追加的写入方式写入的数据不重复
    if (os.path.exists("HTTPS_IP.txt")):  
        os.remove("HTTPS_IP.txt")
    if (os.path.exists("socks_IP.txt")):  
        os.remove("socks_IP.txt")      #突然想到直接在这里就以w方式打开文件,就不需要考虑以追加的写入方式写入的数据重复的问题
    for i in range(1,11):        #设爬取代理的范围
        target_url = "https://www.xicidaili.com/nn/" + str(i) # 爬取代理的网站
        get_ip_list(target_url)

验证ip可用性
# -*- coding: utf-8 -*-

import requests
from queue import Queue
import sys
import threading
import telnetlib


class Check_IP(threading.Thread):


    def __init__(self,queue):
        threading.Thread.__init__(self)
        self._queue = queue

    def run(self):
        global lines 
        lines = [] 

        while (not self._queue.empty()):    #只要队列不为空就继续从队列中取url路径
            url = self._queue.get()
            # print(url)

            ip,port,types = url.split(',',3)

            try :
                telnetlib.Telnet(ip,port,timeout=6)        #超过6秒则判定为无效IP代理

                # if r.status_code == 200 :
                #     sys.stdout.write('[*] %s\n' % url)    #显示状态码为200的url
                print("%s可用" % (url))
                lines.append(url+'\n')
            except Exception:
                print("%s不可用" % (url))

    def write_enable_ip():
        fw = open("enable_ip\\enable_ip.txt",'w')
        for i in range(len(lines)):
            fw.write(lines[i])
        fw.close()            


def start(txt,count):
    queue = Queue()

    fr = open('%s' % txt,'r',encoding='utf-8')    #按行读取文件内容
    lines = fr.readlines()  
    fr.close()
    for line in lines:
        ip,port,types= line.split(',',3)    #将ip、端口和类型读取并分别赋值
        types = types.rstrip('\n')
        queue.put(ip+','+port+','+types) 

    threads = []
    thread_count = int(count)

    for i in range(thread_count):
        threads.append(Check_IP(queue))

    for t in threads:
        t.start()

    for t in threads:
        t.join()    

if __name__=="__main__":
    txt = 'HTTPS_IP.txt'        #设置要验证的代理类型的文件
    count = 8    #设定线程数
    start(txt,count)
    Check_IP.write_enable_ip()

我是项目地址
password:31ex

Welcome to my other publishing channels