|
发表于 2017-4-15 16:10:29
|
显示全部楼层
我自己写了个程序,可以获取一些免费的代理IP,要不楼主看看?- from urllib.request import *
- #url = 'http://www.xicidaili.com'
- def get(url='http://www.xicidaili.com'):
- '返回一个叫做 ip_dict 的字典\n,格式:\nip_dict[ip] = [port,place,anonymity,form,live_time,update_time]'
-
- url = Request(url)
- url.add_header("User-Agent",'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
- html = urlopen(url)
- response = html.read().decode('utf-8')
- ip_dict = {}
- #the_num 为 '<img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>' 的位置
- the_num = response.find('<img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>')
- count = 0
-
- while the_num != -1:
- #查找IP地址
- a = response.find('<td>',the_num) + 4
- b = response.find('</td>',a)
- ip = response[a:b]
- #查找端口号
- a = response.find('<td>',b) + 4
- b = response.find('</td>',a)
- port = response[a:b]
- #查找服务器地址
- a = response.find('<td>',b) + 4
- b = response.find('</td>',a)
- place = response[a:b]
- #查找是否匿名
- a = response.find('<td class="country">',b) + 20
- b = response.find('</td>',a)
- anonymity = response[a:b]
- #查找代理类型
- a = response.find('<td>',b) + 4
- b = response.find('</td>',a)
- form = response[a:b]
-
- #查找存活时间
- a = response.find('<td>',b) + 4
- b = response.find('</td>',a)
- live_time = response[a:b]
-
- #查找更新时间
- a = response.find('<td>',b) + 4
- b = response.find('</td>',a)
- update_time = response[a:b]
- #将查找结果加入 ip_dict
- ip_dict[ip] = [port,place,anonymity,form,live_time,update_time]
- #为下一次查找初始化
- the_num = response.find('<img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>',b)
- #计数器
- count += 1
- print('一共找到%d个' % count)
- return ip_dict
- if __name__ == '__main__':
- ip_dict = get()
-
复制代码 |
|