|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 fl93 于 2018-4-24 16:57 编辑
RT
做完了python入门的所有作业
老甲鱼视频里说做一个自动抓代理地址的爬虫
经过尝试,代码在最后
主要作为配合其他工具代理使用
纯新手,请老司机们刀下留情
有任何问题和建议请留言,期待你的回复
- import urllib.request
- import urllib.parse
- import chardet
- import gzip
- import json
- import re
- from bs4 import BeautifulSoup
- def get_proxylist(proxy_info=None):
- '''
- get_proxylist() is a func to get proxy ip addresses from http://www.gatherproxy.com/
- it will return proxy informations with a list
- each element in this list is a dictionary
- proxy_info is the proxy information to hiding your real ip address
- if you can not connect to http://www.gatherproxy.com or you get block by this server
- parameter format is like blow:
- "ip:port" --> "192.168.1.1:8080"
-
- you can access by folowing keywords
- important keyword:
- "PROXY_COUNTRY" --> Proxy location
- "PROXY_IP" --> Proxy service IP
- "PROXY_PORT" --> Need to covert from Hex to Decimal(now is automatically coverted)
- "PROXY_STATUS" --> OK is the default status
- "PROXY_UPTIMELD" --> Max capacity and how many people are using this proxy
-
- '''
-
- url = 'http://www.gatherproxy.com/'
- header = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.281 Safari/537.36'
- }
- try:
-
- if proxy_info:
- proxy_support = urllib.request.ProxyHandler({'http':proxy_info})
- req = urllib.request.Request(url=url, headers=header)
- opener = urllib.request.build_opener(proxy_support)
-
- else:
- req = urllib.request.Request(url=url, headers=header)
- opener = urllib.request.build_opener()
- response = opener.open(req)
-
- html = response.read()
-
- encode = chardet.detect(html)['encoding']
- #print('当前编码为:%s\n'%encode)
- if encode == 'GB2312':
- encode = 'GBK'
-
- #if not encode:
- # html = gzip.decompress(html)
- # encode = chardet.detect(html)['encoding']
- # print('当前编码为:%s\n'%encode)
-
- html = html.decode(encode)
- soup = BeautifulSoup(html, 'html.parser')
- proxy_list = list()
-
- proxy_js = soup.find_all(type="text/javascript")
- for each_js in proxy_js:
- tmp = re.search(r'gp.insertPrx\(({.+\})\)', each_js.text)
- if tmp:
- tmp = json.loads(tmp.group(1))
- port_hex = tmp.get("PROXY_PORT")
- tmp["PROXY_PORT"] = int(port_hex, 16)
- proxy_list.append(tmp)
-
- return proxy_list
- except ConnectionResetError as e:
- print('Your ip address has been blocked by this server! please try again with proxy_info parameter filled\n')
- print('Error Information is: %s' %e)
-
- except urllib.error.URLError as e:
- print('Failed to connect to the proxy ip! please try again with new proxy information\n')
- print('Error Information is: %s' %e)
复制代码 |
评分
-
查看全部评分
|