|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import pymysql
- import random
- from bs4 import BeautifulSoup
- import urllib.request
- import re
- import os
- def get_proxy():
- #这里是连接数据库,获得所有代理,并存入列表返回
- proxys_list=[]
- conn=pymysql.connect(host="127.0.0.1",user="root",passwd="123456",db="mysql")
- cur=conn.cursor()
- cur.execute("USE scraping")
- cur.execute("SELECT * FROM PROXYS_LIST WHERE ID")
- for proxy in cur.fetchall():
- proxys_list.append(proxy[1])
- cur.close()
- conn.close()
- return proxys_list
- def choice_proxy():
- #这里是通过random随机选择一个代理,并安装opener
- headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
- proxy_temp=random.choice(get_proxy())
- proxy_support=urllib.request.ProxyHandler(eval(proxy_temp))
- opener=urllib.request.build_opener(proxy_support)
- opener.addheaders = [headers]
- urllib.request.install_opener(opener)
- print(proxy_temp)
- def get_content(url):
- #这里首先选择代理,然后进行访问判断,如果失败,重新选择代理IP,重复上一步
- choice_proxy()
- while True:
- try:
- response=urllib.request.urlopen(url)
- except Exception as e:
- print("失败")
- choice_proxy()
- continue
- else:
- print("成功")
- print("读取完成")
- html=response.read()
- return html
- def get_out_url():
- #这里是爬取shaofu页面的所有内链接,然后把标题和链接分别存入两个列表中
- global href_list,title_list
- href_list=[]
- title_list=[]
- out_url="http://www.symmz.com/shaofu.html "
- html=BeautifulSoup(get_content(out_url),"html.parser")
- title=html.findAll("div",{"class":"title"})
- for each in title:
- in_to=each.findAll("a",{"href":re.compile(r"/shaofu/.+")},{"title":re.compile(r".+")})
- for i in in_to:
- href_list.append(i["href"])
- title_list.append(i["title"])
- def download(in_url,title):
- #用正则表达式匹配图片链接,然后逐个下载图片
- html=BeautifulSoup(get_content(in_url),"html.parser")
- n=0
- pic_url=html.findAll("img",{"src":re.compile("http://img.symmz.com/img/.+\.jpg$")})
- for each in pic_url:
- print(each["src"])
- n+=1
- print("第%s张"%n)
- pic_name="F://Python36//项目//图片//"+title+"//"+each["src"].split("/")[-1]
- urllib.request.urlretrieve(each["src"],pic_name)
- def go():
- #根据前面爬取到的标题和链接,创建文件夹,还有下载图片
- for href,title in zip(href_list,title_list):
- create_dir(title)
- for num in range(1,11):
- in_url="http://www.symmz.com"+href.split("-")[0]+"-"+str(num)+".html"
- print(in_url)
- download(in_url,title)
- def create_dir(title):
- #创建文件夹
- os.makedirs("F://Python36//项目//图片//"+title)
-
- get_out_url()
- go()
复制代码
http://bbs.fishc.com/thread-102905-1-1.html 这个是写如何从西刺爬取代理并存入数据库的方法 |
|