|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import urllib.request
import re
import time
def open_url(url):
req=urllib.request.Request(url,headers={'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
})
html=urllib.request.urlopen(req)
html=html.read()
return html
def get_page(url):
p='>\[[\d]+\]<'
p=re.compile(p)
html=open_url(url).decode('utf-8')
page=re.search(p,html)
page=page.group()[2:-2]
return page
def get_imgs(url):
p='<img src="http.+\.jpg"'
p=re.compile(p)
html=open_url(url).decode('utf-8')
img_addrs=re.finditer(p,html)
return img_addrs
def save_imgs(imgs):
n=0
for i in imgs:
img=i.group()
print(img)
p='http://ww.+jpg'
p=re.compile(p)
img_addr=re.search(p,img)
if img_addr:
req=urllib.request.Request(img_addr.group(),headers={'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'})
photo=urllib.request.urlopen(req)
print('打开%s'%img_addr.group())
with open('妹子图%d.jpg'%n,'wb') as f:
n+=1
f.write(photo.read())
print('正在保存%d'%n)
def main_save(url,pages=1):
pages=int(get_page(url))
print(pages)
for i in range(pages):
pages-=1
temp2=url+'/page-'+str(pages)+'#comments'
print('打开%s'%pages)
print(temp2)
imgs=get_imgs(temp2)
save_imgs(imgs)
time.sleep(1)
proxy=urllib.request.ProxyHandler({'http':'58.208.132.112:808'})
opener=urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)
main_save('http://jandan.net/ooxx')
但我发现爬到第2页是获取某一张图片的地址会错误,正在改正,尝试学完正则表达式后修复 |
|