[已解决]爬某网站妹子图老是报错

waitforlove · 发表于 2018-2-19 21:42:37

import urllib.request as q
import os,re
import urllib.error
def readurl(url):
head={}
head['User-Agent']='Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)'
req=q.Request(url,headers=head)
try:
res=q.urlopen(url)
except urllib.error.HTTPError as cwu:
return cwu.code
except urllib.error.URLError as ucwu:
return ucwu.code
html=res.read()
return html
def xiazai():
for j in range(1,445):
if j<10:
motenum='00'+str(j)
elif 10<=j<100:
motenum='0'+str(j)
else:
motenum=str(j)
url=r'http://www.ugirls.com/Content/List/Magazine-'+ motenum+'.html'#模特总共从001到444
html=readurl(url)
if type(html)==type(int()):
if 400<=html<=599:
print('网页错误无法下载%d'%html)
continue
else:
html=html.decode('utf-8')
#获取模特名字
uname=re.findall(r'<meta name="keywords" content="[\u4e00-\u9fa5]{2,5}、ugirls ([\u4e00-\u9fa5]{2,3})" />',html)
if len(uname)==0:
uname='大集合'
else:
uname=str(uname[0])
print('第%d模特是%s'%(j,uname))
dqian=os.getcwd()
try:
os.mkdir('ugirl')
except OSError:
os.chdir('ugirl')
else:
os.chdir('ugirl')
try:
os.mkdir(uname)
except OSError:
os.chdir(uname)
else:
os.chdir(uname)
str1=len('05c9b11a01aebf2a2af18a030ffea553')
str2=len('_magazine_web_m.jpg')
imglist=re.findall(r'http://img.ugirls.tv/uploads/magazine/content/[^"]+_magazine_web_m\.jpg',html)
for each in imglist:
with open(each.split('/')[-1][:str1]+'.jpg','wb') as f:
f.write(readurl(each[:-str2]+'.jpg'))
print(each[:-str2]+'.jpg',end='下载完成\n')
print('******************************************************************')
os.chdir(dqian)
xiazai()

复制代码

寂寞的兄弟有福利了

最佳答案

月排行榜 / 总排行榜

°蓝鲤歌蓝

2018-2-19 21:42:38

res=q.urlopen(url)

复制代码

这一行应该是

res=q.urlopen(req)

复制代码

跳转到最佳答案楼层

°蓝鲤歌蓝 · 发表于 2018-2-19 21:42:38

res=q.urlopen(url)

复制代码

这一行应该是

res=q.urlopen(req)

复制代码

waitforlove · 发表于 2018-2-19 22:28:13

°蓝鲤歌蓝发表于 2018-2-19 22:15
这一行应该是

可以运行，就是隔一段时间报错一次，明天照你说的地方改改再，现在睡觉了

枫树霜雪 · 发表于 2018-2-20 00:03:37

楼上说的req的确是一个问题，如果直接用不用req伪装user-agent，他会直接拒绝你的请求的，但是单单加上一个请求头，过一段时间他一样会根据你的IP地址拒绝你的访问，所以你需要使用代理iP,具体的用法小甲鱼有教程的，免费的IP可以去哪些代理网站上爬取

綉氣 · 发表于 2018-2-20 08:33:22

我也经常遇到这种问题，要不就加代理，要不就用最简单的办法
把请求的这段换成这样

def readurl(url):
head={}
head['User-Agent']='Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)'
req=q.Request(url,headers=head)
try:
res=q.urlopen(url)
except Exception:
readurl(url)
html=res.read()
return html

复制代码

遇到错误就重新请求一次

账号		自动登录	找回密码
密码			立即注册