|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 结果咧 于 2018-3-20 14:08 编辑
import requests
import os
os.chdir('saving1')
def getpage(url_old):#获得更新的url页数
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url_old,headers=headers)
response.encoding=response.apparent_encoding
html=response.text
a=html.find('<span class="on">')+17
b=a+1
page=html[a:b]
return page
def getpictureurl(url):
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url,headers=headers)
response.encoding=response.apparent_encoding
html=response.text
##########################################################
a1=0
b1=0
while a1!=-1:#找到jpg文件,这里有缺陷因为png也会存为jpg格式
a1=html.find('jpgsrc',b1)
b1=html.find('.jpg',a1)
if b1 != -1:
pictureurljpg_list.append(html[a1+8:b1+4])
else:
b1=a1+100
a1=html.find('jpgsrc',b1)
###########################################################
a2=0
b2=0
while a2!=-1:#找到gif文件
a2=html.find('gifsrc',b2)
b2=html.find('.gif',a2)
if b2 != -1:
pictureurlgif_list.append(html[a2+8:b2+4])
else:
b2=a2+100
a2=html.find('gifsrc',b2)
def savepicture(pictureurljpg_list,pictureurlgif_list,page):
os.mkdir(page)
os.chdir(page)
i=1
for pictureurl in pictureurljpg_list:
name=str(i)+'.jpg'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
i+=1
j=1
for pictureurl in pictureurlgif_list:
name=str(j)+'.gif'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
j+=1
os.chdir(os.pardir)
if __name__=='__main__':
temp=int(input('打印几页:'))
pictureurljpg_list=[]
pictureurlgif_list=[]
url_old='https://www.pengfu.com/'
page=getpage(url_old)
while True:
url='https://www.pengfu.com/index_'+page+'.html'
getpictureurl(url)
savepicture(pictureurljpg_list,pictureurlgif_list,page)
pictureurljpg_list=[]
pictureurlgif_list=[]
num=int(page)+1
page=str(num)
if num > temp:
break
我自己觉得挺挫的
以下是利用bs4实现的,正则表达式还在学习中,之后再发
import requests
import os
import re
from bs4 import BeautifulSoup
os.chdir('saving2')
def getpage(url_old):#获得更新的url页数
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url_old,headers=headers)
response.encoding=response.apparent_encoding
html=response.text
soup=BeautifulSoup(html,'html.parser')
spans=soup.find_all('span')
for span in spans:
if span.attrs=={'class': ['on']}:
page=span.string
return page
def getpictureurl(url):
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url,headers=headers)
response.encoding=response.apparent_encoding
html=response.text
soup=BeautifulSoup(html,'html.parser')
imgs=soup.find_all('img')
for img in imgs:
if 'jpgsrc' in img.attrs:
pictureurljpg_list.append(img.attrs['jpgsrc'])
for img in imgs:
if 'gifsrc' in img.attrs:
pictureurlgif_list.append(img.attrs['gifsrc'])
def savepicture(pictureurljpg_list,pictureurlgif_list,page):
os.mkdir(page)
os.chdir(page)
i=1
for pictureurl in pictureurljpg_list:
name=str(i)+'.jpg'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
i+=1
j=1
for pictureurl in pictureurlgif_list:
name=str(j)+'.gif'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
j+=1
os.chdir(os.pardir)
if __name__=='__main__':
temp=int(input('打印几页:'))
pictureurljpg_list=[]
pictureurlgif_list=[]
url_old='https://www.pengfu.com/'
page=getpage(url_old)
while True:
url='https://www.pengfu.com/index_'+page+'.html'
getpictureurl(url)
savepicture(pictureurljpg_list,pictureurlgif_list,page)
pictureurljpg_list=[]
pictureurlgif_list=[]
num=int(page)+1
page=str(num)
if num > temp:
break
刚学完正则表达式############################
import requests
import re
import os
os.chdir('saving3')
def getpage(url_old):#获得更新的url页数
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url_old,headers=headers)
response.encoding=response.apparent_encoding
html=response.text
r=re.search(r'<span class="on">[1-9]\d*</span>',html)
page=r.group(0)[17:r.group(0).find('</span>')]
return page
def getpictureurl(url):
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url,headers=headers)
response.encoding=response.apparent_encoding
html=response.text
jpgsrcs=re.findall(r'jpgsrc=".*"',html)
gifsrcs=re.findall(r'gifsrc=".*"',html)
for jpgsrc in jpgsrcs:
pictureurljpg_list.append(jpgsrc[8:jpgsrc[0].find('"',8)])
for gifsrc in gifsrcs:
pictureurlgif_list.append(gifsrc[8:gifsrc[0].find('"',8)])
def savepicture(pictureurljpg_list,pictureurlgif_list,page):
os.mkdir(page)
os.chdir(page)
i=1
for pictureurl in pictureurljpg_list:
name=str(i)+'.jpg'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
i+=1
j=1
for pictureurl in pictureurlgif_list:
name=str(j)+'.gif'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
j+=1
os.chdir(os.pardir)
if __name__=='__main__':
temp=int(input('打印几页:'))
pictureurljpg_list=[]
pictureurlgif_list=[]
url_old='https://www.pengfu.com/'
page=getpage(url_old)
while True:
url='https://www.pengfu.com/index_'+page+'.html'
getpictureurl(url)
savepicture(pictureurljpg_list,pictureurlgif_list,page)
pictureurljpg_list=[]
pictureurlgif_list=[]
num=int(page)+1
page=str(num)
if num > temp:
break
今天学了Xpath,果然方便!!!!!!!!!如下…………………………………………………………
import requests
import os
from lxml import etree
import extract
os.chdir('saving4')
def getpage(url_old):#获得更新的url页数
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url_old,headers=headers)
response.encoding=response.apparent_encoding
html=etree.HTML(response.text)
page=html.xpath('/html/body/div[1]/div[1]/div[13]/div/span[1]/text()')[0]
return page
def getpictureurl(url):
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url,headers=headers)
response.encoding=response.apparent_encoding
html=etree.HTML(response.text)
jpg_list=html.xpath('//*[@id]/dl/dd/div[2]/img/@jpgsrc')
gif_list=html.xpath('//*[@id]/dl/dd/div[2]/img/@gifsrc')
return (jpg_list,gif_list)
def savepicture(pictureurljpg_list,pictureurlgif_list,page):
os.mkdir(page)
os.chdir(page)
i=1
for pictureurl in pictureurljpg_list:
name=str(i)+'.jpg'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
i+=1
j=1
for pictureurl in pictureurlgif_list:
name=str(j)+'.gif'
f=open(name,'wb')
headers={'user-agent':'Mozilla/5.0'}
response=requests.get(url=pictureurl,headers=headers)
f.write(response.content)
f.close()
j+=1
os.chdir(os.pardir)
if __name__=='__main__':
temp=int(input('打印几页:'))
jpg_list=[]
gif_list=[]
url_old='https://www.pengfu.com/'
page=getpage(url_old)
while True:
url='https://www.pengfu.com/index_'+page+'.html'
(jpg_list,gif_list)=getpictureurl(url)
savepicture(jpg_list,gif_list,page)
num=int(page)+1
page=str(num)
if num > temp:
break
|
-
-
|