|
发表于 2017-4-19 16:27:08
|
显示全部楼层
本帖最后由 zlj19931010 于 2017-4-19 16:29 编辑
我发现1-10页和10-~页返回的数据不一样
然后1-10也返回的数据我要用红色代码处理 为什么用这个<div id="plist".+?<div class="page clearfix">正则取出来的数据是空
import re
import os
import urllib.request
def fetchData(url):
req = urllib.request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36')
response = urllib.request.urlopen(req)
return response
def getImageUrl(pageUrl):
images = []
response = fetchData(pageUrl)
html = response.read().decode("utf-8")
p1 = '<div id="plist".+?<div class="page clearfix">'
req1 = re.compile(p1).findall(html)
if len(req1) < 1:
# 这边很奇怪,明明有<div id="plist"和<div class="page clearfix">就是不能匹配,只能这样曲线救国了
req1 = html.split('<div id="plist"')[1]
req1 = req1.split('<div class="page clearfix">')[0]
else:
req1 = req1[0]
p2 = '<img width="220" height="220" data-img="1".+?\.jpg'
imgs = re.compile(p2).findall(req1)
for each in imgs:
# 打印出//之后的数据
images.append(each.split('//')[1])
return images
def down_jd(url,save_path):
imageUrls = getImageUrl(url)
# 指定文件夹不存在就创建文件夹
if not os.path.exists(save_path):
os.makedirs(save_path)
os.chdir(save_path)
i = 0
for each in imageUrls:
each = "http://%s" % each
image_one = fetchData(each).read()
image_name = 'jd_phone_%d.jpg' % i
i += 1;
with open(image_name, 'wb') as f:
f.write(image_one)
for i in range(1,5):
url = 'https://list.jd.com/list.html?cat=9987,653,655&page=' + str(i)
down_jd(url, "d://zlj//jd_phone//page%d" % i)
- import re
- import os
- import urllib.request
- def fetchData(url):
- req = urllib.request.Request(url)
- req.add_header('User-Agent',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36')
- response = urllib.request.urlopen(req)
- return response
- def getImageUrl(pageUrl):
- images = []
- response = fetchData(pageUrl)
- html = response.read().decode("utf-8")
- p1 = '<div id="plist".+?<div class="page clearfix">'
- req1 = re.compile(p1).findall(html)
- if len(req1) < 1:
- # 这边很奇怪,明明有<div id="plist"和<div class="page clearfix">就是不能匹配,只能这样曲线救国了
- req1 = html.split('<div id="plist"')[1]
- req1 = req1.split('<div class="page clearfix">')[0]
- else:
- req1 = req1[0]
- p2 = '<img width="220" height="220" data-img="1".+?\.jpg'
- imgs = re.compile(p2).findall(req1)
- for each in imgs:
- # 打印出//之后的数据
- images.append(each.split('//')[1])
- return images
- def down_jd(url,save_path):
- imageUrls = getImageUrl(url)
- # 指定文件夹不存在就创建文件夹
- if not os.path.exists(save_path):
- os.makedirs(save_path)
- os.chdir(save_path)
- i = 0
- for each in imageUrls:
- each = "http://%s" % each
- image_one = fetchData(each).read()
- image_name = 'jd_phone_%d.jpg' % i
- i += 1;
- with open(image_name, 'wb') as f:
- f.write(image_one)
- for i in range(1,5):
- url = 'https://list.jd.com/list.html?cat=9987,653,655&page=' + str(i)
- down_jd(url, "d://zlj//jd_phone//page%d" % i)
复制代码
|
|