|
楼主 |
发表于 2018-1-5 17:55:09
|
显示全部楼层
- # 1. 爬取商品多页评论:
- #商品链接: https://detail.tmall.com/item.htm?id=555502261542
- #url: https://rate.tmall.com/list_detail_rate.htm?itemId=555502261542&sellerId=813836783¤tPage=1
- #user-agent:Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36
- import requests
- import urllib.request
- import re
- import pandas as pd
- from pandas import DataFrame
- import time
- import random
- tlist = [6,8,11,15,16,18,22]
- #iplist = ['119.29.18.239:8888', '219.138.58.74:3128', '114.228.8.128:8118','60.177.225.111:808','180.156.95.80:8118'] #类型:https
- iplist = ['221.225.186.63:3128', '219.244.186.30:3128', '122.72.18.34:80','58.220.95.107:8080','116.31.75.100:3128']
- datapj = DataFrame(columns=[])
- for i in range(1,100):
- i = str(i)
- url='https://rate.tmall.com/list_detail_rate.htm?itemId=555502261542&sellerId=813836783¤tPage=' + i
- proxy = urllib.request.ProxyHandler({'http':random.choice(iplist)}) #随机选择代理ip
- opener = urllib.request.build_opener(proxy) #创建opener模块
- opener.addheaders = [('user-agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')] #添加headers
- urllib.request.install_opener(opener) #安装opener
- web = requests.get(url)
- json = re.findall('rateList":(.*?),"searchinfo',web.text)[0]
- table = pd.read_json(json)
- datapj = pd.concat([datapj,table],axis=0,ignore_index=True)
- time.sleep(random.choice(tlist))
- datapj.to_excel('datapj.xls')
复制代码
这是我对本贴问题的完善 我写的这个代码 只能爬取几十页就gameover 请问如何进一步优化修改? 我想爬取所有 99 页的评论 大神请赐教??? 所有鱼币奉上 |
|