|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 Reed 于 2015-5-21 02:42 编辑
现在我用的方法只有一个就是代理IP,去免费代理的网站上先爬下来IP,然后再使用
但是可用率太低,速度也不理想,有什么办法更好的解决这问题么?
鱼油们~~(代理IP可用率低,目前还爬不下来全部数据,现在也是在测试)
下面的代码,主要用了队列和线程,实现类似 消费者&生产者的问题。
写的太丑,实现原理懂的少,如有优化方案还望告知,交流才是第一生产力。
- # coding: utf-8
- __author__ = 'Reed'
- import urllib.request
- import queue
- import threading
- from bs4 import BeautifulSoup
- from pandas import Series
- # import mysql.connector
- from io import BytesIO
- import gzip
- import numpy
- '''
- class Desc(threading.Thread):
- def __init__(self, q_url, q_proxy):
- threading.Thread.__init__(self)
- self.q_url = q_url
- self.q_proxy = q_proxy
- self.flag = True
- def run(self):
- while 1:
- if not self.q_proxy.empty() and not self.q_url.empty():
- url = self.q_url.get()
- rqt = urllib.request.Request(
- url=url,
- headers={'User-Agent': 'Mozilla/5.0'})
- while 1:
- if self.flag and not self.q_proxy.empty():
- addr = self.q_proxy.get()
- print(addr, self.name, url)
- proxy_handler = urllib.request.ProxyHandler({'http': addr})
- opener = urllib.request.build_opener(proxy_handler)
- try:
- page = opener.open(rqt, timeout=4).read().decode('utf-8')
- print('%s read DESCOK!' % self.name)
- self.flag = False
- break
- except:
- print('%s read DESCFALSE' % self.name)
- self.flag = True
- q_proxy.task_done()
- continue
- soup = BeautifulSoup(page)
- info = soup.select('#info')[0].text.encode('gbk', 'ignore').decode('gbk').replace(' ', '').replace('/', '').split('\n')
- infor = list(filter(lambda x: x, info))
- dct = dict()
- for i in range(len(infor)):
- if infor[i].endswith(':'):
- dct[infor[i][:-1]] = infor[i+1]
- elif ':' in infor[i]:
- k = infor[i].split(':')[0]
- dct[k] = ''.join(infor[i].split(':')[1:])
- rate = soup.select('.rating_wrap')[0].select('strong')[0].text.replace(' ', '').replace('\n', '')
- raters = soup.select('.rating_wrap')[0].select('a')[0].text[:-3]
- title = soup.title.text.encode('gbk', 'ignore').decode('gbk')[:-5]
- dct['评分'] = rate
- dct['评分人数'] = raters
- dct['书名'] = title
- data = Series(dct, index=['书名', '作者', '译者', '评分', '评分人数', '定价', '出版社', '出版年', '页数', 'ISBN', '装帧', '副标题', '丛书', '原作名'])
- desc_book = list(data.replace(numpy.nan, ''))
- q_desc.put(desc_book)
- q_url.task_done()
- '''
- # 上面的Desc(注释掉了)和Book共用一个IP代理队列,其他速度均正常,只有Desc慢很多,有可能是资源竞争搞的,嗯,我猜
- # 下面的Desc没有用代理,用本机访问,速度果然就恢复正常了,然后...就被封了
- # 应该IP代理队列一分为二,效果应该会好,(免费代理,可用率惊的低人啊)
- # 从每本书的详情主页,拿出图书信息,并格式化数据
- class Desc(threading.Thread):
- def __init__(self, q_url):
- threading.Thread.__init__(self)
- self.q_url = q_url
- self.flag = True
- def run(self):
- while 1:
- if not self.q_url.empty():
- url = self.q_url.get()
- rqt = urllib.request.Request(
- url=url,
- headers={'User-Agent': 'Mozilla/5.0'})
- while 1:
- opener = urllib.request.build_opener()
- try:
- page = opener.open(rqt, timeout=4).read().decode('utf-8')
- self.flag = False
- break
- except:
- self.flag = True
- continue
- # 网页解析,而且雷特别多,比如 · 这个东西GBK无法编码,虽然utf8编码存到DB里,取的时候也可以修正,但是别扭
- # douban的html代码搞的有点复杂,所以格式化费劲了点,各种replace
- soup = BeautifulSoup(page)
- info = soup.select('#info')[0].text.encode('gbk', 'ignore').decode('gbk').replace(' ', '').replace('/', '').split('\n')
- infor = list(filter(lambda x: x, info))
- # 这个里就是在格式化数据,存到字典里,为后面pandas.Series格式化数据做好准备
- # 遗憾的是译者有多个的时候,无法全部取出
- dct = dict()
- for i in range(len(infor)):
- if infor[i].endswith(':'):
- dct[infor[i][:-1]] = infor[i+1]
- elif ':' in infor[i]:
- k = infor[i].split(':')[0]
- dct[k] = ''.join(infor[i].split(':')[1:])
- # 取评分,评分人数,书名是上面忘了取
- rate = soup.select('.rating_wrap')[0].select('strong')[0].text.replace(' ', '').replace('\n', '')
- raters = soup.select('.rating_wrap')[0].select('a')[0].text[:-3]
- title = soup.title.text.encode('gbk', 'ignore').decode('gbk')[:-5]
- dct['评分'] = rate
- dct['评分人数'] = raters
- dct['书名'] = title
- # pandas.Series严格的按照index的顺序,排列dict.values(),空白处会置NAN(numpy.nan),再次replace
- data = Series(dct, index=['书名', '作者', '译者', '评分', '评分人数', '定价', '出版社', '出版年', '页数', 'ISBN', '装帧', '副标题', '丛书', '原作名'])
- desc_book = list(data.replace(numpy.nan, ''))
- q_desc.put(desc_book)
- q_url.task_done()
- # 从q_page队列里,取每一页的url然后获取每本书的url
- class Book(threading.Thread):
- def __init__(self, q_page, q_proxy):
- threading.Thread.__init__(self)
- self.q_page = q_page
- self.q_proxy = q_proxy
- self.sign = True
- def run(self):
- while 1:
- if not self.q_page.empty() and not self.q_proxy.empty():
- page_url = self.q_page.get()
- rqt = urllib.request.Request(
- url=page_url,
- headers={'User-Agent': 'Mozilla/5.0'})
- # 内层while做到了,如果代理可用,那么下一次换url,不换代理
- # 如果代理不可用,下一次换代理,不换url
- while 1:
- if self.sign and not self.q_proxy.empty():
- addr = self.q_proxy.get()
- proxy_handler = urllib.request.ProxyHandler({'http': addr})
- opener = urllib.request.build_opener(proxy_handler)
- try:
- book_page = opener.open(rqt, timeout=4).read().decode('utf-8')
- self.sign = False
- break
- except:
- self.sign = True
- q_proxy.task_done()
- continue
- # BeautifulSoup脑残用法
- soup = BeautifulSoup(book_page)
- for dd in soup.select('dd'):
- if dd.select('a'):
- href = dd.select('a')[0].get('href')
- q_url.put(href)
- q_page.task_done()
- # 将格式化的数据,存到DB里
- class DB(threading.Thread):
- def __init__(self, q_desc):
- threading.Thread.__init__(self)
- self.q_desc = q_desc
- def run(self):
- while 1:
- desc = q_desc.get()
- print(self.name)
- # con = mysql.connector.connect(user='root', password='password', host='127.0.0.1', database='douban')
- print('inset into table values %s' % ('/'.join(desc)))
- q_desc.task_done()
- # 以上3个class均开启多线程, 与queue队列合作,不用过度关心锁等问题
- # 正式开启线程爬book information之前,先爬好代理IP,放到队列
- def get_proxy(proxy_url_lst):
- for each_url in proxy_url_lst:
- rqst = urllib.request.Request(each_url)
- rqst.add_header('User-Agent', 'Mozilla/5.0')
- # 这里response header返回的Content-Encoding是gzip,google到的方法是要解压
- # 其他网站有的也是这样,但是并不用解压,不明原因
- try:
- response = urllib.request.urlopen(rqst)
- if response.info().get('Content-Encoding') == 'gzip':
- buf = BytesIO(response.read())
- f = gzip.GzipFile(fileobj=buf)
- each_page = f.read()
- else:
- each_page = response.read()
- except urllib.request.URLError as r:
- print(r.reason)
- proxy_soup = BeautifulSoup(each_page)
- trs = proxy_soup.select('tbody > tr')
- for tr in trs:
- td = tr.find_all('td')
- proxy_addr = td[0].text + ':' + td[1].text
- q_proxy.put(proxy_addr)
- def main():
- proxy_url_list = ['http://www.kuaidaili.com/free/intr/' + str(i) for i in range(1, 30)]
- get_proxy(proxy_url_list)
- page_url = ['http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=' + str(n) for n in range(0, 71985, 15)]
- for each in page_url:
- q_page.put(each)
- # 每项任务5个线程
- for i in range(5):
- book = Book(q_page, q_proxy)
- book.setDaemon(True)
- book.start()
- desc = Desc(q_url)
- desc.setDaemon(True)
- desc.start()
- db = DB(q_desc)
- db.setDaemon(True)
- db.start()
- if __name__ == '__main__':
- q_url = queue.Queue() # 每本书的url
- q_page = queue.Queue() # 每页15本书概况的page url, tag:小说
- q_desc = queue.Queue() # 每本书的详细描述信息(list)
- q_proxy = queue.Queue() # 从代理IP网站爬下来的IP地址和端口,'127.0.0.1:8888'
- main()
- q_desc.join()
- print('Done!')
复制代码
|
|