|
发表于 2018-3-14 22:18:02
|
显示全部楼层
有些日子了。我有点忘了。。。。。
当时我这部分代码是这样的(不太好的版本,不能爬所有,会被封):
- # 书籍信息爬虫
- def book_spider(book_tag, cookies):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
- }
- books_list = []
- page_num = 0
- url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T'
- res = requests.get(url, cookies=cookies, headers=headers)
- soup = bs4.BeautifulSoup(res.text, 'html.parser')
- # 找到一共有多少页
- page_num_max = soup.find('div', attrs={'class': 'paginator'})
- page_num_max = page_num_max.findAll('a')
- page_num_max = page_num_max[-2].string.strip()
- page_num_max = int(page_num_max)
- while True:
- url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T'
- res = requests.get(url, cookies=cookies, headers=headers)
- soup = bs4.BeautifulSoup(res.text, 'html.parser')
- # 找到该页所有书
- soup_list = soup.findAll('li', attrs={'class': 'subject-item'})
- for book_info in soup_list:
- # 书名
- title = book_info.find('a', attrs={'title': True})
- book_url = title.attrs['href']
- title = title.attrs['title']
- # 基本信息
- basic_info = book_info.find('div', attrs={'class': 'pub'}).string.strip()
- basic_info_list = basic_info.split('/')
- try:
- author_info = '/'.join(basic_info_list[0: -3])
- except:
- author_info = '暂无'
- try:
- pub_info = '/'.join(basic_info_list[-3: ])
- except:
- pub_info = '暂无'
- # 评价方面的数据
- evaluate_info = book_info.find('div', attrs={'class': 'star clearfix'})
- # 星级
- try:
- allstar = evaluate_info.find('span', attrs={'class': True})
- if (allstar.attrs['class'])[0][-1] == '1':
- allstar = (allstar.attrs['class'])[0][-1]
- else:
- allstar = (allstar.attrs['class'])[0][-2] + '.' + (allstar.attrs['class'])[0][-1]
- except:
- allstar = '0.0'
- # 评分
- try:
- rating_nums = evaluate_info.find('span', attrs={'class': 'rating_nums'}).string.strip()
- except:
- rating_nums = '0.0'
- # 评价人数
- try:
- people_num = evaluate_info.find('span', attrs={'class': 'pl'}).string.strip()
- people_num = people_num[1: -4]
- except:
- people_num = '0'
- # 内容描述
- try:
- description = book_info.find('p').string.strip()
- except:
- description = '暂无'
- # 信息整理
- books_list.append([title, author_info, pub_info, allstar, rating_nums, people_num, description, book_url])
- print('第%d页信息采集完毕,共%d页' % (page_num+1, page_num_max))
- time.sleep(0.5)
- page_num += 1
- if page_num == page_num_max:
- break
- return books_list
复制代码 |
|