|
楼主 |
发表于 2018-6-17 21:20:19
|
显示全部楼层
- from urllib import request
- import re
- class Spider():
- url = "https://www.panda.tv/cate/lol?pdt=1.24.s1.3.2c6qoma1l34"
- root_pattern = '<div class="video-info">([\s\S]*?)</div>'
- name_pattern = '</i>([\s\S]+?)</span>'
- number_pattern = '<span class="video-number">([\s\S]+?)</span>'
-
- def __fetch_content(self):
- r = request.urlopen(Spider.url)
- htmls = r.read()
- htmls = str(htmls,encoding = "utf-8")
- return htmls
-
- def __analysis(self,htmls):
- root_htmls = re.findall(Spider.root_pattern,htmls)
- anchors = []
- #print(root_htmls[0])
-
- for html in root_htmls:
- name = re.findall(Spider.name_pattern,html)
- number = re.findall(Spider.number_pattern,html)
- anchor = {'name':name,'number':number}
- anchors.append(anchor)
- #print(anchors[0])
- return anchors
-
- def __refine(self,anchors):
- l = lambda anchor:{'name':anchor['name'][0].strip(),'number':anchor['number'][0]}
- return map(l,anchors)
-
- def __sort(self,anchors):
- anchors = sorted(anchors,key =self.__sort1)
- def __sort1(self,anchor):
- r = re.findall("\d*",anchor['number'])
- number = float(r)
- if "万" in anchor['number']:
- number *= 10000
- return number
-
- def __show(self,anchors):
- for i in anchors:
- print(i['name'] + '------>' + i['number'])
-
-
- def go(self):
- htmls = self.__fetch_content()
- anchors = self.__analysis(htmls)
- anchors = list(self.__refine(anchors))
- anchors = self.__sort(anchors)
- self.__show(anchors)
-
- #print(anchors[0])
-
- s = Spider()
- s.go()
复制代码 |
|