|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 kerwin_lui 于 2018-5-23 14:58 编辑
项目背景:
跟着小甲鱼的视频进行操作,对小甲鱼的书——《零基础入门学习python》在淘宝上的销量进行统计!!
困惑:
虽然和小甲鱼的代码不完全一样,但我觉得代码实现的功能都是一样的!!
比如:
re.search(r" (love) ", "i love fishc").group(1)
re.findall() → 再用for循环出字符串
运行的结果硬是惊人的不一样,而且我同样的一段的代码,早上运行和下午运行的报错内容也是不一样?
wtf?想死的心都有~~
不知道各位鱼C的大牛,有没有遇到过这个心累的问题?????还是说是我代码或者软件的问题???
以下是我的代码: (我是在Jupyter Notebook 和pycharm 上运行的)
版本一:(该版本是我自认为比较完善的代码,但就是运行不起来)
- # 用基础的爬虫代码,统计淘宝上《零基础入门学习python》这本小甲鱼的书的销量:
- import re
- import json
- import requests
- #得到网页HTML对象
- def Get_url_html(url, pages, keyword):
- payload = {"q": str(keyword), "sort": "sale-desc", "s": str((pages - 1) * 44)}
- headers = {}
- headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
- headers["Referer"] = "https://s.taobao.com/search?q=%E9%9B%B6%E5%9F%BA%E7%A1%80%E5%85%A5%E9%97%A8%E5%AD%A6%E4%B9%A0python&\
- sort=sale-desc&s=0"
- res = requests.get(url, params=payload, headers=headers)
- return res
- #传入HTML文本,找到page_config对应的内容,转化成json格式
- def Get_page_config(url_html):
- page_config = re.search(r"g_page_config =(.*?);\n ", url_html).group(1)
- # print(page_config) 测试bug用
- page_config_json = json.loads(page_config)
- return page_config_json
- #传入json格式,进行数据整理,把自己需要的数据放入新的list中
- def Get_total_item_list(page_config_json):
- auctions_list = page_config_json["mods"]["itemlist"]["data"]["auctions"]
- total_item_list = []
- for each_item in auctions_list:
- each_item_target_dict = {}
- each_item_target_dict.fromkeys(())
- each_item_target_dict["raw_title"] = each_item["raw_title"]
- each_item_target_dict["pic_url"] = each_item["pic_url"]
- each_item_target_dict["view_price"] = each_item["view_price"]
- each_item_target_dict["view_sales"] = each_item["view_sales"]
- each_item_target_dict["comment_count"] = each_item["comment_count"]
- total_item_list.append(each_item_target_dict)
- # print(total_item_list)
- return total_item_list
- #利用新构造的list ,进行数据统计,算出《入门学习python》的
- def Sum_sales_amounts(total_item_list):
- cumulate_each_sales = 0
- global cumulate_pages_sales_money
- cumulate_pages_sales_money = 0
- for each_list in total_item_list:
- if "小甲鱼" in each_list["raw_title"]:
- # if 1:
- each_sales = int(re.search(r"\d+", each_list["view_sales"]).group()) ##每个商店的销量
- print("每个商店的销量为 : ", each_sales)
- each_price = float(re.search(r"\d+\.\d+", each_list["view_price"]).group()) # 每个商店的售价
- cumulate_each_sales += each_sales # 累加这一页所有商店的销量
- each_sales_money = each_sales * each_price # 计算这一页的每一个商店的销售额
- cumulate_pages_sales_money += each_sales_money # 累加这一页所有商店的销售额 (但是每个函数只能实现一个功能,return 1个变量,不考虑打包)
- else:
- continue
- print(cumulate_each_sales)
- print(cumulate_pages_sales_money)
- return cumulate_each_sales
- def main():
- pri_url = "https://s.taobao.com/search"
- keyword = input("请输入关键词 :")
- total_sales_amounts = 0
- total_sales_money = 0
- for i in range(10):
- pages = i + 1
- url_html = Get_url_html(pri_url, pages, keyword) #得到网页HTML对象
- page_config_json = Get_page_config(url_html.text) #传入HTML文本,找到page_config对应的内容,转化成json格式
- total_item_list = Get_total_item_list(page_config_json) #传入json格式,进行数据整理,把自己需要的数据放入新的list中
- pages_sales_amounts = Sum_sales_amounts(total_item_list) #利用新构造的list ,进行数据统计,算出《入门学习python》的
-
- total_sales_amounts += pages_sales_amounts #pages代表每个页面的销售总数 total代表所有页面的销售总数
- print("第%d 页 目前总销量为 :%d" % (pages, total_sales_amounts))
- pages_sales_money = cumulate_pages_sales_money
- total_sales_money += pages_sales_money #pages代表每个页面的销售金额 total代表所有页面的销售金额
- print("第%d 页 目前总销售金额为 :%d" % (pages, total_sales_money))
- if __name__ == "__main__":
- main()
复制代码
以下是报错的内容:
错误一:
AttributeError Traceback (most recent call last)
<ipython-input-12-8941ded1076f> in <module>()
84
85 if __name__ == "__main__":
---> 86 main()
87
88
<ipython-input-12-8941ded1076f> in main()
71 pages = i + 1
72 url_html = Get_url_html(pri_url, pages, keyword) #得到网页HTML对象
---> 73 page_config_json = Get_page_config(url_html.text) #传入HTML文本,找到page_config对应的内容,转化成json格式
74 total_item_list = Get_total_item_list(page_config_json) #传入json格式,进行数据整理,把自己需要的数据放入新的list中
75 pages_sales_amounts = Sum_sales_amounts(total_item_list) #利用新构造的list ,进行数据统计,算出《入门学习python》的
<ipython-input-12-8941ded1076f> in Get_page_config(url_html)
19 #传入HTML文本,找到page_config对应的内容,转化成json格式
20 def Get_page_config(url_html):
---> 21 page_config = re.search(r"g_page_config =(.*?);\n ", url_html).group(1)
22 # print(page_config) 测试bug用
23 page_config_json = json.loads(page_config)
AttributeError: 'NoneType' object has no attribute 'group'
错误二:
TypeError Traceback (most recent call last)
<ipython-input-2-49aeccd1ba5b> in <module>()
95
96 if __name__ == "__main__":
---> 97 main()
98
<ipython-input-2-49aeccd1ba5b> in main()
85
86 page_config_json = Get_page_config(url_html.text)
---> 87 total_item_list = Get_total_item_list(page_config_json)
88 total_sales_amounts = Sum_sales_amounts(total_item_list)
89 xxx += total_sales_amounts
<ipython-input-2-49aeccd1ba5b> in Get_total_item_list(page_config_json)
40
41 def Get_total_item_list(page_config_json):
---> 42 auctions_list = page_config_json["mods"]["itemlist"]["data"]["auctions"]
43 total_item_list = []
44 for each_item in auctions_list:
TypeError: 'int' object is not subscriptable
——————————————————丑陋的分割线————————————————
版本二:
(该版本实际就是把re.search 改成 re.findall 然后在遍历出字符串,就莫名其妙对了,想哭!! )
- import re
- import json
- import requests
- def get_page_config(url_html):
- listxxx = re.findall(r"g_page_config =(.*?);\n ", url_html)
- yy = "0"
- for xx in listxxx:
- yy = xx
- page_config = yy
- # print(page_config)
- page_config_json = json.loads(page_config)
- return page_config_json
- def Get_url_html(url, pages, keyword):
- payload = {"q": str(keyword), "sort": "sale-desc", "s": str((pages - 1) * 44)}
- headers = {}
- headers[
- "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
- headers["Referer"] = "https://s.taobao.com/search?q=%E9%9B%B6%E5%9F%BA%E7%A1%80%E5%85%A5%E9%97%A8%E5%AD%A6%E4%B9%A0python&\
- sort=sale-desc&s=0"
- res = requests.get(url, params=payload, headers=headers)
- return res
- def Get_total_item_list(page_config_json):
- auctions_list = page_config_json["mods"]["itemlist"]["data"]["auctions"]
- total_item_list = []
- for each_item in auctions_list:
- each_item_target_dict = {}
- each_item_target_dict.fromkeys(())
- each_item_target_dict["raw_title"] = each_item["raw_title"]
- each_item_target_dict["pic_url"] = each_item["pic_url"]
- each_item_target_dict["view_price"] = each_item["view_price"]
- each_item_target_dict["view_sales"] = each_item["view_sales"]
- each_item_target_dict["comment_count"] = each_item["comment_count"]
- total_item_list.append(each_item_target_dict)
- # print(total_item_list)
- return total_item_list
- def Sum_sales_amounts(total_item_list):
- cumulate_each_sales = 0
- cumulate_each_sales_money = 0
- for each_list in total_item_list:
- if "小甲鱼" in each_list["raw_title"]:
- # if 1:
- each_sales = int(re.search(r"\d+", each_list["view_sales"]).group()) ##每个商店的销量
- print("每个商店的销量为 : ", each_sales)
- each_price = float(re.search(r"\d+\.\d+", each_list["view_price"]).group()) # 每个商店的售价
- cumulate_each_sales += each_sales # 累加所有商店的销量
- each_sales_money = each_sales * each_price # 计算每一个商店的销售额
- cumulate_each_sales_money += each_sales_money # 累加所有商店的销售额 (但是每个函数只能实现一个功能,return 1个变量,不考虑打包)
- else:
- continue
- print(cumulate_each_sales)
- return cumulate_each_sales
- def main():
- pri_url = "https://s.taobao.com/search"
- keyword = input("请输入关键词 :")
- xxx = 0
- for i in range(10):
- pages = i + 1
- url_html = Get_url_html(pri_url, pages, keyword)
- page_config_json = get_page_config(url_html.text)
- total_item_list = Get_total_item_list(page_config_json)
- total_sales_amounts = Sum_sales_amounts(total_item_list)
- xxx += total_sales_amounts
- print("第%d 页 目前总销量为 :%d" % (pages, xxx))
- if __name__ == "__main__":
- main()
复制代码
|
|