|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
写的爬虫总是报这样一个错误应该是json解析问题,我查了半天也没有解决,有没有大神帮忙看一下- Traceback (most recent call last):
- File "F:/Python爬虫/街拍/spider.py", line 66, in <module>
- main()
- File "F:/Python爬虫/街拍/spider.py", line 62, in main
- result=parse_page_detail(html,each)
- File "F:/Python爬虫/街拍/spider.py", line 42, in parse_page_detail
- data=json.loads(result.group(1))
- File "C:\Users\Administrator\AppData\Local\Programs\Python\Python35\lib\json\__init__.py", line 319, in loads
- return _default_decoder.decode(s)
- File "C:\Users\Administrator\AppData\Local\Programs\Python\Python35\lib\json\decoder.py", line 339, in decode
- obj, end = self.raw_decode(s, idx=_w(s, 0).end())
- File "C:\Users\Administrator\AppData\Local\Programs\Python\Python35\lib\json\decoder.py", line 355, in raw_decode
- obj, end = self.scan_once(s, idx)
- json.decoder.JSONDecodeError: Invalid \uXXXX escape: line 1 column 4086 (char 4085)
复制代码
以下是源代码
- import json
- import re
- from urllib.parse import urlencode
- import pymongo
- import requests
- from bs4 import BeautifulSoup
- from config import *
- client=pymongo.MongoClient(MONGO_URL)
- db=client[MONGO_DB]
- #获取索引页
- def get_page_index(offset,keyword):
- data={
- 'offset': offset,
- 'format': 'json',
- 'keyword': keyword,
- 'autoload': 'true',
- 'count': '20',
- 'cur_tab': 3
- }
- url='http://www.toutiao.com/search_content/?' + urlencode(data)#把字典编码成url数据形式
- response = requests.get(url)
- return response.text
- #解析索引页
- def parse_page_index(html):
- data=json.loads(html)
- if data and 'data' in data.keys():
- for each in data.get('data'):#获取数据中的
- yield each.get('article_url')
- def get_page_detail(url):
- response = requests.get(url)
- return response.text
- def parse_page_detail(html,url):
- soup=BeautifulSoup(html,'lxml')
- title=soup.select('title')[0].get_text()
- r=re.compile('var gallery =.*?(.*?);',re.S)
- result=re.search(r,html)#正则表达式获取页面js代码中gallery,存储着组图的地址
- if result:
- data=json.loads(result.group(1))
- if data and 'sub_images' in data.keys():
- sub_images=data.get('sub_images')
- for each in sub_images:#循环读取组图的地址
- image_url=each.get('url')
- return{
- 'tltle': title,
- 'url': url,
- 'image_url':image_url
- }
- '''def save_mongodb(result):
- if db[MONGO_TABLE].insert(result):
- print('存储到Mongodb成功',result)
- return True
- return False'''
- def main():
- html = get_page_index(0,'街拍')
- for each in parse_page_index(html):#
- html=get_page_detail(each)
- if html:
- result=parse_page_detail(html,each)
- #save_mongodb(result)
- print(result)
- if __name__== '__main__':
- main()
复制代码
把 result.group(1) print 出来你就知道为什么了
|
|