|
发表于 2017-12-12 09:26:45
|
显示全部楼层
本帖最后由 jerryxjr1220 于 2017-12-18 09:41 编辑
- import requests
- from bs4 import BeautifulSoup
- import threading
- import os
- import re
- import pdfkit
- import lxml
- from requests.exceptions import RequestException
- def get_index_page():
- # headers是为了解决http 304 错误
- headers={
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'Accept-Encoding':'gzip, deflate',
- 'Accept-Language':'zh-CN,zh;q=0.9',
- 'Cache-Control':'max-age=0',
- 'Connection':'keep-alive',
- 'Host':'daily.zhihu.com',
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3269.3 Safari/537.36'
- }
- url="http://daily.zhihu.com"
- response=requests.get(url,headers=headers) #启用headers参数
- # print(response.text)
- return response.text
- def get_detail_content():
- # global data_G
- html = get_index_page() #获取网页文本数据
- # 使用BeautifulSoup 解析
- soup = BeautifulSoup(html, 'lxml')
- # 按照xmlselector 查找titles 和images,href 详情地址
- titles = soup.select('body > div.main-content > div > div.main-content-wrap > div > div > div > div > a > span ')
- images = soup.select('body > div.main-content > div > div.main-content-wrap > div > div > div > div > a > img ')
- hrefs =soup.select('body > div.main-content > div > div.main-content-wrap > div > div > div > div > a ')
- for title,image,href in zip(titles,images,hrefs):
- title=title.get_text()
- data={
- 'title':re.sub(r'\W',"",title),
- 'image':image.get('src'),
- 'href':href.get('href')
- }
- # print(data)
- yield data
- print("================================")
- def timer_work():
- global num,data_G
- num=num+1
- data_G = get_detail_content()
- save_all()
- #每1000s执行一次
- t = threading.Timer(1000, timer_work)
- t.start()
- def save_all():
- # 创建文件夹
- if not os.path.isdir(os.getcwd() + '/zhihu_daily'):
- print("NO")
- os.mkdir(os.getcwd() + '/zhihu_daily')
- else:
- print('OK')
- for tmp in data_G:
- images_save(tmp['title'],tmp['image']) #保存图片
- #保存详情
- PDF_save(tmp['href'],tmp['title'])
- print("Done")
- def PDF_save(href,title):
- # 解决:python使用pdfkit中,如果使用pdfkit.from_url 或者pdfkit.from_string等,就会出现上述错误。而且如果你使用pip安装了 wkhtmltopdf,还是会出现这个问题:
- # If this file exists please check that this process can read it. Otherwise please install wkhtmltopdf - https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf
- # 方法
- # path_wk = r'D:\Users\Administrator\AppData\Local\Programs\Python\Python36\Lib\site-packages\wkhtmltopdf\bin\wkhtmltopdf.exe' # 安装位置
- # config = pdfkit.configuration(wkhtmltopdf=path_wk)
- # pdfkit.from_url(url,file_path,configuration=config)
-
- path_wk = r'D:\Users\Administrator\AppData\Local\Programs\Python\Python36\Lib\site-packages\wkhtmltopdf\bin\wkhtmltopdf.exe' # 安装位置
- config = pdfkit.configuration(wkhtmltopdf=path_wk)
- url='https://daily.zhihu.com'+href
- # print(title+'.pdf')
- file_path="{0}/{1}/{2}.{3}".format(os.getcwd(),'zhihu_daily',title , 'pdf')
- try:
- if not os.path.exists(file_path):
- pdfkit.from_url(url,file_path,configuration=config)
- else:
- print("TP2:文件存在")
- except:
- # 此处一个Exit with code 1 due to network error: ContentNotFoundError异常
- # 此异常为是因为css文件引用了外部的资源,如:字体,图片,iframe加载等。
- # 选择忽略此异常
- pass
- def images_save(title,image):
- img = requests.get(image) #获取图片的response
- # title1=re.sub(r'\W',"",title) #标题去掉特殊符号,如 ? 不然会出现无法保存的错误
- file_path = "{0}/{1}/{2}.{3}".format(os.getcwd(),'zhihu_daily',title , 'jpg') #格式化存放路径
- if not os.path.exists(file_path):
- with open(file_path, 'wb') as f:
- f.write(img.content) # 存放图片
- f.close()
- def main():
- timer_work() #每一个小时刷新一次
- data_G={}
- if __name__=='__main__':
- num=0
- main()
复制代码 |
-
评分
-
查看全部评分
|