|
100鱼币
代码如下,点击给我爬!!!程序就会卡死,本来想把输出结果放入到Text组件中,结果程序卡死,很尴尬,求解决方案
- from tkinter import *
- import urllib.request
- import urllib.error
- import os
- import http.client
- import time
- import re
- import random
- import math
- class App(Frame):
- def __init__(self,mw):
- self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
- self.enctype = 'utf-8'
- self.proxies = []
- Frame.__init__(self,mw)
- #初始默认值字典
- self.var = dict()
- #抓取地址----------------------------------------------------------------------------
- self.frmURL = Frame(mw) #抓取地址框架
-
- self.lblURL = Label(self.frmURL,text="抓取地址:") #抓取地址标签
- self.lblURL.grid(row=0,column=0,padx=10)
- var = StringVar()
- var.set(r"http://image.baidu.com/search/flip?tn=baiduimage&word=mai%20kuraki&pn=")
- self.var["url1"] = var
- self.entURL1 = Entry(self.frmURL,width=80,textvariable=self.var["url1"]) #地址1输入框
- self.entURL1.grid(row=0,column=1)
- var = StringVar()
- var.set("0")
- self.var["url2"] = var
- self.entURL2 = Entry(self.frmURL,width=6,textvariable=self.var["url2"]) #地址2输入框
- self.entURL2.grid(row=0,column=2)
- var = StringVar()
- var.set("")
- self.var["url3"] = var
- self.entURL3 = Entry(self.frmURL,textvariable=self.var["url3"]) #地址3输入框
- self.entURL3.grid(row=0,column=3)
-
- self.lblStep = Label(self.frmURL,text="递增的量:") #递增量标签
- self.lblStep.grid(row=0,column=4,padx=10)
- var = StringVar()
- var.set("1")
- self.var["step"] = var
- self.entStep = Entry(self.frmURL,width=6,textvariable=self.var["step"]) #递增量输入框
- self.entStep.grid(row=0,column=5)
-
- self.frmURL.grid(sticky=W+E,row=0,column=0,pady=5)
- #-------------------------------------------------------------------------------------
-
- #代理地址--------------------------以及times-------------------------------------------
- self.frmProxy = Frame(mw) #代理框架
- self.lblProxy = Label(self.frmProxy,text="代理地址:") #代理标签
- self.lblProxy.grid(row=0,column=0,padx=10)
- var = StringVar()
- var.set(r"http://www.xicidaili.com/")
- self.var["proxyurl"] = var
- self.entProxyURL = Entry(self.frmProxy,width=80,textvariable=self.var["proxyurl"]) #代理地址输入框
- self.entProxyURL.grid(row=0,column=1)
- var = IntVar()
- var.set(1)
- self.var["http"] = var
- self.cbtHttp = Checkbutton(self.frmProxy,text="HTTP",variable = self.var["http"]) #HTTP开关
- self.cbtHttp.grid(row=0,column=2,padx=10)
- var = IntVar()
- var.set(0)
- self.var["https"] = var
- self.cbtHttps = Checkbutton(self.frmProxy,text="HTTPS",variable = self.var["https"]) #HTTPS开关
- self.cbtHttps.grid(row=0,column=3,padx=10)
- self.lblTimes = Label(self.frmProxy,text="最大失败次数:") #最大失败次数标签
- self.lblTimes.grid(row=0,column=4,padx=10)
- var = StringVar()
- var.set("5")
- self.var["times"] = var
- self.entTimes = Entry(self.frmProxy,textvariable = self.var["times"],width=6) #最大失败次数
- self.entTimes.grid(row=0,column=5,padx=10)
-
- self.frmProxy.grid(sticky=W+E,row=1,column=0,pady=5)
- #-------------------------------------------------------------------------------------
- #正则表达式区域--------------------------------------------------------------------------
- self.frmre = Frame(mw) #正则表达式框架
-
- self.frmreURL = LabelFrame(self.frmre,text="网页正则表达式",padx=10) #网页正则表达式
- self.var["reurl"] = r'''"objURL":"(http://.+?\.(?:jpg|jpeg|gif))"'''
- self.txtURL = Text(self.frmreURL,width=60,height=10)
- self.txtURL.pack()
- self.txtURL.insert(INSERT,self.var["reurl"])
- self.frmreURL.grid(sticky=W,row=0,column=0,padx=10)
- self.frmreProxy = LabelFrame(self.frmre,text="代理正则表达式",padx=10) #代理正则表达式
- self.var["reproxy"] = r'''<tr\sclass[^>]*?>\s*?
- <td>.+</td>\s*?
- <td>(.*)?</td>\s*?
- <td>(.*)?</td>\s*?
- <td>(.*)?</td>\s*?
- <td>(.*)?</td>\s*?
- <td>(.*)?</td>\s*?
- <td>(.*)?</td>\s*?
- </tr>'''
- self.txtProxy = Text(self.frmreProxy,width=60,height=10)
- self.txtProxy.pack()
- self.txtProxy.insert(INSERT,self.var["reproxy"])
- self.frmreProxy.grid(sticky=E,row=0,column=1,padx=10)
-
- self.frmre.grid(sticky=W+E,row=2,column=0,pady=5)
- #--------------------------------------------------------------------------------------
- #文件路径区域-----------------------------------------------------------------------------
- self.frmPath = Frame(mw) #文件路径框架
- self.lblPath = Label(self.frmPath,text="保存路径:") #标签
- self.lblPath.pack(side=LEFT,padx=10)
- var = StringVar()
- var.set(r"G:\图片\仓木麻衣")
- self.var["path"] = var
- self.entPath = Entry(self.frmPath,textvariable=self.var["path"],width=125) #保存路径
- self.entPath.pack(side=RIGHT,padx=10)
-
- self.frmPath.grid(sticky=W+E,row=3,column=0,pady=5)
- #---------------------------------------------------------------------------------------
- #按钮区域---------------------------------------------------------------------------------
- self.frmButton = Frame(mw) #按钮框架
- self.btnSpide = Button(self.frmButton,text="给我爬!!!",command=self.spide) #开始爬
- self.btnSpide.pack(side=LEFT,padx=10,ipadx=60,ipady=5)
- self.btnExit = Button(self.frmButton,text="退出",command=quit)
- self.btnExit.pack(side=RIGHT,padx=10,ipadx=60,ipady=5)
- self.frmButton.grid(sticky=W+E,row=4,column=0,pady=5)
- #----------------------------------------------------------------------------------------
- def spide(self):
- self.get_proxy() #取得代理
- self.create_localhost() #生成本地直接访问proxies
- self.download() #开始下载
- def get_proxy(self): #从代理页面提取代理IP及端口
- proxy_url = self.var["proxyurl"].get()
- req = urllib.request.Request(proxy_url,None,self.headers)
- response = self.get_result(req)
- html = response.read().decode('utf-8')
- p = re.compile(self.var["reproxy"],re.VERBOSE)
- proxy_list = p.findall(html)
- for each_proxy in proxy_list[1:]:
- if self.var["http"].get() == 1 and each_proxy[4] == 'HTTP' or self.var["https"].get() == 1 and each_proxy[4] == 'HTTPS':
- self.proxies.append(each_proxy[0]+':'+each_proxy[1])
- def get_result(self,req_or_url,is_retrieve=False,filename = None): #取得网页页面
- max_error_times = int(self.var["times"].get())
- error_time = 0
- while True:
- try:
- if error_time == max_error_times:
- print('失败次数达%d次......放弃操作' % max_error_times)
- return None
- error_time += 1
- if is_retrieve:
- return urllib.request.urlretrieve(req_or_url,filename)
- else:
- return urllib.request.urlopen(req_or_url)
- except urllib.error.URLError as e:
- if hasattr(e,'code'):
- print(e.code,e.reason)
- self.change_proxy()
- continue
- elif hasattr(e,'reason'):
- print(e)
- self.change_proxy()
- continue
- except (ConnectionResetError,http.client.BadStatusLine) as e:
- print(e)
- self.change_proxy()
- continue
- except TimeoutError as e:
- print(e)
- print('服务器长时间无响应,自动切换代理.....')
- self.change_proxy()
- continue
- def change_proxy(self): #切换代理
- proxy = random.choice(self.proxies)
- if proxy == None:
- proxy_support = urllib.request.ProxyHandler({})
- else:
- proxy_support = urllib.request.ProxyHandler({'http':proxy})
- opener = urllib.request.build_opener(proxy_support)
- opener.addheaders = [('User-Agent',self.headers['User-Agent'])]
- urllib.request.install_opener(opener)
- print('智能切换代理:%s' % ('本机' if proxy==None else proxy))
- def create_localhost(self): #生成本地直接访问的proxies
- number = int((math.sqrt(5)-1)/2 * len(self.proxies))
- for x in range(number):
- self.proxies.append(None)
- def download(self): #下载图片
- save_path = self.var["path"].get()
- for pic_url in self.get_pic():
- file_name = os.path.split(pic_url)[1]
- if not os.path.isdir(save_path): #目录不存在就创建
- os.makedirs(save_path)
- #如果文件已存在则跳过
- if os.path.exists(save_path+'\\'+file_name):
- print('文件%s已存在...' % file_name)
- continue
- self.get_result(pic_url,True,save_path+'\\'+file_name)
- print('本次成功下载编号%s! %s' % (self.var['url2'].get() , pic_url))
- self.var['url2'].set(str(int(self.var['url2'].get())+int(self.var['step'].get())))
- def get_pic(self): #生成器,返回一个图片链接
- url1 = self.var['url1'].get()
- url2 = self.var['url2'].get()
- url3 = self.var['url3'].get()
- while True:
- url = url1+url2+url3
- req = urllib.request.Request(url,None,self.headers)
- response = self.get_result(req)
- if response == None:
- print('获取页面失败.....')
- quit
- html = response.read().decode(self.enctype)
- pic = re.compile(self.var['reurl'],re.VERBOSE)
- for pic in pic.finditer(html):
- yield pic.group(1)
- time.sleep(5)
- def main():
- root = Tk()
- root.title("爬虫4------Margular制作")
- root.iconbitmap("spider.ico")
- root.resizable(True,True)
- app1 = App(mw=root)
- app1.mainloop()
-
- if __name__ == '__main__':
- main()
复制代码 |
最佳答案
查看完整内容
代码太长,也没看。你试试在程序爬完之前,把所有的返回值都写在变量里面,不要写到tk的组件里面。因为图形化的列表框什么的,超频繁的读取写入会卡死的。这样反而拖慢电脑速度。另外一个就是,卡死也可能跟电脑配置有关呢。
|