|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 神奇的yxq 于 2017-12-18 15:50 编辑
为了学习bs4模块,所以将mmspider“高清重置”了一下
前置解释:internet为自制的模块,里面的有两个类 Browser 和 Download ,前者实现Browser.get(url),后者实现下载srcqueue里面下载队列,感兴趣的可以下载附件
mylogging为自制模块,里面三个类分别是Logging记录到输入,LoggingText记录到文本文件,LoggingDB记录到数据库,都有一个方法log.put(需要记录的信息, 记录的位置, 记录等级), 感兴趣打附件下载
附件:
- # 需求:下载http://www.mzitu.com/网站的妹子图片
- """
- 思路:
- 1、访问图集首页 如:http://www.mzitu.com/page/页码数/;
- 2、获取每个图集的详情页面;
- 3、访问每个图集的详情页面 如:http://www.mzitu.com/106245/页码数/;
- 4、获取 每页的图片地址;
- 5、将每个图集的所有图片的地址打包给downloader下载;
- 6、拼接第二个首页地址,返回第一步;
- 7、爬取到达到指定的页数或爬取完所有页数 爬虫停止爬取;
- """
- from bs4 import BeautifulSoup
- import internet
- import mylogging
- import queue
- def searchimglist(content):
- """
- 用于获取图集的url
- :param content: 返回的数据
- :return: url列表[(图集名, 图集url), ...]
- """
- soup = BeautifulSoup(content, "lxml")
- lilist = soup.find_all("li")
- imglist = []
- for eachli in lilist:
- if len(eachli) == 4:
- temp = eachli.find_all("a")
- url = temp[1].attrs["href"]
- title = temp[1].text
- imglist.append((title, url))
- return imglist
- def searchimgurl(content):
- """
- 用于搜索和组装图片src
- :param content: 请求的返回数据
- :return: srclist
- """
- soup = BeautifulSoup(content, "lxml")
- title = soup.find_all("h2", {"class": "main-title"})[0].text # 获取标题
- url = soup.find_all("img", {"alt": title})[0].attrs["src"] # 获取第一张图片的url
- maxpage = soup.find_all("div", {"class": "pagenavi"})[0].find_all("span")[-2].text # 获取最大页码数,也是图片数量
- srclist = []
- for pagenumber in range(1, int(maxpage) + 1): # 组装其他图片的src地址
- if pagenumber < 10:
- pagenumber = "0" + str(pagenumber)
- else:
- pagenumber = str(pagenumber)
- src = url[:-6] + pagenumber + url[-4:]
- srclist.append(src)
- return srclist
- def main():
- """
- 主函数
- :return: none
- """
- rawurl = "http://www.mzitu.com/page/" # 原始url
- downloadPath = "MMImg"
- while True:
- maxpage = input("请输入爬虫爬取的最大页数:")
- try:
- maxpage = int(maxpage)
- except ValueError:
- print("输入错误!请重新输入")
- else:
- if maxpage > 150:
- print("太多了,身体吃不消的!")
- continue
- else:
- print("爬取任务:%s 页!爬虫开始爬取" % maxpage)
- break
- log = mylogging.Logging(name="MMSpider")
- srcqueue = queue.Queue(30)
- b = internet.Browse() # 浏览器类
- b.__log__ = log
- d1 = internet.Download(srcqueue, downloadPath=downloadPath, downloader=2, downloadWaitTime=1) # 下载器线程
- d2 = internet.Download(srcqueue, downloadPath=downloadPath, downloader=2, downloadWaitTime=1)
- d3 = internet.Download(srcqueue, downloadPath=downloadPath, downloader=2, downloadWaitTime=1)
- # 讲道理,别搞太多下载器线程
- d1.__log__ = log
- d2.__log__ = log
- d3.__log__ = log
- d1.start()
- d2.start()
- d3.start()
- for i in range(1, maxpage + 1):
- if i < 10:
- i = "0" + str(i)
- else:
- i = str(i)
- url = rawurl + i + "/" # 组装页面url
- print("正在访问第%s页,url=%s" % (i, url))
- content = b.get(url, decoding=False)
- if not content: # 异常处理
- print(content)
- continue
- imglist = searchimglist(content) # 获取所有图片集标题和地址
- for title, imgurl in imglist:
- print("正在访问[%s]的详情页,url=%s" % (title, imgurl))
- content = b.get(imgurl, decoding=False)
- if not content: # 异常处理
- print(content)
- continue
- srclist = searchimgurl(content)
- datapage = [title, "jpg", imgurl]
- datapage.extend(srclist)
- srcqueue.put(datapage) # 数据包打包给downloader
- srcqueue.task_done()
- d1.join()
- d2.join()
- d3.join()
- print("MMSpider 爬取完毕!")
- main()
复制代码
以上是 直接下载版 ,后期为了学习mysql,所以将之改造成数据采集版:
- # 需求:采集http://www.mzitu.com/网站的妹子图片的地址到数据库
- """
- 思路:
- 1、访问图集首页 如:http://www.mzitu.com/page/页码数/;
- 2、获取每个图集的详情页面;
- 3、访问每个图集的详情页面 如:http://www.mzitu.com/106245/页码数/,并保存到数据库;
- 4、获取 每页的图片地址;
- 5、将每个图集的所有图片的地址保存到mysql数据库;
- 6、拼接第二个首页地址,返回第一步;
- 7、爬取到达到指定的页数或爬取完所有页数 爬虫停止爬取;
- """
- from bs4 import BeautifulSoup
- import internet
- import pymysql
- import mylogging
- import urllib.parse
- def searchImgList(content):
- ...
- def searchImgUrl(content):
- ...
- def getSelectResult(cur, cmd):
- cur.execute(cmd)
- result = cur.fetchall()
- return result
- def main():
- """
- 主函数
- :return: none
- """
- rawurl = "http://www.mzitu.com/page/" # 原始url
- maxpage = 150
- mmSpiderDbInfo = {
- "host": "localhost",
- "port": 3306,
- "user": "root",
- "passwd": "789987110",
- "charset": "utf8",
- "db": "mmspiderdb"
- }
- '''logDbInfo = {
- "host": "localhost",
- "port": 3306,
- "user": "root",
- "passwd": "789987110",
- "charset": "utf8",
- "db": "logdb",
- "table": "mmspider"
- }'''
- host = mmSpiderDbInfo["host"]
- port = mmSpiderDbInfo["port"]
- user = mmSpiderDbInfo["user"]
- passwd = mmSpiderDbInfo["passwd"]
- database = mmSpiderDbInfo["db"]
- log = mylogging.Logging()
- b = internet.Browse()
- b.__log__ = log
- try:
- db = pymysql.connect(host=host, port=port,
- user=user, passwd=passwd,
- db=database, charset="utf8")
- except pymysql.err.InternalError:
- try:
- db = pymysql.connect(host=host, port=port,
- user=user, passwd=passwd,
- charset="utf8")
- except Exception as e:
- log.put(e, "MMSpider.main() Create DataBase", "warning")
- exit()
- else:
- cur = db.cursor(cursor=pymysql.cursors.DictCursor)
- cur.execute("CREATE DATABASE mmspiderdb")
- cur.execute("USE mmspiderdb")
- else:
- cur = db.cursor(cursor=pymysql.cursors.DictCursor)
- try: # 创建title_table表,用于保存title信息
- cur.execute("CREATE TABLE title_table (title_id INT PRIMARY KEY AUTO_INCREMENT, "
- "title CHAR(80), url CHAR(100))"
- "CHARACTER SET utf8;")
- except pymysql.Error:
- pass
- try: # 创建img_table表,用于保存img信息
- cur.execute("CREATE TABLE img_table (img_id INT PRIMARY KEY AUTO_INCREMENT,"
- "name CHAR(80), src CHAR(100), title_id INT) "
- "CHARACTER SET utf8;")
- except pymysql.Error:
- pass
- for i in range(1, maxpage + 1):
- if i < 10:
- i = "0" + str(i)
- else:
- i = str(i)
- url = rawurl + i + "/" # 组装页面url
- print("正在访问第%s页,url=%s" % (i, url))
- content = b.get(url, decoding=False)
- if not content: # 异常处理
- log.put(content, "访问列表")
- continue
- imglist = searchImgList(content) # 获取所有图片集标题和地址
- for (title, url) in imglist:
- cmd = "SELECT title_id, url from title_table WHERE title = '%s';" % title
- result = getSelectResult(cur, cmd)
- if not result:
- try:
- cur.execute("INSERT INTO title_table (title, url) "
- "VALUES ('%s', '%s')" % (title, url)) # 插入数据到数据库
- db.commit()
- except pymysql.Error as e:
- log.put(e, "MMspider 插入标题", "abnormal")
- continue
- print("正在访问[%s]的详情页,url=%s" % (title, url))
- content = b.get(url, decoding=False)
- if not content: # 异常处理
- log.put(content, "MMspider 访问详情页", "abnormal")
- continue
- srclist = searchImgUrl(content)
- cmd = "SELECT title_id from title_table WHERE title = '%s'" % title # 查询,重复的数据不保存
- result = getSelectResult(cur, cmd) # 获取title的id
- title_id = result[0]["title_id"]
- cmd = "SELECT src from img_table WHERE title_id = '%s'" % title_id # 查询,重复的数据不保存
- result = getSelectResult(cur, cmd)
- for src in srclist:
- if {"src": src} not in result:
- name = urllib.parse.urlparse(src)[2].split("/")[-1]
- try:
- cur.execute("INSERT INTO img_table (name, src, title_id)"
- "VALUES ('%s', '%s', %d)" % (name, src, title_id))
- db.commit()
- except Exception as e:
- log.put(e, "MMSpider", "warning")
- continue
- db.close()
- log.close()
- print("MMSpider 爬取完毕!")
- main()
复制代码
附带一个爬取妹子自拍的爬虫:
- import internet
- from bs4 import BeautifulSoup
- b = internet.Browse()
- d = internet.Download(downloadPath="zipai", downloader=2, downloadWaitTime=2)
- d.start()
- maxpage = 317
- for i in range(1, maxpage + 1):
- url = "http://www.mzitu.com/zipai/comment-page-%s/#comments" % i
- print("正在访问第 %d 页|url=%s" % (i, url))
- content = b.get(url, decoding=False)
- soup = BeautifulSoup(content, "lxml")
- imglist = soup.find_all("img")
- srclist = []
- for each in imglist:
- src = each.attrs["src"]
- srclist.append(src)
- datapage = [".", "jpg", url]
- datapage.extend(srclist)
- d.srcQueue.put(datapage)
- d.srcQueue.task_done()
复制代码
|
|