外面挂着台风,下午把人人相册的爬虫写了,晚上偶无聊又把QQ空间的博客的爬虫写了,默认只抓取提供的Q号的空间,可以在main.py里面填上Q号,也可以加个循环弄很多个Q号....博客里面的图片就木有理它了,要下载回来也很简单。有空再完善了。
- # -*-coding:utf-8-*-
- # Filename: main.py
- # 作者:华亮
- #
-
- from QQ import QQ
-
- if __name__ == "__main__":
- # 第一个参数为QQ号,第二个为保存文件名
- QQ.DownloadBlog("414112390", "blog.txt")
- # -*-coding:utf-8-*-
- # Filename: QQ.py
- # 作者:华亮
- #
-
- import urllib
- import urllib2
- import re
- from HTMLParser import HTMLParser
-
-
- # 获取QQ空间博客列表
- class QQBlogList(HTMLParser):
- in_key_div = False
- in_ul = False
- in_li = False
- in_a = False
- blogList = []
- lasturl = ""
-
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if tag == "div" and "class" in attrs and attrs["class"] == "bloglist":
- self.in_key_div = True
- elif self.in_key_div:
- if tag == "ul":
- self.in_ul = True
- elif self.in_ul and tag == "li":
- self.in_li = True
- elif self.in_li and tag == "a" and "href" in attrs:
- self.in_a = True
- self.lasturl = attrs["href"]
-
- def handle_data(self, data):
- if self.in_a:
- self.blogList.append((data, self.lasturl))
-
- def handle_endtag(self, tag):
- if self.in_key_div and tag == "div":
- self.in_key_div = False
- elif self.in_ul and tag == "ul":
- self.in_ul = False
- elif self.in_li and tag == "li":
- self.in_li = False
- elif self.in_a and tag == "a":
- self.in_a = False
-
-
-
- class QQ:
- """""
- QQ
- 作者:华亮
- 说明:自动下载QQ空间博客文章
- """
-
- @staticmethod
- def DownloadBlog(qq, filename = None):
- print "Start"
- blogurl = "http://qz.qq.com/%s/bloglist?page=0" % qq
- QQ.__Download(blogurl, filename)
- print "End"
-
- @staticmethod
- def __Download(starturl, filename):
- url = starturl
-
- cookieFile = urllib2.HTTPCookieProcessor()
- opener = urllib2.build_opener(cookieFile)
-
- # 获取所有页的文章路径
- while True:
- req = urllib2.Request(url)
- result = opener.open(req)
- text = result.read()
-
- qq = QQBlogList()
- qq.feed(text)
- qq.close()
-
- nextpagePattern = re.compile(r"<a href="(.*?)" title="下一页" class="bt_next"><span>下一页</span></a>")
- nextpage = nextpagePattern.search(text)
- if nextpage:
- url = nextpage.group(1)
- else:
- break
-
- if not filename:
- filename = "blog.txt"
- file = open(filename, "w")
-
- # 下载文章
- blogContentPattern = re.compile(r"<div class="entry_content">(.*?)</div>", re.S)
- for title, url in qq.blogList:
- print "Downloading", title
- req = urllib2.Request(url)
- result = opener.open(req)
- file.write("
" + title + "
")
- ret = blogContentPattern.search( result.read() )
- if ret:
- file.write(ret.group(1).replace("<p>", "
"))
- file.close()