Python抓取QQ空间博客文章

外面挂着台风，下午把人人相册的爬虫写了，晚上偶无聊又把QQ空间的博客的爬虫写了，默认只抓取提供的Q号的空间，可以在main.py里面填上Q号，也可以加个循环弄很多个Q号....博客里面的图片就木有理它了，要下载回来也很简单。有空再完善了。

# -*-coding:utf-8-*-
# Filename: main.py
# 作者：华亮
#
from QQ import QQ
if __name__ == "__main__":
# 第一个参数为QQ号，第二个为保存文件名
QQ.DownloadBlog（"414112390", "blog.txt"）

# -*-coding:utf-8-*-
# Filename: QQ.py
# 作者：华亮
#
import urllib
import urllib2
import re
from HTMLParser import HTMLParser
# 获取QQ空间博客列表
class QQBlogList（HTMLParser）:
in_key_div = False
in_ul = False
in_li = False
in_a = False
blogList = []
lasturl = ""
def handle_starttag（self, tag, attrs）:
attrs = dict（attrs）
if tag == "div" and "class" in attrs and attrs["class"] == "bloglist":
self.in_key_div = True
elif self.in_key_div:
if tag == "ul":
self.in_ul = True
elif self.in_ul and tag == "li":
self.in_li = True
elif self.in_li and tag == "a" and "href" in attrs:
self.in_a = True
self.lasturl = attrs["href"]
def handle_data（self, data）:
if self.in_a:
self.blogList.append（（data, self.lasturl））
def handle_endtag（self, tag）:
if self.in_key_div and tag == "div":
self.in_key_div = False
elif self.in_ul and tag == "ul":
self.in_ul = False
elif self.in_li and tag == "li":
self.in_li = False
elif self.in_a and tag == "a":
self.in_a = False
class QQ:
"""""
QQ
作者：华亮
说明：自动下载QQ空间博客文章
"""
@staticmethod
def DownloadBlog（qq, filename = None）:
print "Start"
blogurl = "http://qz.qq.com/%s/bloglist？page=0" % qq
QQ.__Download（blogurl, filename）
print "End"
@staticmethod
def __Download（starturl, filename）:
url = starturl
cookieFile = urllib2.HTTPCookieProcessor（）
opener = urllib2.build_opener（cookieFile）
# 获取所有页的文章路径
while True:
req = urllib2.Request（url）
result = opener.open（req）
text = result.read（）
qq = QQBlogList（）
qq.feed（text）
qq.close（）
nextpagePattern = re.compile（r"<a href="（.*？）" title="下一页" class="bt_next"><span>下一页</span></a>"）
nextpage = nextpagePattern.search（text）
if nextpage:
url = nextpage.group（1）
else:
break
if not filename:
filename = "blog.txt"
file = open（filename, "w"）
# 下载文章
blogContentPattern = re.compile（r"<div class="entry_content">（.*？）</div>", re.S）
for title, url in qq.blogList:
print "Downloading", title
req = urllib2.Request（url）
result = opener.open（req）
file.write（" " + title + " "）
ret = blogContentPattern.search（ result.read（））
if ret:
file.write（ret.group（1）.replace（"<p>", " "））
file.close（）