爬取古诗文

注意一下文件编码的处理,以及正则的使用 下面的脚本将会从古诗文网获取杜甫的所有诗文并保存到poems.txt文件内

#coding=utf-8
import urllib
import re
import sys

def getTitle(poem):
    reg1 = r'<p><a style="font-size:18px[\s\S]*?</b></a></p>'
    title_re = re.compile(reg1)
    titlelist = re.findall(title_re, poem)
    string1 = titlelist[0]
    string1= string1[string1.find(r'"_blank"><b>')+12:-12]
    return string1

def getContent(poem):
    reg1 = r'<div class="contson" id="contson.*">[\s\S]*?</div>'
    reg2 = r'<div class="contson" id="contson.*">'
    poem_re = re.compile(reg1)
    poem_re2 = re.compile(reg2)
    poemlist = re.findall(poem_re, poem)
    string1 = poemlist[0]
    string1 = string1.replace(re.findall(poem_re2, string1)[0], "")
    string1 = string1.replace(r"<p>", "")
    string1 = string1.replace(r"</p>", "")
    string1 = string1.replace(r"<br />", "")
    string1 = string1.replace(r"</textarea>", "")
    string1 = string1.replace(r"<textarea>", "")
    string1 = string1.replace(r"<strong>", "")
    string1 = string1.replace(r"</strong>", "")
    string1 = string1.replace(r"<div>", "")
    string1 = string1.replace(r"</div>", "")
    string1 = string1.replace("\n", "")
    string1 = string1.replace(" ", "")
    return string1

def getPoem(html):
    reg1 = r'<p><a style="font-size:18px[\s\S]*?<div class="contson" id="contson.*">[\s\S]*?</div>'
    poem_re = re.compile(reg1)
    poemlist = re.findall(poem_re,html)
    return poemlist

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html


if __name__=="__main__":

    f = open('poems.txt','w')
    f.write('\xEF\xBB\xBF') #设置utf-8文件编码
    for i in range(1,131):
        html = getHtml("http://so.gushiwen.org/search.aspx?type=author&page=" + str(i) + "&value=%E6%9D%9C%E7%94%AB")
        list = getPoem(html)
        print u"\r正在爬取...  (第", i,u"/ 130 页)",
        sys.stdout.flush()
        print >> f,("---- 第" + str(i) + "页 ----")
        for a in list:
            print >> f, ("《" + getTitle(a) + "》")
            print >> f, getContent(a)
            print >> f, ""
    f.close()
    print u"\r文件已保存至 poems.txt",
    sys.stdout.flush()