osc的rss不是全文输出的,不开心,所以就有了python抓取osc最新博客生成Rss
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import urllib2 import datetime import time import PyRSS2Gen from email.Utils import formatdate import re import sys import os reload (sys) sys.setdefaultencoding( 'utf-8' ) class RssSpider(): def __init__( self ): self .myrss = PyRSS2Gen.RSS2(title = 'OSChina' , description = str (datetime.date.today()), pubDate = datetime.datetime.now(), lastBuildDate = datetime.datetime.now(), items = [] ) self .xmlpath = r '/var/www/myrss/oschina.xml' #if os.path.isfile(self.xmlpath): #os.remove(self.xmlpath) def useragent( self ,url): i_headers = { "User-Agent" :"Mozilla / 5.0 (Windows NT 6.1 ; WOW64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 36.0 . 1985.125 Safari / 537.36 ", req = urllib2.Request(url, headers = i_headers) html = urllib2.urlopen(req).read() return html def enterpage( self ,url): pattern = re. compile (r 'd{4}Sd{2}Sd{2}sd{2}Sd{2}' ) rsp = self .useragent(url) soup = BeautifulSoup(rsp) timespan = soup.find( 'div' ,{ 'class' : 'BlogStat' }) timespan = str (timespan).strip().replace( 'n' ,' ').decode(' utf - 8 ') match = re.search(r 'd{4}Sd{2}Sd{2}sd{2}Sd{2}' ,timespan) timestr = str (datetime.date.today()) if match: timestr = match.group() #print timestr ititle = soup.title.string div = soup.find( 'div' ,{ 'class' : 'BlogContent' }) rss = PyRSS2Gen.RSSItem( title = ititle, link = url, description = str (div), pubDate = timestr ) return rss def getcontent( self ): rsp = self .useragent( self .baseurl) soup = BeautifulSoup(rsp) ul = soup.find( 'div' ,{ 'id' : 'RecentBlogs' }) for li in ul.findAll( 'li' ): div = li.find( 'div' ) if div is not None : alink = div.find( 'a' ) if alink is not None : link = alink.get( 'href' ) print link html = self .enterpage(link) self .myrss.items.append(html) def SaveRssFile( self ,filename): finallxml = self .myrss.to_xml(encoding = 'utf-8' ) file = open ( self .xmlpath, 'w' ) file .writelines(finallxml) file .close() if __name__ = = '__main__' : rssSpider = RssSpider() rssSpider.getcontent() rssSpider.SaveRssFile( 'oschina.xml' ) |
以上所述就是本文的全部内容了,希望大家能够喜欢。