summaryrefslogtreecommitdiffstats
path: root/autorss.py
blob: b0a2754d780b61eb49e8b686b964c4625ab04d22 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/python2
"""Find RSS feed from site's LINK tag"""

__author__ = "Mark Pilgrim ([email protected])"
__copyright__ = "Copyright 2002, Mark Pilgrim"
__license__ = "Python"

try:
    import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
    timeoutsocket.setDefaultSocketTimeout(10)
except ImportError:
    pass
import urllib, urlparse
from sgmllib import SGMLParser

BUFFERSIZE = 1024

class LinkParser(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.href = ''
        
    def do_link(self, attrs):
        if not ('rel', 'alternate') in attrs: return
        if not ('type', 'application/rss+xml') in attrs: return
        hreflist = [e[1] for e in attrs if e[0]=='href']
        if hreflist:
            self.href = hreflist[0]
        self.setnomoretags()
    
    def end_head(self, attrs):
        self.setnomoretags()
    start_body = end_head

def getRSSLinkFromHTMLSource(htmlSource):
    try:
        parser = LinkParser()
        parser.feed(htmlSource)
        return parser.href
    except:
        return ''
    
def getRSSLink(url):
    try:
        usock = urllib.urlopen(url)
        parser = LinkParser()
        while 1:
            buffer = usock.read(BUFFERSIZE)
            parser.feed(buffer)
            if parser.nomoretags: break
            if len(buffer) < BUFFERSIZE: break
        usock.close()
        return urlparse.urljoin(url, parser.href)
    except:
        return ''

if __name__ == '__main__':
    import sys
    print getRSSLink(sys.argv[1])