Sgmlop Patterns

July 11, 2002 | Fredrik Lundh

Here’s a code snippet that extracts A HREF anchors from a webpage:

import sgmlop
import urllib

class anchor_parser:
    # sgmlop parser target
    def __init__(self):
        self.anchors = []
    def finish_starttag(self, tag, attrs):
        if tag == "a":
            for k, v in attrs:
                if k == "href":

def sgmlop_parse(target, data):
    # helper to feed events into a target
    parser = sgmlop.SGMLParser()
    parser.close() # we're done
    return target

def getpage(page):
    # helper to fetch an entire web page
    return urllib.urlopen(page).read()

def getanchors(page):
    parser = sgmlop_parse(anchor_parser(), getpage(page))
    return parser.anchors

print getanchors("")

