We're back after a server migration that caused effbot.org to fall over a bit harder than expected. Expect some glitches.

Generating SeeAlso Indexes for the Python Library Reference

December 10, 2005 | Fredrik Lundh

Here’s a simple script that uses the global module index to generate a seealso index for the Python library reference.

Note that it queries python.org for all reference pages, so you shouldn’t run it more often than needed. If you just want a sample file to play with, you can use this copy.

Required components (elementtree and elementtidy) can be found at the effbot.org downloads site.

import os, re, urllib, urlparse

from elementtidy.TidyHTMLTreeBuilder import parse
from elementtree.ElementTree import dump

URI = "http://docs.python.org/modindex.html"

# helpers

class NS:
    def __init__(self, uri):
        self.uri = uri
    def __getattr__(self, name):
        return self.uri + name

XHTML = NS("{http://www.w3.org/1999/xhtml}")

def innertext(elem):
    text = elem.text or ""
    for elem in elem.getiterator():
        if elem.text: text += elem.text
        if elem.tail: text += elem.tail
    return text

def gettitle(href, default):
    s = urllib.urlopen(href).read(1024)
    m = re.search("<title>[\d\s.]+(.*)</title>", s)
    if m:
        return m.group(1)
    return default

# get going

tree = parse(urllib.urlopen(URI))

# the index is in the second table on the page
tables = tree.findall(".//" + XHTML.table)

print "<seealso target-domain='python'>"

print " <info xmlns:dc='http://purl.org/dc/elements/1.1/'>"
print "  <dc:title>Python Library Reference</dc:title>"
print "  <dc:identifier>http://docs.python.org/index.html</dc:identifier>"
print "  <dc:language>en</dc:language>"
print " </info>"

for a in tables[1].findall(".//" + XHTML.a):
    href = urlparse.urljoin(URI, a.get("href"))
    text = innertext(a)
    target = text.split()[0]
    print " <item href='%s'>" % href
    print "  <title>%s</title>" % gettitle(href, text)
    print "  <target>%s</target>" % target
    print " </item>"

print "</seealso>"