#!/usr/bin/env python from BeautifulSoup import BeautifulSoup from urllib import urlopen from re import match url = 'http://www.iana.org/assignments/media-types/application/' html = urlopen(url).read() soup = BeautifulSoup(html) for rows in soup.findAll('tr'): cells = rows.findAll('td') if len(cells) == 3: # can be an anchor or just plain text mime_type = cells[1].string if not mime_type: mime_type = cells[1].findNext('a').string mime_type = mime_type.strip() if match('.*\+xml$', mime_type): print "application/%s" % mime_type