Wednesday, December 31, 2008

Using SGMLParser With IronPython

{

Mark Pilgrim's excellent Dive Into Python has a section on using SGMLParser and having seen nothing similar (and imagining its many uses!) I thought I'd give it a whirl in IronPython. I thought a good proof of concept would be creating a database out of link heavy sites.  Since I visit Arts & Letters Daily every so often and the closet intellectual in me likes to hang onto what I find there, I thought I'd target it:

import urllib2
import sgmllib
from sgmllib import SGMLParser

import clr
clr.AddReference("System.Data")
clr.AddReference("System.Net")
from System import *
from System.Data import *
from System.Net import *

class AlReader(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
self.pieces = []
self.track = 0
self.prePend = "No Category"
self.counter = 0

def start_a(self, attrs):
href = [v for k,v in attrs if k == "href"]
key = [v for k,v in attrs if k == "name"]
if href:
self.urls.extend(href)
self.track = 1
elif key:
self.prePend = attrs[0][1]

def handle_data(self, text):
if self.track:
self.pieces.append("|".join([self.prePend, text]))
self.counter = self.counter + 1

def end_a(self):
self.track = 0

def get_links(self):
links = []
for i in range(0, len(self.urls)):
links.append("|".join([self.pieces[i], self.urls[i]]))
return links
#print "%s %s" % (self.counter, "Total links")

def get_link_datatable(self):
d = DataTable()
d.Columns.Add(DataColumn("Category", Type.GetType("System.String")))
d.Columns.Add(DataColumn("Site", Type.GetType("System.String")))
d.Columns.Add(DataColumn("Url", Type.GetType("System.String")))

for text in self.get_links():
newRow = d.NewRow()
newRow["Category"], newRow["Site"], newRow["Url"] = text.split("|")
d.Rows.Add(newRow)

return d

response = urllib2.urlopen("http://www.aldaily.com")
a = AlReader()
a.feed(response.read())
linkdata = a.get_link_datatable()
# write it out to prove we got it.
ds = DataSet()
ds.Tables.Add(linkdata)
ds.WriteXml("c:\\temp\\arts and letters links.xml")


If you find tihs interesting do make sure you look at Pilgrim's chapter on HTML Processing



}

No comments: