import xml.sax
import xml.sax.handler
"""ifarchivexml:
This module parses the Master-Index.xml file that is available at
<
http://www.ifarchive.org/indexes/Master-Index.xml>.
You can use this module like this:
import ifarchivexml
(root, dirs, files) = ifarchivexml.parse('Master-Index.xml')
root is an IFDir object representing the root directory ('if-archive').
dirs is a dictionary mapping directory names ('if-archive/games', for
example) to IFDir objects. files is a dictionary mapping file pathname
('if-archive/games/playgame.FAQ', for example) to IFFile objects.
You can display the contents of either an IFDir or IFFile object with
the obj.dump() method.
"""
CONTEXT_NONE = 0
CONTEXT_DIR = 1
CONTEXT_FILE = 2
CONTEXT_DIRLINK = 3
CONTEXT_FILELINK = 4
class IFDir:
description = None
def __init__(self):
self.subdirs = []
self.files = []
def __repr__(self):
return '<IFDir \'' + self.name + '\'>'
def dump(self):
print 'name: ', self.name
print 'xdir: ', self.xdir
print 'parent: ', self.parent, ('('+str(self.parentobj)+')')
print 'subdircount:', self.subdircount
print 'filecount: ', self.filecount
if (self.description != None):
print 'description:'
print self.description
print 'subdirs:'
for subdir in self.subdirs:
print ' ', str(subdir)
print 'files:'
for file in self.files:
print ' ', str(file)
class IFFile:
size = None
date = None
md5 = None
rawdate = None
symlink = None
description = None
def __repr__(self):
return '<IFFile \'' + self.path + '\'>'
def dump(self):
print 'path: ', self.path
print 'name: ', self.name
print 'directory: ', self.directory, ('('+str(self.directoryobj)+')')
if (self.symlink == 'dir'):
print 'symlink to dir:'
print ' name: ', self.symlinkname
print ' xdir: ', self.symlinkxdir
if (self.symlink == 'file'):
print 'symlink to file:'
print ' path: ', self.symlinkpath
print 'size: ', self.size
print 'date: ', self.date
print 'rawdate:', self.rawdate
print 'md5: ', self.md5
print 'orderindex:', self.orderindex
if (self.description != None):
print 'description:'
print self.description
class IFAParser(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.ContentHandler.__init__(self)
self.grabbeddata = ''
self.curdir = None
self.directories = {}
self.files = {}
self.orderindex = 0
self.context = CONTEXT_NONE
self.elements = {
'ifarchive': (self.ignore_start, self.ifarchive_end),
'directory': (self.directory_start, self.directory_end),
'file': (self.file_start, self.file_end),
'name': (self.grabdata_start, self.name_end),
'xdir': (self.grabdata_start, self.xdir_end),
'filecount': (self.grabdata_start, self.filecount_end),
'subdircount': (self.grabdata_start, self.subdircount_end),
'parent': (self.grabdata_start, self.parent_end),
'path': (self.grabdata_start, self.path_end),
'size': (self.grabdata_start, self.size_end),
'date': (self.grabdata_start, self.date_end),
'rawdate': (self.grabdata_start, self.rawdate_end),
'md5': (self.grabdata_start, self.md5_end),
'description': (self.grabdata_start, self.description_end),
'symlink': (self.symlink_start, self.symlink_end),
}
def characters(self, data):
self.grabbeddata = (self.grabbeddata + data)
def startElement(self, name, attrs):
if (not self.elements.has_key(name)):
return
(startfunc, endfunc) = self.elements.get(name)
startfunc(attrs)
def endElement(self, name):
if (not self.elements.has_key(name)):
return
(startfunc, endfunc) = self.elements.get(name)
endfunc()
def ignore_start(self, dict):
pass
def ignore_end(self):
pass
def grabdata_start(self, dict):
self.grabbeddata = ''
def grabdata(self):
dat = self.grabbeddata
self.grabbeddata = ''
return dat
def directory_start(self, dict):
if (self.context == CONTEXT_NONE):
self.curdir = IFDir()
self.context = CONTEXT_DIR
elif (self.context == CONTEXT_FILE):
self.grabdata_start(None)
def directory_end(self):
if (self.context == CONTEXT_DIR):
name = self.curdir.name
self.directories[name] = self.curdir
self.curdir = None
self.context = CONTEXT_NONE
elif (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile != None):
self.curfile.directory = data
def file_start(self, dict):
if (self.context == CONTEXT_NONE):
self.curfile = IFFile()
self.context = CONTEXT_FILE
def file_end(self):
if (self.context == CONTEXT_FILE):
path = self.curfile.path
self.curfile.orderindex = self.orderindex
self.orderindex = self.orderindex+1
self.files[path] = self.curfile
self.curfile = None
self.context = CONTEXT_NONE
def symlink_start(self, dict):
if (self.context == CONTEXT_FILE):
if (dict['type'] == 'dir'):
self.context = CONTEXT_DIRLINK
self.curfile.symlink = 'dir'
else:
self.context = CONTEXT_FILELINK
self.curfile.symlink = 'file'
def symlink_end(self):
if (self.context == CONTEXT_DIRLINK):
self.context = CONTEXT_FILE
elif (self.context == CONTEXT_FILELINK):
self.context = CONTEXT_FILE
def name_end(self):
if (self.context == CONTEXT_DIR):
name = self.grabdata()
if (self.curdir != None):
self.curdir.name = name
elif (self.context == CONTEXT_FILE):
name = self.grabdata()
if (self.curfile != None):
self.curfile.name = name
elif (self.context == CONTEXT_DIRLINK):
name = self.grabdata()
if (self.curfile != None):
self.curfile.symlinkname = name
def parent_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir != None):
self.curdir.parent = data
def xdir_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir != None):
self.curdir.xdir = data
elif (self.context == CONTEXT_DIRLINK):
data = self.grabdata()
if (self.curfile != None):
self.curfile.symlinkxdir = data
def subdircount_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir != None):
self.curdir.subdircount = int(data)
def filecount_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir != None):
self.curdir.filecount = int(data)
def path_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile != None):
self.curfile.path = data
elif (self.context == CONTEXT_FILELINK):
data = self.grabdata()
if (self.curfile != None):
self.curfile.symlinkpath = data
def size_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile != None):
self.curfile.size = int(data)
def date_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile != None):
self.curfile.date = data
def rawdate_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile != None):
self.curfile.rawdate = int(data)
def md5_end(self):
if (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile != None):
self.curfile.md5 = data
def description_end(self):
if (self.context == CONTEXT_DIR):
data = self.grabdata()
if (self.curdir != None):
self.curdir.description = data
elif (self.context == CONTEXT_FILE):
data = self.grabdata()
if (self.curfile != None):
self.curfile.description = data
def ifarchive_end(self):
for dir in self.directories.values():
parent = dir.parent
if (parent == ''):
dir.parentobj = None
else:
dir.parentobj = self.directories[parent]
dir.parentobj.subdirs.append(dir)
for file in self.files.values():
parent = file.directory
file.directoryobj = self.directories[parent]
file.directoryobj.files.append(file)
def parse(filename):
parser = IFAParser()
fl = open(filename, 'r')
xml.sax.parse(fl, parser)
fl.close()
rootdir = parser.directories['if-archive']
result = (rootdir, parser.directories, parser.files)
return result