File contents
import string
import urllib
from xml.sax.xmlreader import AttributesImpl
class GEDCOMParser:
'''A GEDCOM file format parser based on saxlib.Parser.'''
def __init__(self, handler=None):
self.handler = handler
def parse(self, systemId):
"Parse a document from a system identifier."
self.sysID=sysID
try:
self.parseFile(urllib.urlopen(sysID))
finally:
del self.sysID
def parseFile(self, fileobj):
"Parse a document from a file-like object."
self._stack = []
self.lineno = 0
self.inlined = 0
self.handler.startDocument()
self._processElement(-1, 'GEDCOM', AttributesImpl({}))
while 1:
line = fileobj.readline()
if len(line) < 1:
# EOF.
break
line = string.strip(line)
if len(line) > 0:
self._processLine(line)
# Close all opened elements.
self._processElement(-1)
self.handler.endDocument()
def _processLine(self, line):
self.lineno = lineno = self.lineno + 1
delim_pos = string.index(line, ' ')
level = int(line[:delim_pos])
# assert level >= 0 and level <= 99
attrs = {}
if line[delim_pos + 1] == '@':
# This line has an xref_id.
end_xref_id = string.index(line, '@', delim_pos + 2)
attrs['ID'] = line[delim_pos + 2 : end_xref_id]
# assert len(xref_id) > 0
# assert len(xref_id) == end_xref_id - (delim_pos + 2)
delim_pos = end_xref_id + 1
end_tag = string.find(line, ' ', delim_pos + 1)
line_value = None
if end_tag < 0:
tag = string.upper(line[delim_pos + 1:])
else:
tag = string.upper(line[delim_pos + 1 : end_tag])
# assert len(tag) > 0
line_value = string.strip(line[end_tag + 1:])
# assert len(line_value) > 0
if line_value[0] == '@' and line_value[1] not in ('@', '#'):
# line_value is a line_pointer.
line_pointer = line_value[1:-1]
# assert len(line_pointer) > 0
attrs['IDREF'] = line_pointer
line_value=None
else:
line_value = self._translateSpecialAttrs(
tag, attrs, line_value)
attrs = AttributesImpl(attrs)
__traceback_info__ = lineno
self._processElement(level, tag, attrs, line_value)
def _processElement(self, level, tag=None, attrs=None, line_value=None):
stack = self._stack
cur_level = len(stack) - 2
h = self.handler
while (cur_level >= level):
# End previously opened elements.
old_tag = stack[-1]
if not self.inlined:
h.ignorableWhitespace(' ' * (cur_level + 1))
h.endElement(old_tag)
del stack[-1]
cur_level = cur_level - 1
self.inlined = 0
h.ignorableWhitespace('\n')
if tag is not None and cur_level < level:
# Open a new element.
# assert cur_level + 1 == level
if not self.inlined:
h.ignorableWhitespace(' ' * (cur_level + 2))
stack.append(tag)
h.startElement(tag, attrs)
if line_value is not None:
self.inlined = 1
h.characters(line_value)
else:
self.inlined = 0
h.ignorableWhitespace('\n')
def _translateSpecialAttrs(self, tag, attrs, line_value):
# Here we translate all known GEDCOM escape sequences into
# attributes.
if tag == 'DATE' and line_value[:3] == '@#D':
# Make the date calendar specifier an attribute
# rather than part of the date text.
end_cal = string.find(line_value, '@', 3)
# assert end_cal > 3
attrs['CALENDAR'] = line_value[3:end_cal]
line_value = string.strip(line_value[end_cal + 1:])
return line_value
if __name__ == '__main__':
# Translate GEDCOM to GEDML
import sys
from xml.sax.saxutils import XMLGenerator
p = GEDCOMParser(XMLGenerator(sys.stdout))
if len(sys.argv) > 1:
# File specified
f = open(sys.argv[1])
else:
# Parse stdin
f = sys.stdin
p.parseFile(f)