import string import urllib from xml.sax.xmlreader import AttributesImpl class GEDCOMParser: '''A GEDCOM file format parser based on saxlib.Parser.''' def __init__(self, handler=None): self.handler = handler def parse(self, systemId): "Parse a document from a system identifier." self.sysID=sysID try: self.parseFile(urllib.urlopen(sysID)) finally: del self.sysID def parseFile(self, fileobj): "Parse a document from a file-like object." self._stack = [] self.lineno = 0 self.inlined = 0 self.handler.startDocument() self._processElement(-1, 'GEDCOM', AttributesImpl({})) while 1: line = fileobj.readline() if len(line) < 1: # EOF. break line = string.strip(line) if len(line) > 0: self._processLine(line) # Close all opened elements. self._processElement(-1) self.handler.endDocument() def _processLine(self, line): self.lineno = lineno = self.lineno + 1 delim_pos = string.index(line, ' ') level = int(line[:delim_pos]) # assert level >= 0 and level <= 99 attrs = {} if line[delim_pos + 1] == '@': # This line has an xref_id. end_xref_id = string.index(line, '@', delim_pos + 2) attrs['ID'] = line[delim_pos + 2 : end_xref_id] # assert len(xref_id) > 0 # assert len(xref_id) == end_xref_id - (delim_pos + 2) delim_pos = end_xref_id + 1 end_tag = string.find(line, ' ', delim_pos + 1) line_value = None if end_tag < 0: tag = string.upper(line[delim_pos + 1:]) else: tag = string.upper(line[delim_pos + 1 : end_tag]) # assert len(tag) > 0 line_value = string.strip(line[end_tag + 1:]) # assert len(line_value) > 0 if line_value[0] == '@' and line_value[1] not in ('@', '#'): # line_value is a line_pointer. line_pointer = line_value[1:-1] # assert len(line_pointer) > 0 attrs['IDREF'] = line_pointer line_value=None else: line_value = self._translateSpecialAttrs( tag, attrs, line_value) attrs = AttributesImpl(attrs) __traceback_info__ = lineno self._processElement(level, tag, attrs, line_value) def _processElement(self, level, tag=None, attrs=None, line_value=None): stack = self._stack cur_level = len(stack) - 2 h = self.handler while (cur_level >= level): # End previously opened elements. old_tag = stack[-1] if not self.inlined: h.ignorableWhitespace(' ' * (cur_level + 1)) h.endElement(old_tag) del stack[-1] cur_level = cur_level - 1 self.inlined = 0 h.ignorableWhitespace('\n') if tag is not None and cur_level < level: # Open a new element. # assert cur_level + 1 == level if not self.inlined: h.ignorableWhitespace(' ' * (cur_level + 2)) stack.append(tag) h.startElement(tag, attrs) if line_value is not None: self.inlined = 1 h.characters(line_value) else: self.inlined = 0 h.ignorableWhitespace('\n') def _translateSpecialAttrs(self, tag, attrs, line_value): # Here we translate all known GEDCOM escape sequences into # attributes. if tag == 'DATE' and line_value[:3] == '@#D': # Make the date calendar specifier an attribute # rather than part of the date text. end_cal = string.find(line_value, '@', 3) # assert end_cal > 3 attrs['CALENDAR'] = line_value[3:end_cal] line_value = string.strip(line_value[end_cal + 1:]) return line_value if __name__ == '__main__': # Translate GEDCOM to GEDML import sys from xml.sax.saxutils import XMLGenerator p = GEDCOMParser(XMLGenerator(sys.stdout)) if len(sys.argv) > 1: # File specified f = open(sys.argv[1]) else: # Parse stdin f = sys.stdin p.parseFile(f)