You are not logged in Log in Join
You are here: Home » Members » hathawsh » GEDCOMParser.py » View File

Log in
Name

Password

 

GEDCOMParser.py

File details
Size
4 K
File type
text/x-python

File contents

import string
import urllib
from xml.sax.xmlreader import AttributesImpl

class GEDCOMParser:
    '''A GEDCOM file format parser based on saxlib.Parser.'''

    def __init__(self, handler=None):
        self.handler = handler

    def parse(self, systemId):
        "Parse a document from a system identifier."
        self.sysID=sysID
        try:
            self.parseFile(urllib.urlopen(sysID))
        finally:
            del self.sysID

    def parseFile(self, fileobj):
        "Parse a document from a file-like object."
        self._stack = []
        self.lineno = 0
        self.inlined = 0
        self.handler.startDocument()
        self._processElement(-1, 'GEDCOM', AttributesImpl({}))
        while 1:
            line = fileobj.readline()
            if len(line) < 1:
                # EOF.
                break
            line = string.strip(line)
            if len(line) > 0:
                self._processLine(line)
        # Close all opened elements.
        self._processElement(-1)
        self.handler.endDocument()

    def _processLine(self, line):
        self.lineno = lineno = self.lineno + 1
        delim_pos = string.index(line, ' ')
        level = int(line[:delim_pos])
        # assert level >= 0 and level <= 99
        attrs = {}

        if line[delim_pos + 1] == '@':
            # This line has an xref_id.
            end_xref_id = string.index(line, '@', delim_pos + 2)
            attrs['ID'] = line[delim_pos + 2 : end_xref_id]
            # assert len(xref_id) > 0
            # assert len(xref_id) == end_xref_id - (delim_pos + 2)
            delim_pos = end_xref_id + 1

        end_tag = string.find(line, ' ', delim_pos + 1)
        line_value = None
        if end_tag < 0:
            tag = string.upper(line[delim_pos + 1:])
        else:
            tag = string.upper(line[delim_pos + 1 : end_tag])
            # assert len(tag) > 0
            line_value = string.strip(line[end_tag + 1:])
            # assert len(line_value) > 0
            if line_value[0] == '@' and line_value[1] not in ('@', '#'):
                # line_value is a line_pointer.
                line_pointer = line_value[1:-1]
                # assert len(line_pointer) > 0
                attrs['IDREF'] = line_pointer
                line_value=None
            else:
                line_value = self._translateSpecialAttrs(
                    tag, attrs, line_value)

        attrs = AttributesImpl(attrs)
        __traceback_info__ = lineno
        self._processElement(level, tag, attrs, line_value)

    def _processElement(self, level, tag=None, attrs=None, line_value=None):
        stack = self._stack
        cur_level = len(stack) - 2
        h = self.handler
        while (cur_level >= level):
            # End previously opened elements.
            old_tag = stack[-1]
            if not self.inlined:
                h.ignorableWhitespace(' ' * (cur_level + 1))
            h.endElement(old_tag)
            del stack[-1]
            cur_level = cur_level - 1
            self.inlined = 0
            h.ignorableWhitespace('\n')
        if tag is not None and cur_level < level:
            # Open a new element.
            # assert cur_level + 1 == level
            if not self.inlined:
                h.ignorableWhitespace(' ' * (cur_level + 2))
            stack.append(tag)
            h.startElement(tag, attrs)
            if line_value is not None:
                self.inlined = 1
                h.characters(line_value)
            else:
                self.inlined = 0
                h.ignorableWhitespace('\n')

    def _translateSpecialAttrs(self, tag, attrs, line_value):
        # Here we translate all known GEDCOM escape sequences into
        # attributes.
        if tag == 'DATE' and line_value[:3] == '@#D':
            # Make the date calendar specifier an attribute
            # rather than part of the date text.
            end_cal = string.find(line_value, '@', 3)
            # assert end_cal > 3
            attrs['CALENDAR'] = line_value[3:end_cal]
            line_value = string.strip(line_value[end_cal + 1:])
        return line_value


if __name__ == '__main__':
    # Translate GEDCOM to GEDML
    import sys
    from xml.sax.saxutils import XMLGenerator

    p = GEDCOMParser(XMLGenerator(sys.stdout))
    if len(sys.argv) > 1:
        # File specified
        f = open(sys.argv[1])
    else:
        # Parse stdin
        f = sys.stdin
    p.parseFile(f)