From df10fc12e3d2e87cedaccd7c56044d3eacd77052 Mon Sep 17 00:00:00 2001 From: Matthias Jobst Date: Sat, 15 Jul 2017 08:28:01 +0200 Subject: [PATCH 1/2] Upgrade to python 3.6 --- gedcom.py | 22 +++++++++++----------- test.py | 12 ++++++------ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/gedcom.py b/gedcom.py index 9a1e40a..4038e5d 100644 --- a/gedcom.py +++ b/gedcom.py @@ -80,7 +80,7 @@ def __parse(self,file): def __parse_line(self,number,line): # each line should have: Level SP (Pointer SP)? Tag (SP Value)? (SP)? NL # parse the line - parts = string.split(line) + parts = str.split(line) place = 0 l = self.__level(number,parts,place) place += 1 @@ -162,12 +162,12 @@ def __value(self,number,parts,place): while place < len(parts): vlist.append(parts[place]) place += 1 - v = string.join(vlist) + v = ' '.join(vlist) return v def __error(self,number,text): error = "Gedcom format error on line " + str(number) + ': ' + text - raise GedcomParseError, error + raise GedcomParseError(error) def __count(self): # Count number of individuals @@ -179,7 +179,7 @@ def __count(self): def __print(self): for e in self.element_list: - print string.join([str(e.level()),e.pointer(),e.tag(),e.value()]) + print (str.join([str(e.level()),e.pointer(),e.tag(),e.value()])) class GedcomParseError(Exception): @@ -189,7 +189,7 @@ def __init__(self, value): self.value = value def __str__(self): - return `self.value` + return self.value class Element: """Gedcom element @@ -435,9 +435,9 @@ def name(self): # some older Gedcom files don't use child tags but instead # place the name in the value of the NAME tag if e.value() != "": - name = string.split(e.value(),'/') - first = string.strip(name[0]) - last = string.strip(name[1]) + name = str.split(e.value(),'/') + first = str.strip(name[0]) + last = str.strip(name[1]) else: for c in e.children(): if c.tag() == "GIVN": @@ -470,7 +470,7 @@ def birth_year(self): if e.tag() == "BIRT": for c in e.children(): if c.tag() == "DATE": - datel = string.split(c.value()) + datel = str.split(c.value()) date = datel[len(datel)-1] if date == "": return -1 @@ -503,7 +503,7 @@ def death_year(self): if e.tag() == "DEAT": for c in e.children(): if c.tag() == "DATE": - datel = string.split(c.value()) + datel = str.split(c.value()) date = datel[len(datel)-1] if date == "": return -1 @@ -561,7 +561,7 @@ def marriage_years(self): if g.tag() == "MARR": for h in g.children(): if h.tag() == "DATE": - datel = string.split(h.value()) + datel = str.split(h.value()) date = datel[len(datel)-1] try: dates.append(int(date)) diff --git a/test.py b/test.py index 2caeecb..5f599fd 100644 --- a/test.py +++ b/test.py @@ -92,17 +92,17 @@ def run(self,info='names',surname='A',given='Maria', def print_header(self,header): """Print a header.""" print - print "="*70 - print header - print "="*70 + print ("="*70) + print (header) + print ("="*70) def print_record(self,e): """Print an element.""" if self.info == 'names': (first,last) = e.name() - print first, last + print (first, last) elif self.info == 'gedcom': - print e.get_individual() + print (e.get_individual()) def surname(self,match): @@ -208,7 +208,7 @@ def missing(self): if e.value().startswith('@'): f = self.g.element_dict().get(e.value(),None) if f == None: - print e.value() + print (e.value()) if __name__ == "__main__": t = Test() From 02cf8580d1e2a45cef2b45649fabc0eb2bb5137b Mon Sep 17 00:00:00 2001 From: Matthias Jobst Date: Sun, 16 Jul 2017 17:36:13 +0200 Subject: [PATCH 2/2] Some refactoring to handle format and encoding issues Gedcom takes an iterator of lines instead of a file directly. This allows to apply encoding. Date is a file that delivers a splitDate function to handle the special cases encountered when splitting dates. Added Query class to split concerns between storage and query. --- date.py | 15 +++++++++++ gedcom.py | 58 ++++++++++++++++++++++------------------ test.py | 80 ++++++++++++++++++++++++++++++++++--------------------- 3 files changed, 97 insertions(+), 56 deletions(-) create mode 100644 date.py diff --git a/date.py b/date.py new file mode 100644 index 0000000..3840ee7 --- /dev/null +++ b/date.py @@ -0,0 +1,15 @@ +import datetime + +def splitDate(date): + """Split the date and return the year + Add your conversion rules as needed""" + # See here for a suitable format string: https://docs.python.org/3/library/datetime.html?highlight=datetime#strftime-strptime-behavior + retdate = datetime.datetime(1,1,1) + try: + retdate = datetime.datetime.strptime(date, "%d.%m.%Y") + except ValueError: + try: + retdate = datetime.datetime.strptime(date, "%Y") + except ValueError: + pass + return retdate.year diff --git a/gedcom.py b/gedcom.py index 4038e5d..519304e 100644 --- a/gedcom.py +++ b/gedcom.py @@ -24,7 +24,7 @@ __all__ = ["Gedcom", "Element", "GedcomParseError"] # Global imports -import string +from date import splitDate class Gedcom: """Gedcom parser @@ -37,7 +37,7 @@ class Gedcom: """ - def __init__(self,file): + def __init__(self,lines): """Initialize a Gedcom parser. You must supply a Gedcom file.""" self.__element_list = [] self.__element_dict = {} @@ -45,7 +45,7 @@ def __init__(self,file): self.__current_level = -1 self.__current_element = self.__element_top self.__individuals = 0 - self.__parse(file) + self.__parse(lines) def element_list(self): """Return a list of all the elements in the Gedcom file. The @@ -64,12 +64,10 @@ def element_dict(self): # Private methods - def __parse(self,file): - # open file + def __parse(self,lines): # go through the lines - f = open(file) number = 1 - for line in f.readlines(): + for line in lines: # Skip over some junk that Rootsmagic puts in gedcom files. if number == 1 and ord(line[0]) == 239: line = line[3:] @@ -173,7 +171,8 @@ def __count(self): # Count number of individuals self.__individuals = 0 for e in self.__element_list: - if e.individual(): + q = Query(e) + if q.individual(): self.__individuals += 1 @@ -250,6 +249,10 @@ def value(self): """Return the value of this element.""" return self.__value + def dict(self): + """Return the dictionary of this element.""" + return self.__dict + def children(self): """Return the child elements of this element.""" return self.__children @@ -266,9 +269,15 @@ def add_parent(self,element): """Add a parent element to this element.""" self.__parent = element +class Query(object): + """Query GEDCOM Element tree""" + + def __init__(self,element): + self.element = element + def individual(self): """Check if this element is an individual.""" - return self.tag() == "INDI" + return self.element.tag() == "INDI" # criteria matching @@ -417,9 +426,9 @@ def marriage_range_match(self,year1,year2): def families(self): """Return a list of all of the family elements of a person.""" results = [] - for e in self.children(): + for e in self.element.children(): if e.tag() == "FAMS": - f = self.__dict.get(e.value(),None) + f = e.dict().get(e.value(),None) if f != None: results.append(f) return results @@ -430,7 +439,7 @@ def name(self): last = "" if not self.individual(): return (first,last) - for e in self.children(): + for e in self.element.children(): if e.tag() == "NAME": # some older Gedcom files don't use child tags but instead # place the name in the value of the NAME tag @@ -466,12 +475,11 @@ def birth_year(self): date = "" if not self.individual(): return date - for e in self.children(): + for e in self.element.children(): if e.tag() == "BIRT": for c in e.children(): if c.tag() == "DATE": - datel = str.split(c.value()) - date = datel[len(datel)-1] + date = splitDate(c.value()) if date == "": return -1 try: @@ -499,12 +507,11 @@ def death_year(self): date = "" if not self.individual(): return date - for e in self.children(): + for e in self.element.children(): if e.tag() == "DEAT": for c in e.children(): if c.tag() == "DATE": - datel = str.split(c.value()) - date = datel[len(datel)-1] + date = splitDate(c.value()) if date == "": return -1 try: @@ -532,7 +539,7 @@ def marriage(self): return (date,place) for e in self.children(): if e.tag() == "FAMS": - f = self.__dict.get(e.value(),None) + f = e.dict().get(e.value(),None) if f == None: return (date,place) for g in f.children(): @@ -552,17 +559,16 @@ def marriage_years(self): dates = [] if not self.individual(): return dates - for e in self.children(): + for e in self.element.children(): if e.tag() == "FAMS": - f = self.__dict.get(e.value(),None) + f = e.dict().get(e.value(),None) if f == None: return dates for g in f.children(): if g.tag() == "MARR": for h in g.children(): if h.tag() == "DATE": - datel = str.split(h.value()) - date = datel[len(datel)-1] + date = splitDate(h.value()) try: dates.append(int(date)) except: @@ -578,10 +584,10 @@ def get_individual(self): def get_family(self): """Return this element any all elements in its families.""" - result = [self] - for e in self.children(): + result = [self.element] + for e in self.element.children(): if e.tag() == "HUSB" or e.tag() == "WIFE" or e.tag() == "CHIL": - f = self.__dict.get(e.value()) + f = e.dict().get(e.value()) if f != None: result.append(f) return result diff --git a/test.py b/test.py index 5f599fd..cee10fe 100644 --- a/test.py +++ b/test.py @@ -19,10 +19,16 @@ # Global imports import optparse +import codecs # Local imports import gedcom +# Local functions +def getLinesFromFile(file, encoding): + f = codecs.open(file,encoding=encoding) + return f.readlines() + class Test: """Test driver for the Gedcom parser.""" @@ -30,7 +36,7 @@ def __init__(self): """ Initialize test class.""" self.parse_options() self.filename = self.options.filename - self.g = gedcom.Gedcom(self.filename) + self.g = gedcom.Gedcom(getLinesFromFile(self.filename, encoding='utf-8')) self.info = 'names' def parse_options(self): @@ -45,11 +51,11 @@ def parse_options(self): (self.options,self.args) = parser.parse_args() def run(self,info='names',surname='A',given='Maria', - birth=1857,birth_start=1850,birth_end=1860, - death=1857,death_start=1850,death_end=1860, - marriage=1857,marriage_start=1850,marriage_end=1860, - family_surname='Nicotra',family_given='Maria', - criteria='surname=N:birthrange=1820-1840:deathrange=1900-1910'): + birth=1968,birth_start=1940,birth_end=1960, + death=1922,death_start=1900,death_end=1950, + marriage=1805,marriage_start=1850,marriage_end=1900, + family_surname='Steuer',family_given='Maria', + criteria='surname=J:birthrange=1800-1910:deathrange=1900-2010'): """Run a standard series of tests. Keyword arguments: @@ -99,8 +105,12 @@ def print_header(self,header): def print_record(self,e): """Print an element.""" if self.info == 'names': - (first,last) = e.name() - print (first, last) + q = gedcom.Query(e) + (first,last) = q.name() + try: + print (first, last) + except UnicodeEncodeError: + pass elif self.info == 'gedcom': print (e.get_individual()) @@ -109,64 +119,72 @@ def surname(self,match): """Show matching records for a surname substring.""" self.print_header('Surname - %s' % (match)) for e in self.g.element_list(): - if e.individual(): - if e.surname_match(match): + q = gedcom.Query(e) + if q.individual(): + if q.surname_match(match): self.print_record(e) def given(self,match): """Show matching records for a given name substring.""" self.print_header('Given - %s' % (match)) for e in self.g.element_list(): - if e.individual(): - if e.given_match(match): + q = gedcom.Query(e) + if q.individual(): + if q.given_match(match): self.print_record(e) def birth(self,year): """Show matching records for a birth year.""" self.print_header('Born %d' % (year)) for e in self.g.element_list(): - if e.individual(): - if e.birth_year_match(year): + q = gedcom.Query(e) + if q.individual(): + if q.birth_year_match(year): self.print_record(e) def birth_range(self,year1,year2): """Show matching records for a birth year range.""" self.print_header('Born %d - %d' % (year1,year2)) for e in self.g.element_list(): - if e.individual(): - if e.birth_range_match(year1,year2): + q = gedcom.Query(e) + if q.individual(): + if q.birth_range_match(year1,year2): self.print_record(e) def death(self,year): """Show matching records for a death year.""" self.print_header('Died %d' %(year)) for e in self.g.element_list(): - if e.individual(): - if e.death_year_match(year): + q = gedcom.Query(e) + if q.individual(): + if q.death_year_match(year): self.print_record(e) def death_range(self,year1,year2): """Show matching records for a death year range.""" self.print_header('Died %d - %d' % (year1,year2)) for e in self.g.element_list(): - if e.individual(): - if e.death_range_match(year1,year2): + q = gedcom.Query(e) + if q.individual(): + if q.death_range_match(year1,year2): self.print_record(e) def marriage(self,year): """Show matching records for a marriage year.""" self.print_header('Married %d' % (year)) for e in self.g.element_list(): - if e.individual(): - if e.marriage_year_match(year): + q = gedcom.Query(e) + if q.individual(): + if q.marriage_year_match(year): self.print_record(e) def marriage_range(self,year1,year2): """Show matching records for a marriage year range.""" self.print_header('Married %d - %d' % (year1,year2)) for e in self.g.element_list(): - if e.individual(): - if e.marriage_range_match(year1,year2): + q = gedcom.Query(e) + if q.individual(): + if q.marriage_range_match(year1,year2): self.print_record(e) def family_of(self,surname,given): @@ -176,10 +194,11 @@ def family_of(self,surname,given): """ self.print_header('Family of %s %s' % (given,surname)) for e in self.g.element_list(): - if e.individual(): - if e.surname_match(surname) and e.given_match(given): - for f in e.families(): - for e in f.get_family(): + q = gedcom.Query(e) + if q.individual(): + if q.surname_match(surname) and q.given_match(given): + for f in q.families(): + for e in gedcom.Query(f).get_family(): self.print_record(e) def criteria_match(self,criteria): @@ -194,8 +213,9 @@ def criteria_match(self,criteria): """ self.print_header('Criteria Matching: %s' % (criteria)) for e in self.g.element_list(): - if e.individual(): - if e.criteria_match(criteria): + q = gedcom.Query(e) + if q.individual(): + if q.criteria_match(criteria): self.print_record(e) def missing(self):