From 9a8f7f69bd6f9780f1964bdafd0ab670cadad55f Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Fri, 1 Apr 2022 14:52:54 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Handle=20multiple=20mods:role?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- qurator/modstool/modstool.py | 46 +++++++++++++++---------- qurator/modstool/tests/test_modstool.py | 38 ++++++++++++++++++++ 2 files changed, 65 insertions(+), 19 deletions(-) diff --git a/qurator/modstool/modstool.py b/qurator/modstool/modstool.py index e122922..cb0c424 100755 --- a/qurator/modstool/modstool.py +++ b/qurator/modstool/modstool.py @@ -165,6 +165,25 @@ class TagGroup: warnings.warn('Changed scriptTerm authority to lower case') return self + def merge_sub_tags_to_set(self): + value = {} + + sub_dicts = [mods_to_dict(e) for e in self.group] + sub_tags = {k for d in sub_dicts for k in d.keys()} + for sub_tag in sub_tags: + s = set() + for d in sub_dicts: + v = d.get(sub_tag) + if v: + # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a + # German language document. + if isinstance(v, set): + s.update(v) + else: + s.add(v) + value[sub_tag] = s + return value + def sorted_groupby(iterable, key=None): """ @@ -303,21 +322,8 @@ def mods_to_dict(mods, raise_errors=True): k = 'genre-{}'.format(authority) if authority is not None else 'genre' value[k] = {e.text for e in group if e.attrib.get('authority') == authority} elif tag == '{http://www.loc.gov/mods/v3}language': - # Make languageTerm/scriptTerm sets - sub_dicts = [mods_to_dict(e) for e in group] - sub_tags = {k for d in sub_dicts for k in d.keys()} - for sub_tag in sub_tags: - s = set() - for d in sub_dicts: - v = d.get(sub_tag) - if v: - # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a - # German language document. - if isinstance(v, set): - s.update(v) - else: - s.add(v) - value['language_{}'.format(sub_tag)] = s + value["language"] = TagGroup(tag, group) \ + .merge_sub_tags_to_set() elif tag == '{http://www.loc.gov/mods/v3}languageTerm': value['languageTerm'] = TagGroup(tag, group) \ .has_attributes({'authority': 'iso639-2b', 'type': 'code'}) \ @@ -333,11 +339,13 @@ def mods_to_dict(mods, raise_errors=True): for n, e in enumerate(group): value['name{}'.format(n)] = mods_to_dict(e, raise_errors) elif tag == '{http://www.loc.gov/mods/v3}role': - value['role'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + value["role"] = TagGroup(tag, group) \ + .has_no_attributes() \ + .merge_sub_tags_to_set() elif tag == '{http://www.loc.gov/mods/v3}roleTerm': value['roleTerm'] = TagGroup(tag, group) \ - .is_singleton().has_attributes({'authority': 'marcrelator', 'type': 'code'}) \ - .text() + .has_attributes({'authority': 'marcrelator', 'type': 'code'}) \ + .text_set() elif tag == '{http://www.loc.gov/mods/v3}namePart': for e in group: if not e.attrib.get('type'): @@ -412,7 +420,7 @@ def mets_to_dict(mets, raise_errors=True): raise ValueError('Unknown tag "{}"'.format(tag)) else: pass - + return value diff --git a/qurator/modstool/tests/test_modstool.py b/qurator/modstool/tests/test_modstool.py index 10ec3b6..b13586a 100644 --- a/qurator/modstool/tests/test_modstool.py +++ b/qurator/modstool/tests/test_modstool.py @@ -33,6 +33,44 @@ def test_multitple_language_languageTerm(): """) assert d['language_languageTerm'] == {'ger', 'lat'} +def test_role_roleTerm(): + d = dict_fromstring(""" + + + Wurm, Mary + Mary + 078789583 + Wurm + + cmp + + + + """) + assert d['name0_role_roleTerm'] == {'cmp'} + +def test_multiple_role_roleTerm(): + """ + Multiple mods:role/mods:roleTerm should be merged into one column. + """ + d = dict_fromstring(""" + + + Wurm, Mary + Mary + 078789583 + Wurm + + cmp + + + aut + + + + """) + assert d['name0_role_roleTerm'] == {'cmp', 'aut'} + def test_scriptTerm(): """ Same language using different scripts have one mods:language, with multiple scriptTerms inside.