🎨 Rename qurator.modstool to qurator.mods4pandas

2025-08-16 21:19:52 +02:00 · 2022-07-04 19:11:10 +02:00 · 2022-07-04 19:11:10 +02:00 · 3c2e59f0ed
commit 3c2e59f0ed
parent 1d2c5e2d10
36 changed files with 6 additions and 6 deletions
--- a/qurator/mods4pandas/lib.py
+++ b/qurator/mods4pandas/lib.py
@ -0,0 +1,296 @@
+from itertools import groupby
+import re
+import warnings
+from typing import List, Sequence, MutableMapping
+
+import numpy as np
+from lxml import etree as ET
+
+
+__all__ = ["ns"]
+
+
+ns = {
+    'mets': 'http://www.loc.gov/METS/',
+    'mods': 'http://www.loc.gov/mods/v3',
+    "alto": "http://www.loc.gov/standards/alto/ns-v2"
+}
+
+
+
+class TagGroup:
+    """Helper class to simplify the parsing and checking of MODS metadata"""
+
+    def __init__(self, tag, group: List[ET.Element]):
+        self.tag = tag
+        self.group = group
+
+    def __str__(self):
+        return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
+
+    def is_singleton(self):
+        if len(self.group) != 1:
+            raise ValueError('More than one instance: {}'.format(self))
+        return self
+
+    def has_no_attributes(self):
+        return self.has_attributes({})
+
+    def has_attributes(self, attrib):
+        if not isinstance(attrib, Sequence):
+            attrib = [attrib]
+        if not all(e.attrib in attrib for e in self.group):
+            raise ValueError('One or more element has unexpected attributes: {}'.format(self))
+        return self
+
+    def ignore_attributes(self):
+        # This serves as documentation for now.
+        return self
+
+    def sort(self, key=None, reverse=False):
+        self.group = sorted(self.group, key=key, reverse=reverse)
+        return self
+
+    def text(self, separator='\n'):
+        t = ''
+        for e in self.group:
+            if t != '':
+                t += separator
+            if e.text:
+                t += e.text
+        return t
+
+    def text_set(self):
+        return {e.text for e in self.group}
+
+    def descend(self, raise_errors):
+        return _to_dict(self.is_singleton().group[0], raise_errors)
+
+    def filter(self, cond, warn=None):
+        new_group = []
+        for e in self.group:
+            if cond(e):
+                new_group.append(e)
+            else:
+                if warn:
+                    warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
+        return TagGroup(self.tag, new_group)
+
+    def force_singleton(self, warn=True):
+        if len(self.group) == 1:
+            return self
+        else:
+            if warn:
+                warnings.warn('Forced single instance of {}'.format(self.tag))
+            return TagGroup(self.tag, self.group[:1])
+
+    RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$'  # Note: Includes non-specific century dates like '18XX'
+    RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$'
+
+    def fix_date(self):
+
+        for e in self.group:
+            if e.attrib.get('encoding') == 'w3cdtf':
+                # This should be 'iso8601' according to MODS-AP 2.3.1
+                warnings.warn('Changed w3cdtf encoding to iso8601')
+                e.attrib['encoding'] = 'iso8601'
+
+        new_group = []
+        for e in self.group:
+            if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
+                new_group.append(e)
+            elif re.match(self.RE_ISO8601_DATE, e.text):
+                warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
+                e.attrib['encoding'] = 'iso8601'
+                new_group.append(e)
+            elif re.match(self.RE_GERMAN_DATE, e.text):
+                warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
+                m = re.match(self.RE_GERMAN_DATE, e.text)
+                e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
+                e.attrib['encoding'] = 'iso8601'
+                new_group.append(e)
+            else:
+                warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
+                new_group.append(e)
+        self.group = new_group
+
+        # Notes:
+        # - There are dates with the misspelled qualifier 'aproximate'
+        # - Rough periods are sometimes given either by:
+        #   - years like '19xx'
+        #   - or 'approximate' date ranges with point="start"/"end" attributes set
+        #     (this could be correct according to MODS-AP 2.3.1)
+        # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier
+        # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)"
+
+        return self
+
+    def fix_event_type(self):
+        # According to MODS-AP 2.3.1, every originInfo should have its eventType set.
+        # Fix this for special cases.
+
+        for e in self.group:
+            if e.attrib.get('eventType') is None:
+                try:
+                    if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \
+                            e.find('mods:edition', ns).text == '[Electronic ed.]':
+                        e.attrib['eventType'] = 'digitization'
+                        warnings.warn('Fixed eventType for electronic ed.')
+                        continue
+                except AttributeError:
+                    pass
+                try:
+                    if e.find('mods:dateIssued', ns) is not None:
+                        e.attrib['eventType'] = 'publication'
+                        warnings.warn('Fixed eventType for an issued origin')
+                        continue
+                except AttributeError:
+                    pass
+                try:
+                    if e.find('mods:dateCreated', ns) is not None:
+                        e.attrib['eventType'] = 'production'
+                        warnings.warn('Fixed eventType for a created origin')
+                        continue
+                except AttributeError:
+                    pass
+        return self
+
+    def fix_script_term(self):
+        for e in self.group:
+            # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
+            if e.attrib['authority'] == 'ISO15924':
+                e.attrib['authority'] = 'iso15924'
+                warnings.warn('Changed scriptTerm authority to lower case')
+        return self
+
+    def merge_sub_tags_to_set(self):
+        from .mods4pandas import mods_to_dict
+        value = {}
+
+        sub_dicts = [mods_to_dict(e) for e in self.group]
+        sub_tags = {k for d in sub_dicts for k in d.keys()}
+        for sub_tag in sub_tags:
+            s = set()
+            for d in sub_dicts:
+                v = d.get(sub_tag)
+                if v:
+                    # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a
+                    # German language document.
+                    if isinstance(v, set):
+                        s.update(v)
+                    else:
+                        s.add(v)
+            value[sub_tag] = s
+        return value
+
+    def attributes(self):
+        """
+        Return a merged dict of all attributes of the tag group.
+
+        Probably most useful if used on a singleton, for example:
+
+            value['Page'] = TagGroup(tag, group).is_singleton().attributes()
+        """
+        attrib = {}
+        for e in self.group:
+            for a, v in e.attrib.items():
+                a_localname = ET.QName(a).localname
+                attrib[a_localname] = v
+        return attrib
+
+    def subelement_counts(self):
+        counts = {}
+        for e in self.group:
+            for x in e.iter():
+                tag = ET.QName(x.tag).localname
+                key = f"{tag}-count"
+                counts[key] = counts.get(key, 0) + 1
+        return counts
+
+    def xpath_statistics(self, xpath_expr, namespaces):
+        """
+        Extract values and calculate statistics
+
+        Extract values using the given XPath expression, convert them to float and return descriptive
+        statistics on the values.
+        """
+        values = []
+        for e in self.group:
+            r = e.xpath(xpath_expr, namespaces=namespaces)
+            values += r
+        values = np.array([float(v) for v in values])
+
+        statistics = {}
+        if values.size > 0:
+            statistics[f'{xpath_expr}-mean'] = np.mean(values)
+            statistics[f'{xpath_expr}-median'] = np.median(values)
+            statistics[f'{xpath_expr}-std'] = np.std(values)
+            statistics[f'{xpath_expr}-min'] = np.min(values)
+            statistics[f'{xpath_expr}-max'] = np.max(values)
+        return statistics
+
+    def xpath_count(self, xpath_expr, namespaces):
+        """
+        Count all elements matching xpath_expr
+        """
+        values = []
+        for e in self.group:
+            r = e.xpath(xpath_expr, namespaces=namespaces)
+            values += r
+
+        counts = {f'{xpath_expr}-count': len(values)}
+        return counts
+
+
+
+def sorted_groupby(iterable, key=None):
+    """
+    Sort iterable by key and then group by the same key.
+
+    itertools.groupby() assumes that the iterable is already sorted. This function
+    conveniently sorts the iterable first, and then groups its elements.
+    """
+    return groupby(sorted(iterable, key=key), key=key)
+
+
+def _to_dict(root, raise_errors):
+    from .mods4pandas import mods_to_dict, mets_to_dict
+    from .alto4pandas import alto_to_dict
+
+    root_name = ET.QName(root.tag)
+    if root_name.namespace == "http://www.loc.gov/mods/v3":
+        return mods_to_dict(root, raise_errors)
+    elif root_name.namespace == "http://www.loc.gov/METS/":
+        return mets_to_dict(root, raise_errors)
+    elif root_name.namespace in [
+        "http://schema.ccs-gmbh.com/ALTO",
+        "http://www.loc.gov/standards/alto/",
+        "http://www.loc.gov/standards/alto/ns-v2#",
+        "http://www.loc.gov/standards/alto/ns-v4#",
+    ]:
+        return alto_to_dict(root, raise_errors)
+    else:
+        raise ValueError(f"Unknown namespace {root_name.namespace}")
+
+
+def flatten(d: MutableMapping, parent='', separator='_'):
+    """
+    Flatten the given nested dict.
+
+    It is assumed that d maps strings to either another dictionary (similarly structured) or some other value.
+    """
+    items = []
+
+    for k, v in d.items():
+        if parent:
+            new_key = parent + separator + k
+        else:
+            new_key = k
+
+        if isinstance(v, MutableMapping):
+            items.extend(flatten(v, new_key, separator=separator).items())
+        else:
+            items.append((new_key, v))
+
+    return dict(items)
+