From 889d36f0d4b06a4a13c02170e65ecb5f7f84e20b Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 22 Nov 2023 18:11:14 +0100
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20page=5Finfo:=20Retrieve=20filenames?=
 =?UTF-8?q?=20+=20structMap=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 qurator/mods4pandas/lib.py         |  8 ++-
 qurator/mods4pandas/mods4pandas.py | 89 +++++++++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/qurator/mods4pandas/lib.py b/qurator/mods4pandas/lib.py
index b611820..f3c6078 100644
--- a/qurator/mods4pandas/lib.py
+++ b/qurator/mods4pandas/lib.py
@@ -13,7 +13,8 @@ __all__ = ["ns"]
 ns = {
     'mets': 'http://www.loc.gov/METS/',
     'mods': 'http://www.loc.gov/mods/v3',
-    "alto": "http://www.loc.gov/standards/alto/ns-v2"
+    "alto": "http://www.loc.gov/standards/alto/ns-v2",
+    "xlink": "http://www.w3.org/1999/xlink",
 }
 
 
@@ -25,9 +26,12 @@ class TagGroup:
         self.tag = tag
         self.group = group
 
-    def __str__(self):
+    def to_xml(self):
         return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
 
+    def __str__(self):
+        return f"TagGroup with content:\n{self.to_xml()}"
+
     def is_singleton(self):
         if len(self.group) != 1:
             raise ValueError('More than one instance: {}'.format(self))
diff --git a/qurator/mods4pandas/mods4pandas.py b/qurator/mods4pandas/mods4pandas.py
index 45fb675..94bddd3 100755
--- a/qurator/mods4pandas/mods4pandas.py
+++ b/qurator/mods4pandas/mods4pandas.py
@@ -252,9 +252,90 @@ def mets_to_dict(mets, raise_errors=True):
                 raise ValueError('Unknown tag "{}"'.format(tag))
             else:
                 pass
-
     return value
 
+def pages_to_dict(mets, raise_errors=True) -> list[dict]:
+    # TODO replace asserts by ValueError
+
+    result = []
+
+    # PPN
+    def get_mets_recordIdentifier(*, source="gbv-ppn"):
+        return (mets.xpath(f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
+                           namespaces=ns) or [None])[0].text
+    ppn = get_mets_recordIdentifier()
+
+    # Getting per-page/structure information is a bit different
+    structMap_PHYSICAL = (mets.xpath('//mets:structMap[@TYPE="PHYSICAL"]', namespaces=ns) or [None])[0]
+    if not structMap_PHYSICAL:
+        raise ValueError("No structMap[@TYPE='PHYSICAL'] found")
+
+    div_physSequence = structMap_PHYSICAL[0]
+    assert div_physSequence.attrib.get("TYPE") == "physSequence"
+
+    for page in div_physSequence:
+
+        # TODO sort by ORDER?
+        assert page.attrib.get("TYPE") == "page"
+        page_dict = {}
+        page_dict["ppn"] = ppn
+        page_dict["ID"] = page.attrib.get("ID")
+        for fptr in page:
+            assert fptr.tag == "{http://www.loc.gov/METS/}fptr"
+            file_id = fptr.attrib.get("FILEID")
+            assert file_id
+
+            def get_mets_file(*, ID):
+                if ID:
+                    file_ = (mets.xpath(f'//mets:file[@ID="{ID}"]', namespaces=ns) or [None])[0]
+                    return file_
+
+            file_ = get_mets_file(ID=file_id)
+            fileGrp_USE = file_.getparent().attrib.get("USE")
+            file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
+            page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
+
+        def get_struct_log(*, to_phys):
+            """
+            Get the logical structMap elements that link to the given physical page.
+
+            Keyword arguments:
+            to_phys -- ID of the page, as per structMap[@TYPE="PHYSICAL"]
+            """
+
+            # This is all XLink, there might be a more generic way to traverse the links. However, currently,
+            # it suffices to do this the old-fashioned way.
+
+            sm_links = mets.xpath(f'//mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', namespaces=ns)
+
+            targets = []
+            for sm_link in sm_links:
+                xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from")
+                targets.extend(mets.xpath(f'//mets:div[@ID="{xlink_from}"]', namespaces=ns))
+            return targets
+
+        struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))
+
+        # In our documents, there are already links to parent elements, but we want to make
+        # sure and add them.
+        def get_struct_log_parents(div):
+            cursor = div
+            while (cursor := cursor.getparent()).tag == f"{{{ns['mets']}}}div":
+                yield cursor
+
+        for struct_div in struct_divs:
+            struct_divs.update(get_struct_log_parents(struct_div))
+
+        for struct_div in struct_divs:
+            type_ = struct_div.attrib.get("TYPE")
+            assert type_
+            page_dict[f"structmap_LOGICAL_TYPE_{type_}"] = 1
+
+        from pprint import pprint; pprint(page_dict); print()
+        result.append(page_dict)
+
+    return result
+
 
 @click.command()
 @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
@@ -286,6 +367,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
     with open(output_file + '.warnings.csv', 'w') as csvfile:
         csvwriter = csv.writer(csvfile)
         mods_info = []
+        page_info = []
         logger.info('Processing METS files')
         for mets_file in tqdm(mets_files_real, leave=False):
             try:
@@ -298,6 +380,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
 
                     # MODS
                     d = flatten(mods_to_dict(mods, raise_errors=True))
+
                     # METS
                     d_mets = flatten(mets_to_dict(mets, raise_errors=True))
                     for k, v in d_mets.items():
@@ -305,7 +388,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
                     # "meta"
                     d['mets_file'] = mets_file
 
+                    # METS - per-page
+                    page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
+
                     mods_info.append(d)
+                    page_info.extend(page_info_doc)
 
                     if caught_warnings:
                         # PyCharm thinks caught_warnings is not Iterable: