From a1390699d4273b3a151a0f14398c5f2def0f094d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 26 Nov 2024 16:27:43 +0100
Subject: [PATCH 01/37] =?UTF-8?q?=F0=9F=9A=A7=20Use=20a=20temporary=20sqli?=
 =?UTF-8?q?te=20db?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 51 ++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index ef24d36..e12af4f 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -3,7 +3,9 @@ import csv
 import logging
 import os
 import re
+import sqlite3
 import warnings
+import sys
 from lxml import etree as ET
 from itertools import groupby
 from operator import attrgetter
@@ -394,7 +396,45 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
         else:
             mets_files_real.append(m)
 
+    current_columns = []
+
+    def valid_column_key(k):
+        if re.match("^[a-zA-Z0-9 _-]+$", k):
+            return True
+        else:
+            return False
+
+    def insert_into_db(con, d: Dict):
+        # Create table if necessary
+        if not current_columns:
+            for k in d.keys():
+                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
+                current_columns.append(k)
+            con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
+
+        # Add columns if necessary
+        for k in d.keys():
+            if not k in current_columns:
+                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
+                current_columns.append(k)
+                con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
+
+        # Insert
+        # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
+        # have use qmark style here.
+        columns = d.keys()
+        con.execute(
+            "INSERT INTO mods_info"
+            f"( {",".join(f"\"{c}\"" for c in columns)} )"
+            "VALUES"
+            f"( {",".join("?" for c in columns)} )",
+            [str(d[c]) for c in columns]
+        )
+
+
     # Process METS files
+    output_file_sqlite3 = output_file + ".sqlite3"
+    con = sqlite3.connect(output_file_sqlite3)
     with open(output_file + '.warnings.csv', 'w') as csvfile:
         csvwriter = csv.writer(csvfile)
         mods_info = []
@@ -423,9 +463,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
                     if output_page_info:
                         page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
 
-                    mods_info.append(d)
-                    if output_page_info:
-                        page_info.extend(page_info_doc)
+                    insert_into_db(con, d)
+                    con.commit()
+                    #TODO
+                    #if output_page_info:
+                    #    page_info.extend(page_info_doc)
 
                     if caught_warnings:
                         # PyCharm thinks caught_warnings is not Iterable:
@@ -433,8 +475,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
                         for caught_warning in caught_warnings:
                             csvwriter.writerow([mets_file, caught_warning.message])
             except Exception as e:
-                logger.error('Exception in {}: {}'.format(mets_file, e))
-                #import traceback; traceback.print_exc()
+                logger.exception('Exception in {}'.format(mets_file))
 
     # Convert the mods_info List[Dict] to a pandas DataFrame
     mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")

From b385f273915e05f7274cc51811281313be8b60eb Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Nov 2024 14:43:42 +0100
Subject: [PATCH 02/37] =?UTF-8?q?=F0=9F=9A=A7=20Write=20out=20page=5Finfo?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/lib.py         | 43 ++++++++++++++++++++
 src/mods4pandas/mods4pandas.py | 71 +++++++++++-----------------------
 2 files changed, 66 insertions(+), 48 deletions(-)

diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index d2e1f8f..302e4f2 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -2,6 +2,7 @@ from itertools import groupby
 import re
 import warnings
 from typing import List, Sequence, MutableMapping, Dict
+from collections import defaultdict
 
 import pandas as pd
 import numpy as np
@@ -328,3 +329,45 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
 
     df = pd.DataFrame(data=data, index=index, columns=columns)
     return df
+
+
+def valid_column_key(k):
+    if re.match("^[a-zA-Z0-9 _-]+$", k):
+        return True
+    else:
+        return False
+
+current_columns = defaultdict(list)
+
+def insert_into_db(con, table, d: Dict):
+    """Insert the values from the dict into the table, creating columns if necessary"""
+
+    # Create table if necessary
+    if not current_columns[table]:
+        for k in d.keys():
+            assert valid_column_key(k), f"\"{k}\" is not a valid column name"
+            current_columns[table].append(k)
+        con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})")
+
+    # Add columns if necessary
+    for k in d.keys():
+        if not k in current_columns[table]:
+            assert valid_column_key(k), f"\"{k}\" is not a valid column name"
+            current_columns[table].append(k)
+            con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"")
+
+    # Insert
+    # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
+    # have use qmark style here.
+    columns = d.keys()
+    con.execute(
+        f"INSERT INTO {table}"
+        f"( {",".join(f"\"{c}\"" for c in columns)} )"
+        "VALUES"
+        f"( {",".join("?" for c in columns)} )",
+        [str(d[c]) for c in columns]
+    )
+
+def insert_into_db_multiple(con, table, ld: List[Dict]):
+    for d in ld:
+        insert_into_db(con, table, d)
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index e12af4f..aae282d 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -10,13 +10,14 @@ from lxml import etree as ET
 from itertools import groupby
 from operator import attrgetter
 from typing import Dict, List
+from collections import defaultdict
 from collections.abc import MutableMapping, Sequence
 
 import click
 import pandas as pd
 from tqdm import tqdm
 
-from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
+from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple
 
 
 
@@ -396,45 +397,16 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
         else:
             mets_files_real.append(m)
 
-    current_columns = []
-
-    def valid_column_key(k):
-        if re.match("^[a-zA-Z0-9 _-]+$", k):
-            return True
-        else:
-            return False
-
-    def insert_into_db(con, d: Dict):
-        # Create table if necessary
-        if not current_columns:
-            for k in d.keys():
-                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
-                current_columns.append(k)
-            con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
-
-        # Add columns if necessary
-        for k in d.keys():
-            if not k in current_columns:
-                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
-                current_columns.append(k)
-                con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
-
-        # Insert
-        # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
-        # have use qmark style here.
-        columns = d.keys()
-        con.execute(
-            "INSERT INTO mods_info"
-            f"( {",".join(f"\"{c}\"" for c in columns)} )"
-            "VALUES"
-            f"( {",".join("?" for c in columns)} )",
-            [str(d[c]) for c in columns]
-        )
 
 
     # Process METS files
     output_file_sqlite3 = output_file + ".sqlite3"
     con = sqlite3.connect(output_file_sqlite3)
+
+    if output_page_info:
+        output_page_info_sqlite3 = output_page_info + ".sqlite3"
+        con_page_info = sqlite3.connect(output_page_info_sqlite3)
+
     with open(output_file + '.warnings.csv', 'w') as csvfile:
         csvwriter = csv.writer(csvfile)
         mods_info = []
@@ -463,11 +435,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
                     if output_page_info:
                         page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
 
-                    insert_into_db(con, d)
+                    insert_into_db(con, "mods_info", d)
                     con.commit()
-                    #TODO
-                    #if output_page_info:
-                    #    page_info.extend(page_info_doc)
+                    if output_page_info:
+                        insert_into_db_multiple(con_page_info, "page_info", page_info_doc)
+                        con_page_info.commit()
 
                     if caught_warnings:
                         # PyCharm thinks caught_warnings is not Iterable:
@@ -478,18 +450,21 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
                 logger.exception('Exception in {}'.format(mets_file))
 
     # Convert the mods_info List[Dict] to a pandas DataFrame
-    mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
-
+    # TODO
+    # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
+ 
     # Save the DataFrame
-    logger.info('Writing DataFrame to {}'.format(output_file))
-    mods_info_df.to_parquet(output_file)
+    # TODO
+    #logger.info('Writing DataFrame to {}'.format(output_file))
+    #mods_info_df.to_parquet(output_file)
 
     # Convert page_info
-    if output_page_info:
-        page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
-        # Save the DataFrame
-        logger.info('Writing DataFrame to {}'.format(output_page_info))
-        page_info_df.to_parquet(output_page_info)
+    # TODO
+    # if output_page_info:
+    #     page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
+    #     # Save the DataFrame
+    #     logger.info('Writing DataFrame to {}'.format(output_page_info))
+    #     page_info_df.to_parquet(output_page_info)
 
 
 def main():

From eeaad036860c5510633d2a664d8d55604b78cfa0 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Nov 2024 16:38:18 +0100
Subject: [PATCH 03/37] =?UTF-8?q?=F0=9F=9A=A7=20Avoid=20nested=20quotes=20?=
 =?UTF-8?q?for=20Python=20<=203.12?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/lib.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 302e4f2..2f50f43 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -337,6 +337,14 @@ def valid_column_key(k):
     else:
         return False
 
+def column_names_csv(columns):
+    """
+    Format Column names (identifiers) as a comma-separated list.
+
+    This uses double quotes per SQL standard.
+    """
+    return ",".join('"' + c + '"' for c in columns)
+
 current_columns = defaultdict(list)
 
 def insert_into_db(con, table, d: Dict):
@@ -345,16 +353,16 @@ def insert_into_db(con, table, d: Dict):
     # Create table if necessary
     if not current_columns[table]:
         for k in d.keys():
-            assert valid_column_key(k), f"\"{k}\" is not a valid column name"
+            assert valid_column_key(k), f'"{k}" is not a valid column name'
             current_columns[table].append(k)
-        con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})")
+        con.execute(f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})")
 
     # Add columns if necessary
     for k in d.keys():
         if not k in current_columns[table]:
-            assert valid_column_key(k), f"\"{k}\" is not a valid column name"
+            assert valid_column_key(k), f'"{k}" is not a valid column name'
             current_columns[table].append(k)
-            con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"")
+            con.execute(f'ALTER TABLE {table} ADD COLUMN "{k}"')
 
     # Insert
     # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
@@ -362,9 +370,9 @@ def insert_into_db(con, table, d: Dict):
     columns = d.keys()
     con.execute(
         f"INSERT INTO {table}"
-        f"( {",".join(f"\"{c}\"" for c in columns)} )"
+        f"( {column_names_csv(columns)} )"
         "VALUES"
-        f"( {",".join("?" for c in columns)} )",
+        f"( {','.join('?' for c in columns)} )",
         [str(d[c]) for c in columns]
     )
 

From 22b62d7a2fbbef453562c9c3738bb1a0c5f6db7c Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Nov 2024 18:48:56 +0100
Subject: [PATCH 04/37] =?UTF-8?q?=F0=9F=90=9B=20Remove=20output=20files=20?=
 =?UTF-8?q?before=20writing=20them=20again?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index aae282d..215acd0 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import contextlib
 import csv
 import logging
 import os
@@ -401,10 +402,14 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 
     # Process METS files
     output_file_sqlite3 = output_file + ".sqlite3"
+    with contextlib.suppress(FileNotFoundError):
+        os.remove(output_file_sqlite3)
     con = sqlite3.connect(output_file_sqlite3)
 
     if output_page_info:
         output_page_info_sqlite3 = output_page_info + ".sqlite3"
+        with contextlib.suppress(FileNotFoundError):
+            os.remove(output_page_info_sqlite3)
         con_page_info = sqlite3.connect(output_page_info_sqlite3)
 
     with open(output_file + '.warnings.csv', 'w') as csvfile:

From 11d7b012ec3cc2762b1baea77aee0efc949b99a4 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Nov 2024 19:03:35 +0100
Subject: [PATCH 05/37] =?UTF-8?q?=F0=9F=90=9B=20Leave=20tqdm=20progress=20?=
 =?UTF-8?q?bar=20to=20avoid=20confusion=20through=20other=20left-over=20pr?=
 =?UTF-8?q?ogress=20bars?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 215acd0..c3cde97 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -417,7 +417,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
         mods_info = []
         page_info = []
         logger.info('Processing METS files')
-        for mets_file in tqdm(mets_files_real, leave=False):
+        for mets_file in tqdm(mets_files_real, leave=True):
             try:
                 root = ET.parse(mets_file).getroot()
                 mets = root # XXX .find('mets:mets', ns) does not work here

From 939967edc8863d7781d2311631cab3e94c3fc1c3 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Nov 2024 19:05:05 +0100
Subject: [PATCH 06/37] =?UTF-8?q?=F0=9F=90=9B=20De-couple=20mods=5Finfo=20?=
 =?UTF-8?q?from=20page=5Finfo?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index c3cde97..e08fe37 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -436,13 +436,13 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
                     # "meta"
                     d['mets_file'] = mets_file
 
+                    # Save
+                    insert_into_db(con, "mods_info", d)
+                    con.commit()
+
                     # METS - per-page
                     if output_page_info:
                         page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
-
-                    insert_into_db(con, "mods_info", d)
-                    con.commit()
-                    if output_page_info:
                         insert_into_db_multiple(con_page_info, "page_info", page_info_doc)
                         con_page_info.commit()
 

From 8ee4c3d0bcb4a65fec343c0317a6df211251f311 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 27 Nov 2024 19:56:36 +0100
Subject: [PATCH 07/37] =?UTF-8?q?=F0=9F=90=9B=20Normalize=20structure=20ty?=
 =?UTF-8?q?pe=20names=20to=20lower=20case?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index e08fe37..b4051f9 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -362,7 +362,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
 
         # Populate structure type indicator variables
         for struct_div in struct_divs:
-            type_ = struct_div.attrib.get("TYPE")
+            type_ = struct_div.attrib.get("TYPE").lower()
             assert type_
             page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = 1
 

From abb20b8ba9fca2e36d12df0198cb2b2de7830b19 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 28 Nov 2024 14:25:27 +0100
Subject: [PATCH 08/37] =?UTF-8?q?=F0=9F=90=9B=20Add=20multivolume=20type?=
 =?UTF-8?q?=20'multivolume=5Fmanuscript'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index b4051f9..4427f13 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -277,7 +277,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
         # This is expected in a multivolume work or periodical!
         if any(
                 structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
-                for t in ["multivolume_work", "MultivolumeWork", "periodical"]
+                for t in ["multivolume_work", "MultivolumeWork", "multivolume_manuscript", "periodical"]
         ):
             return []
         else:

From 11a04916f38631a2e55192bcb4db9b25df8f384b Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 28 Nov 2024 18:27:39 +0100
Subject: [PATCH 09/37] =?UTF-8?q?=F0=9F=90=9B=20Write=20mods=5Finfo=20Parq?=
 =?UTF-8?q?uet=20file=20again?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 4427f13..4fabb52 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -399,19 +399,24 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
             mets_files_real.append(m)
 
 
-
-    # Process METS files
+    # Prepare output files
+    with contextlib.suppress(FileNotFoundError):
+        os.remove(output_file)
     output_file_sqlite3 = output_file + ".sqlite3"
     with contextlib.suppress(FileNotFoundError):
         os.remove(output_file_sqlite3)
+
+    logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
     con = sqlite3.connect(output_file_sqlite3)
 
     if output_page_info:
         output_page_info_sqlite3 = output_page_info + ".sqlite3"
+        logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3))
         with contextlib.suppress(FileNotFoundError):
             os.remove(output_page_info_sqlite3)
         con_page_info = sqlite3.connect(output_page_info_sqlite3)
 
+    # Process METS files
     with open(output_file + '.warnings.csv', 'w') as csvfile:
         csvwriter = csv.writer(csvfile)
         mods_info = []
@@ -454,14 +459,12 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
             except Exception as e:
                 logger.exception('Exception in {}'.format(mets_file))
 
-    # Convert the mods_info List[Dict] to a pandas DataFrame
-    # TODO
-    # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
- 
+    # Convert the mods_info SQL to a pandas DataFrame
+    mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier")
+
     # Save the DataFrame
-    # TODO
-    #logger.info('Writing DataFrame to {}'.format(output_file))
-    #mods_info_df.to_parquet(output_file)
+    logger.info('Writing DataFrame to {}'.format(output_file))
+    mods_info_df.to_parquet(output_file)
 
     # Convert page_info
     # TODO

From 6981efb87cf236f531bb72af026bd18d7c601b4a Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 28 Nov 2024 18:32:40 +0100
Subject: [PATCH 10/37] =?UTF-8?q?=F0=9F=90=9B=20Write=20page=5Finfo=20Parq?=
 =?UTF-8?q?uet=20file=20again?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 4fabb52..46ebfc8 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -461,18 +461,15 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 
     # Convert the mods_info SQL to a pandas DataFrame
     mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier")
-
-    # Save the DataFrame
     logger.info('Writing DataFrame to {}'.format(output_file))
     mods_info_df.to_parquet(output_file)
 
-    # Convert page_info
-    # TODO
-    # if output_page_info:
-    #     page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
-    #     # Save the DataFrame
-    #     logger.info('Writing DataFrame to {}'.format(output_page_info))
-    #     page_info_df.to_parquet(output_page_info)
+    if output_page_info:
+          # Convert page_info SQL to a pandas DataFrama
+          page_info_df = pd.read_sql_query("SELECT * FROM page_info", con_page_info, index_col=["ppn", "ID"])
+          # Save the DataFrame
+          logger.info('Writing DataFrame to {}'.format(output_page_info))
+          page_info_df.to_parquet(output_page_info)
 
 
 def main():

From ca8f165955f36292958ce3a752b12469f6cdc824 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 28 Nov 2024 20:05:55 +0100
Subject: [PATCH 11/37] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20redundant=20comme?=
 =?UTF-8?q?nt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 46ebfc8..4479844 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -467,7 +467,6 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
     if output_page_info:
           # Convert page_info SQL to a pandas DataFrama
           page_info_df = pd.read_sql_query("SELECT * FROM page_info", con_page_info, index_col=["ppn", "ID"])
-          # Save the DataFrame
           logger.info('Writing DataFrame to {}'.format(output_page_info))
           page_info_df.to_parquet(output_page_info)
 

From 39f7d8646a8433fa11472216b5fe27550a820f45 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Fri, 29 Nov 2024 15:53:00 +0100
Subject: [PATCH 12/37] =?UTF-8?q?=F0=9F=9A=A7=20Use=20temporary=20SQLite?=
 =?UTF-8?q?=20DB=20for=20alto4pandas,=20too?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/alto4pandas.py | 53 +++++++++++++++++-----------------
 src/mods4pandas/lib.py         |  2 +-
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py
index 1508150..8dde40a 100755
--- a/src/mods4pandas/alto4pandas.py
+++ b/src/mods4pandas/alto4pandas.py
@@ -5,6 +5,8 @@ import os
 import re
 import warnings
 import sys
+import contextlib
+import sqlite3
 from xml.dom.expatbuilder import Namespaces
 from lxml import etree as ET
 from itertools import groupby
@@ -17,7 +19,7 @@ import pandas as pd
 import numpy as np
 from tqdm import tqdm
 
-from .lib import TagGroup, sorted_groupby, flatten, ns
+from .lib import TagGroup, sorted_groupby, flatten, ns, insert_into_db
 
 
 logger = logging.getLogger('alto4pandas')
@@ -121,18 +123,19 @@ def walk(m):
 
 @click.command()
 @click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
-@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
-              default='alto_info_df.pkl', show_default=True)
-@click.option('--output-csv', type=click.Path(), help='Output CSV file')
-@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
-def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
+@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
+              default='alto_info_df.parquet', show_default=True)
+def process(alto_files: List[str], output_file: str):
     """
     A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
 
     INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
     all files in the directory.
 
-    alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
+    alto4pandas writes multiple output files:
+    - A Parquet DataFrame
+    - A SQLite database
+    - and a CSV file with all conversion warnings.
     """
 
     # Extend file list if directories are given
@@ -141,10 +144,19 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
         for x in walk(m):
             alto_files_real.append(x)
 
+    # Prepare output files
+    with contextlib.suppress(FileNotFoundError):
+        os.remove(output_file)
+    output_file_sqlite3 = output_file + ".sqlite3"
+    with contextlib.suppress(FileNotFoundError):
+        os.remove(output_file_sqlite3)
+
+    logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
+    con = sqlite3.connect(output_file_sqlite3)
+
     # Process ALTO files
     with open(output_file + '.warnings.csv', 'w') as csvfile:
         csvwriter = csv.writer(csvfile)
-        alto_info = []
         logger.info('Processing ALTO files')
         for alto_file in tqdm(alto_files_real, leave=False):
             try:
@@ -160,7 +172,9 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
                     d['alto_file'] = alto_file
                     d['alto_xmlns'] = ET.QName(alto).namespace
 
-                    alto_info.append(d)
+                    # Save
+                    insert_into_db(con, "alto_info", d)
+                    con.commit
 
                     if caught_warnings:
                         # PyCharm thinks caught_warnings is not Iterable:
@@ -171,25 +185,10 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
                 logger.error('Exception in {}: {}'.format(alto_file, e))
                 import traceback; traceback.print_exc()
 
-    # Convert the alto_info List[Dict] to a pandas DataFrame
-    columns = []
-    for m in alto_info:
-        for c in m.keys():
-            if c not in columns:
-                columns.append(c)
-    data = [[m.get(c) for c in columns] for m in alto_info]
-    index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?
-    alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)
-
-    # Pickle the DataFrame
+    # Convert the alto_info SQL to a pandas DataFrame
+    alto_info_df = pd.read_sql_query("SELECT * FROM alto_info", con, index_col="alto_file")
     logger.info('Writing DataFrame to {}'.format(output_file))
-    alto_info_df.to_pickle(output_file)
-    if output_csv:
-        logger.info('Writing CSV to {}'.format(output_csv))
-        alto_info_df.to_csv(output_csv)
-    if output_xlsx:
-        logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
-        alto_info_df.to_excel(output_xlsx)
+    alto_info_df.to_parquet(output_file)
 
 
 def main():
diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 2f50f43..11c88a3 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -332,7 +332,7 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
 
 
 def valid_column_key(k):
-    if re.match("^[a-zA-Z0-9 _-]+$", k):
+    if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k):
         return True
     else:
         return False

From 6af4a6f67128f83585f29710bfb7c5a62411a6ed Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 3 Dec 2024 17:02:12 +0100
Subject: [PATCH 13/37] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20unused/obsolete?=
 =?UTF-8?q?=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/lib.py         | 30 ------------------------------
 src/mods4pandas/mods4pandas.py |  2 +-
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 11c88a3..8a65901 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -301,36 +301,6 @@ def flatten(d: MutableMapping, parent='', separator='_'):
     return dict(items)
 
 
-def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
-    """
-    Convert the given list of dicts to a Pandas DataFrame.
-
-    The keys of the dicts make the columns.
-    """
-
-    # Build columns from keys
-    columns = []
-    for m in data_list:
-        for c in m.keys():
-            if c not in columns:
-                columns.append(c)
-
-    # Build data table
-    data = [[m.get(c) for c in columns] for m in data_list]
-
-    # Build index
-    if isinstance(index_column, str):
-        index = [m[index_column] for m in data_list]
-    elif isinstance(index_column, tuple):
-        index = [[m[c] for m in data_list] for c in index_column]
-        index = pd.MultiIndex.from_arrays(index, names=index_column)
-    else:
-        raise ValueError(f"index_column must")
-
-    df = pd.DataFrame(data=data, index=index, columns=columns)
-    return df
-
-
 def valid_column_key(k):
     if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k):
         return True
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 4479844..ee1f45d 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -18,7 +18,7 @@ import click
 import pandas as pd
 from tqdm import tqdm
 
-from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple
+from .lib import sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple
 
 
 

From ef4eeac7e249856439756b054b175470c338604f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 3 Dec 2024 17:02:24 +0100
Subject: [PATCH 14/37] =?UTF-8?q?=F0=9F=A7=B9=20Remove=20unused/obsolete?=
 =?UTF-8?q?=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/mods4pandas.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index ee1f45d..30d7c22 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -419,8 +419,6 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
     # Process METS files
     with open(output_file + '.warnings.csv', 'w') as csvfile:
         csvwriter = csv.writer(csvfile)
-        mods_info = []
-        page_info = []
         logger.info('Processing METS files')
         for mets_file in tqdm(mets_files_real, leave=True):
             try:

From 4d6e1f4ff4e164c1319aaaae51cb2b366d30fb96 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 3 Dec 2024 17:24:24 +0100
Subject: [PATCH 15/37] =?UTF-8?q?=F0=9F=90=9B=20Add=20missing=20tag=20alto?=
 =?UTF-8?q?:fileIdentifier?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/alto4pandas.py                |   2 +
 .../data/alto/PPN1844793923/00000017.xml      | 663 ++++++++++++++++++
 2 files changed, 665 insertions(+)
 create mode 100644 src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml

diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py
index 8dde40a..8c8f934 100755
--- a/src/mods4pandas/alto4pandas.py
+++ b/src/mods4pandas/alto4pandas.py
@@ -76,6 +76,8 @@ def alto_to_dict(alto, raise_errors=True):
             value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
         elif localname == 'fileName':
             value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
+        elif localname == 'fileIdentifier':
+            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
 
         elif localname == 'Layout':
             value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
diff --git a/src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml b/src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml
new file mode 100644
index 0000000..7f658fa
--- /dev/null
+++ b/src/mods4pandas/tests/data/alto/PPN1844793923/00000017.xml
@@ -0,0 +1,663 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--Angefertigt im Rahmen der Citizen Science-Werkstatt "Frauen* im Fokus" der Staatsbibliothek zu Berlin und der Universität Potsdam (2023-2024)-->
+<!--Veröffentlicht unter CC0 1.0 Universal Licence (https://creativecommons.org/publicdomain/zero/1.0/)-->
+<alto xmlns="http://www.loc.gov/standards/alto/ns-v4#"
+      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# http://www.loc.gov/standards/alto/v4/alto-4-2.xsd">
+   <Description>
+      <MeasurementUnit>pixel</MeasurementUnit>
+      <sourceImageInformation>
+         <fileName>16_b079a_default.jpg</fileName>
+         <fileIdentifier>https://content.staatsbibliothek-berlin.de/dc/1844793923-0017/full/full/0/default.jpg</fileIdentifier>
+      </sourceImageInformation>
+   </Description>
+   <Tags>
+      <OtherTag ID="BT1" LABEL="Title" DESCRIPTION="block type Title"/>
+      <OtherTag ID="BT2" LABEL="Main" DESCRIPTION="block type Main"/>
+      <OtherTag ID="BT3" LABEL="Commentary" DESCRIPTION="block type Commentary"/>
+      <OtherTag ID="BT4" LABEL="Illustration" DESCRIPTION="block type Illustration"/>
+      <OtherTag ID="BT615" LABEL="text" DESCRIPTION="block type text"/>
+      <OtherTag ID="LT357" LABEL="default" DESCRIPTION="line type default"/>
+   </Tags>
+   <Layout>
+      <Page WIDTH="1400"
+            HEIGHT="2256"
+            PHYSICAL_IMG_NR="16"
+            ID="eSc_dummypage_">
+         <PrintSpace HPOS="0" VPOS="0" WIDTH="1400" HEIGHT="2256">
+            <TextBlock HPOS="620"
+                       VPOS="110"
+                       WIDTH="54"
+                       HEIGHT="44"
+                       ID="eSc_textblock_badd9679"
+                       TAGREFS="BT615">
+               <Shape>
+                  <Polygon POINTS="621 110 659 110 674 134 654 154 620 149 621 110"/>
+               </Shape>
+               <TextLine ID="eSc_line_8138ac12"
+                         TAGREFS="LT357"
+                         BASELINE="619 142 669 142"
+                         HPOS="619"
+                         VPOS="109"
+                         WIDTH="50"
+                         HEIGHT="55">
+                  <Shape>
+                     <Polygon POINTS="656 110 644 110 631 110 625 109 619 109 619 142 619 160 636 164 637 164 639 164 669 152 669 142 668 135 656 110"/>
+                  </Shape>
+                  <String CONTENT="11"
+                          HPOS="619"
+                          VPOS="109"
+                          WIDTH="50"
+                          HEIGHT="55"
+                          WC="0.9999991655349731"/>
+               </TextLine>
+            </TextBlock>
+            <TextBlock HPOS="65"
+                       VPOS="200"
+                       WIDTH="1164"
+                       HEIGHT="1809"
+                       ID="eSc_textblock_848130e8"
+                       TAGREFS="BT615">
+               <Shape>
+                  <Polygon POINTS="66 200 1194 200 1219 215 1229 1803 1214 2009 65 2004 66 200"/>
+               </Shape>
+               <TextLine ID="eSc_line_224db9b6"
+                         TAGREFS="LT357"
+                         BASELINE="62 235 1210 235"
+                         HPOS="62"
+                         VPOS="198"
+                         WIDTH="1148"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="1203 205 1190 200 1178 200 1174 200 1145 205 1144 205 1104 200 1102 200 1090 200 1077 200 1075 200 1062 205 1061 206 1029 206 1027 205 1020 200 1015 200 1002 200 991 200 958 204 947 205 945 204 938 200 927 200 914 200 902 200 889 200 877 200 864 200 852 200 839 200 837 200 830 203 828 204 819 203 798 200 789 200 777 200 765 200 758 203 748 206 674 206 666 201 663 200 651 200 637 200 626 200 614 200 601 200 589 200 576 200 564 200 551 200 538 200 526 200 513 200 501 200 488 200 476 200 473 200 419 205 406 200 401 200 388 200 376 200 363 200 353 200 297 205 266 200 263 200 250 200 238 200 225 200 215 200 144 205 88 200 87 200 75 200 73 198 62 198 62 235 62 246 80 255 81 255 82 255 185 248 266 255 268 256 437 256 437 255 438 255 439 255 451 248 523 255 525 256 579 256 579 255 580 255 581 255 591 248 639 255 640 255 641 255 670 248 708 255 709 255 763 248 792 255 793 255 794 255 795 255 818 249 843 255 844 255 846 256 991 256 991 255 992 255 1022 248 1039 254 1040 254 1041 254 1077 248 1170 254 1171 254 1210 246 1210 235 1210 210 1203 205"/>
+                  </Shape>
+                  <String CONTENT="ihrigen völlig gleiche. Es iſt das eine Wahrheit, die wir viel⸗"
+                          HPOS="62"
+                          VPOS="198"
+                          WIDTH="1148"
+                          HEIGHT="58"
+                          WC="0.9932242121015277"/>
+               </TextLine>
+               <TextLine ID="eSc_line_a0d143de"
+                         TAGREFS="LT357"
+                         BASELINE="63 287 1208 287"
+                         HPOS="63"
+                         VPOS="249"
+                         WIDTH="1145"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="63 287 63 304 209 298 219 304 221 305 223 305 224 305 225 307 350 307 350 305 388 303 446 298 453 302 456 304 457 304 458 304 531 305 532 305 533 305 543 302 553 298 587 302 627 305 629 305 686 300 723 298 763 300 847 305 848 305 849 305 851 305 859 299 863 298 906 298 909 299 916 304 917 304 918 304 1007 305 1008 305 1064 298 1069 298 1074 298 1124 305 1125 305 1208 297 1208 287 1208 259 1174 249 1173 249 1171 249 1129 259 1128 259 1126 259 1100 250 1099 250 1097 250 1051 258 983 249 982 249 981 249 957 258 952 259 946 258 922 249 921 249 919 249 918 249 903 256 898 260 854 260 833 256 788 249 787 249 715 255 665 259 632 254 599 249 597 249 596 249 585 254 571 259 522 253 507 251 506 251 505 251 503 251 502 253 490 260 428 260 416 251 414 251 413 251 412 251 408 251 382 258 352 251 339 249 338 249 322 251 265 259 239 250 235 249 234 249 233 249 226 250 186 259 169 249 167 249 166 249 165 249 164 249 162 249 137 259 122 249 121 249 120 249 63 249 63 287"/>
+                  </Shape>
+                  <String CONTENT="fach in Deutſchland auch erkennen müſſen. So hat die Er⸗"
+                          HPOS="63"
+                          VPOS="249"
+                          WIDTH="1145"
+                          HEIGHT="58"
+                          WC="0.9997389763593674"/>
+               </TextLine>
+               <TextLine ID="eSc_line_8e9f4e0d"
+                         TAGREFS="LT357"
+                         BASELINE="63 338 1211 340"
+                         HPOS="62"
+                         VPOS="302"
+                         WIDTH="1149"
+                         HEIGHT="52">
+                  <Shape>
+                     <Polygon POINTS="63 338 62 353 1209 354 1211 340 1211 313 1185 303 1184 303 1183 303 1013 310 981 312 977 310 966 303 965 303 963 303 919 303 918 303 917 303 892 309 881 312 868 309 839 303 838 303 837 303 819 309 810 312 802 308 787 303 785 303 784 303 783 303 773 308 765 312 758 308 747 303 745 303 744 303 723 308 705 312 698 308 691 303 690 303 689 303 688 303 686 303 676 307 668 310 656 307 637 302 636 302 635 302 599 307 569 310 416 304 360 303 359 303 358 303 355 304 340 310 309 304 298 302 297 302 295 302 292 303 278 312 146 312 63 302 63 338"/>
+                  </Shape>
+                  <String CONTENT="fahrung z. B. keineswegs gelehrt, daß die bei uns übliche"
+                          HPOS="62"
+                          VPOS="302"
+                          WIDTH="1149"
+                          HEIGHT="52"
+                          WC="0.9990628215304592"/>
+               </TextLine>
+               <TextLine ID="eSc_line_3701a0da"
+                         TAGREFS="LT357"
+                         BASELINE="63 389 1211 392"
+                         HPOS="62"
+                         VPOS="350"
+                         WIDTH="1149"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="63 389 62 399 243 408 244 409 303 409 303 408 304 408 305 408 314 402 315 402 360 403 574 408 575 408 607 404 635 402 664 406 698 409 699 409 729 406 752 403 764 406 775 409 777 409 778 409 844 407 937 403 1209 409 1211 392 1210 360 1096 353 1095 353 1094 353 1092 353 1084 359 1077 363 1023 363 1005 358 981 353 980 353 956 358 936 362 911 357 886 353 884 353 851 357 809 362 794 357 785 353 784 353 661 353 660 353 659 353 654 355 637 362 557 354 527 352 526 352 525 352 518 354 500 360 428 353 422 353 421 353 419 353 418 353 396 360 364 353 360 352 359 352 358 352 357 352 354 353 339 362 261 362 226 352 225 352 224 352 223 352 174 360 154 353 152 353 151 353 109 359 63 350 63 389"/>
+                  </Shape>
+                  <String CONTENT="Vorbereitung auf das höhere Lehrfach gerade eine günſtige"
+                          HPOS="62"
+                          VPOS="350"
+                          WIDTH="1149"
+                          HEIGHT="59"
+                          WC="0.9992533409804628"/>
+               </TextLine>
+               <TextLine ID="eSc_line_d164fc0e"
+                         TAGREFS="LT357"
+                         BASELINE="66 441 1211 441"
+                         HPOS="66"
+                         VPOS="408"
+                         WIDTH="1145"
+                         HEIGHT="56">
+                  <Shape>
+                     <Polygon POINTS="66 441 66 454 210 454 213 456 224 463 225 463 226 463 228 464 377 464 377 463 378 463 397 456 402 454 412 456 448 463 449 464 510 464 510 463 511 463 532 457 541 454 557 457 602 463 604 464 726 464 726 463 763 457 782 454 792 457 805 462 807 462 808 462 809 462 823 457 832 454 841 458 849 461 851 461 852 461 853 461 859 458 867 454 1000 454 1011 458 1025 463 1026 463 1027 463 1029 463 1044 458 1056 454 1081 458 1097 461 1099 461 1117 458 1149 454 1155 458 1161 463 1163 463 1164 463 1165 463 1211 458 1211 441 1211 411 66 408 66 441"/>
+                  </Shape>
+                  <String CONTENT="Vorbedingung für den Mädch enlehrer ſei, und doch wird ſie"
+                          HPOS="66"
+                          VPOS="408"
+                          WIDTH="1145"
+                          HEIGHT="56"
+                          WC="0.9875460811730089"/>
+               </TextLine>
+               <TextLine ID="eSc_line_6e7da7da"
+                         TAGREFS="LT357"
+                         BASELINE="63 491 1210 493"
+                         HPOS="62"
+                         VPOS="456"
+                         WIDTH="1148"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="63 491 62 505 343 512 344 512 345 512 373 505 376 505 377 505 387 512 388 512 389 512 391 513 525 513 525 512 582 505 591 505 605 505 723 513 724 515 772 515 772 513 773 513 774 513 785 506 853 513 854 513 948 506 965 513 966 513 967 513 1057 506 1097 513 1099 513 1100 513 1121 506 1188 512 1189 512 1190 512 1209 505 1210 493 1208 461 1102 466 1076 459 1062 457 995 457 993 457 971 459 924 466 856 459 825 457 824 457 812 459 783 466 765 459 759 457 758 457 757 457 745 459 721 464 704 459 698 457 696 457 695 457 694 457 693 457 690 459 678 467 640 467 619 459 609 456 607 456 606 456 605 456 595 458 577 464 553 458 540 456 538 456 523 458 498 464 486 458 480 456 478 456 401 456 399 456 398 456 391 458 373 464 312 464 300 458 295 456 294 456 249 456 248 456 246 456 241 458 229 464 184 458 165 456 164 456 146 458 101 464 63 458 63 491"/>
+                  </Shape>
+                  <String CONTENT="von den Lehrern ſelbſt für die einzig richtige gehalten, und"
+                          HPOS="62"
+                          VPOS="456"
+                          WIDTH="1148"
+                          HEIGHT="59"
+                          WC="0.9964237123727798"/>
+               </TextLine>
+               <TextLine ID="eSc_line_750a0366"
+                         TAGREFS="LT357"
+                         BASELINE="63 546 1211 543"
+                         HPOS="62"
+                         VPOS="507"
+                         WIDTH="1149"
+                         HEIGHT="63">
+                  <Shape>
+                     <Polygon POINTS="63 546 65 570 97 558 213 558 233 569 234 569 235 570 283 570 283 569 284 569 285 569 297 561 313 569 314 569 315 569 317 570 491 570 491 569 492 569 493 569 495 567 507 560 581 567 605 569 606 569 607 569 616 566 651 557 666 566 668 567 669 567 670 567 671 567 758 566 759 566 760 566 770 560 809 565 825 567 827 567 828 567 829 567 832 565 844 557 986 557 1046 564 1081 567 1082 567 1111 564 1136 560 1149 564 1160 566 1161 566 1163 566 1164 566 1174 562 1191 557 1208 562 1211 543 1210 517 1174 507 1173 507 1171 507 1170 507 1148 517 1133 517 1100 517 1100 516 1079 507 1077 507 993 507 992 507 991 507 971 516 970 516 913 507 912 507 911 507 891 516 889 517 849 517 848 516 827 507 825 507 824 507 760 508 759 508 743 516 741 516 740 516 718 507 716 507 681 507 680 507 679 507 664 516 663 516 656 516 609 508 607 508 606 508 594 515 590 517 553 515 485 511 483 511 463 515 452 517 446 515 431 508 429 508 428 508 378 515 357 517 329 515 266 508 265 508 62 513 63 546"/>
+                  </Shape>
+                  <String CONTENT="jeder Vorſchlag, Lehrerinnen für Oberklaſſen eine tüchtige, für"
+                          HPOS="62"
+                          VPOS="507"
+                          WIDTH="1149"
+                          HEIGHT="63"
+                          WC="0.9951480844664196"/>
+               </TextLine>
+               <TextLine ID="eSc_line_1f1aab06"
+                         TAGREFS="LT357"
+                         BASELINE="63 597 1213 595"
+                         HPOS="62"
+                         VPOS="564"
+                         WIDTH="1151"
+                         HEIGHT="53">
+                  <Shape>
+                     <Polygon POINTS="63 597 63 611 491 617 492 617 606 611 637 610 664 611 741 616 743 616 1211 610 1213 595 1209 564 62 564 63 597"/>
+                  </Shape>
+                  <String CONTENT="ihre Zwecke geeignete, aber von der bisher für Männer üblichen"
+                          HPOS="62"
+                          VPOS="564"
+                          WIDTH="1151"
+                          HEIGHT="53"
+                          WC="0.9934530392769845"/>
+               </TextLine>
+               <TextLine ID="eSc_line_2a35be96"
+                         TAGREFS="LT357"
+                         BASELINE="63 646 1210 649"
+                         HPOS="62"
+                         VPOS="612"
+                         WIDTH="1148"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="63 646 62 658 142 669 144 669 211 663 367 669 368 670 584 670 584 669 585 669 599 663 640 670 641 671 749 671 749 670 750 670 770 663 772 661 774 663 792 670 793 670 794 670 844 663 856 661 918 661 922 663 928 668 929 668 931 668 932 668 933 668 950 663 957 661 1095 661 1101 664 1114 670 1115 670 1116 670 1117 670 1130 664 1135 661 1208 664 1210 649 1210 621 924 614 923 614 922 614 897 620 872 614 871 614 869 614 798 621 788 621 785 621 764 614 763 614 696 614 695 614 694 614 693 614 683 621 680 622 641 622 636 621 601 612 600 612 599 612 571 621 427 621 379 612 378 612 322 620 297 612 295 612 294 612 293 612 269 620 245 614 244 614 243 614 204 620 157 612 156 612 121 620 90 612 88 612 87 612 86 612 63 620 63 646"/>
+                  </Shape>
+                  <String CONTENT="abweichende Vorbildung zu geben, ſtößt gerade bei den Lehrern,"
+                          HPOS="62"
+                          VPOS="612"
+                          WIDTH="1148"
+                          HEIGHT="59"
+                          WC="0.9888591131856365"/>
+               </TextLine>
+               <TextLine ID="eSc_line_9a56aaff"
+                         TAGREFS="LT357"
+                         BASELINE="65 700 1210 698"
+                         HPOS="63"
+                         VPOS="663"
+                         WIDTH="1147"
+                         HEIGHT="53">
+                  <Shape>
+                     <Polygon POINTS="65 700 65 713 1208 716 1210 698 1208 669 1171 663 1170 663 1169 663 1158 668 1151 673 1105 673 1080 668 1046 663 1045 663 1044 663 1035 668 1026 673 981 673 971 668 961 663 960 663 911 663 909 663 908 663 894 666 881 671 808 666 763 664 762 664 739 666 696 671 680 666 674 664 673 664 671 664 670 664 661 666 645 670 616 665 602 664 601 664 600 664 595 665 577 673 558 665 553 664 552 664 551 664 550 664 543 665 525 670 468 665 449 664 448 664 447 664 442 665 416 673 347 665 337 664 335 664 334 664 333 664 317 674 282 674 249 664 246 664 245 664 244 664 235 664 134 673 63 664 65 700"/>
+                  </Shape>
+                  <String CONTENT="die es ernſt mit der Mädchenſchule meinen, auf den Einwand:"
+                          HPOS="63"
+                          VPOS="663"
+                          WIDTH="1147"
+                          HEIGHT="53"
+                          WC="0.9915219272597361"/>
+               </TextLine>
+               <TextLine ID="eSc_line_be07dd7d"
+                         TAGREFS="LT357"
+                         BASELINE="65 750 1211 750"
+                         HPOS="65"
+                         VPOS="716"
+                         WIDTH="1146"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="65 750 65 765 150 765 166 773 167 773 169 773 170 773 189 765 297 765 310 773 312 773 313 773 314 774 500 774 500 773 501 773 521 765 566 773 567 774 656 774 656 773 658 773 659 773 670 765 686 773 688 773 689 774 913 774 913 773 914 773 916 773 929 765 973 765 987 773 988 773 990 773 991 773 992 773 1023 765 1070 772 1071 772 1095 767 1120 773 1121 773 1123 773 1124 773 1140 765 1211 765 1211 750 1211 720 65 716 65 750"/>
+                  </Shape>
+                  <String CONTENT="das iſt keine Wiſſenſchaft. Wenn ich perſönlich dieſen Einwand"
+                          HPOS="65"
+                          VPOS="716"
+                          WIDTH="1146"
+                          HEIGHT="58"
+                          WC="0.9983825597070879"/>
+               </TextLine>
+               <TextLine ID="eSc_line_408ede24"
+                         TAGREFS="LT357"
+                         BASELINE="66 800 1214 803"
+                         HPOS="66"
+                         VPOS="767"
+                         WIDTH="1148"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="66 800 66 822 102 815 205 822 238 823 239 824 386 824 386 823 387 823 397 820 411 817 444 820 473 823 475 823 476 823 485 820 497 815 564 819 639 824 640 825 725 825 725 824 726 824 728 824 735 818 736 818 745 818 797 824 798 824 799 824 815 818 820 817 832 818 874 824 876 824 877 824 892 818 896 817 913 818 988 823 990 824 990 823 1076 817 1082 817 1092 823 1094 823 1095 823 1096 823 1138 817 1139 817 1175 824 1176 824 1213 815 1214 803 1214 778 1170 768 1169 768 933 775 901 775 898 774 886 768 884 768 829 768 828 768 793 774 782 775 769 773 739 768 738 768 685 773 655 775 650 773 641 767 640 767 639 767 637 767 609 772 596 774 439 770 270 767 269 767 251 769 215 774 203 768 198 767 196 767 195 767 194 767 189 768 172 775 114 775 102 768 101 767 100 767 99 767 97 767 66 767 66 800"/>
+                  </Shape>
+                  <String CONTENT="für unberechtigt halte, wenn ich glaube, daß man, auch ohne"
+                          HPOS="66"
+                          VPOS="767"
+                          WIDTH="1148"
+                          HEIGHT="58"
+                          WC="0.998726772049726"/>
+               </TextLine>
+               <TextLine ID="eSc_line_20a902e6"
+                         TAGREFS="LT357"
+                         BASELINE="66 854 1211 852"
+                         HPOS="65"
+                         VPOS="814"
+                         WIDTH="1146"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="66 854 66 872 1210 867 1211 852 1209 823 1141 814 1140 814 1087 815 1086 815 1074 822 1069 825 968 825 963 822 952 814 951 814 950 814 948 814 947 814 923 822 912 824 853 822 729 814 728 814 676 820 635 827 572 827 558 820 546 815 545 815 543 815 457 820 417 822 374 819 299 815 298 815 273 819 244 824 235 819 228 815 226 815 132 815 131 815 130 815 129 815 125 819 115 825 65 819 66 854"/>
+                  </Shape>
+                  <String CONTENT="in den Fehler der Halbbildung und der Anpaſſung an die ſo⸗"
+                          HPOS="65"
+                          VPOS="814"
+                          WIDTH="1146"
+                          HEIGHT="58"
+                          WC="0.9782982016431874"/>
+               </TextLine>
+               <TextLine ID="eSc_line_9ece79d4"
+                         TAGREFS="LT357"
+                         BASELINE="66 908 1211 906"
+                         HPOS="65"
+                         VPOS="869"
+                         WIDTH="1146"
+                         HEIGHT="54">
+                  <Shape>
+                     <Polygon POINTS="66 908 66 922 1209 923 1211 906 1208 869 1102 877 1059 869 1055 869 1054 869 1052 869 1051 869 1035 877 962 871 961 871 960 871 931 877 916 871 913 869 912 869 911 869 903 871 872 877 847 871 841 869 839 869 838 869 792 871 636 878 585 872 569 871 567 871 566 871 561 872 545 878 531 872 526 871 525 871 461 871 459 871 458 871 453 873 441 878 379 873 343 871 342 871 317 873 280 878 65 876 66 908"/>
+                  </Shape>
+                  <String CONTENT="genannten weiblichen Fähigkeiten zu verfallen, doch unbedenklich"
+                          HPOS="65"
+                          VPOS="869"
+                          WIDTH="1146"
+                          HEIGHT="54"
+                          WC="0.9946558028459549"/>
+               </TextLine>
+               <TextLine ID="eSc_line_c94c8b73"
+                         TAGREFS="LT357"
+                         BASELINE="63 958 1211 958"
+                         HPOS="63"
+                         VPOS="922"
+                         WIDTH="1148"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="63 958 63 971 208 978 209 978 365 973 398 972 401 973 416 980 417 980 418 981 574 981 574 980 575 980 587 973 589 973 594 973 630 980 631 980 632 980 649 975 656 972 665 975 686 980 688 980 689 980 726 975 738 973 753 975 805 980 807 980 808 980 824 975 835 972 872 975 928 980 929 980 965 976 995 972 1047 972 1055 976 1061 980 1062 980 1064 980 1065 980 1089 976 1114 972 1126 976 1138 980 1139 980 1140 980 1211 976 1211 958 1211 932 1115 922 1114 922 990 929 961 931 957 929 941 922 940 922 938 922 721 927 656 928 646 927 624 922 622 922 621 922 607 926 591 932 414 932 402 924 397 922 396 922 344 922 343 922 342 922 340 922 338 923 327 932 246 932 233 924 231 924 230 924 229 924 228 924 218 932 149 932 124 922 122 922 63 922 63 958"/>
+                  </Shape>
+                  <String CONTENT="einen anderen Bildungsweg als den jetzt üblichen einſchlagen,"
+                          HPOS="63"
+                          VPOS="922"
+                          WIDTH="1148"
+                          HEIGHT="59"
+                          WC="0.9841961567519141"/>
+               </TextLine>
+               <TextLine ID="eSc_line_592b1c73"
+                         TAGREFS="LT357"
+                         BASELINE="66 1011 1213 1008"
+                         HPOS="65"
+                         VPOS="975"
+                         WIDTH="1148"
+                         HEIGHT="57">
+                  <Shape>
+                     <Polygon POINTS="66 1011 66 1032 140 1023 170 1031 171 1031 172 1032 204 1032 260 1032 260 1031 261 1031 263 1031 274 1023 313 1031 318 1031 319 1031 320 1031 322 1031 339 1023 388 1030 391 1030 392 1030 394 1030 434 1023 444 1030 446 1031 447 1031 448 1031 449 1032 621 1032 621 1031 622 1031 629 1028 636 1025 658 1027 675 1030 676 1030 678 1030 683 1027 693 1022 726 1027 739 1028 740 1028 741 1028 743 1028 745 1027 753 1022 782 1026 797 1028 798 1028 834 1026 888 1022 901 1026 916 1030 917 1030 918 1031 1164 1031 1164 1030 1211 1022 1213 1008 1210 981 1064 975 1062 975 1061 975 1041 980 1030 982 1016 980 990 975 988 975 948 980 924 982 911 980 889 975 888 975 887 975 886 975 879 980 874 983 842 983 823 978 798 975 797 975 770 978 750 982 744 978 738 975 736 975 735 975 659 975 658 975 656 975 646 978 634 983 604 978 595 977 594 977 592 977 591 977 590 978 581 985 477 985 458 978 452 976 451 976 449 976 434 977 397 983 378 977 373 976 372 976 370 976 362 977 335 983 308 977 299 976 298 976 297 976 292 977 273 985 146 985 132 977 130 976 129 976 127 976 65 976 66 1011"/>
+                  </Shape>
+                  <String CONTENT="daß man z. B. den Umweg durch die alten Sprachen ſich zum"
+                          HPOS="65"
+                          VPOS="975"
+                          WIDTH="1148"
+                          HEIGHT="57"
+                          WC="0.9897867263409129"/>
+               </TextLine>
+               <TextLine ID="eSc_line_3a4e0610"
+                         TAGREFS="LT357"
+                         BASELINE="66 1064 1213 1060"
+                         HPOS="65"
+                         VPOS="1023"
+                         WIDTH="1148"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="66 1064 66 1082 147 1075 283 1075 293 1081 294 1081 295 1081 297 1081 448 1074 462 1080 463 1080 464 1080 466 1080 493 1074 545 1080 546 1080 547 1080 548 1080 574 1074 586 1080 587 1080 589 1080 590 1080 614 1074 636 1080 637 1080 639 1080 640 1081 829 1081 829 1080 843 1079 897 1072 912 1079 913 1079 914 1079 916 1079 1211 1076 1213 1060 1210 1032 1165 1023 1164 1023 1163 1023 1146 1031 1115 1023 1114 1023 1112 1023 1111 1023 1095 1032 1039 1032 977 1023 976 1023 975 1023 960 1031 864 1023 863 1023 862 1023 861 1023 848 1031 695 1025 694 1025 693 1025 691 1025 678 1034 619 1034 595 1025 594 1025 592 1025 546 1032 526 1025 525 1025 523 1025 485 1032 448 1025 447 1025 446 1025 426 1034 129 1026 127 1026 65 1035 66 1064"/>
+                  </Shape>
+                  <String CONTENT="großen Teil ſparen könnte, ſo verſtehe ich doch, daß, wie die"
+                          HPOS="65"
+                          VPOS="1023"
+                          WIDTH="1148"
+                          HEIGHT="59"
+                          WC="0.9951545205272612"/>
+               </TextLine>
+               <TextLine ID="eSc_line_44befd6d"
+                         TAGREFS="LT357"
+                         BASELINE="66 1112 1210 1109"
+                         HPOS="65"
+                         VPOS="1079"
+                         WIDTH="1145"
+                         HEIGHT="51">
+                  <Shape>
+                     <Polygon POINTS="66 1112 66 1122 125 1122 127 1124 141 1130 142 1130 144 1130 145 1130 162 1124 166 1122 233 1122 235 1124 244 1130 245 1130 246 1130 248 1130 451 1125 574 1121 636 1125 699 1129 700 1129 747 1125 802 1121 813 1125 820 1129 822 1129 823 1129 958 1128 960 1128 963 1126 975 1120 1010 1126 1020 1128 1021 1128 1030 1126 1065 1120 1076 1126 1077 1128 1079 1128 1080 1128 1081 1128 1209 1126 1210 1109 1206 1079 65 1081 66 1112"/>
+                  </Shape>
+                  <String CONTENT="Sachen liegen, die Frauen in England zunächſt das Prinzip"
+                          HPOS="65"
+                          VPOS="1079"
+                          WIDTH="1145"
+                          HEIGHT="51"
+                          WC="0.9997124786962542"/>
+               </TextLine>
+               <TextLine ID="eSc_line_58f44575"
+                         TAGREFS="LT357"
+                         BASELINE="65 1160 1213 1163"
+                         HPOS="63"
+                         VPOS="1126"
+                         WIDTH="1150"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="65 1160 63 1179 322 1183 323 1183 378 1179 424 1175 444 1179 471 1183 472 1183 473 1183 475 1183 481 1178 487 1175 584 1175 590 1178 597 1181 599 1181 600 1181 601 1181 610 1178 616 1175 622 1178 639 1184 640 1184 641 1184 689 1178 691 1178 693 1178 716 1184 718 1184 719 1185 852 1185 852 1184 884 1176 889 1176 901 1176 998 1184 1000 1184 1211 1175 1213 1163 1211 1134 938 1128 937 1128 936 1128 924 1134 919 1136 700 1136 694 1134 679 1128 678 1128 676 1128 637 1134 631 1134 626 1134 594 1126 592 1126 555 1134 548 1134 537 1134 464 1126 463 1126 462 1126 442 1134 439 1134 436 1134 402 1126 401 1126 399 1126 398 1126 387 1134 384 1135 218 1135 211 1134 172 1126 171 1126 170 1126 169 1126 159 1134 157 1134 152 1134 119 1126 117 1126 116 1126 115 1126 104 1134 101 1134 65 1134 65 1160"/>
+                  </Shape>
+                  <String CONTENT="aufſtellten, genau denſelben Studiengang zu verfolgen und die⸗"
+                          HPOS="63"
+                          VPOS="1126"
+                          WIDTH="1150"
+                          HEIGHT="59"
+                          WC="0.9797537374881006"/>
+               </TextLine>
+               <TextLine ID="eSc_line_ee9d034a"
+                         TAGREFS="LT357"
+                         BASELINE="67 1218 1213 1215"
+                         HPOS="66"
+                         VPOS="1178"
+                         WIDTH="1147"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="67 1218 67 1234 376 1229 386 1234 386 1235 387 1235 388 1235 389 1237 483 1237 483 1235 491 1234 527 1229 556 1234 562 1235 564 1235 572 1234 614 1229 634 1234 636 1235 637 1235 639 1235 640 1235 644 1234 670 1228 686 1234 688 1234 689 1234 832 1228 1210 1234 1213 1215 1210 1184 1166 1178 1165 1178 1114 1184 1087 1186 1082 1184 1076 1179 1075 1179 1074 1179 1008 1179 1007 1179 1006 1179 990 1184 980 1186 973 1184 962 1178 961 1178 960 1178 933 1184 926 1185 922 1184 911 1178 909 1178 867 1178 866 1178 864 1178 863 1178 854 1184 849 1188 749 1188 743 1184 733 1179 731 1179 730 1179 671 1184 665 1184 664 1184 644 1178 642 1178 641 1178 640 1178 619 1184 605 1188 587 1184 574 1181 572 1181 571 1181 570 1181 566 1184 560 1189 459 1189 452 1184 444 1179 443 1179 442 1179 441 1179 439 1179 427 1184 417 1188 404 1184 387 1179 386 1179 384 1179 383 1179 370 1184 362 1189 317 1189 309 1184 299 1179 298 1179 297 1179 295 1179 279 1184 268 1189 230 1189 224 1184 216 1179 215 1179 214 1179 213 1179 179 1184 166 1186 155 1184 129 1179 127 1179 66 1184 67 1218"/>
+                  </Shape>
+                  <String CONTENT="ſelben Examina abzulegen wie die Männer. Wie ſie ſich innerlich"
+                          HPOS="66"
+                          VPOS="1178"
+                          WIDTH="1147"
+                          HEIGHT="59"
+                          WC="0.9896195313287159"/>
+               </TextLine>
+               <TextLine ID="eSc_line_ff145eff"
+                         TAGREFS="LT357"
+                         BASELINE="67 1270 1215 1268"
+                         HPOS="66"
+                         VPOS="1232"
+                         WIDTH="1149"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="67 1270 68 1289 92 1283 177 1289 189 1289 190 1290 332 1290 332 1289 333 1289 334 1289 335 1288 343 1283 383 1288 399 1289 401 1289 418 1288 453 1283 483 1287 501 1289 502 1289 516 1287 536 1283 548 1287 557 1289 558 1289 560 1289 620 1287 701 1282 728 1285 747 1288 748 1288 1214 1282 1215 1268 1213 1239 1164 1232 1163 1232 965 1238 898 1232 897 1232 864 1238 784 1232 783 1232 782 1232 763 1238 724 1232 723 1232 681 1238 665 1232 664 1232 663 1232 661 1232 637 1239 314 1233 313 1233 241 1239 199 1233 198 1233 167 1239 126 1233 125 1233 66 1240 67 1270"/>
+                  </Shape>
+                  <String CONTENT="zu der Frage ſtellten, darauf werde ich ſpäter Gelegenheit haben"
+                          HPOS="66"
+                          VPOS="1232"
+                          WIDTH="1149"
+                          HEIGHT="58"
+                          WC="0.9887113310396671"/>
+               </TextLine>
+               <TextLine ID="eSc_line_8002b118"
+                         TAGREFS="LT357"
+                         BASELINE="66 1322 1214 1319"
+                         HPOS="65"
+                         VPOS="1284"
+                         WIDTH="1149"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="66 1322 66 1342 117 1334 167 1341 169 1341 199 1334 408 1339 476 1341 477 1341 503 1338 553 1334 655 1337 734 1339 735 1339 736 1339 738 1339 741 1337 747 1333 768 1337 789 1339 790 1341 1131 1341 1131 1339 1166 1333 1176 1339 1178 1339 1179 1339 1180 1339 1213 1332 1214 1319 1210 1284 1115 1293 1097 1285 1095 1284 1094 1284 1042 1284 1041 1284 1040 1284 1035 1285 1011 1293 988 1285 986 1285 985 1285 869 1284 868 1284 867 1284 853 1288 830 1293 790 1288 757 1284 755 1284 754 1284 738 1289 724 1293 716 1289 706 1284 705 1284 640 1284 639 1284 637 1284 624 1290 612 1294 587 1290 570 1288 569 1288 567 1288 562 1290 553 1295 454 1295 446 1292 423 1285 422 1285 421 1285 419 1285 407 1293 403 1294 381 1293 307 1287 305 1287 248 1293 223 1285 221 1285 220 1285 185 1293 174 1285 172 1285 171 1285 127 1285 126 1285 65 1295 66 1322"/>
+                  </Shape>
+                  <String CONTENT="zurückzukommen; es waren vorläufig Opportunitätsgründe, die"
+                          HPOS="65"
+                          VPOS="1284"
+                          WIDTH="1149"
+                          HEIGHT="58"
+                          WC="0.989206098904044"/>
+               </TextLine>
+               <TextLine ID="eSc_line_bc1026b2"
+                         TAGREFS="LT357"
+                         BASELINE="67 1371 1214 1371"
+                         HPOS="67"
+                         VPOS="1336"
+                         WIDTH="1147"
+                         HEIGHT="56">
+                  <Shape>
+                     <Polygon POINTS="67 1371 68 1391 95 1384 161 1391 169 1391 170 1391 179 1391 245 1384 260 1389 263 1391 264 1391 265 1391 266 1391 270 1389 290 1384 303 1389 305 1391 307 1391 308 1391 309 1391 313 1389 330 1384 345 1389 350 1391 352 1391 353 1391 354 1391 360 1389 378 1384 427 1388 452 1391 453 1391 486 1388 537 1384 624 1384 631 1387 637 1391 639 1391 640 1391 686 1387 721 1384 730 1387 743 1391 744 1391 745 1391 747 1391 759 1387 768 1384 784 1387 814 1391 815 1391 841 1386 851 1384 859 1386 883 1391 884 1392 1022 1392 1022 1391 1023 1391 1025 1391 1032 1384 1035 1384 1051 1384 1185 1391 1186 1391 1188 1391 1214 1383 1214 1371 1213 1336 1146 1342 1110 1336 1109 1336 1107 1336 1042 1342 950 1336 946 1336 862 1336 861 1336 859 1336 858 1336 849 1342 833 1336 832 1336 830 1336 829 1336 799 1342 755 1336 753 1336 752 1336 749 1336 718 1342 659 1336 654 1336 652 1336 650 1336 621 1342 595 1336 594 1336 592 1336 591 1336 587 1336 543 1342 486 1336 480 1336 478 1336 475 1336 448 1342 67 1337 67 1371"/>
+                  </Shape>
+                  <String CONTENT="ſie beſtimmten: ſie wollten die ihnen ſo oft beſtrittene Fähigkeit"
+                          HPOS="67"
+                          VPOS="1336"
+                          WIDTH="1147"
+                          HEIGHT="56"
+                          WC="0.9987486862775051"/>
+               </TextLine>
+               <TextLine ID="eSc_line_8e130807"
+                         TAGREFS="LT357"
+                         BASELINE="66 1418 1213 1422"
+                         HPOS="65"
+                         VPOS="1383"
+                         WIDTH="1148"
+                         HEIGHT="57">
+                  <Shape>
+                     <Polygon POINTS="66 1418 65 1430 120 1438 121 1440 307 1440 307 1438 308 1438 327 1432 328 1432 338 1432 449 1440 451 1440 452 1440 466 1433 470 1433 531 1440 532 1440 533 1440 535 1440 543 1433 546 1433 1210 1438 1213 1422 1211 1392 1017 1386 1016 1386 1015 1386 1013 1386 1005 1392 1002 1393 993 1392 967 1386 966 1386 965 1386 950 1392 941 1396 872 1396 854 1392 820 1384 819 1384 747 1392 723 1393 701 1392 636 1384 635 1384 634 1384 620 1392 616 1393 601 1392 558 1386 557 1386 480 1392 449 1393 437 1392 391 1383 389 1383 388 1383 362 1391 345 1384 344 1384 343 1384 342 1384 327 1392 325 1393 250 1393 244 1392 198 1383 196 1383 157 1392 154 1392 151 1392 130 1383 129 1383 127 1383 66 1391 66 1418"/>
+                  </Shape>
+                  <String CONTENT="nachweiſen, zu leiſten, was die Männer leiſten, und ſich ſo Ver⸗"
+                          HPOS="65"
+                          VPOS="1383"
+                          WIDTH="1148"
+                          HEIGHT="57"
+                          WC="0.9979207767173648"/>
+               </TextLine>
+               <TextLine ID="eSc_line_1108a642"
+                         TAGREFS="LT357"
+                         BASELINE="66 1473 1214 1470"
+                         HPOS="65"
+                         VPOS="1433"
+                         WIDTH="1149"
+                         HEIGHT="60">
+                  <Shape>
+                     <Polygon POINTS="66 1473 66 1485 261 1485 265 1485 293 1492 294 1492 295 1492 297 1492 320 1485 322 1485 323 1485 337 1492 338 1492 339 1493 448 1493 448 1492 656 1485 703 1483 710 1485 734 1490 735 1490 810 1485 849 1482 929 1485 1102 1490 1104 1490 1213 1483 1214 1470 1211 1443 1115 1433 1114 1433 1112 1433 1111 1433 1097 1442 1051 1436 1050 1436 1049 1436 1032 1442 1011 1433 1010 1433 1008 1433 1007 1433 981 1442 927 1433 926 1433 924 1433 907 1443 848 1443 834 1435 833 1435 832 1435 830 1435 788 1443 760 1437 759 1437 758 1437 666 1443 642 1435 641 1435 510 1435 508 1435 507 1435 487 1443 387 1436 386 1436 384 1436 362 1445 265 1436 264 1436 263 1436 239 1445 206 1436 205 1436 204 1436 145 1445 77 1436 76 1436 75 1436 73 1436 65 1442 66 1473"/>
+                  </Shape>
+                  <String CONTENT="trauen in ihre geiſtigen Fähigkeiten erwerben. Die Univerſitäts⸗"
+                          HPOS="65"
+                          VPOS="1433"
+                          WIDTH="1149"
+                          HEIGHT="60"
+                          WC="0.996201422996819"/>
+               </TextLine>
+               <TextLine ID="eSc_line_fc54fbc3"
+                         TAGREFS="LT357"
+                         BASELINE="67 1522 1215 1525"
+                         HPOS="66"
+                         VPOS="1486"
+                         WIDTH="1149"
+                         HEIGHT="54">
+                  <Shape>
+                     <Polygon POINTS="66 1536 1213 1540 1215 1525 1215 1497 1186 1488 1185 1488 1184 1488 1149 1496 1146 1496 1130 1496 901 1488 899 1488 834 1493 792 1496 773 1492 753 1488 752 1488 730 1492 709 1496 701 1492 696 1488 695 1488 694 1488 693 1488 691 1488 679 1491 661 1496 644 1491 630 1487 629 1487 571 1487 570 1487 569 1487 567 1487 562 1491 553 1496 392 1496 370 1488 365 1487 364 1487 363 1487 354 1488 319 1495 276 1487 270 1487 269 1487 268 1487 265 1487 240 1495 224 1487 223 1487 221 1487 220 1487 219 1487 205 1496 145 1496 122 1487 121 1487 120 1487 119 1487 97 1495 68 1486 67 1522 66 1536"/>
+                  </Shape>
+                  <String CONTENT="kurſe und Examina waren bekannt und gangbare Münze; ein"
+                          HPOS="66"
+                          VPOS="1486"
+                          WIDTH="1149"
+                          HEIGHT="54"
+                          WC="0.9912660761313005"/>
+               </TextLine>
+               <TextLine ID="eSc_line_216c5253"
+                         TAGREFS="LT357"
+                         BASELINE="67 1576 1215 1574"
+                         HPOS="66"
+                         VPOS="1537"
+                         WIDTH="1149"
+                         HEIGHT="58">
+                  <Shape>
+                     <Polygon POINTS="67 1576 67 1587 96 1595 97 1595 99 1595 100 1595 110 1589 172 1595 174 1595 175 1595 176 1595 185 1587 188 1587 198 1589 271 1595 273 1595 274 1595 297 1589 302 1587 307 1589 337 1595 338 1595 416 1589 431 1587 446 1589 515 1595 516 1595 630 1589 679 1586 694 1589 721 1594 723 1594 798 1589 842 1586 868 1589 909 1594 911 1594 950 1589 973 1586 991 1589 1018 1594 1020 1594 1214 1589 1215 1574 1213 1544 1095 1537 1094 1537 1070 1542 1055 1546 1039 1542 1016 1537 1015 1537 1013 1537 993 1542 978 1546 968 1542 955 1537 953 1537 873 1537 872 1537 871 1537 859 1542 851 1546 822 1542 794 1539 793 1539 772 1542 748 1546 740 1542 734 1537 733 1537 731 1537 730 1537 729 1537 719 1542 709 1546 684 1542 664 1539 663 1539 661 1539 652 1542 636 1547 522 1541 461 1539 459 1539 446 1541 419 1549 353 1549 342 1541 338 1539 337 1539 335 1539 334 1539 333 1539 324 1541 303 1547 278 1541 268 1539 266 1539 265 1539 264 1539 255 1541 230 1549 151 1549 130 1541 127 1540 126 1540 125 1540 124 1540 122 1540 122 1541 112 1546 66 1541 67 1576"/>
+                  </Shape>
+                  <String CONTENT="neuer, nach eigener Einſicht und eigenem Urteil eingerichteter"
+                          HPOS="66"
+                          VPOS="1537"
+                          WIDTH="1149"
+                          HEIGHT="58"
+                          WC="0.9905212761894349"/>
+               </TextLine>
+               <TextLine ID="eSc_line_32d22bf2"
+                         TAGREFS="LT357"
+                         BASELINE="66 1629 1213 1629"
+                         HPOS="66"
+                         VPOS="1592"
+                         WIDTH="1147"
+                         HEIGHT="57">
+                  <Shape>
+                     <Polygon POINTS="66 1629 66 1640 139 1648 140 1648 141 1648 164 1641 166 1641 172 1641 241 1648 243 1649 350 1649 350 1648 352 1648 353 1648 359 1643 362 1641 376 1643 416 1648 417 1648 453 1643 470 1641 485 1644 511 1648 512 1648 552 1644 580 1641 597 1644 619 1648 620 1648 818 1645 1179 1641 1211 1648 1213 1629 1213 1603 1170 1592 1169 1592 1097 1601 1094 1601 1092 1601 1066 1592 1065 1592 981 1592 980 1592 978 1592 963 1600 961 1601 950 1600 886 1592 884 1592 883 1592 859 1600 853 1603 803 1603 798 1599 788 1592 787 1592 785 1592 609 1592 607 1592 606 1592 605 1592 597 1598 590 1603 464 1603 457 1598 451 1592 449 1592 448 1592 447 1592 422 1596 394 1603 330 1603 314 1596 304 1592 303 1592 260 1592 259 1592 258 1592 256 1592 253 1595 243 1601 66 1595 66 1629"/>
+                  </Shape>
+                  <String CONTENT="Kurſus würde, auch wenn thatſächlich mehr geleiſtet wurde,"
+                          HPOS="66"
+                          VPOS="1592"
+                          WIDTH="1147"
+                          HEIGHT="57"
+                          WC="0.98101570688445"/>
+               </TextLine>
+               <TextLine ID="eSc_line_d7d0d1ec"
+                         TAGREFS="LT357"
+                         BASELINE="67 1679 1214 1679"
+                         HPOS="67"
+                         VPOS="1644"
+                         WIDTH="1147"
+                         HEIGHT="56">
+                  <Shape>
+                     <Polygon POINTS="67 1679 67 1695 478 1700 480 1700 481 1700 495 1694 497 1694 511 1694 601 1700 602 1700 604 1700 624 1694 629 1693 634 1694 652 1699 654 1699 655 1699 725 1694 744 1693 747 1694 760 1700 762 1700 763 1700 819 1694 919 1700 921 1700 922 1700 947 1693 950 1693 957 1693 1035 1700 1036 1700 1037 1700 1060 1693 1061 1693 1062 1693 1095 1699 1096 1699 1214 1692 1214 1679 1214 1653 1179 1644 1178 1644 1117 1644 1116 1644 1115 1644 1091 1653 1090 1654 1044 1654 1042 1653 1027 1645 1026 1645 1025 1645 957 1651 943 1653 941 1651 926 1644 924 1644 829 1644 828 1644 827 1644 813 1650 809 1653 803 1650 783 1644 782 1644 724 1644 723 1644 721 1644 710 1650 705 1653 663 1650 582 1644 581 1644 580 1644 565 1649 555 1653 528 1649 502 1645 501 1645 459 1649 407 1654 284 1654 270 1648 261 1644 260 1644 174 1644 172 1644 171 1644 165 1646 150 1653 125 1646 116 1644 115 1644 114 1644 67 1645 67 1679"/>
+                  </Shape>
+                  <String CONTENT="keine Anerkennung gefunden haben. Dieſe Anſicht wurde mit be⸗"
+                          HPOS="67"
+                          VPOS="1644"
+                          WIDTH="1147"
+                          HEIGHT="56"
+                          WC="0.9962575377010908"/>
+               </TextLine>
+               <TextLine ID="eSc_line_f0f47027"
+                         TAGREFS="LT357"
+                         BASELINE="68 1730 1215 1733"
+                         HPOS="68"
+                         VPOS="1695"
+                         WIDTH="1147"
+                         HEIGHT="54">
+                  <Shape>
+                     <Polygon POINTS="68 1730 68 1749 1214 1745 1215 1733 1214 1705 1136 1697 1135 1697 1049 1703 1008 1697 1007 1697 1006 1697 991 1704 990 1704 975 1697 973 1697 972 1697 938 1703 931 1704 921 1703 872 1697 871 1697 828 1703 814 1705 729 1705 723 1702 709 1697 708 1697 706 1697 681 1702 670 1704 661 1702 642 1695 641 1695 640 1695 639 1695 624 1702 619 1703 612 1702 587 1695 586 1695 585 1695 584 1695 562 1702 553 1703 543 1702 513 1695 421 1695 419 1695 418 1695 408 1700 403 1703 349 1700 266 1695 265 1695 245 1699 229 1703 68 1699 68 1730"/>
+                  </Shape>
+                  <String CONTENT="ſonderem Eifer von Miß Emily Davies vertreten, die ihr in einem"
+                          HPOS="68"
+                          VPOS="1695"
+                          WIDTH="1147"
+                          HEIGHT="54"
+                          WC="0.9917690716092549"/>
+               </TextLine>
+               <TextLine ID="eSc_line_19164acc"
+                         TAGREFS="LT357"
+                         BASELINE="70 1779 1215 1782"
+                         HPOS="68"
+                         VPOS="1745"
+                         WIDTH="1147"
+                         HEIGHT="56">
+                  <Shape>
+                     <Polygon POINTS="70 1779 68 1794 76 1799 77 1799 78 1799 80 1799 81 1799 112 1792 114 1792 141 1792 186 1792 196 1799 198 1799 199 1799 200 1801 387 1801 387 1799 388 1799 389 1799 399 1792 401 1792 404 1792 437 1799 438 1799 599 1793 640 1792 649 1793 683 1801 684 1801 762 1793 770 1793 778 1793 829 1801 830 1801 956 1794 982 1793 991 1794 1020 1799 1021 1799 1022 1799 1040 1794 1046 1793 1077 1794 1190 1801 1191 1801 1193 1801 1214 1794 1215 1782 1214 1758 1158 1749 1156 1749 1155 1749 1136 1757 1131 1758 1079 1758 1074 1755 1042 1748 1041 1748 1040 1748 1039 1748 1020 1755 1011 1759 975 1759 957 1754 926 1748 924 1748 923 1748 908 1754 898 1759 854 1759 843 1753 834 1749 833 1749 832 1749 830 1749 823 1753 813 1759 750 1759 705 1752 676 1748 675 1748 646 1752 601 1757 589 1750 584 1748 582 1748 581 1748 562 1750 516 1757 505 1749 503 1749 502 1749 418 1747 417 1747 416 1747 414 1747 412 1749 399 1758 290 1758 255 1748 254 1748 253 1748 206 1754 175 1747 174 1747 70 1745 70 1779"/>
+                  </Shape>
+                  <String CONTENT="1866 erſchienenen Buch: the higher education of women"
+                          HPOS="68"
+                          VPOS="1745"
+                          WIDTH="1147"
+                          HEIGHT="56"
+                          WC="0.9992763411323979"/>
+               </TextLine>
+               <TextLine ID="eSc_line_b2bb6632"
+                         TAGREFS="LT357"
+                         BASELINE="67 1834 1214 1832"
+                         HPOS="66"
+                         VPOS="1796"
+                         WIDTH="1148"
+                         HEIGHT="60">
+                  <Shape>
+                     <Polygon POINTS="67 1834 67 1848 342 1846 353 1848 377 1854 378 1854 379 1856 444 1856 444 1854 486 1848 500 1846 515 1848 553 1854 555 1854 587 1847 599 1846 622 1847 685 1853 686 1853 745 1847 775 1844 844 1847 1000 1853 1001 1853 1030 1847 1040 1844 1044 1847 1055 1852 1056 1852 1057 1852 1128 1847 1190 1853 1191 1853 1193 1853 1214 1846 1214 1832 1211 1806 1116 1797 1115 1797 1036 1804 1030 1804 1020 1804 872 1797 871 1797 805 1803 795 1804 793 1803 778 1797 777 1797 775 1797 774 1797 773 1797 763 1803 762 1804 749 1803 693 1798 691 1798 690 1798 673 1803 669 1804 665 1803 646 1796 645 1796 644 1796 642 1796 615 1803 604 1806 587 1803 552 1797 551 1797 550 1797 533 1803 525 1807 476 1807 470 1803 462 1798 461 1798 459 1798 458 1798 457 1798 448 1803 442 1806 409 1802 379 1799 378 1799 360 1802 347 1804 339 1802 324 1797 323 1797 322 1797 66 1801 67 1834"/>
+                  </Shape>
+                  <String CONTENT="beredten Ausdruck gab. Die Schäden der bisherigen Mädchen⸗"
+                          HPOS="66"
+                          VPOS="1796"
+                          WIDTH="1148"
+                          HEIGHT="60"
+                          WC="0.9993956273999708"/>
+               </TextLine>
+               <TextLine ID="eSc_line_b63d5548"
+                         TAGREFS="LT357"
+                         BASELINE="70 1883 1214 1886"
+                         HPOS="68"
+                         VPOS="1847"
+                         WIDTH="1146"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="70 1883 68 1893 83 1903 85 1903 86 1903 87 1905 226 1905 226 1903 228 1903 244 1895 520 1903 521 1903 601 1895 652 1905 654 1905 655 1905 689 1896 856 1905 857 1906 897 1906 897 1905 898 1905 899 1905 912 1896 936 1905 937 1905 938 1905 982 1896 1060 1905 1061 1905 1112 1896 1180 1905 1181 1905 1183 1905 1213 1895 1214 1886 1213 1859 1138 1849 1136 1849 1135 1849 1115 1858 1112 1858 1102 1858 1017 1849 1016 1849 1015 1849 996 1857 991 1858 983 1857 960 1849 958 1849 957 1849 929 1856 918 1859 798 1859 772 1854 744 1849 743 1849 724 1853 701 1858 642 1853 630 1852 629 1852 620 1852 576 1857 562 1852 552 1848 551 1848 550 1848 506 1851 427 1857 414 1851 414 1849 413 1849 412 1849 342 1848 340 1848 339 1848 334 1849 303 1857 265 1848 261 1848 260 1848 259 1848 258 1848 241 1858 195 1858 70 1847 70 1883"/>
+                  </Shape>
+                  <String CONTENT="bildung, die Notwendigkeit einer Anderung, die dazu einzu⸗"
+                          HPOS="68"
+                          VPOS="1847"
+                          WIDTH="1146"
+                          HEIGHT="59"
+                          WC="0.9984893675508171"/>
+               </TextLine>
+               <TextLine ID="eSc_line_47538265"
+                         TAGREFS="LT357"
+                         BASELINE="70 1938 1214 1938"
+                         HPOS="70"
+                         VPOS="1901"
+                         WIDTH="1144"
+                         HEIGHT="59">
+                  <Shape>
+                     <Polygon POINTS="70 1938 70 1958 213 1952 325 1952 337 1958 338 1958 339 1958 340 1958 347 1958 419 1952 503 1952 513 1958 515 1958 516 1958 517 1958 525 1958 571 1952 615 1958 622 1958 624 1958 629 1958 655 1952 684 1957 690 1958 691 1958 708 1957 778 1952 793 1957 795 1958 797 1958 798 1960 897 1960 897 1958 898 1958 902 1957 914 1952 926 1957 928 1958 929 1958 931 1958 942 1957 976 1952 1006 1957 1017 1958 1018 1958 1044 1957 1109 1952 1123 1957 1128 1958 1129 1958 1130 1958 1214 1956 1214 1938 1214 1901 1076 1901 1075 1901 1074 1901 1072 1901 1061 1908 1030 1901 1029 1901 1027 1901 1026 1901 1006 1908 929 1901 927 1901 926 1901 922 1901 819 1910 774 1910 762 1901 760 1901 759 1901 758 1901 757 1901 731 1908 696 1902 695 1902 550 1908 527 1901 526 1901 525 1901 523 1901 518 1901 463 1908 449 1901 448 1901 447 1901 446 1901 444 1901 443 1901 428 1908 407 1901 406 1901 404 1901 403 1901 402 1901 401 1901 387 1910 338 1910 295 1901 290 1901 289 1901 284 1901 246 1908 221 1901 219 1901 218 1901 216 1901 215 1901 214 1901 200 1910 140 1910 125 1901 124 1901 122 1901 121 1901 70 1901 70 1938"/>
+                  </Shape>
+                  <String CONTENT="ſchlagenden Wege finden hier eine ebenſo gründliche als ſtiliſtiſch"
+                          HPOS="70"
+                          VPOS="1901"
+                          WIDTH="1144"
+                          HEIGHT="59"
+                          WC="0.9937905046477247"/>
+               </TextLine>
+               <TextLine ID="eSc_line_0c1539b6"
+                         TAGREFS="LT357"
+                         BASELINE="70 1991 1215 1989"
+                         HPOS="68"
+                         VPOS="1953"
+                         WIDTH="1147"
+                         HEIGHT="68">
+                  <Shape>
+                     <Polygon POINTS="70 1991 72 2021 87 2012 188 2012 204 2020 205 2020 206 2020 208 2020 210 2020 241 2012 263 2019 265 2020 266 2020 268 2020 269 2020 271 2019 283 2014 330 2019 345 2020 347 2020 348 2020 349 2020 350 2019 362 2012 454 2012 511 2017 550 2020 551 2020 592 2016 646 2011 655 2016 661 2019 663 2019 664 2019 665 2019 673 2015 684 2011 699 2015 713 2019 714 2019 715 2020 775 2020 775 2019 777 2019 785 2015 792 2011 808 2014 833 2019 834 2020 952 2020 952 2019 953 2019 967 2012 972 2011 1010 2012 1151 2019 1153 2019 1214 2010 1215 1989 1213 1960 1124 1953 1123 1953 1121 1953 1107 1960 1104 1962 1037 1962 1030 1960 996 1953 995 1953 956 1960 950 1961 946 1960 924 1953 923 1953 922 1953 921 1953 897 1961 893 1962 686 1962 686 1961 665 1953 664 1953 663 1953 661 1953 641 1962 639 1962 622 1962 508 1955 507 1955 506 1955 490 1962 488 1963 391 1963 388 1962 263 1955 261 1955 223 1962 184 1955 182 1955 68 1963 70 1991"/>
+                  </Shape>
+                  <String CONTENT="gewandte Erörterung. Einiges daraus mag nur für engliſche"
+                          HPOS="68"
+                          VPOS="1953"
+                          WIDTH="1147"
+                          HEIGHT="68"
+                          WC="0.999370839512139"/>
+               </TextLine>
+            </TextBlock>
+         </PrintSpace>
+      </Page>
+   </Layout>
+</alto>

From 88a6c5f26f310eea659036492b3a39f1d3fd2c20 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Tue, 3 Dec 2024 17:34:07 +0100
Subject: [PATCH 16/37] =?UTF-8?q?=F0=9F=90=9B=20alto4pandas:=20*Really*=20?=
 =?UTF-8?q?commit=20data=20to=20SQLite=20DB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/alto4pandas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py
index 8c8f934..77e23e2 100755
--- a/src/mods4pandas/alto4pandas.py
+++ b/src/mods4pandas/alto4pandas.py
@@ -176,7 +176,7 @@ def process(alto_files: List[str], output_file: str):
 
                     # Save
                     insert_into_db(con, "alto_info", d)
-                    con.commit
+                    con.commit()
 
                     if caught_warnings:
                         # PyCharm thinks caught_warnings is not Iterable:

From be1c8609a3fcbf6461c7c4c17d88fa7dbe1ffb04 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 28 May 2025 20:44:12 +0200
Subject: [PATCH 17/37] =?UTF-8?q?=F0=9F=9A=A7=20Check=20dtypes=20(WIP)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 check_dtypes.py

diff --git a/check_dtypes.py b/check_dtypes.py
new file mode 100644
index 0000000..64fe514
--- /dev/null
+++ b/check_dtypes.py
@@ -0,0 +1,57 @@
+import pandas as pd
+import re
+
+
+# Fix
+mods_info = pd.read_parquet("mods_info_df.parquet")
+for c in mods_info.columns:
+    if c.endswith("-count"):
+        mods_info[c] = mods_info[c].astype('Int64')
+
+
+# Tmp to parquet
+mods_info.to_parquet("tmp.parquet")
+mods_info = pd.read_parquet("tmp.parquet")
+
+
+# Check
+EXPECTED_TYPES = {
+        r"mets_file": ("object", ["str"]),
+        r"titleInfo_title": ("object", ["str"]),
+        r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
+        r"titleInfo_partName": ("object", ["str", "NoneType"]),
+        r"identifier-.*": ("object", ["str", "NoneType"]),
+        r"location_.*t ": ("object", ["str", "NoneType"]),
+        r"name\d+_.*": ("object", ["str", "NoneType"]),
+        r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
+        r".*-count": ("Int64", None),
+
+        # XXX possibly sets:
+        r"genre-.*": ("object", ["str", "NoneType"]),
+        r"subject-.*": ("object", ["str", "NoneType"]),
+        r"language_.*Term": ("object", ["str", "NoneType"]),
+}
+def expected_types(c):
+    for r, types in EXPECTED_TYPES.items():
+        if re.fullmatch(r, c):
+            edt = types[0]
+            einner_types = types[1]
+            if einner_types:
+                einner_types = set(einner_types)
+            return edt, einner_types
+    return None, None
+
+for c in mods_info.columns:
+    dt = mods_info.dtypes[c]
+    edt, einner_types = expected_types(c)
+
+    if edt is None:
+        print(f"No expected dtype known for column {c}")
+    elif dt != edt:
+        print(f"Unexpected dtype {dt} for column {c} (expected {edt})")
+
+    if edt == "object":
+        inner_types = set(type(v).__name__ for v in mods_info[c])
+        if any(it not in einner_types for it in inner_types):
+            print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})")
+

From 15f603671c1ef4e640d433ae921043a3052c19fa Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 2 Jun 2025 15:36:35 +0200
Subject: [PATCH 18/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20Fix=20lo?=
 =?UTF-8?q?cation=5F.*?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index 64fe514..991fd34 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -21,7 +21,7 @@ EXPECTED_TYPES = {
         r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
         r"titleInfo_partName": ("object", ["str", "NoneType"]),
         r"identifier-.*": ("object", ["str", "NoneType"]),
-        r"location_.*t ": ("object", ["str", "NoneType"]),
+        r"location_.*": ("object", ["str", "NoneType"]),
         r"name\d+_.*": ("object", ["str", "NoneType"]),
         r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
         r".*-count": ("Int64", None),

From d8d3f12cb0fb6cf3fc17ff4e8f9ba3ac4c5406f2 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 2 Jun 2025 15:36:57 +0200
Subject: [PATCH 19/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20classifi?=
 =?UTF-8?q?cation-.*?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/check_dtypes.py b/check_dtypes.py
index 991fd34..3a24a96 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -30,6 +30,7 @@ EXPECTED_TYPES = {
         r"genre-.*": ("object", ["str", "NoneType"]),
         r"subject-.*": ("object", ["str", "NoneType"]),
         r"language_.*Term": ("object", ["str", "NoneType"]),
+        r"classification-.*": ("object", ["str", "NoneType"]),
 }
 def expected_types(c):
     for r, types in EXPECTED_TYPES.items():

From 383c6b2d3d2bca3233a025e530e50233eaa06d5f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 2 Jun 2025 15:38:37 +0200
Subject: [PATCH 20/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20typeOfRe?=
 =?UTF-8?q?source?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/check_dtypes.py b/check_dtypes.py
index 3a24a96..bae19cd 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -25,6 +25,7 @@ EXPECTED_TYPES = {
         r"name\d+_.*": ("object", ["str", "NoneType"]),
         r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
         r".*-count": ("Int64", None),
+        r"typeOfResource": ("object", ["str", "NoneType"]),
 
         # XXX possibly sets:
         r"genre-.*": ("object", ["str", "NoneType"]),

From f5f2dc05a3ab5c737a27ffd618443ad0b17923b4 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 2 Jun 2025 15:40:43 +0200
Subject: [PATCH 21/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20accessCo?=
 =?UTF-8?q?ndition-.*?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/check_dtypes.py b/check_dtypes.py
index bae19cd..082d445 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -26,6 +26,7 @@ EXPECTED_TYPES = {
         r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
         r".*-count": ("Int64", None),
         r"typeOfResource": ("object", ["str", "NoneType"]),
+        r"accessCondition-.*": ("object", ["str", "NoneType"]),
 
         # XXX possibly sets:
         r"genre-.*": ("object", ["str", "NoneType"]),

From ff39da49e875d33a5a87b03233c07e534502dae3 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 2 Jun 2025 15:43:50 +0200
Subject: [PATCH 22/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20originIn?=
 =?UTF-8?q?fo-.*?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/check_dtypes.py b/check_dtypes.py
index 082d445..26a38d9 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -27,6 +27,7 @@ EXPECTED_TYPES = {
         r".*-count": ("Int64", None),
         r"typeOfResource": ("object", ["str", "NoneType"]),
         r"accessCondition-.*": ("object", ["str", "NoneType"]),
+        r"originInfo-.*": ("object", ["str", "NoneType"]),
 
         # XXX possibly sets:
         r"genre-.*": ("object", ["str", "NoneType"]),

From bec59242a0d38627cc2be7a9d55a2972d8330b67 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Mon, 2 Jun 2025 15:44:11 +0200
Subject: [PATCH 23/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20group=20?=
 =?UTF-8?q?by=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index 26a38d9..3024d9b 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -24,11 +24,12 @@ EXPECTED_TYPES = {
         r"location_.*": ("object", ["str", "NoneType"]),
         r"name\d+_.*": ("object", ["str", "NoneType"]),
         r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
-        r".*-count": ("Int64", None),
         r"typeOfResource": ("object", ["str", "NoneType"]),
         r"accessCondition-.*": ("object", ["str", "NoneType"]),
         r"originInfo-.*": ("object", ["str", "NoneType"]),
 
+        r".*-count": ("Int64", None),
+
         # XXX possibly sets:
         r"genre-.*": ("object", ["str", "NoneType"]),
         r"subject-.*": ("object", ["str", "NoneType"]),

From 8bc443f9fb1e4db7846781c19fe6aba922d8213d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 4 Jun 2025 19:05:34 +0200
Subject: [PATCH 24/37] =?UTF-8?q?=F0=9F=8E=A8=20Install=20mypy=20and=20typ?=
 =?UTF-8?q?e=20stubs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README-DEV.md         | 2 +-
 requirements-dev.txt  | 7 +++++++
 requirements-test.txt | 2 --
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 requirements-dev.txt
 delete mode 100644 requirements-test.txt

diff --git a/README-DEV.md b/README-DEV.md
index 134e784..33da234 100644
--- a/README-DEV.md
+++ b/README-DEV.md
@@ -1,5 +1,5 @@
 ```
-pip install -r requirements-test.txt
+pip install -r requirements-dev.txt
 ```
 
 To run tests:
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..5020dd0
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,7 @@
+pytest
+pytest-profiling
+
+mypy
+types-lxml
+types-tqdm
+pandas-stubs
diff --git a/requirements-test.txt b/requirements-test.txt
deleted file mode 100644
index 6f0f369..0000000
--- a/requirements-test.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-pytest
-pytest-profiling

From 14172e3b8183a5be66cdec6988949ec5ea253c44 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 4 Jun 2025 20:32:07 +0200
Subject: [PATCH 25/37] =?UTF-8?q?=F0=9F=9A=A7=20Save=20Python=20types=20fo?=
 =?UTF-8?q?r=20later=20conversion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/lib.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 8a65901..082ed9a 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -315,7 +315,8 @@ def column_names_csv(columns):
     """
     return ",".join('"' + c + '"' for c in columns)
 
-current_columns = defaultdict(list)
+current_columns: defaultdict = defaultdict(list)
+current_columns_types: dict[dict] = defaultdict(dict)
 
 def insert_into_db(con, table, d: Dict):
     """Insert the values from the dict into the table, creating columns if necessary"""
@@ -334,6 +335,11 @@ def insert_into_db(con, table, d: Dict):
             current_columns[table].append(k)
             con.execute(f'ALTER TABLE {table} ADD COLUMN "{k}"')
 
+    # Save types
+    for k in d.keys():
+        if k not in current_columns_types[table]:
+            current_columns_types[table][k] = type(d[k]).__name__
+
     # Insert
     # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
     # have use qmark style here.

From ebe988cfff18b7849551aa7e1acc6ac26fd2d21f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 4 Jun 2025 21:10:10 +0200
Subject: [PATCH 26/37] =?UTF-8?q?=F0=9F=9A=A7=20Restore=20types=20before?=
 =?UTF-8?q?=20saving=20as=20Parquet?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py                | 10 ----------
 src/mods4pandas/alto4pandas.py |  5 ++---
 src/mods4pandas/lib.py         | 21 +++++++++++++++++++++
 src/mods4pandas/mods4pandas.py | 11 +++--------
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index 3024d9b..cbdfd70 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -2,17 +2,7 @@ import pandas as pd
 import re
 
 
-# Fix
 mods_info = pd.read_parquet("mods_info_df.parquet")
-for c in mods_info.columns:
-    if c.endswith("-count"):
-        mods_info[c] = mods_info[c].astype('Int64')
-
-
-# Tmp to parquet
-mods_info.to_parquet("tmp.parquet")
-mods_info = pd.read_parquet("tmp.parquet")
-
 
 # Check
 EXPECTED_TYPES = {
diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py
index 77e23e2..0739f35 100755
--- a/src/mods4pandas/alto4pandas.py
+++ b/src/mods4pandas/alto4pandas.py
@@ -19,7 +19,7 @@ import pandas as pd
 import numpy as np
 from tqdm import tqdm
 
-from .lib import TagGroup, sorted_groupby, flatten, ns, insert_into_db
+from .lib import TagGroup, convert_db_to_parquet, sorted_groupby, flatten, ns, insert_into_db
 
 
 logger = logging.getLogger('alto4pandas')
@@ -188,9 +188,8 @@ def process(alto_files: List[str], output_file: str):
                 import traceback; traceback.print_exc()
 
     # Convert the alto_info SQL to a pandas DataFrame
-    alto_info_df = pd.read_sql_query("SELECT * FROM alto_info", con, index_col="alto_file")
     logger.info('Writing DataFrame to {}'.format(output_file))
-    alto_info_df.to_parquet(output_file)
+    convert_db_to_parquet(con, "alto_info", "alto_file", output_file)
 
 
 def main():
diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 082ed9a..32f717a 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -355,3 +355,24 @@ def insert_into_db(con, table, d: Dict):
 def insert_into_db_multiple(con, table, ld: List[Dict]):
     for d in ld:
         insert_into_db(con, table, d)
+
+def convert_db_to_parquet(con, table, index_col, output_file):
+    df = pd.read_sql_query(f"SELECT * FROM {table}", con, index_col)
+
+    # Convert Python column type into Pandas type
+    for c in df.columns:
+        column_type = current_columns_types[table][c]
+
+        if column_type == "str":
+            continue
+        elif column_type == "int":
+            df[c] = df[c].astype("Int64")
+        elif column_type == "float64":
+            df[c] = df[c].astype("Float64")
+        elif column_type == "set":
+            # TODO WIP
+            continue
+        else:
+            raise NotImplementedError(f"Column type {column_type} not implemented yet.")
+
+    df.to_parquet(output_file)
\ No newline at end of file
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 30d7c22..2da7c80 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -18,7 +18,7 @@ import click
 import pandas as pd
 from tqdm import tqdm
 
-from .lib import sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple
+from .lib import convert_db_to_parquet, sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple, current_columns_types
 
 
 
@@ -457,16 +457,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
             except Exception as e:
                 logger.exception('Exception in {}'.format(mets_file))
 
-    # Convert the mods_info SQL to a pandas DataFrame
-    mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier")
     logger.info('Writing DataFrame to {}'.format(output_file))
-    mods_info_df.to_parquet(output_file)
-
+    convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
     if output_page_info:
-          # Convert page_info SQL to a pandas DataFrama
-          page_info_df = pd.read_sql_query("SELECT * FROM page_info", con_page_info, index_col=["ppn", "ID"])
           logger.info('Writing DataFrame to {}'.format(output_page_info))
-          page_info_df.to_parquet(output_page_info)
+          convert_db_to_parquet(con_page_info, "page_info", ["ppn", "ID"], output_page_info)
 
 
 def main():

From 44550ff926400410907d70d35b6a27b3323d1b61 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 14:30:06 +0200
Subject: [PATCH 27/37] =?UTF-8?q?=F0=9F=A4=93=20requirements-dev:=20add=20?=
 =?UTF-8?q?ipython?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements-dev.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5020dd0..e63c022 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,8 @@
 pytest
 pytest-profiling
 
+ipython
+
 mypy
 types-lxml
 types-tqdm

From 580442a4c9430964cd2ee07c16301cc737834667 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 14:36:29 +0200
Subject: [PATCH 28/37] =?UTF-8?q?=F0=9F=A4=93=20Add=20type=20annotations?=
 =?UTF-8?q?=20(and=20related=20changes)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/lib.py         | 55 ++++++++++++++++++----------------
 src/mods4pandas/mods4pandas.py |  4 +--
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 32f717a..cff8ea9 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from itertools import groupby
 import re
 import warnings
@@ -24,40 +26,40 @@ ns = {
 class TagGroup:
     """Helper class to simplify the parsing and checking of MODS metadata"""
 
-    def __init__(self, tag, group: List[ET.Element]):
+    def __init__(self, tag, group: List[ET._Element]):
         self.tag = tag
         self.group = group
 
-    def to_xml(self):
+    def to_xml(self) -> str:
         return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"TagGroup with content:\n{self.to_xml()}"
 
-    def is_singleton(self):
+    def is_singleton(self) -> TagGroup:
         if len(self.group) != 1:
             raise ValueError('More than one instance: {}'.format(self))
         return self
 
-    def has_no_attributes(self):
+    def has_no_attributes(self) -> TagGroup:
         return self.has_attributes({})
 
-    def has_attributes(self, attrib):
+    def has_attributes(self, attrib) -> TagGroup:
         if not isinstance(attrib, Sequence):
             attrib = [attrib]
         if not all(e.attrib in attrib for e in self.group):
             raise ValueError('One or more element has unexpected attributes: {}'.format(self))
         return self
 
-    def ignore_attributes(self):
+    def ignore_attributes(self) -> TagGroup:
         # This serves as documentation for now.
         return self
 
-    def sort(self, key=None, reverse=False):
+    def sort(self, key=None, reverse=False) -> TagGroup:
         self.group = sorted(self.group, key=key, reverse=reverse)
         return self
 
-    def text(self, separator='\n'):
+    def text(self, separator='\n') -> str:
         t = ''
         for e in self.group:
             if t != '':
@@ -66,13 +68,13 @@ class TagGroup:
                 t += e.text
         return t
 
-    def text_set(self):
+    def text_set(self) -> set:
         return {e.text for e in self.group}
 
-    def descend(self, raise_errors):
+    def descend(self, raise_errors) -> dict:
         return _to_dict(self.is_singleton().group[0], raise_errors)
 
-    def filter(self, cond, warn=None):
+    def filter(self, cond, warn=None) -> TagGroup:
         new_group = []
         for e in self.group:
             if cond(e):
@@ -82,7 +84,7 @@ class TagGroup:
                     warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
         return TagGroup(self.tag, new_group)
 
-    def force_singleton(self, warn=True):
+    def force_singleton(self, warn=True) -> TagGroup:
         if len(self.group) == 1:
             return self
         else:
@@ -93,7 +95,7 @@ class TagGroup:
     RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$'  # Note: Includes non-specific century dates like '18XX'
     RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$'
 
-    def fix_date(self):
+    def fix_date(self) -> TagGroup:
 
         for e in self.group:
             if e.attrib.get('encoding') == 'w3cdtf':
@@ -103,6 +105,9 @@ class TagGroup:
 
         new_group = []
         for e in self.group:
+            if e.text is None:
+                warnings.warn('Empty date')
+                continue
             if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
                 new_group.append(e)
             elif re.match(self.RE_ISO8601_DATE, e.text):
@@ -131,7 +136,7 @@ class TagGroup:
 
         return self
 
-    def fix_event_type(self):
+    def fix_event_type(self) -> TagGroup:
         # According to MODS-AP 2.3.1, every originInfo should have its eventType set.
         # Fix this for special cases.
 
@@ -161,7 +166,7 @@ class TagGroup:
                     pass
         return self
 
-    def fix_script_term(self):
+    def fix_script_term(self) -> TagGroup:
         for e in self.group:
             # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
             if e.attrib['authority'] == 'ISO15924':
@@ -169,7 +174,7 @@ class TagGroup:
                 warnings.warn('Changed scriptTerm authority to lower case')
         return self
 
-    def merge_sub_tags_to_set(self):
+    def merge_sub_tags_to_set(self) -> dict:
         from .mods4pandas import mods_to_dict
         value = {}
 
@@ -189,7 +194,7 @@ class TagGroup:
             value[sub_tag] = s
         return value
 
-    def attributes(self):
+    def attributes(self) -> dict[str, str]:
         """
         Return a merged dict of all attributes of the tag group.
 
@@ -204,7 +209,7 @@ class TagGroup:
                 attrib[a_localname] = v
         return attrib
 
-    def subelement_counts(self):
+    def subelement_counts(self) -> dict[str, int]:
         counts = {}
         for e in self.group:
             for x in e.iter():
@@ -213,7 +218,7 @@ class TagGroup:
                 counts[key] = counts.get(key, 0) + 1
         return counts
 
-    def xpath_statistics(self, xpath_expr, namespaces):
+    def xpath_statistics(self, xpath_expr, namespaces) -> dict[str, float]:
         """
         Extract values and calculate statistics
 
@@ -235,7 +240,7 @@ class TagGroup:
             statistics[f'{xpath_expr}-max'] = np.max(values)
         return statistics
 
-    def xpath_count(self, xpath_expr, namespaces):
+    def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]:
         """
         Count all elements matching xpath_expr
         """
@@ -279,7 +284,7 @@ def _to_dict(root, raise_errors):
         raise ValueError(f"Unknown namespace {root_name.namespace}")
 
 
-def flatten(d: MutableMapping, parent='', separator='_'):
+def flatten(d: MutableMapping, parent='', separator='_') -> dict:
     """
     Flatten the given nested dict.
 
@@ -301,13 +306,13 @@ def flatten(d: MutableMapping, parent='', separator='_'):
     return dict(items)
 
 
-def valid_column_key(k):
-    if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k):
+def valid_column_key(k) -> bool:
+    if re.match(r'^[a-zA-Z0-9 _@/:\[\]-]+$', k):
         return True
     else:
         return False
 
-def column_names_csv(columns):
+def column_names_csv(columns) -> str:
     """
     Format Column names (identifiers) as a comma-separated list.
 
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 2da7c80..ea6a49f 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -376,7 +376,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
 @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
               default='mods_info_df.parquet', show_default=True)
 @click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
-def process(mets_files: List[str], output_file: str, output_page_info: str):
+def process(mets_files: list[str], output_file: str, output_page_info: str):
     """
     A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
 
@@ -389,7 +389,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
     """
 
     # Extend file list if directories are given
-    mets_files_real = []
+    mets_files_real: list[str] = []
     for m in mets_files:
         if os.path.isdir(m):
             logger.info('Scanning directory {}'.format(m))

From 62b93c760ba6d98ae7f00e3eba40cec9ebaef4b6 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 14:56:26 +0200
Subject: [PATCH 29/37] =?UTF-8?q?=F0=9F=A4=93=20Add=20type=20annotations?=
 =?UTF-8?q?=20(and=20related=20changes)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/lib.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index cff8ea9..44f1400 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -114,9 +114,8 @@ class TagGroup:
                 warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
                 e.attrib['encoding'] = 'iso8601'
                 new_group.append(e)
-            elif re.match(self.RE_GERMAN_DATE, e.text):
+            elif m := re.match(self.RE_GERMAN_DATE, e.text):
                 warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
-                m = re.match(self.RE_GERMAN_DATE, e.text)
                 e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
                 e.attrib['encoding'] = 'iso8601'
                 new_group.append(e)
@@ -210,7 +209,7 @@ class TagGroup:
         return attrib
 
     def subelement_counts(self) -> dict[str, int]:
-        counts = {}
+        counts: dict[str, int] = {}
         for e in self.group:
             for x in e.iter():
                 tag = ET.QName(x.tag).localname

From e4db150cbaf736aa43b2cbd2c08269e4c72bc9b3 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 15:49:09 +0200
Subject: [PATCH 30/37] =?UTF-8?q?=E2=9A=99=20=20vscode:=20Enable=20pytest?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .vscode/settings.json | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index de288e1..74a2cbb 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,8 @@
 {
-    "python.formatting.provider": "black"
+    "python.formatting.provider": "black",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
 }
\ No newline at end of file

From 5384e18ab5d24e7db6d9031f5a8e241c8d506cc7 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 16:18:33 +0200
Subject: [PATCH 31/37] =?UTF-8?q?=F0=9F=9A=A7=20check=5Fdtypes:=20Check=20?=
 =?UTF-8?q?alto=5Finfo=20types?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index cbdfd70..cf3da08 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -3,9 +3,13 @@ import re
 
 
 mods_info = pd.read_parquet("mods_info_df.parquet")
+alto_info = pd.read_parquet("alto_info_df.parquet")
 
 # Check
 EXPECTED_TYPES = {
+
+        # mods_info
+
         r"mets_file": ("object", ["str"]),
         r"titleInfo_title": ("object", ["str"]),
         r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
@@ -25,6 +29,19 @@ EXPECTED_TYPES = {
         r"subject-.*": ("object", ["str", "NoneType"]),
         r"language_.*Term": ("object", ["str", "NoneType"]),
         r"classification-.*": ("object", ["str", "NoneType"]),
+
+        # alto_info
+
+        r"Description_.*": ("object", ["str", "NoneType"]),
+        r"Layout_Page_ID": ("object", ["str", "NoneType"]),
+        r"Layout_Page_PHYSICAL_(IMG|IMAGE)_NR": ("object", ["str", "NoneType"]),
+        r"Layout_Page_PROCESSING": ("object", ["str", "NoneType"]),
+        r"Layout_Page_QUALITY": ("object", ["str", "NoneType"]),
+        r"Layout_Page_//alto:String/@WC-.*": ("Float64", None),
+        r"alto_xmlns": ("object", ["str", "NoneType"]),
+
+        # XXX r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
+        r"Layout_Page_(WIDTH|HEIGHT)": ("object", ["str", "NoneType"]),
 }
 def expected_types(c):
     for r, types in EXPECTED_TYPES.items():
@@ -36,17 +53,21 @@ def expected_types(c):
             return edt, einner_types
     return None, None
 
-for c in mods_info.columns:
-    dt = mods_info.dtypes[c]
-    edt, einner_types = expected_types(c)
+def check_types(df):
+    for c in df.columns:
+        dt = df.dtypes[c]
+        edt, einner_types = expected_types(c)
 
-    if edt is None:
-        print(f"No expected dtype known for column {c}")
-    elif dt != edt:
-        print(f"Unexpected dtype {dt} for column {c} (expected {edt})")
+        if edt is None:
+            print(f"No expected dtype known for column {c}")
+        elif dt != edt:
+            print(f"Unexpected dtype {dt} for column {c} (expected {edt})")
 
-    if edt == "object":
-        inner_types = set(type(v).__name__ for v in mods_info[c])
-        if any(it not in einner_types for it in inner_types):
-            print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})")
+        if edt == "object":
+            inner_types = set(type(v).__name__ for v in df[c])
+            if any(it not in einner_types for it in inner_types):
+                print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})")
+
+check_types(mods_info)
+check_types(alto_info)
 

From a20c979351722f1a813a2c084b2d06aadced2e3f Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 17:20:28 +0200
Subject: [PATCH 32/37] =?UTF-8?q?=F0=9F=A7=B9=20Filter=20annoying=20UserWa?=
 =?UTF-8?q?rning=20on=20every=20pandas=20import=20(on=20WSL)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py                | 9 ++++++++-
 src/mods4pandas/alto4pandas.py | 7 ++++++-
 src/mods4pandas/lib.py         | 7 ++++++-
 src/mods4pandas/mods4pandas.py | 6 +++++-
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index cf3da08..502e4bb 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -1,5 +1,12 @@
-import pandas as pd
 import re
+import warnings
+import os
+
+with warnings.catch_warnings():
+    # Filter warnings on WSL
+    if "Microsoft" in os.uname().release:
+        warnings.simplefilter("ignore")
+    import pandas as pd
 
 
 mods_info = pd.read_parquet("mods_info_df.parquet")
diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py
index 0739f35..668d7f3 100755
--- a/src/mods4pandas/alto4pandas.py
+++ b/src/mods4pandas/alto4pandas.py
@@ -15,12 +15,17 @@ from typing import List
 from collections.abc import MutableMapping, Sequence
 
 import click
-import pandas as pd
 import numpy as np
 from tqdm import tqdm
 
 from .lib import TagGroup, convert_db_to_parquet, sorted_groupby, flatten, ns, insert_into_db
 
+with warnings.catch_warnings():
+    # Filter warnings on WSL
+    if "Microsoft" in os.uname().release:
+        warnings.simplefilter("ignore")
+    import pandas as pd
+
 
 logger = logging.getLogger('alto4pandas')
 
diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 44f1400..ab01fce 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -3,13 +3,18 @@ from __future__ import annotations
 from itertools import groupby
 import re
 import warnings
+import os
 from typing import List, Sequence, MutableMapping, Dict
 from collections import defaultdict
 
-import pandas as pd
 import numpy as np
 from lxml import etree as ET
 
+with warnings.catch_warnings():
+    # Filter warnings on WSL
+    if "Microsoft" in os.uname().release:
+        warnings.simplefilter("ignore")
+    import pandas as pd
 
 __all__ = ["ns"]
 
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index ea6a49f..2d80c33 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -15,11 +15,15 @@ from collections import defaultdict
 from collections.abc import MutableMapping, Sequence
 
 import click
-import pandas as pd
 from tqdm import tqdm
 
 from .lib import convert_db_to_parquet, sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple, current_columns_types
 
+with warnings.catch_warnings():
+    # Filter warnings on WSL
+    if "Microsoft" in os.uname().release:
+        warnings.simplefilter("ignore")
+    import pandas as pd
 
 
 logger = logging.getLogger('mods4pandas')

From 64ed7298da3097257ef1f0d9b9d6ef328ea00741 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 19:13:38 +0200
Subject: [PATCH 33/37] =?UTF-8?q?=E2=9C=A8=20Make=20Layout=5FPage=5FWIDTH/?=
 =?UTF-8?q?HEIGHT=20integer=20values?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py                | 3 +--
 src/mods4pandas/alto4pandas.py | 6 ++++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index 502e4bb..946c5fe 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -47,8 +47,7 @@ EXPECTED_TYPES = {
         r"Layout_Page_//alto:String/@WC-.*": ("Float64", None),
         r"alto_xmlns": ("object", ["str", "NoneType"]),
 
-        # XXX r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
-        r"Layout_Page_(WIDTH|HEIGHT)": ("object", ["str", "NoneType"]),
+        r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
 }
 def expected_types(c):
     for r, types in EXPECTED_TYPES.items():
diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py
index 668d7f3..1d7b748 100755
--- a/src/mods4pandas/alto4pandas.py
+++ b/src/mods4pandas/alto4pandas.py
@@ -89,6 +89,12 @@ def alto_to_dict(alto, raise_errors=True):
         elif localname == 'Page':
             value[localname] = {}
             value[localname].update(TagGroup(tag, group).is_singleton().attributes())
+            for attr in ("WIDTH", "HEIGHT"):
+                if attr in value[localname]:
+                    try:
+                        value[localname][attr] = int(value[localname][attr])
+                    except ValueError:
+                        del value[localname][attr]
             value[localname].update(TagGroup(tag, group).subelement_counts())
             value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
 

From d685454c5260e51deaf96c4671ca600201485d50 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Wed, 11 Jun 2025 20:41:13 +0200
Subject: [PATCH 34/37] =?UTF-8?q?=E2=9C=A8=20page=5Finfo:=20Use=20boolean?=
 =?UTF-8?q?=20for=20indicator=20variable,=20str=20for=20hrefs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py                | 10 +++++++++-
 src/mods4pandas/lib.py         |  4 +++-
 src/mods4pandas/mods4pandas.py |  4 +++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index 946c5fe..5925b48 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -10,6 +10,7 @@ with warnings.catch_warnings():
 
 
 mods_info = pd.read_parquet("mods_info_df.parquet")
+page_info = pd.read_parquet("page_info_df.parquet")
 alto_info = pd.read_parquet("alto_info_df.parquet")
 
 # Check
@@ -37,6 +38,11 @@ EXPECTED_TYPES = {
         r"language_.*Term": ("object", ["str", "NoneType"]),
         r"classification-.*": ("object", ["str", "NoneType"]),
 
+        # page_info
+
+        r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]),
+        r"structMap-LOGICAL_TYPE_.*": ("boolean", None),
+
         # alto_info
 
         r"Description_.*": ("object", ["str", "NoneType"]),
@@ -49,6 +55,7 @@ EXPECTED_TYPES = {
 
         r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
 }
+
 def expected_types(c):
     for r, types in EXPECTED_TYPES.items():
         if re.fullmatch(r, c):
@@ -65,7 +72,7 @@ def check_types(df):
         edt, einner_types = expected_types(c)
 
         if edt is None:
-            print(f"No expected dtype known for column {c}")
+            print(f"No expected dtype known for column {c} (got {dt})")
         elif dt != edt:
             print(f"Unexpected dtype {dt} for column {c} (expected {edt})")
 
@@ -75,5 +82,6 @@ def check_types(df):
                 print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})")
 
 check_types(mods_info)
+check_types(page_info)
 check_types(alto_info)
 
diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index ab01fce..a0646fb 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -378,10 +378,12 @@ def convert_db_to_parquet(con, table, index_col, output_file):
             df[c] = df[c].astype("Int64")
         elif column_type == "float64":
             df[c] = df[c].astype("Float64")
+        elif column_type == "bool":
+            df[c] = df[c].map({"True": True, "False": False}).astype("boolean")
         elif column_type == "set":
             # TODO WIP
             continue
         else:
-            raise NotImplementedError(f"Column type {column_type} not implemented yet.")
+            raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")
 
     df.to_parquet(output_file)
\ No newline at end of file
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 2d80c33..7d45b47 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -327,6 +327,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
             assert file_ is not None
             fileGrp_USE = file_.getparent().attrib.get("USE")
             file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
+            if file_FLocat_href is not None:
+                file_FLocat_href = str(file_FLocat_href)
             page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
 
         def get_struct_log(*, to_phys):
@@ -368,7 +370,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
         for struct_div in struct_divs:
             type_ = struct_div.attrib.get("TYPE").lower()
             assert type_
-            page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = 1
+            page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = True
 
         result.append(page_dict)
 

From ebdded90d6162aa8de6e240059670899887f535d Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Jun 2025 07:02:23 +0200
Subject: [PATCH 35/37] =?UTF-8?q?=F0=9F=A4=93=20Add=20type=20annotations?=
 =?UTF-8?q?=20(and=20related=20changes)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/mods4pandas/lib.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index a0646fb..4d85a9e 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -4,7 +4,7 @@ from itertools import groupby
 import re
 import warnings
 import os
-from typing import List, Sequence, MutableMapping, Dict
+from typing import Any, List, Sequence, MutableMapping, Dict
 from collections import defaultdict
 
 import numpy as np
@@ -229,12 +229,14 @@ class TagGroup:
         Extract values using the given XPath expression, convert them to float and return descriptive
         statistics on the values.
         """
-        values = []
-        for e in self.group:
-            r = e.xpath(xpath_expr, namespaces=namespaces)
-            values += r
-        values = np.array([float(v) for v in values])
+        def xpath_values():
+            values = []
+            for e in self.group:
+                r = e.xpath(xpath_expr, namespaces=namespaces)
+                values += r
+            return np.array([float(v) for v in values])
 
+        values = xpath_values()
         statistics = {}
         if values.size > 0:
             statistics[f'{xpath_expr}-mean'] = np.mean(values)
@@ -294,7 +296,7 @@ def flatten(d: MutableMapping, parent='', separator='_') -> dict:
 
     It is assumed that d maps strings to either another dictionary (similarly structured) or some other value.
     """
-    items = []
+    items: list[Any] = []
 
     for k, v in d.items():
         if parent:
@@ -324,8 +326,8 @@ def column_names_csv(columns) -> str:
     """
     return ",".join('"' + c + '"' for c in columns)
 
-current_columns: defaultdict = defaultdict(list)
-current_columns_types: dict[dict] = defaultdict(dict)
+current_columns: dict[str, list] = defaultdict(list)
+current_columns_types: dict[str, dict] = defaultdict(dict)
 
 def insert_into_db(con, table, d: Dict):
     """Insert the values from the dict into the table, creating columns if necessary"""

From 215bfbb11fe816595fd3f8d637ba7458ae20e01e Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Jun 2025 07:45:22 +0200
Subject: [PATCH 36/37] =?UTF-8?q?=E2=9C=A8=20Represent=20sets=20as=20array?=
 =?UTF-8?q?s=20in=20the=20Parquet=20file?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py        | 10 +++++-----
 src/mods4pandas/lib.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/check_dtypes.py b/check_dtypes.py
index 5925b48..b5736df 100644
--- a/check_dtypes.py
+++ b/check_dtypes.py
@@ -24,6 +24,7 @@ EXPECTED_TYPES = {
         r"titleInfo_partName": ("object", ["str", "NoneType"]),
         r"identifier-.*": ("object", ["str", "NoneType"]),
         r"location_.*": ("object", ["str", "NoneType"]),
+        r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]),
         r"name\d+_.*": ("object", ["str", "NoneType"]),
         r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
         r"typeOfResource": ("object", ["str", "NoneType"]),
@@ -32,11 +33,10 @@ EXPECTED_TYPES = {
 
         r".*-count": ("Int64", None),
 
-        # XXX possibly sets:
-        r"genre-.*": ("object", ["str", "NoneType"]),
-        r"subject-.*": ("object", ["str", "NoneType"]),
-        r"language_.*Term": ("object", ["str", "NoneType"]),
-        r"classification-.*": ("object", ["str", "NoneType"]),
+        r"genre-.*": ("object", ["ndarray", "NoneType"]),
+        r"subject-.*": ("object", ["ndarray", "NoneType"]),
+        r"language_.*Term": ("object", ["ndarray", "NoneType"]),
+        r"classification-.*": ("object", ["ndarray", "NoneType"]),
 
         # page_info
 
diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 4d85a9e..68050b1 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import ast
 from itertools import groupby
 import re
 import warnings
@@ -383,8 +384,7 @@ def convert_db_to_parquet(con, table, index_col, output_file):
         elif column_type == "bool":
             df[c] = df[c].map({"True": True, "False": False}).astype("boolean")
         elif column_type == "set":
-            # TODO WIP
-            continue
+            df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
         else:
             raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")
 

From ac8740c33fba027199699837c14b14a2f5639491 Mon Sep 17 00:00:00 2001
From: Mike Gerber <mike.gerber@sbb.spk-berlin.de>
Date: Thu, 12 Jun 2025 09:42:29 +0200
Subject: [PATCH 37/37] =?UTF-8?q?=E2=9C=94=20=20Test=20if=20dtypes=20are?=
 =?UTF-8?q?=20as=20expected=20in=20produced=20Parquet=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 check_dtypes.py                           | 87 -----------------------
 src/mods4pandas/alto4pandas.py            |  5 +-
 src/mods4pandas/mods4pandas.py            |  6 +-
 src/mods4pandas/tests/test_alto.py        | 53 +++++++++++++-
 src/mods4pandas/tests/test_mods4pandas.py | 71 +++++++++++++++++-
 5 files changed, 130 insertions(+), 92 deletions(-)
 delete mode 100644 check_dtypes.py

diff --git a/check_dtypes.py b/check_dtypes.py
deleted file mode 100644
index b5736df..0000000
--- a/check_dtypes.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import re
-import warnings
-import os
-
-with warnings.catch_warnings():
-    # Filter warnings on WSL
-    if "Microsoft" in os.uname().release:
-        warnings.simplefilter("ignore")
-    import pandas as pd
-
-
-mods_info = pd.read_parquet("mods_info_df.parquet")
-page_info = pd.read_parquet("page_info_df.parquet")
-alto_info = pd.read_parquet("alto_info_df.parquet")
-
-# Check
-EXPECTED_TYPES = {
-
-        # mods_info
-
-        r"mets_file": ("object", ["str"]),
-        r"titleInfo_title": ("object", ["str"]),
-        r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
-        r"titleInfo_partName": ("object", ["str", "NoneType"]),
-        r"identifier-.*": ("object", ["str", "NoneType"]),
-        r"location_.*": ("object", ["str", "NoneType"]),
-        r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]),
-        r"name\d+_.*": ("object", ["str", "NoneType"]),
-        r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
-        r"typeOfResource": ("object", ["str", "NoneType"]),
-        r"accessCondition-.*": ("object", ["str", "NoneType"]),
-        r"originInfo-.*": ("object", ["str", "NoneType"]),
-
-        r".*-count": ("Int64", None),
-
-        r"genre-.*": ("object", ["ndarray", "NoneType"]),
-        r"subject-.*": ("object", ["ndarray", "NoneType"]),
-        r"language_.*Term": ("object", ["ndarray", "NoneType"]),
-        r"classification-.*": ("object", ["ndarray", "NoneType"]),
-
-        # page_info
-
-        r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]),
-        r"structMap-LOGICAL_TYPE_.*": ("boolean", None),
-
-        # alto_info
-
-        r"Description_.*": ("object", ["str", "NoneType"]),
-        r"Layout_Page_ID": ("object", ["str", "NoneType"]),
-        r"Layout_Page_PHYSICAL_(IMG|IMAGE)_NR": ("object", ["str", "NoneType"]),
-        r"Layout_Page_PROCESSING": ("object", ["str", "NoneType"]),
-        r"Layout_Page_QUALITY": ("object", ["str", "NoneType"]),
-        r"Layout_Page_//alto:String/@WC-.*": ("Float64", None),
-        r"alto_xmlns": ("object", ["str", "NoneType"]),
-
-        r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
-}
-
-def expected_types(c):
-    for r, types in EXPECTED_TYPES.items():
-        if re.fullmatch(r, c):
-            edt = types[0]
-            einner_types = types[1]
-            if einner_types:
-                einner_types = set(einner_types)
-            return edt, einner_types
-    return None, None
-
-def check_types(df):
-    for c in df.columns:
-        dt = df.dtypes[c]
-        edt, einner_types = expected_types(c)
-
-        if edt is None:
-            print(f"No expected dtype known for column {c} (got {dt})")
-        elif dt != edt:
-            print(f"Unexpected dtype {dt} for column {c} (expected {edt})")
-
-        if edt == "object":
-            inner_types = set(type(v).__name__ for v in df[c])
-            if any(it not in einner_types for it in inner_types):
-                print(f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})")
-
-check_types(mods_info)
-check_types(page_info)
-check_types(alto_info)
-
diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py
index 1d7b748..359a26e 100755
--- a/src/mods4pandas/alto4pandas.py
+++ b/src/mods4pandas/alto4pandas.py
@@ -138,7 +138,7 @@ def walk(m):
 @click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
 @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
               default='alto_info_df.parquet', show_default=True)
-def process(alto_files: List[str], output_file: str):
+def process_command(alto_files: List[str], output_file: str):
     """
     A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
 
@@ -151,6 +151,9 @@ def process(alto_files: List[str], output_file: str):
     - and a CSV file with all conversion warnings.
     """
 
+    process(alto_files, output_file)
+
+def process(alto_files: List[str], output_file: str):
     # Extend file list if directories are given
     alto_files_real = []
     for m in alto_files:
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 7d45b47..669c1e0 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -382,7 +382,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
 @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
               default='mods_info_df.parquet', show_default=True)
 @click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
-def process(mets_files: list[str], output_file: str, output_page_info: str):
+def process_command(mets_files: list[str], output_file: str, output_page_info: str):
     """
     A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
 
@@ -393,7 +393,9 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
 
     Per-page information (e.g. structure information) can be output to a separate Parquet file.
     """
+    process(mets_files, output_file, output_page_info)
 
+def process(mets_files: list[str], output_file: str, output_page_info: str):
     # Extend file list if directories are given
     mets_files_real: list[str] = []
     for m in mets_files:
@@ -476,7 +478,7 @@ def main():
     for prefix, uri in ns.items():
         ET.register_namespace(prefix, uri)
 
-    process()
+    process_command()
 
 
 if __name__ == '__main__':
diff --git a/src/mods4pandas/tests/test_alto.py b/src/mods4pandas/tests/test_alto.py
index 827bc7a..adf931f 100644
--- a/src/mods4pandas/tests/test_alto.py
+++ b/src/mods4pandas/tests/test_alto.py
@@ -1,9 +1,13 @@
+from pathlib import Path
+import re
 from lxml import etree as ET
+import pandas as pd
 
 
-from mods4pandas.alto4pandas import alto_to_dict
+from mods4pandas.alto4pandas import alto_to_dict, process
 from mods4pandas.lib import flatten
 
+TESTS_DATA_DIR = Path(__file__).parent / "data"
 
 def dict_fromstring(x):
    return flatten(alto_to_dict(ET.fromstring(x)))
@@ -79,3 +83,50 @@ def test_String_TAGREF_counts():
     """)
     assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3
     assert d['Layout_Page_String-count'] == 4
+
+
+def test_dtypes(tmp_path):
+    alto_dir = (TESTS_DATA_DIR / "alto").absolute().as_posix()
+    alto_info_df_parquet = (tmp_path / "test_dtypes_alto_info.parquet").as_posix()
+    process([alto_dir], alto_info_df_parquet)
+    alto_info_df = pd.read_parquet(alto_info_df_parquet)
+
+    EXPECTED_TYPES = {
+        r"Description_.*": ("object", ["str", "NoneType"]),
+        r"Layout_Page_ID": ("object", ["str", "NoneType"]),
+        r"Layout_Page_PHYSICAL_(IMG|IMAGE)_NR": ("object", ["str", "NoneType"]),
+        r"Layout_Page_PROCESSING": ("object", ["str", "NoneType"]),
+        r"Layout_Page_QUALITY": ("object", ["str", "NoneType"]),
+        r"Layout_Page_//alto:String/@WC-.*": ("Float64", None),
+        r".*-count": ("Int64", None),
+        r"alto_xmlns": ("object", ["str", "NoneType"]),
+
+        r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
+    }
+    def expected_types(c):
+        """Return the expected types for column c."""
+        for r, types in EXPECTED_TYPES.items():
+            if re.fullmatch(r, c):
+                edt = types[0]
+                einner_types = types[1]
+                if einner_types:
+                    einner_types = set(einner_types)
+                return edt, einner_types
+        return None, None
+
+    def check_types(df):
+        """Check the types of the DataFrame df."""
+        for c in df.columns:
+            dt = df.dtypes[c]
+            edt, einner_types = expected_types(c)
+            print(c, dt, edt)
+
+            assert edt is not None, f"No expected dtype known for column {c} (got {dt})"
+            assert dt == edt, f"Unexpected dtype {dt} for column {c} (expected {edt})"
+
+            if edt == "object":
+                inner_types = set(type(v).__name__ for v in df[c])
+                assert all(it in einner_types for it in inner_types), \
+                    f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
+
+    check_types(alto_info_df)
\ No newline at end of file
diff --git a/src/mods4pandas/tests/test_mods4pandas.py b/src/mods4pandas/tests/test_mods4pandas.py
index f9a98d7..0707a74 100644
--- a/src/mods4pandas/tests/test_mods4pandas.py
+++ b/src/mods4pandas/tests/test_mods4pandas.py
@@ -1,10 +1,14 @@
+from pathlib import Path
+import re
 from lxml import etree as ET
+import pandas as pd
 import pytest
 
 
-from mods4pandas.mods4pandas import mods_to_dict
+from mods4pandas.mods4pandas import mods_to_dict, process
 from mods4pandas.lib import flatten
 
+TESTS_DATA_DIR = Path(__file__).parent / "data"
 
 def dict_fromstring(x):
     """Helper function to parse a MODS XML string to a flattened dict"""
@@ -151,3 +155,68 @@ def test_relatedItem():
     """)
 
     assert d['relatedItem-original_recordInfo_recordIdentifier-dnb-ppn'] == '1236513355'
+
+def test_dtypes(tmp_path):
+    mets_files = [p.absolute().as_posix() for p in (TESTS_DATA_DIR / "mets-mods").glob("*.xml")]
+    mods_info_df_parquet = (tmp_path / "test_dtypes_mods_info.parquet").as_posix()
+    page_info_df_parquet = (tmp_path / "test_dtypes_page_info.parquet").as_posix()
+    process(mets_files, mods_info_df_parquet, page_info_df_parquet)
+    mods_info_df = pd.read_parquet(mods_info_df_parquet)
+    page_info_df = pd.read_parquet(page_info_df_parquet)
+
+    EXPECTED_TYPES = {
+        # mods_info
+
+        r"mets_file": ("object", ["str"]),
+        r"titleInfo_title": ("object", ["str"]),
+        r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
+        r"titleInfo_partName": ("object", ["str", "NoneType"]),
+        r"identifier-.*": ("object", ["str", "NoneType"]),
+        r"location_.*": ("object", ["str", "NoneType"]),
+        r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]),
+        r"name\d+_.*": ("object", ["str", "NoneType"]),
+        r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
+        r"typeOfResource": ("object", ["str", "NoneType"]),
+        r"accessCondition-.*": ("object", ["str", "NoneType"]),
+        r"originInfo-.*": ("object", ["str", "NoneType"]),
+
+        r".*-count": ("Int64", None),
+
+        r"genre-.*": ("object", ["ndarray", "NoneType"]),
+        r"subject-.*": ("object", ["ndarray", "NoneType"]),
+        r"language_.*Term": ("object", ["ndarray", "NoneType"]),
+        r"classification-.*": ("object", ["ndarray", "NoneType"]),
+
+        # page_info
+
+        r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]),
+        r"structMap-LOGICAL_TYPE_.*": ("boolean", None),
+    }
+    def expected_types(c):
+        """Return the expected types for column c."""
+        for r, types in EXPECTED_TYPES.items():
+            if re.fullmatch(r, c):
+                edt = types[0]
+                einner_types = types[1]
+                if einner_types:
+                    einner_types = set(einner_types)
+                return edt, einner_types
+        return None, None
+
+    def check_types(df):
+        """Check the types of the DataFrame df."""
+        for c in df.columns:
+            dt = df.dtypes[c]
+            edt, einner_types = expected_types(c)
+            print(c, dt, edt)
+
+            assert edt is not None, f"No expected dtype known for column {c} (got {dt})"
+            assert dt == edt, f"Unexpected dtype {dt} for column {c} (expected {edt})"
+
+            if edt == "object":
+                inner_types = set(type(v).__name__ for v in df[c])
+                assert all(it in einner_types for it in inner_types), \
+                    f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
+
+    check_types(mods_info_df)
+    check_types(page_info_df)
\ No newline at end of file