diff --git a/src/mods4pandas/lib.py b/src/mods4pandas/lib.py
index 32f717a..cff8ea9 100644
--- a/src/mods4pandas/lib.py
+++ b/src/mods4pandas/lib.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
from itertools import groupby
import re
import warnings
@@ -24,40 +26,40 @@ ns = {
class TagGroup:
"""Helper class to simplify the parsing and checking of MODS metadata"""
- def __init__(self, tag, group: List[ET.Element]):
+ def __init__(self, tag, group: List[ET._Element]):
self.tag = tag
self.group = group
- def to_xml(self):
+ def to_xml(self) -> str:
return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
- def __str__(self):
+ def __str__(self) -> str:
return f"TagGroup with content:\n{self.to_xml()}"
- def is_singleton(self):
+ def is_singleton(self) -> TagGroup:
if len(self.group) != 1:
raise ValueError('More than one instance: {}'.format(self))
return self
- def has_no_attributes(self):
+ def has_no_attributes(self) -> TagGroup:
return self.has_attributes({})
- def has_attributes(self, attrib):
+ def has_attributes(self, attrib) -> TagGroup:
if not isinstance(attrib, Sequence):
attrib = [attrib]
if not all(e.attrib in attrib for e in self.group):
raise ValueError('One or more element has unexpected attributes: {}'.format(self))
return self
- def ignore_attributes(self):
+ def ignore_attributes(self) -> TagGroup:
# This serves as documentation for now.
return self
- def sort(self, key=None, reverse=False):
+ def sort(self, key=None, reverse=False) -> TagGroup:
self.group = sorted(self.group, key=key, reverse=reverse)
return self
- def text(self, separator='\n'):
+ def text(self, separator='\n') -> str:
t = ''
for e in self.group:
if t != '':
@@ -66,13 +68,13 @@ class TagGroup:
t += e.text
return t
- def text_set(self):
+ def text_set(self) -> set:
return {e.text for e in self.group}
- def descend(self, raise_errors):
+ def descend(self, raise_errors) -> dict:
return _to_dict(self.is_singleton().group[0], raise_errors)
- def filter(self, cond, warn=None):
+ def filter(self, cond, warn=None) -> TagGroup:
new_group = []
for e in self.group:
if cond(e):
@@ -82,7 +84,7 @@ class TagGroup:
warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
return TagGroup(self.tag, new_group)
- def force_singleton(self, warn=True):
+ def force_singleton(self, warn=True) -> TagGroup:
if len(self.group) == 1:
return self
else:
@@ -93,7 +95,7 @@ class TagGroup:
RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX'
RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$'
- def fix_date(self):
+ def fix_date(self) -> TagGroup:
for e in self.group:
if e.attrib.get('encoding') == 'w3cdtf':
@@ -103,6 +105,9 @@ class TagGroup:
new_group = []
for e in self.group:
+ if e.text is None:
+ warnings.warn('Empty date')
+ continue
if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
new_group.append(e)
elif re.match(self.RE_ISO8601_DATE, e.text):
@@ -131,7 +136,7 @@ class TagGroup:
return self
- def fix_event_type(self):
+ def fix_event_type(self) -> TagGroup:
# According to MODS-AP 2.3.1, every originInfo should have its eventType set.
# Fix this for special cases.
@@ -161,7 +166,7 @@ class TagGroup:
pass
return self
- def fix_script_term(self):
+ def fix_script_term(self) -> TagGroup:
for e in self.group:
# MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
if e.attrib['authority'] == 'ISO15924':
@@ -169,7 +174,7 @@ class TagGroup:
warnings.warn('Changed scriptTerm authority to lower case')
return self
- def merge_sub_tags_to_set(self):
+ def merge_sub_tags_to_set(self) -> dict:
from .mods4pandas import mods_to_dict
value = {}
@@ -189,7 +194,7 @@ class TagGroup:
value[sub_tag] = s
return value
- def attributes(self):
+ def attributes(self) -> dict[str, str]:
"""
Return a merged dict of all attributes of the tag group.
@@ -204,7 +209,7 @@ class TagGroup:
attrib[a_localname] = v
return attrib
- def subelement_counts(self):
+ def subelement_counts(self) -> dict[str, int]:
counts = {}
for e in self.group:
for x in e.iter():
@@ -213,7 +218,7 @@ class TagGroup:
counts[key] = counts.get(key, 0) + 1
return counts
- def xpath_statistics(self, xpath_expr, namespaces):
+ def xpath_statistics(self, xpath_expr, namespaces) -> dict[str, float]:
"""
Extract values and calculate statistics
@@ -235,7 +240,7 @@ class TagGroup:
statistics[f'{xpath_expr}-max'] = np.max(values)
return statistics
- def xpath_count(self, xpath_expr, namespaces):
+ def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]:
"""
Count all elements matching xpath_expr
"""
@@ -279,7 +284,7 @@ def _to_dict(root, raise_errors):
raise ValueError(f"Unknown namespace {root_name.namespace}")
-def flatten(d: MutableMapping, parent='', separator='_'):
+def flatten(d: MutableMapping, parent='', separator='_') -> dict:
"""
Flatten the given nested dict.
@@ -301,13 +306,13 @@ def flatten(d: MutableMapping, parent='', separator='_'):
return dict(items)
-def valid_column_key(k):
- if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k):
+def valid_column_key(k) -> bool:
+ if re.match(r'^[a-zA-Z0-9 _@/:\[\]-]+$', k):
return True
else:
return False
-def column_names_csv(columns):
+def column_names_csv(columns) -> str:
"""
Format Column names (identifiers) as a comma-separated list.
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
index 2da7c80..ea6a49f 100755
--- a/src/mods4pandas/mods4pandas.py
+++ b/src/mods4pandas/mods4pandas.py
@@ -376,7 +376,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
default='mods_info_df.parquet', show_default=True)
@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
-def process(mets_files: List[str], output_file: str, output_page_info: str):
+def process(mets_files: list[str], output_file: str, output_page_info: str):
"""
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
@@ -389,7 +389,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
"""
# Extend file list if directories are given
- mets_files_real = []
+ mets_files_real: list[str] = []
for m in mets_files:
if os.path.isdir(m):
logger.info('Scanning directory {}'.format(m))