mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-08-16 13:09:53 +02:00
🎨 Reformat (Black)
This commit is contained in:
parent
5c9858a061
commit
212df99436
7 changed files with 639 additions and 355 deletions
|
@ -21,14 +21,13 @@ __all__ = ["ns"]
|
|||
|
||||
|
||||
ns = {
|
||||
'mets': 'http://www.loc.gov/METS/',
|
||||
'mods': 'http://www.loc.gov/mods/v3',
|
||||
"mets": "http://www.loc.gov/METS/",
|
||||
"mods": "http://www.loc.gov/mods/v3",
|
||||
"alto": "http://www.loc.gov/standards/alto/ns-v2",
|
||||
"xlink": "http://www.w3.org/1999/xlink",
|
||||
}
|
||||
|
||||
|
||||
|
||||
class TagGroup:
|
||||
"""Helper class to simplify the parsing and checking of MODS metadata"""
|
||||
|
||||
|
@ -37,14 +36,14 @@ class TagGroup:
|
|||
self.group = group
|
||||
|
||||
def to_xml(self) -> str:
|
||||
return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
|
||||
return "\n".join(str(ET.tostring(e), "utf-8").strip() for e in self.group)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"TagGroup with content:\n{self.to_xml()}"
|
||||
|
||||
def is_singleton(self) -> TagGroup:
|
||||
if len(self.group) != 1:
|
||||
raise ValueError('More than one instance: {}'.format(self))
|
||||
raise ValueError("More than one instance: {}".format(self))
|
||||
return self
|
||||
|
||||
def has_no_attributes(self) -> TagGroup:
|
||||
|
@ -54,7 +53,9 @@ class TagGroup:
|
|||
if not isinstance(attrib, Sequence):
|
||||
attrib = [attrib]
|
||||
if not all(e.attrib in attrib for e in self.group):
|
||||
raise ValueError('One or more element has unexpected attributes: {}'.format(self))
|
||||
raise ValueError(
|
||||
"One or more element has unexpected attributes: {}".format(self)
|
||||
)
|
||||
return self
|
||||
|
||||
def ignore_attributes(self) -> TagGroup:
|
||||
|
@ -65,10 +66,10 @@ class TagGroup:
|
|||
self.group = sorted(self.group, key=key, reverse=reverse)
|
||||
return self
|
||||
|
||||
def text(self, separator='\n') -> str:
|
||||
t = ''
|
||||
def text(self, separator="\n") -> str:
|
||||
t = ""
|
||||
for e in self.group:
|
||||
if t != '':
|
||||
if t != "":
|
||||
t += separator
|
||||
if e.text:
|
||||
t += e.text
|
||||
|
@ -87,7 +88,7 @@ class TagGroup:
|
|||
new_group.append(e)
|
||||
else:
|
||||
if warn:
|
||||
warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
|
||||
warnings.warn("Filtered {} element ({})".format(self.tag, warn))
|
||||
return TagGroup(self.tag, new_group)
|
||||
|
||||
def force_singleton(self, warn=True) -> TagGroup:
|
||||
|
@ -95,35 +96,38 @@ class TagGroup:
|
|||
return self
|
||||
else:
|
||||
if warn:
|
||||
warnings.warn('Forced single instance of {}'.format(self.tag))
|
||||
warnings.warn("Forced single instance of {}".format(self.tag))
|
||||
return TagGroup(self.tag, self.group[:1])
|
||||
|
||||
RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX'
|
||||
RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$'
|
||||
RE_ISO8601_DATE = r"^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$" # Note: Includes non-specific century dates like '18XX'
|
||||
RE_GERMAN_DATE = r"^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$"
|
||||
|
||||
def fix_date(self) -> TagGroup:
|
||||
|
||||
for e in self.group:
|
||||
if e.attrib.get('encoding') == 'w3cdtf':
|
||||
if e.attrib.get("encoding") == "w3cdtf":
|
||||
# This should be 'iso8601' according to MODS-AP 2.3.1
|
||||
warnings.warn('Changed w3cdtf encoding to iso8601')
|
||||
e.attrib['encoding'] = 'iso8601'
|
||||
warnings.warn("Changed w3cdtf encoding to iso8601")
|
||||
e.attrib["encoding"] = "iso8601"
|
||||
|
||||
new_group = []
|
||||
for e in self.group:
|
||||
if e.text is None:
|
||||
warnings.warn('Empty date')
|
||||
warnings.warn("Empty date")
|
||||
continue
|
||||
if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
|
||||
if e.attrib.get("encoding") == "iso8601" and re.match(
|
||||
self.RE_ISO8601_DATE, e.text
|
||||
):
|
||||
new_group.append(e)
|
||||
elif re.match(self.RE_ISO8601_DATE, e.text):
|
||||
warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
|
||||
e.attrib['encoding'] = 'iso8601'
|
||||
warnings.warn("Added iso8601 encoding to date {}".format(e.text))
|
||||
e.attrib["encoding"] = "iso8601"
|
||||
new_group.append(e)
|
||||
elif m := re.match(self.RE_GERMAN_DATE, e.text):
|
||||
warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
|
||||
e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
|
||||
e.attrib['encoding'] = 'iso8601'
|
||||
warnings.warn("Converted date {} to iso8601 encoding".format(e.text))
|
||||
e.text = "{}-{}-{}".format(
|
||||
m.group("yyyy"), m.group("mm"), m.group("dd")
|
||||
)
|
||||
e.attrib["encoding"] = "iso8601"
|
||||
new_group.append(e)
|
||||
else:
|
||||
warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
|
||||
|
@ -146,26 +150,30 @@ class TagGroup:
|
|||
# Fix this for special cases.
|
||||
|
||||
for e in self.group:
|
||||
if e.attrib.get('eventType') is None:
|
||||
if e.attrib.get("eventType") is None:
|
||||
try:
|
||||
if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \
|
||||
e.find('mods:edition', ns).text == '[Electronic ed.]':
|
||||
e.attrib['eventType'] = 'digitization'
|
||||
warnings.warn('Fixed eventType for electronic ed.')
|
||||
if (
|
||||
e.find("mods:publisher", ns).text.startswith(
|
||||
"Staatsbibliothek zu Berlin"
|
||||
)
|
||||
and e.find("mods:edition", ns).text == "[Electronic ed.]"
|
||||
):
|
||||
e.attrib["eventType"] = "digitization"
|
||||
warnings.warn("Fixed eventType for electronic ed.")
|
||||
continue
|
||||
except AttributeError:
|
||||
pass
|
||||
try:
|
||||
if e.find('mods:dateIssued', ns) is not None:
|
||||
e.attrib['eventType'] = 'publication'
|
||||
warnings.warn('Fixed eventType for an issued origin')
|
||||
if e.find("mods:dateIssued", ns) is not None:
|
||||
e.attrib["eventType"] = "publication"
|
||||
warnings.warn("Fixed eventType for an issued origin")
|
||||
continue
|
||||
except AttributeError:
|
||||
pass
|
||||
try:
|
||||
if e.find('mods:dateCreated', ns) is not None:
|
||||
e.attrib['eventType'] = 'production'
|
||||
warnings.warn('Fixed eventType for a created origin')
|
||||
if e.find("mods:dateCreated", ns) is not None:
|
||||
e.attrib["eventType"] = "production"
|
||||
warnings.warn("Fixed eventType for a created origin")
|
||||
continue
|
||||
except AttributeError:
|
||||
pass
|
||||
|
@ -174,13 +182,14 @@ class TagGroup:
|
|||
def fix_script_term(self) -> TagGroup:
|
||||
for e in self.group:
|
||||
# MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
|
||||
if e.attrib['authority'] == 'ISO15924':
|
||||
e.attrib['authority'] = 'iso15924'
|
||||
warnings.warn('Changed scriptTerm authority to lower case')
|
||||
if e.attrib["authority"] == "ISO15924":
|
||||
e.attrib["authority"] = "iso15924"
|
||||
warnings.warn("Changed scriptTerm authority to lower case")
|
||||
return self
|
||||
|
||||
def merge_sub_tags_to_set(self) -> dict:
|
||||
from .mods4pandas import mods_to_dict
|
||||
|
||||
value = {}
|
||||
|
||||
sub_dicts = [mods_to_dict(e) for e in self.group]
|
||||
|
@ -230,6 +239,7 @@ class TagGroup:
|
|||
Extract values using the given XPath expression, convert them to float and return descriptive
|
||||
statistics on the values.
|
||||
"""
|
||||
|
||||
def xpath_values():
|
||||
values = []
|
||||
for e in self.group:
|
||||
|
@ -240,11 +250,11 @@ class TagGroup:
|
|||
values = xpath_values()
|
||||
statistics = {}
|
||||
if values.size > 0:
|
||||
statistics[f'{xpath_expr}-mean'] = np.mean(values)
|
||||
statistics[f'{xpath_expr}-median'] = np.median(values)
|
||||
statistics[f'{xpath_expr}-std'] = np.std(values)
|
||||
statistics[f'{xpath_expr}-min'] = np.min(values)
|
||||
statistics[f'{xpath_expr}-max'] = np.max(values)
|
||||
statistics[f"{xpath_expr}-mean"] = np.mean(values)
|
||||
statistics[f"{xpath_expr}-median"] = np.median(values)
|
||||
statistics[f"{xpath_expr}-std"] = np.std(values)
|
||||
statistics[f"{xpath_expr}-min"] = np.min(values)
|
||||
statistics[f"{xpath_expr}-max"] = np.max(values)
|
||||
return statistics
|
||||
|
||||
def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]:
|
||||
|
@ -256,11 +266,10 @@ class TagGroup:
|
|||
r = e.xpath(xpath_expr, namespaces=namespaces)
|
||||
values += r
|
||||
|
||||
counts = {f'{xpath_expr}-count': len(values)}
|
||||
counts = {f"{xpath_expr}-count": len(values)}
|
||||
return counts
|
||||
|
||||
|
||||
|
||||
def sorted_groupby(iterable, key=None):
|
||||
"""
|
||||
Sort iterable by key and then group by the same key.
|
||||
|
@ -291,7 +300,7 @@ def _to_dict(root, raise_errors):
|
|||
raise ValueError(f"Unknown namespace {root_name.namespace}")
|
||||
|
||||
|
||||
def flatten(d: MutableMapping, parent='', separator='_') -> dict:
|
||||
def flatten(d: MutableMapping, parent="", separator="_") -> dict:
|
||||
"""
|
||||
Flatten the given nested dict.
|
||||
|
||||
|
@ -314,11 +323,12 @@ def flatten(d: MutableMapping, parent='', separator='_') -> dict:
|
|||
|
||||
|
||||
def valid_column_key(k) -> bool:
|
||||
if re.match(r'^[a-zA-Z0-9 _@/:\[\]-]+$', k):
|
||||
if re.match(r"^[a-zA-Z0-9 _@/:\[\]-]+$", k):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def column_names_csv(columns) -> str:
|
||||
"""
|
||||
Format Column names (identifiers) as a comma-separated list.
|
||||
|
@ -327,9 +337,11 @@ def column_names_csv(columns) -> str:
|
|||
"""
|
||||
return ",".join('"' + c + '"' for c in columns)
|
||||
|
||||
|
||||
current_columns: dict[str, list] = defaultdict(list)
|
||||
current_columns_types: dict[str, dict] = defaultdict(dict)
|
||||
|
||||
|
||||
def insert_into_db(con, table, d: Dict):
|
||||
"""Insert the values from the dict into the table, creating columns if necessary"""
|
||||
|
||||
|
@ -338,7 +350,9 @@ def insert_into_db(con, table, d: Dict):
|
|||
for k in d.keys():
|
||||
assert valid_column_key(k), f'"{k}" is not a valid column name'
|
||||
current_columns[table].append(k)
|
||||
con.execute(f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})")
|
||||
con.execute(
|
||||
f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})"
|
||||
)
|
||||
|
||||
# Add columns if necessary
|
||||
for k in d.keys():
|
||||
|
@ -361,13 +375,15 @@ def insert_into_db(con, table, d: Dict):
|
|||
f"( {column_names_csv(columns)} )"
|
||||
"VALUES"
|
||||
f"( {','.join('?' for c in columns)} )",
|
||||
[str(d[c]) for c in columns]
|
||||
[str(d[c]) for c in columns],
|
||||
)
|
||||
|
||||
|
||||
def insert_into_db_multiple(con, table, ld: List[Dict]):
|
||||
for d in ld:
|
||||
insert_into_db(con, table, d)
|
||||
|
||||
|
||||
def convert_db_to_parquet(con, table, index_col, output_file):
|
||||
df = pd.read_sql_query(f"SELECT * FROM {table}", con, index_col)
|
||||
|
||||
|
@ -386,6 +402,8 @@ def convert_db_to_parquet(con, table, index_col, output_file):
|
|||
elif column_type == "set":
|
||||
df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
|
||||
else:
|
||||
raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")
|
||||
raise NotImplementedError(
|
||||
f"Column {c}: type {column_type} not implemented yet."
|
||||
)
|
||||
|
||||
df.to_parquet(output_file)
|
||||
df.to_parquet(output_file)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue