mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-26 20:19:56 +02:00
🎨 Reformat (Black)
This commit is contained in:
parent
5c9858a061
commit
212df99436
7 changed files with 639 additions and 355 deletions
|
@ -18,7 +18,14 @@ import click
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .lib import TagGroup, convert_db_to_parquet, sorted_groupby, flatten, ns, insert_into_db
|
from .lib import (
|
||||||
|
TagGroup,
|
||||||
|
convert_db_to_parquet,
|
||||||
|
sorted_groupby,
|
||||||
|
flatten,
|
||||||
|
ns,
|
||||||
|
insert_into_db,
|
||||||
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
# Filter warnings on WSL
|
# Filter warnings on WSL
|
||||||
|
@ -27,8 +34,7 @@ with warnings.catch_warnings():
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('alto4pandas')
|
logger = logging.getLogger("alto4pandas")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def alto_to_dict(alto, raise_errors=True):
|
def alto_to_dict(alto, raise_errors=True):
|
||||||
|
@ -37,56 +43,91 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
value = {}
|
value = {}
|
||||||
|
|
||||||
# Iterate through each group of tags
|
# Iterate through each group of tags
|
||||||
for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
|
for tag, group in sorted_groupby(alto, key=attrgetter("tag")):
|
||||||
group = list(group)
|
group = list(group)
|
||||||
|
|
||||||
localname = ET.QName(tag).localname
|
localname = ET.QName(tag).localname
|
||||||
alto_namespace = ET.QName(tag).namespace
|
alto_namespace = ET.QName(tag).namespace
|
||||||
namespaces={"alto": alto_namespace}
|
namespaces = {"alto": alto_namespace}
|
||||||
|
|
||||||
if localname == 'Description':
|
if localname == "Description":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
value[localname] = (
|
||||||
elif localname == 'MeasurementUnit':
|
TagGroup(tag, group)
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
.is_singleton()
|
||||||
elif localname == 'OCRProcessing':
|
.has_no_attributes()
|
||||||
|
.descend(raise_errors)
|
||||||
|
)
|
||||||
|
elif localname == "MeasurementUnit":
|
||||||
|
value[localname] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif localname == "OCRProcessing":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||||
elif localname == 'Processing':
|
elif localname == "Processing":
|
||||||
# TODO This enumerated descent is used more than once, DRY!
|
# TODO This enumerated descent is used more than once, DRY!
|
||||||
for n, e in enumerate(group):
|
for n, e in enumerate(group):
|
||||||
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
|
||||||
elif localname == 'ocrProcessingStep':
|
elif localname == "ocrProcessingStep":
|
||||||
for n, e in enumerate(group):
|
for n, e in enumerate(group):
|
||||||
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
|
||||||
elif localname == 'preProcessingStep':
|
elif localname == "preProcessingStep":
|
||||||
for n, e in enumerate(group):
|
for n, e in enumerate(group):
|
||||||
value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
|
value[f"{localname}{n}"] = alto_to_dict(e, raise_errors)
|
||||||
elif localname == 'processingDateTime':
|
elif localname == "processingDateTime":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = (
|
||||||
elif localname == 'processingSoftware':
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif localname == "processingSoftware":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||||
elif localname == 'processingAgency':
|
elif localname == "processingAgency":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = (
|
||||||
elif localname == 'processingStepDescription':
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
)
|
||||||
elif localname == 'processingStepSettings':
|
elif localname == "processingStepDescription":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = (
|
||||||
elif localname == 'softwareCreator':
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
)
|
||||||
elif localname == 'softwareName':
|
elif localname == "processingStepSettings":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value[localname] = (
|
||||||
elif localname == 'softwareVersion':
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
)
|
||||||
|
elif localname == "softwareCreator":
|
||||||
|
value[localname] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif localname == "softwareName":
|
||||||
|
value[localname] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif localname == "softwareVersion":
|
||||||
|
value[localname] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
|
||||||
elif localname == 'sourceImageInformation':
|
elif localname == "sourceImageInformation":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
value[localname] = (
|
||||||
elif localname == 'fileName':
|
TagGroup(tag, group)
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
.is_singleton()
|
||||||
elif localname == 'fileIdentifier':
|
.has_no_attributes()
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
.descend(raise_errors)
|
||||||
|
)
|
||||||
|
elif localname == "fileName":
|
||||||
|
value[localname] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif localname == "fileIdentifier":
|
||||||
|
value[localname] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
|
||||||
elif localname == 'Layout':
|
elif localname == "Layout":
|
||||||
value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
value[localname] = (
|
||||||
elif localname == 'Page':
|
TagGroup(tag, group)
|
||||||
|
.is_singleton()
|
||||||
|
.has_no_attributes()
|
||||||
|
.descend(raise_errors)
|
||||||
|
)
|
||||||
|
elif localname == "Page":
|
||||||
value[localname] = {}
|
value[localname] = {}
|
||||||
value[localname].update(TagGroup(tag, group).is_singleton().attributes())
|
value[localname].update(TagGroup(tag, group).is_singleton().attributes())
|
||||||
for attr in ("WIDTH", "HEIGHT"):
|
for attr in ("WIDTH", "HEIGHT"):
|
||||||
|
@ -96,14 +137,18 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
del value[localname][attr]
|
del value[localname][attr]
|
||||||
value[localname].update(TagGroup(tag, group).subelement_counts())
|
value[localname].update(TagGroup(tag, group).subelement_counts())
|
||||||
value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
|
value[localname].update(
|
||||||
|
TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)
|
||||||
|
)
|
||||||
|
|
||||||
# Count all alto:String elements with TAGREFS attribute
|
# Count all alto:String elements with TAGREFS attribute
|
||||||
value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces))
|
value[localname].update(
|
||||||
|
TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces)
|
||||||
|
)
|
||||||
|
|
||||||
elif localname == 'Styles':
|
elif localname == "Styles":
|
||||||
pass
|
pass
|
||||||
elif localname == 'Tags':
|
elif localname == "Tags":
|
||||||
value[localname] = {}
|
value[localname] = {}
|
||||||
value[localname].update(TagGroup(tag, group).subelement_counts())
|
value[localname].update(TagGroup(tag, group).subelement_counts())
|
||||||
else:
|
else:
|
||||||
|
@ -116,13 +161,12 @@ def alto_to_dict(alto, raise_errors=True):
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def walk(m):
|
def walk(m):
|
||||||
# XXX do this in mods4pandas, too
|
# XXX do this in mods4pandas, too
|
||||||
if os.path.isdir(m):
|
if os.path.isdir(m):
|
||||||
tqdm.write(f'Scanning directory {m}')
|
tqdm.write(f"Scanning directory {m}")
|
||||||
for f in tqdm(os.scandir(m), leave=False):
|
for f in tqdm(os.scandir(m), leave=False):
|
||||||
if f.is_file() and not f.name.startswith('.'):
|
if f.is_file() and not f.name.startswith("."):
|
||||||
yield f.path
|
yield f.path
|
||||||
elif f.is_dir():
|
elif f.is_dir():
|
||||||
try:
|
try:
|
||||||
|
@ -133,11 +177,17 @@ def walk(m):
|
||||||
yield m.path
|
yield m.path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
|
@click.argument("alto_files", type=click.Path(exists=True), required=True, nargs=-1)
|
||||||
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
|
@click.option(
|
||||||
default='alto_info_df.parquet', show_default=True)
|
"--output",
|
||||||
|
"-o",
|
||||||
|
"output_file",
|
||||||
|
type=click.Path(),
|
||||||
|
help="Output Parquet file",
|
||||||
|
default="alto_info_df.parquet",
|
||||||
|
show_default=True,
|
||||||
|
)
|
||||||
def process_command(alto_files: List[str], output_file: str):
|
def process_command(alto_files: List[str], output_file: str):
|
||||||
"""
|
"""
|
||||||
A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
|
A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
|
||||||
|
@ -153,6 +203,7 @@ def process_command(alto_files: List[str], output_file: str):
|
||||||
|
|
||||||
process(alto_files, output_file)
|
process(alto_files, output_file)
|
||||||
|
|
||||||
|
|
||||||
def process(alto_files: List[str], output_file: str):
|
def process(alto_files: List[str], output_file: str):
|
||||||
# Extend file list if directories are given
|
# Extend file list if directories are given
|
||||||
alto_files_real = []
|
alto_files_real = []
|
||||||
|
@ -167,26 +218,26 @@ def process(alto_files: List[str], output_file: str):
|
||||||
with contextlib.suppress(FileNotFoundError):
|
with contextlib.suppress(FileNotFoundError):
|
||||||
os.remove(output_file_sqlite3)
|
os.remove(output_file_sqlite3)
|
||||||
|
|
||||||
logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
|
logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))
|
||||||
con = sqlite3.connect(output_file_sqlite3)
|
con = sqlite3.connect(output_file_sqlite3)
|
||||||
|
|
||||||
# Process ALTO files
|
# Process ALTO files
|
||||||
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
with open(output_file + ".warnings.csv", "w") as csvfile:
|
||||||
csvwriter = csv.writer(csvfile)
|
csvwriter = csv.writer(csvfile)
|
||||||
logger.info('Processing ALTO files')
|
logger.info("Processing ALTO files")
|
||||||
for alto_file in tqdm(alto_files_real, leave=False):
|
for alto_file in tqdm(alto_files_real, leave=False):
|
||||||
try:
|
try:
|
||||||
root = ET.parse(alto_file).getroot()
|
root = ET.parse(alto_file).getroot()
|
||||||
alto = root # XXX .find('alto:alto', ns) does not work here
|
alto = root # XXX .find('alto:alto', ns) does not work here
|
||||||
|
|
||||||
with warnings.catch_warnings(record=True) as caught_warnings:
|
with warnings.catch_warnings(record=True) as caught_warnings:
|
||||||
warnings.simplefilter('always') # do NOT filter double occurrences
|
warnings.simplefilter("always") # do NOT filter double occurrences
|
||||||
|
|
||||||
# ALTO
|
# ALTO
|
||||||
d = flatten(alto_to_dict(alto, raise_errors=True))
|
d = flatten(alto_to_dict(alto, raise_errors=True))
|
||||||
# "meta"
|
# "meta"
|
||||||
d['alto_file'] = alto_file
|
d["alto_file"] = alto_file
|
||||||
d['alto_xmlns'] = ET.QName(alto).namespace
|
d["alto_xmlns"] = ET.QName(alto).namespace
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
insert_into_db(con, "alto_info", d)
|
insert_into_db(con, "alto_info", d)
|
||||||
|
@ -198,11 +249,13 @@ def process(alto_files: List[str], output_file: str):
|
||||||
for caught_warning in caught_warnings:
|
for caught_warning in caught_warnings:
|
||||||
csvwriter.writerow([alto_file, caught_warning.message])
|
csvwriter.writerow([alto_file, caught_warning.message])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error('Exception in {}: {}'.format(alto_file, e))
|
logger.error("Exception in {}: {}".format(alto_file, e))
|
||||||
import traceback; traceback.print_exc()
|
import traceback
|
||||||
|
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
# Convert the alto_info SQL to a pandas DataFrame
|
# Convert the alto_info SQL to a pandas DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
logger.info("Writing DataFrame to {}".format(output_file))
|
||||||
convert_db_to_parquet(con, "alto_info", "alto_file", output_file)
|
convert_db_to_parquet(con, "alto_info", "alto_file", output_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,5 +268,5 @@ def main():
|
||||||
process()
|
process()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -21,14 +21,13 @@ __all__ = ["ns"]
|
||||||
|
|
||||||
|
|
||||||
ns = {
|
ns = {
|
||||||
'mets': 'http://www.loc.gov/METS/',
|
"mets": "http://www.loc.gov/METS/",
|
||||||
'mods': 'http://www.loc.gov/mods/v3',
|
"mods": "http://www.loc.gov/mods/v3",
|
||||||
"alto": "http://www.loc.gov/standards/alto/ns-v2",
|
"alto": "http://www.loc.gov/standards/alto/ns-v2",
|
||||||
"xlink": "http://www.w3.org/1999/xlink",
|
"xlink": "http://www.w3.org/1999/xlink",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TagGroup:
|
class TagGroup:
|
||||||
"""Helper class to simplify the parsing and checking of MODS metadata"""
|
"""Helper class to simplify the parsing and checking of MODS metadata"""
|
||||||
|
|
||||||
|
@ -37,14 +36,14 @@ class TagGroup:
|
||||||
self.group = group
|
self.group = group
|
||||||
|
|
||||||
def to_xml(self) -> str:
|
def to_xml(self) -> str:
|
||||||
return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
|
return "\n".join(str(ET.tostring(e), "utf-8").strip() for e in self.group)
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return f"TagGroup with content:\n{self.to_xml()}"
|
return f"TagGroup with content:\n{self.to_xml()}"
|
||||||
|
|
||||||
def is_singleton(self) -> TagGroup:
|
def is_singleton(self) -> TagGroup:
|
||||||
if len(self.group) != 1:
|
if len(self.group) != 1:
|
||||||
raise ValueError('More than one instance: {}'.format(self))
|
raise ValueError("More than one instance: {}".format(self))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def has_no_attributes(self) -> TagGroup:
|
def has_no_attributes(self) -> TagGroup:
|
||||||
|
@ -54,7 +53,9 @@ class TagGroup:
|
||||||
if not isinstance(attrib, Sequence):
|
if not isinstance(attrib, Sequence):
|
||||||
attrib = [attrib]
|
attrib = [attrib]
|
||||||
if not all(e.attrib in attrib for e in self.group):
|
if not all(e.attrib in attrib for e in self.group):
|
||||||
raise ValueError('One or more element has unexpected attributes: {}'.format(self))
|
raise ValueError(
|
||||||
|
"One or more element has unexpected attributes: {}".format(self)
|
||||||
|
)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def ignore_attributes(self) -> TagGroup:
|
def ignore_attributes(self) -> TagGroup:
|
||||||
|
@ -65,10 +66,10 @@ class TagGroup:
|
||||||
self.group = sorted(self.group, key=key, reverse=reverse)
|
self.group = sorted(self.group, key=key, reverse=reverse)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def text(self, separator='\n') -> str:
|
def text(self, separator="\n") -> str:
|
||||||
t = ''
|
t = ""
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
if t != '':
|
if t != "":
|
||||||
t += separator
|
t += separator
|
||||||
if e.text:
|
if e.text:
|
||||||
t += e.text
|
t += e.text
|
||||||
|
@ -87,7 +88,7 @@ class TagGroup:
|
||||||
new_group.append(e)
|
new_group.append(e)
|
||||||
else:
|
else:
|
||||||
if warn:
|
if warn:
|
||||||
warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
|
warnings.warn("Filtered {} element ({})".format(self.tag, warn))
|
||||||
return TagGroup(self.tag, new_group)
|
return TagGroup(self.tag, new_group)
|
||||||
|
|
||||||
def force_singleton(self, warn=True) -> TagGroup:
|
def force_singleton(self, warn=True) -> TagGroup:
|
||||||
|
@ -95,35 +96,38 @@ class TagGroup:
|
||||||
return self
|
return self
|
||||||
else:
|
else:
|
||||||
if warn:
|
if warn:
|
||||||
warnings.warn('Forced single instance of {}'.format(self.tag))
|
warnings.warn("Forced single instance of {}".format(self.tag))
|
||||||
return TagGroup(self.tag, self.group[:1])
|
return TagGroup(self.tag, self.group[:1])
|
||||||
|
|
||||||
RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX'
|
RE_ISO8601_DATE = r"^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$" # Note: Includes non-specific century dates like '18XX'
|
||||||
RE_GERMAN_DATE = r'^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$'
|
RE_GERMAN_DATE = r"^(?P<dd>\d{2})\.(?P<mm>\d{2})\.(?P<yyyy>\d{4})$"
|
||||||
|
|
||||||
def fix_date(self) -> TagGroup:
|
def fix_date(self) -> TagGroup:
|
||||||
|
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
if e.attrib.get('encoding') == 'w3cdtf':
|
if e.attrib.get("encoding") == "w3cdtf":
|
||||||
# This should be 'iso8601' according to MODS-AP 2.3.1
|
# This should be 'iso8601' according to MODS-AP 2.3.1
|
||||||
warnings.warn('Changed w3cdtf encoding to iso8601')
|
warnings.warn("Changed w3cdtf encoding to iso8601")
|
||||||
e.attrib['encoding'] = 'iso8601'
|
e.attrib["encoding"] = "iso8601"
|
||||||
|
|
||||||
new_group = []
|
new_group = []
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
if e.text is None:
|
if e.text is None:
|
||||||
warnings.warn('Empty date')
|
warnings.warn("Empty date")
|
||||||
continue
|
continue
|
||||||
if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
|
if e.attrib.get("encoding") == "iso8601" and re.match(
|
||||||
|
self.RE_ISO8601_DATE, e.text
|
||||||
|
):
|
||||||
new_group.append(e)
|
new_group.append(e)
|
||||||
elif re.match(self.RE_ISO8601_DATE, e.text):
|
elif re.match(self.RE_ISO8601_DATE, e.text):
|
||||||
warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
|
warnings.warn("Added iso8601 encoding to date {}".format(e.text))
|
||||||
e.attrib['encoding'] = 'iso8601'
|
e.attrib["encoding"] = "iso8601"
|
||||||
new_group.append(e)
|
new_group.append(e)
|
||||||
elif m := re.match(self.RE_GERMAN_DATE, e.text):
|
elif m := re.match(self.RE_GERMAN_DATE, e.text):
|
||||||
warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
|
warnings.warn("Converted date {} to iso8601 encoding".format(e.text))
|
||||||
e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
|
e.text = "{}-{}-{}".format(
|
||||||
e.attrib['encoding'] = 'iso8601'
|
m.group("yyyy"), m.group("mm"), m.group("dd")
|
||||||
|
)
|
||||||
|
e.attrib["encoding"] = "iso8601"
|
||||||
new_group.append(e)
|
new_group.append(e)
|
||||||
else:
|
else:
|
||||||
warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
|
warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
|
||||||
|
@ -146,26 +150,30 @@ class TagGroup:
|
||||||
# Fix this for special cases.
|
# Fix this for special cases.
|
||||||
|
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
if e.attrib.get('eventType') is None:
|
if e.attrib.get("eventType") is None:
|
||||||
try:
|
try:
|
||||||
if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \
|
if (
|
||||||
e.find('mods:edition', ns).text == '[Electronic ed.]':
|
e.find("mods:publisher", ns).text.startswith(
|
||||||
e.attrib['eventType'] = 'digitization'
|
"Staatsbibliothek zu Berlin"
|
||||||
warnings.warn('Fixed eventType for electronic ed.')
|
)
|
||||||
|
and e.find("mods:edition", ns).text == "[Electronic ed.]"
|
||||||
|
):
|
||||||
|
e.attrib["eventType"] = "digitization"
|
||||||
|
warnings.warn("Fixed eventType for electronic ed.")
|
||||||
continue
|
continue
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
if e.find('mods:dateIssued', ns) is not None:
|
if e.find("mods:dateIssued", ns) is not None:
|
||||||
e.attrib['eventType'] = 'publication'
|
e.attrib["eventType"] = "publication"
|
||||||
warnings.warn('Fixed eventType for an issued origin')
|
warnings.warn("Fixed eventType for an issued origin")
|
||||||
continue
|
continue
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
if e.find('mods:dateCreated', ns) is not None:
|
if e.find("mods:dateCreated", ns) is not None:
|
||||||
e.attrib['eventType'] = 'production'
|
e.attrib["eventType"] = "production"
|
||||||
warnings.warn('Fixed eventType for a created origin')
|
warnings.warn("Fixed eventType for a created origin")
|
||||||
continue
|
continue
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
|
@ -174,13 +182,14 @@ class TagGroup:
|
||||||
def fix_script_term(self) -> TagGroup:
|
def fix_script_term(self) -> TagGroup:
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
# MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
|
# MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
|
||||||
if e.attrib['authority'] == 'ISO15924':
|
if e.attrib["authority"] == "ISO15924":
|
||||||
e.attrib['authority'] = 'iso15924'
|
e.attrib["authority"] = "iso15924"
|
||||||
warnings.warn('Changed scriptTerm authority to lower case')
|
warnings.warn("Changed scriptTerm authority to lower case")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def merge_sub_tags_to_set(self) -> dict:
|
def merge_sub_tags_to_set(self) -> dict:
|
||||||
from .mods4pandas import mods_to_dict
|
from .mods4pandas import mods_to_dict
|
||||||
|
|
||||||
value = {}
|
value = {}
|
||||||
|
|
||||||
sub_dicts = [mods_to_dict(e) for e in self.group]
|
sub_dicts = [mods_to_dict(e) for e in self.group]
|
||||||
|
@ -230,6 +239,7 @@ class TagGroup:
|
||||||
Extract values using the given XPath expression, convert them to float and return descriptive
|
Extract values using the given XPath expression, convert them to float and return descriptive
|
||||||
statistics on the values.
|
statistics on the values.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def xpath_values():
|
def xpath_values():
|
||||||
values = []
|
values = []
|
||||||
for e in self.group:
|
for e in self.group:
|
||||||
|
@ -240,11 +250,11 @@ class TagGroup:
|
||||||
values = xpath_values()
|
values = xpath_values()
|
||||||
statistics = {}
|
statistics = {}
|
||||||
if values.size > 0:
|
if values.size > 0:
|
||||||
statistics[f'{xpath_expr}-mean'] = np.mean(values)
|
statistics[f"{xpath_expr}-mean"] = np.mean(values)
|
||||||
statistics[f'{xpath_expr}-median'] = np.median(values)
|
statistics[f"{xpath_expr}-median"] = np.median(values)
|
||||||
statistics[f'{xpath_expr}-std'] = np.std(values)
|
statistics[f"{xpath_expr}-std"] = np.std(values)
|
||||||
statistics[f'{xpath_expr}-min'] = np.min(values)
|
statistics[f"{xpath_expr}-min"] = np.min(values)
|
||||||
statistics[f'{xpath_expr}-max'] = np.max(values)
|
statistics[f"{xpath_expr}-max"] = np.max(values)
|
||||||
return statistics
|
return statistics
|
||||||
|
|
||||||
def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]:
|
def xpath_count(self, xpath_expr, namespaces) -> dict[str, int]:
|
||||||
|
@ -256,11 +266,10 @@ class TagGroup:
|
||||||
r = e.xpath(xpath_expr, namespaces=namespaces)
|
r = e.xpath(xpath_expr, namespaces=namespaces)
|
||||||
values += r
|
values += r
|
||||||
|
|
||||||
counts = {f'{xpath_expr}-count': len(values)}
|
counts = {f"{xpath_expr}-count": len(values)}
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def sorted_groupby(iterable, key=None):
|
def sorted_groupby(iterable, key=None):
|
||||||
"""
|
"""
|
||||||
Sort iterable by key and then group by the same key.
|
Sort iterable by key and then group by the same key.
|
||||||
|
@ -291,7 +300,7 @@ def _to_dict(root, raise_errors):
|
||||||
raise ValueError(f"Unknown namespace {root_name.namespace}")
|
raise ValueError(f"Unknown namespace {root_name.namespace}")
|
||||||
|
|
||||||
|
|
||||||
def flatten(d: MutableMapping, parent='', separator='_') -> dict:
|
def flatten(d: MutableMapping, parent="", separator="_") -> dict:
|
||||||
"""
|
"""
|
||||||
Flatten the given nested dict.
|
Flatten the given nested dict.
|
||||||
|
|
||||||
|
@ -314,11 +323,12 @@ def flatten(d: MutableMapping, parent='', separator='_') -> dict:
|
||||||
|
|
||||||
|
|
||||||
def valid_column_key(k) -> bool:
|
def valid_column_key(k) -> bool:
|
||||||
if re.match(r'^[a-zA-Z0-9 _@/:\[\]-]+$', k):
|
if re.match(r"^[a-zA-Z0-9 _@/:\[\]-]+$", k):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def column_names_csv(columns) -> str:
|
def column_names_csv(columns) -> str:
|
||||||
"""
|
"""
|
||||||
Format Column names (identifiers) as a comma-separated list.
|
Format Column names (identifiers) as a comma-separated list.
|
||||||
|
@ -327,9 +337,11 @@ def column_names_csv(columns) -> str:
|
||||||
"""
|
"""
|
||||||
return ",".join('"' + c + '"' for c in columns)
|
return ",".join('"' + c + '"' for c in columns)
|
||||||
|
|
||||||
|
|
||||||
current_columns: dict[str, list] = defaultdict(list)
|
current_columns: dict[str, list] = defaultdict(list)
|
||||||
current_columns_types: dict[str, dict] = defaultdict(dict)
|
current_columns_types: dict[str, dict] = defaultdict(dict)
|
||||||
|
|
||||||
|
|
||||||
def insert_into_db(con, table, d: Dict):
|
def insert_into_db(con, table, d: Dict):
|
||||||
"""Insert the values from the dict into the table, creating columns if necessary"""
|
"""Insert the values from the dict into the table, creating columns if necessary"""
|
||||||
|
|
||||||
|
@ -338,7 +350,9 @@ def insert_into_db(con, table, d: Dict):
|
||||||
for k in d.keys():
|
for k in d.keys():
|
||||||
assert valid_column_key(k), f'"{k}" is not a valid column name'
|
assert valid_column_key(k), f'"{k}" is not a valid column name'
|
||||||
current_columns[table].append(k)
|
current_columns[table].append(k)
|
||||||
con.execute(f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})")
|
con.execute(
|
||||||
|
f"CREATE TABLE {table} ({column_names_csv(current_columns[table])})"
|
||||||
|
)
|
||||||
|
|
||||||
# Add columns if necessary
|
# Add columns if necessary
|
||||||
for k in d.keys():
|
for k in d.keys():
|
||||||
|
@ -361,13 +375,15 @@ def insert_into_db(con, table, d: Dict):
|
||||||
f"( {column_names_csv(columns)} )"
|
f"( {column_names_csv(columns)} )"
|
||||||
"VALUES"
|
"VALUES"
|
||||||
f"( {','.join('?' for c in columns)} )",
|
f"( {','.join('?' for c in columns)} )",
|
||||||
[str(d[c]) for c in columns]
|
[str(d[c]) for c in columns],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def insert_into_db_multiple(con, table, ld: List[Dict]):
|
def insert_into_db_multiple(con, table, ld: List[Dict]):
|
||||||
for d in ld:
|
for d in ld:
|
||||||
insert_into_db(con, table, d)
|
insert_into_db(con, table, d)
|
||||||
|
|
||||||
|
|
||||||
def convert_db_to_parquet(con, table, index_col, output_file):
|
def convert_db_to_parquet(con, table, index_col, output_file):
|
||||||
df = pd.read_sql_query(f"SELECT * FROM {table}", con, index_col)
|
df = pd.read_sql_query(f"SELECT * FROM {table}", con, index_col)
|
||||||
|
|
||||||
|
@ -386,6 +402,8 @@ def convert_db_to_parquet(con, table, index_col, output_file):
|
||||||
elif column_type == "set":
|
elif column_type == "set":
|
||||||
df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
|
df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")
|
raise NotImplementedError(
|
||||||
|
f"Column {c}: type {column_type} not implemented yet."
|
||||||
|
)
|
||||||
|
|
||||||
df.to_parquet(output_file)
|
df.to_parquet(output_file)
|
|
@ -17,7 +17,16 @@ from collections.abc import MutableMapping, Sequence
|
||||||
import click
|
import click
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .lib import convert_db_to_parquet, sorted_groupby, TagGroup, ns, flatten, insert_into_db, insert_into_db_multiple, current_columns_types
|
from .lib import (
|
||||||
|
convert_db_to_parquet,
|
||||||
|
sorted_groupby,
|
||||||
|
TagGroup,
|
||||||
|
ns,
|
||||||
|
flatten,
|
||||||
|
insert_into_db,
|
||||||
|
insert_into_db_multiple,
|
||||||
|
current_columns_types,
|
||||||
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
# Filter warnings on WSL
|
# Filter warnings on WSL
|
||||||
|
@ -26,7 +35,8 @@ with warnings.catch_warnings():
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('mods4pandas')
|
logger = logging.getLogger("mods4pandas")
|
||||||
|
|
||||||
|
|
||||||
def mods_to_dict(mods, raise_errors=True):
|
def mods_to_dict(mods, raise_errors=True):
|
||||||
"""Convert MODS metadata to a nested dictionary"""
|
"""Convert MODS metadata to a nested dictionary"""
|
||||||
|
@ -37,179 +47,290 @@ def mods_to_dict(mods, raise_errors=True):
|
||||||
value = {}
|
value = {}
|
||||||
|
|
||||||
# Iterate through each group of tags
|
# Iterate through each group of tags
|
||||||
for tag, group in sorted_groupby(mods, key=attrgetter('tag')):
|
for tag, group in sorted_groupby(mods, key=attrgetter("tag")):
|
||||||
group = list(group)
|
group = list(group)
|
||||||
if tag == '{http://www.loc.gov/mods/v3}location':
|
if tag == "{http://www.loc.gov/mods/v3}location":
|
||||||
|
|
||||||
def only_current_location(location):
|
def only_current_location(location):
|
||||||
return location.get('type') != 'former'
|
return location.get("type") != "former"
|
||||||
value['location'] = TagGroup(tag, group) \
|
|
||||||
.filter(only_current_location) \
|
value["location"] = (
|
||||||
.has_attributes([{}, {'type': 'current'}]) \
|
TagGroup(tag, group)
|
||||||
.is_singleton().descend(raise_errors)
|
.filter(only_current_location)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}physicalLocation':
|
.has_attributes([{}, {"type": "current"}])
|
||||||
|
.is_singleton()
|
||||||
|
.descend(raise_errors)
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}physicalLocation":
|
||||||
|
|
||||||
def no_display_label(physical_location):
|
def no_display_label(physical_location):
|
||||||
return physical_location.get('displayLabel') is None
|
return physical_location.get("displayLabel") is None
|
||||||
value['physicalLocation'] = TagGroup(tag, group).filter(no_display_label).text()
|
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}shelfLocator':
|
value["physicalLocation"] = (
|
||||||
|
TagGroup(tag, group).filter(no_display_label).text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}shelfLocator":
|
||||||
# This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain
|
# This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain
|
||||||
# a second element with empty text and a "displayLabel" attribute set.
|
# a second element with empty text and a "displayLabel" attribute set.
|
||||||
def no_display_label(shelf_locator):
|
def no_display_label(shelf_locator):
|
||||||
return shelf_locator.get('displayLabel') is None
|
return shelf_locator.get("displayLabel") is None
|
||||||
value['shelfLocator'] = TagGroup(tag, group) \
|
|
||||||
.filter(no_display_label) \
|
value["shelfLocator"] = (
|
||||||
.force_singleton() \
|
TagGroup(tag, group)
|
||||||
.has_no_attributes() \
|
.filter(no_display_label)
|
||||||
|
.force_singleton()
|
||||||
|
.has_no_attributes()
|
||||||
.text()
|
.text()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}originInfo':
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}originInfo":
|
||||||
|
|
||||||
def has_event_type(origin_info):
|
def has_event_type(origin_info):
|
||||||
# According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some
|
# According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some
|
||||||
# are empty and not fixable.
|
# are empty and not fixable.
|
||||||
return origin_info.attrib.get('eventType') is not None
|
return origin_info.attrib.get("eventType") is not None
|
||||||
tag_group = TagGroup(tag, group).fix_event_type().filter(has_event_type, warn="has no eventType")
|
|
||||||
for event_type, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['eventType']):
|
tag_group = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.fix_event_type()
|
||||||
|
.filter(has_event_type, warn="has no eventType")
|
||||||
|
)
|
||||||
|
for event_type, grouped_group in sorted_groupby(
|
||||||
|
tag_group.group, key=lambda g: g.attrib["eventType"]
|
||||||
|
):
|
||||||
for n, e in enumerate(grouped_group):
|
for n, e in enumerate(grouped_group):
|
||||||
value['originInfo-{}{}'.format(event_type, n)] = mods_to_dict(e, raise_errors)
|
value["originInfo-{}{}".format(event_type, n)] = mods_to_dict(
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}place':
|
e, raise_errors
|
||||||
value['place'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().descend(raise_errors)
|
)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}placeTerm':
|
elif tag == "{http://www.loc.gov/mods/v3}place":
|
||||||
value['placeTerm'] = TagGroup(tag, group).is_singleton().has_attributes({'type': 'text'}).text()
|
value["place"] = (
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}dateIssued':
|
TagGroup(tag, group)
|
||||||
value['dateIssued'] = TagGroup(tag, group) \
|
.force_singleton(warn=False)
|
||||||
.fix_date() \
|
.has_no_attributes()
|
||||||
.sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \
|
.descend(raise_errors)
|
||||||
.ignore_attributes() \
|
)
|
||||||
.force_singleton() \
|
elif tag == "{http://www.loc.gov/mods/v3}placeTerm":
|
||||||
|
value["placeTerm"] = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.is_singleton()
|
||||||
|
.has_attributes({"type": "text"})
|
||||||
.text()
|
.text()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}dateCreated':
|
)
|
||||||
value['dateCreated'] = TagGroup(tag, group) \
|
elif tag == "{http://www.loc.gov/mods/v3}dateIssued":
|
||||||
.fix_date() \
|
value["dateIssued"] = (
|
||||||
.sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \
|
TagGroup(tag, group)
|
||||||
.ignore_attributes() \
|
.fix_date()
|
||||||
.force_singleton() \
|
.sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
|
||||||
|
.ignore_attributes()
|
||||||
|
.force_singleton()
|
||||||
.text()
|
.text()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}dateCaptured':
|
)
|
||||||
value['dateCaptured'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text()
|
elif tag == "{http://www.loc.gov/mods/v3}dateCreated":
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}dateOther':
|
value["dateCreated"] = (
|
||||||
value['dateOther'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text()
|
TagGroup(tag, group)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}publisher':
|
.fix_date()
|
||||||
value['publisher'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().text()
|
.sort(key=lambda d: d.attrib.get("keyDate") == "yes", reverse=True)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}edition':
|
.ignore_attributes()
|
||||||
value['edition'] = TagGroup(tag, group).force_singleton().has_no_attributes().text()
|
.force_singleton()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}classification':
|
.text()
|
||||||
authorities = {e.attrib['authority'] for e in group}
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}dateCaptured":
|
||||||
|
value["dateCaptured"] = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.fix_date()
|
||||||
|
.ignore_attributes()
|
||||||
|
.is_singleton()
|
||||||
|
.text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}dateOther":
|
||||||
|
value["dateOther"] = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.fix_date()
|
||||||
|
.ignore_attributes()
|
||||||
|
.is_singleton()
|
||||||
|
.text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}publisher":
|
||||||
|
value["publisher"] = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.force_singleton(warn=False)
|
||||||
|
.has_no_attributes()
|
||||||
|
.text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}edition":
|
||||||
|
value["edition"] = (
|
||||||
|
TagGroup(tag, group).force_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}classification":
|
||||||
|
authorities = {e.attrib["authority"] for e in group}
|
||||||
for authority in authorities:
|
for authority in authorities:
|
||||||
sub_group = [e for e in group if e.attrib.get('authority') == authority]
|
sub_group = [e for e in group if e.attrib.get("authority") == authority]
|
||||||
value['classification-{}'.format(authority)] = TagGroup(tag, sub_group).text_set()
|
value["classification-{}".format(authority)] = TagGroup(
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}recordInfo':
|
tag, sub_group
|
||||||
value['recordInfo'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
|
).text_set()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}recordIdentifier':
|
elif tag == "{http://www.loc.gov/mods/v3}recordInfo":
|
||||||
|
value["recordInfo"] = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.is_singleton()
|
||||||
|
.has_no_attributes()
|
||||||
|
.descend(raise_errors)
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}recordIdentifier":
|
||||||
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
|
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
|
||||||
# however, in mods:relatedItems, there may be source="dnb-ppns",
|
# however, in mods:relatedItems, there may be source="dnb-ppns",
|
||||||
# which we need to distinguish by using a separate field name.
|
# which we need to distinguish by using a separate field name.
|
||||||
try:
|
try:
|
||||||
value['recordIdentifier'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'gbv-ppn'}).text()
|
value["recordIdentifier"] = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.is_singleton()
|
||||||
|
.has_attributes({"source": "gbv-ppn"})
|
||||||
|
.text()
|
||||||
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
value['recordIdentifier-dnb-ppn'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'dnb-ppn'}).text()
|
value["recordIdentifier-dnb-ppn"] = (
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}identifier':
|
TagGroup(tag, group)
|
||||||
|
.is_singleton()
|
||||||
|
.has_attributes({"source": "dnb-ppn"})
|
||||||
|
.text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}identifier":
|
||||||
for e in group:
|
for e in group:
|
||||||
if len(e.attrib) != 1:
|
if len(e.attrib) != 1:
|
||||||
raise ValueError('Unknown attributes for identifier {}'.format(e.attrib))
|
raise ValueError(
|
||||||
value['identifier-{}'.format(e.attrib['type'])] = e.text
|
"Unknown attributes for identifier {}".format(e.attrib)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}titleInfo':
|
)
|
||||||
|
value["identifier-{}".format(e.attrib["type"])] = e.text
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}titleInfo":
|
||||||
|
|
||||||
def only_standard_title(title_info):
|
def only_standard_title(title_info):
|
||||||
return title_info.attrib.get('type') is None
|
return title_info.attrib.get("type") is None
|
||||||
value['titleInfo'] = TagGroup(tag, group) \
|
|
||||||
.filter(only_standard_title) \
|
value["titleInfo"] = (
|
||||||
.is_singleton().has_no_attributes().descend(raise_errors)
|
TagGroup(tag, group)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}title':
|
.filter(only_standard_title)
|
||||||
value['title'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
.is_singleton()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}partName':
|
.has_no_attributes()
|
||||||
value['partName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
.descend(raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}subTitle':
|
)
|
||||||
value['subTitle'] = TagGroup(tag, group).force_singleton().has_no_attributes().text()
|
elif tag == "{http://www.loc.gov/mods/v3}title":
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}note':
|
value["title"] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}partName":
|
||||||
|
value["partName"] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}subTitle":
|
||||||
|
value["subTitle"] = (
|
||||||
|
TagGroup(tag, group).force_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}note":
|
||||||
# This could be useful if distinguished by type attribute.
|
# This could be useful if distinguished by type attribute.
|
||||||
pass
|
pass
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}part':
|
elif tag == "{http://www.loc.gov/mods/v3}part":
|
||||||
pass
|
pass
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}abstract':
|
elif tag == "{http://www.loc.gov/mods/v3}abstract":
|
||||||
value['abstract'] = TagGroup(tag, group).has_no_attributes().text()
|
value["abstract"] = TagGroup(tag, group).has_no_attributes().text()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}subject':
|
elif tag == "{http://www.loc.gov/mods/v3}subject":
|
||||||
authorities = {e.attrib.get('authority') for e in group}
|
authorities = {e.attrib.get("authority") for e in group}
|
||||||
for authority in authorities:
|
for authority in authorities:
|
||||||
k = 'subject-{}'.format(authority) if authority is not None else 'subject'
|
k = (
|
||||||
sub_group = [e for e in group if e.attrib.get('authority') == authority]
|
"subject-{}".format(authority)
|
||||||
value[k] = TagGroup(tag, sub_group).force_singleton().descend(raise_errors)
|
if authority is not None
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}topic':
|
else "subject"
|
||||||
|
)
|
||||||
|
sub_group = [e for e in group if e.attrib.get("authority") == authority]
|
||||||
|
value[k] = (
|
||||||
|
TagGroup(tag, sub_group).force_singleton().descend(raise_errors)
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}topic":
|
||||||
TagGroup(tag, group).text_set()
|
TagGroup(tag, group).text_set()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}cartographics':
|
elif tag == "{http://www.loc.gov/mods/v3}cartographics":
|
||||||
pass
|
pass
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}geographic':
|
elif tag == "{http://www.loc.gov/mods/v3}geographic":
|
||||||
TagGroup(tag, group).text_set()
|
TagGroup(tag, group).text_set()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}temporal':
|
elif tag == "{http://www.loc.gov/mods/v3}temporal":
|
||||||
TagGroup(tag, group).text_set()
|
TagGroup(tag, group).text_set()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}genre':
|
elif tag == "{http://www.loc.gov/mods/v3}genre":
|
||||||
authorities = {e.attrib.get('authority') for e in group}
|
authorities = {e.attrib.get("authority") for e in group}
|
||||||
for authority in authorities:
|
for authority in authorities:
|
||||||
k = 'genre-{}'.format(authority) if authority is not None else 'genre'
|
k = "genre-{}".format(authority) if authority is not None else "genre"
|
||||||
value[k] = {e.text for e in group if e.attrib.get('authority') == authority}
|
value[k] = {
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}language':
|
e.text for e in group if e.attrib.get("authority") == authority
|
||||||
value["language"] = TagGroup(tag, group) \
|
}
|
||||||
.merge_sub_tags_to_set()
|
elif tag == "{http://www.loc.gov/mods/v3}language":
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}languageTerm':
|
value["language"] = TagGroup(tag, group).merge_sub_tags_to_set()
|
||||||
value['languageTerm'] = TagGroup(tag, group) \
|
elif tag == "{http://www.loc.gov/mods/v3}languageTerm":
|
||||||
.has_attributes({'authority': 'iso639-2b', 'type': 'code'}) \
|
value["languageTerm"] = (
|
||||||
|
TagGroup(tag, group)
|
||||||
|
.has_attributes({"authority": "iso639-2b", "type": "code"})
|
||||||
.text_set()
|
.text_set()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}scriptTerm':
|
)
|
||||||
value['scriptTerm'] = TagGroup(tag, group) \
|
elif tag == "{http://www.loc.gov/mods/v3}scriptTerm":
|
||||||
.fix_script_term() \
|
value["scriptTerm"] = (
|
||||||
.has_attributes({'authority': 'iso15924', 'type': 'code'}) \
|
TagGroup(tag, group)
|
||||||
|
.fix_script_term()
|
||||||
|
.has_attributes({"authority": "iso15924", "type": "code"})
|
||||||
.text_set()
|
.text_set()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}relatedItem':
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}relatedItem":
|
||||||
tag_group = TagGroup(tag, group)
|
tag_group = TagGroup(tag, group)
|
||||||
for type_, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['type']):
|
for type_, grouped_group in sorted_groupby(
|
||||||
sub_tag = 'relatedItem-{}'.format(type_)
|
tag_group.group, key=lambda g: g.attrib["type"]
|
||||||
|
):
|
||||||
|
sub_tag = "relatedItem-{}".format(type_)
|
||||||
grouped_group = list(grouped_group)
|
grouped_group = list(grouped_group)
|
||||||
if type_ in ["original", "host"]:
|
if type_ in ["original", "host"]:
|
||||||
value[sub_tag] = TagGroup(sub_tag, grouped_group).is_singleton().descend(raise_errors)
|
value[sub_tag] = (
|
||||||
|
TagGroup(sub_tag, grouped_group)
|
||||||
|
.is_singleton()
|
||||||
|
.descend(raise_errors)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# TODO type="series"
|
# TODO type="series"
|
||||||
pass
|
pass
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}name':
|
elif tag == "{http://www.loc.gov/mods/v3}name":
|
||||||
for n, e in enumerate(group):
|
for n, e in enumerate(group):
|
||||||
value['name{}'.format(n)] = mods_to_dict(e, raise_errors)
|
value["name{}".format(n)] = mods_to_dict(e, raise_errors)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}role':
|
elif tag == "{http://www.loc.gov/mods/v3}role":
|
||||||
value["role"] = TagGroup(tag, group) \
|
value["role"] = (
|
||||||
.has_no_attributes() \
|
TagGroup(tag, group).has_no_attributes().merge_sub_tags_to_set()
|
||||||
.merge_sub_tags_to_set()
|
)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}roleTerm':
|
elif tag == "{http://www.loc.gov/mods/v3}roleTerm":
|
||||||
value['roleTerm'] = TagGroup(tag, group) \
|
value["roleTerm"] = (
|
||||||
.has_attributes({'authority': 'marcrelator', 'type': 'code'}) \
|
TagGroup(tag, group)
|
||||||
|
.has_attributes({"authority": "marcrelator", "type": "code"})
|
||||||
.text_set()
|
.text_set()
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}namePart':
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}namePart":
|
||||||
for e in group:
|
for e in group:
|
||||||
if not e.attrib.get('type'):
|
if not e.attrib.get("type"):
|
||||||
value['namePart'] = e.text
|
value["namePart"] = e.text
|
||||||
else:
|
else:
|
||||||
value['namePart-{}'.format(e.attrib['type'])] = e.text
|
value["namePart-{}".format(e.attrib["type"])] = e.text
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}nameIdentifier':
|
elif tag == "{http://www.loc.gov/mods/v3}nameIdentifier":
|
||||||
# TODO Use this (e.g. <mods:nameIdentifier type="ppn">106168096</mods:nameIdentifier>) or the
|
# TODO Use this (e.g. <mods:nameIdentifier type="ppn">106168096</mods:nameIdentifier>) or the
|
||||||
# mods:name@valueURI to disambiguate
|
# mods:name@valueURI to disambiguate
|
||||||
pass
|
pass
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}displayForm':
|
elif tag == "{http://www.loc.gov/mods/v3}displayForm":
|
||||||
value['displayForm'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value["displayForm"] = (
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}physicalDescription':
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}physicalDescription":
|
||||||
pass
|
pass
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}extension':
|
elif tag == "{http://www.loc.gov/mods/v3}extension":
|
||||||
pass
|
pass
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}accessCondition':
|
elif tag == "{http://www.loc.gov/mods/v3}accessCondition":
|
||||||
for e in group:
|
for e in group:
|
||||||
if not e.attrib.get('type'):
|
if not e.attrib.get("type"):
|
||||||
raise ValueError('Unknown attributes for accessCondition {}'.format(e.attrib))
|
raise ValueError(
|
||||||
value['accessCondition-{}'.format(e.attrib['type'])] = e.text
|
"Unknown attributes for accessCondition {}".format(e.attrib)
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}typeOfResource':
|
)
|
||||||
value['typeOfResource'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
value["accessCondition-{}".format(e.attrib["type"])] = e.text
|
||||||
elif tag == '{http://www.loc.gov/mods/v3}mods':
|
elif tag == "{http://www.loc.gov/mods/v3}typeOfResource":
|
||||||
|
value["typeOfResource"] = (
|
||||||
|
TagGroup(tag, group).is_singleton().has_no_attributes().text()
|
||||||
|
)
|
||||||
|
elif tag == "{http://www.loc.gov/mods/v3}mods":
|
||||||
# XXX Ignore nested mods:mods for now (used in mods:subject)
|
# XXX Ignore nested mods:mods for now (used in mods:subject)
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
@ -230,30 +351,29 @@ def mets_to_dict(mets, raise_errors=True):
|
||||||
value = {}
|
value = {}
|
||||||
|
|
||||||
# Iterate through each group of tags
|
# Iterate through each group of tags
|
||||||
for tag, group in sorted_groupby(mets, key=attrgetter('tag')):
|
for tag, group in sorted_groupby(mets, key=attrgetter("tag")):
|
||||||
group = list(group)
|
group = list(group)
|
||||||
|
|
||||||
# XXX Namespaces seem to use a trailing / sometimes, sometimes not.
|
# XXX Namespaces seem to use a trailing / sometimes, sometimes not.
|
||||||
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
|
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
|
||||||
if tag == '{http://www.loc.gov/METS/}amdSec':
|
if tag == "{http://www.loc.gov/METS/}amdSec":
|
||||||
pass # TODO
|
pass # TODO
|
||||||
elif tag == '{http://www.loc.gov/METS/}dmdSec':
|
elif tag == "{http://www.loc.gov/METS/}dmdSec":
|
||||||
pass # TODO
|
pass # TODO
|
||||||
elif tag == '{http://www.loc.gov/METS/}metsHdr':
|
elif tag == "{http://www.loc.gov/METS/}metsHdr":
|
||||||
pass # TODO
|
pass # TODO
|
||||||
elif tag == '{http://www.loc.gov/METS/}structLink':
|
elif tag == "{http://www.loc.gov/METS/}structLink":
|
||||||
pass # TODO
|
pass # TODO
|
||||||
elif tag == '{http://www.loc.gov/METS/}structMap':
|
elif tag == "{http://www.loc.gov/METS/}structMap":
|
||||||
pass # TODO
|
pass # TODO
|
||||||
elif tag == '{http://www.loc.gov/METS/}fileSec':
|
elif tag == "{http://www.loc.gov/METS/}fileSec":
|
||||||
value['fileSec'] = TagGroup(tag, group) \
|
value["fileSec"] = TagGroup(tag, group).is_singleton().descend(raise_errors)
|
||||||
.is_singleton().descend(raise_errors)
|
elif tag == "{http://www.loc.gov/METS/}fileGrp":
|
||||||
elif tag == '{http://www.loc.gov/METS/}fileGrp':
|
|
||||||
for e in group:
|
for e in group:
|
||||||
use = e.attrib.get('USE')
|
use = e.attrib.get("USE")
|
||||||
if not use:
|
if not use:
|
||||||
raise ValueError('No USE attribute for fileGrp {}'.format(e))
|
raise ValueError("No USE attribute for fileGrp {}".format(e))
|
||||||
value[f'fileGrp-{use}-count'] = len(e)
|
value[f"fileGrp-{use}-count"] = len(e)
|
||||||
else:
|
else:
|
||||||
if raise_errors:
|
if raise_errors:
|
||||||
print(value)
|
print(value)
|
||||||
|
@ -262,6 +382,7 @@ def mets_to_dict(mets, raise_errors=True):
|
||||||
pass
|
pass
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
# TODO replace asserts by ValueError
|
# TODO replace asserts by ValueError
|
||||||
|
|
||||||
|
@ -269,23 +390,36 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
|
|
||||||
# PPN
|
# PPN
|
||||||
def get_mets_recordIdentifier(*, source="gbv-ppn"):
|
def get_mets_recordIdentifier(*, source="gbv-ppn"):
|
||||||
return (mets.xpath(f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
|
return (
|
||||||
namespaces=ns) or [None])[0].text
|
mets.xpath(
|
||||||
|
f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
|
||||||
|
namespaces=ns,
|
||||||
|
)
|
||||||
|
or [None]
|
||||||
|
)[0].text
|
||||||
|
|
||||||
ppn = get_mets_recordIdentifier()
|
ppn = get_mets_recordIdentifier()
|
||||||
|
|
||||||
# Getting per-page/structure information is a bit different
|
# Getting per-page/structure information is a bit different
|
||||||
structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
|
structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
|
||||||
structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
|
structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
|
||||||
fileSec = mets.find('./mets:fileSec', ns)
|
fileSec = mets.find("./mets:fileSec", ns)
|
||||||
if structMap_PHYSICAL is None:
|
if structMap_PHYSICAL is None:
|
||||||
# This is expected in a multivolume work or periodical!
|
# This is expected in a multivolume work or periodical!
|
||||||
if any(
|
if any(
|
||||||
structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
|
structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
|
||||||
for t in ["multivolume_work", "MultivolumeWork", "multivolume_manuscript", "periodical"]
|
for t in [
|
||||||
|
"multivolume_work",
|
||||||
|
"MultivolumeWork",
|
||||||
|
"multivolume_manuscript",
|
||||||
|
"periodical",
|
||||||
|
]
|
||||||
):
|
):
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)")
|
raise ValueError(
|
||||||
|
"No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)"
|
||||||
|
)
|
||||||
if structMap_LOGICAL is None:
|
if structMap_LOGICAL is None:
|
||||||
raise ValueError("No structMap[@TYPE='LOGICAL'] found")
|
raise ValueError("No structMap[@TYPE='LOGICAL'] found")
|
||||||
if fileSec is None:
|
if fileSec is None:
|
||||||
|
@ -294,13 +428,14 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
div_physSequence = structMap_PHYSICAL[0]
|
div_physSequence = structMap_PHYSICAL[0]
|
||||||
assert div_physSequence.attrib.get("TYPE") == "physSequence"
|
assert div_physSequence.attrib.get("TYPE") == "physSequence"
|
||||||
|
|
||||||
|
|
||||||
# Build a look-up table to get mets:file by @ID
|
# Build a look-up table to get mets:file by @ID
|
||||||
# This cuts retrieving the mets:file down to half the time.
|
# This cuts retrieving the mets:file down to half the time.
|
||||||
mets_file_by_ID = {}
|
mets_file_by_ID = {}
|
||||||
|
|
||||||
def _init_mets_file_by_ID():
|
def _init_mets_file_by_ID():
|
||||||
for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns):
|
for f in fileSec.iterfind("./mets:fileGrp/mets:file", ns):
|
||||||
mets_file_by_ID[f.attrib.get("ID")] = f
|
mets_file_by_ID[f.attrib.get("ID")] = f
|
||||||
|
|
||||||
_init_mets_file_by_ID()
|
_init_mets_file_by_ID()
|
||||||
|
|
||||||
def get_mets_file(*, ID):
|
def get_mets_file(*, ID):
|
||||||
|
@ -312,7 +447,6 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)
|
return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)
|
||||||
|
|
||||||
for page in div_physSequence:
|
for page in div_physSequence:
|
||||||
|
|
||||||
# TODO sort by ORDER?
|
# TODO sort by ORDER?
|
||||||
assert page.attrib.get("TYPE") == "page"
|
assert page.attrib.get("TYPE") == "page"
|
||||||
page_dict = {}
|
page_dict = {}
|
||||||
|
@ -326,7 +460,9 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
file_ = get_mets_file(ID=file_id)
|
file_ = get_mets_file(ID=file_id)
|
||||||
assert file_ is not None
|
assert file_ is not None
|
||||||
fileGrp_USE = file_.getparent().attrib.get("USE")
|
fileGrp_USE = file_.getparent().attrib.get("USE")
|
||||||
file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
|
file_FLocat_href = (
|
||||||
|
file_.xpath("mets:FLocat/@xlink:href", namespaces=ns) or [None]
|
||||||
|
)[0]
|
||||||
if file_FLocat_href is not None:
|
if file_FLocat_href is not None:
|
||||||
file_FLocat_href = str(file_FLocat_href)
|
file_FLocat_href = str(file_FLocat_href)
|
||||||
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
|
page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
|
||||||
|
@ -343,7 +479,7 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
# it suffices to do this the old-fashioned way.
|
# it suffices to do this the old-fashioned way.
|
||||||
|
|
||||||
sm_links = mets.findall(
|
sm_links = mets.findall(
|
||||||
f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
|
f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
|
||||||
)
|
)
|
||||||
|
|
||||||
targets = []
|
targets = []
|
||||||
|
@ -378,10 +514,19 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
|
@click.argument("mets_files", type=click.Path(exists=True), required=True, nargs=-1)
|
||||||
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
|
@click.option(
|
||||||
default='mods_info_df.parquet', show_default=True)
|
"--output",
|
||||||
@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
|
"-o",
|
||||||
|
"output_file",
|
||||||
|
type=click.Path(),
|
||||||
|
help="Output Parquet file",
|
||||||
|
default="mods_info_df.parquet",
|
||||||
|
show_default=True,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--output-page-info", type=click.Path(), help="Output page info Parquet file"
|
||||||
|
)
|
||||||
def process_command(mets_files: list[str], output_file: str, output_page_info: str):
|
def process_command(mets_files: list[str], output_file: str, output_page_info: str):
|
||||||
"""
|
"""
|
||||||
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
|
||||||
|
@ -395,18 +540,21 @@ def process_command(mets_files: list[str], output_file: str, output_page_info: s
|
||||||
"""
|
"""
|
||||||
process(mets_files, output_file, output_page_info)
|
process(mets_files, output_file, output_page_info)
|
||||||
|
|
||||||
|
|
||||||
def process(mets_files: list[str], output_file: str, output_page_info: str):
|
def process(mets_files: list[str], output_file: str, output_page_info: str):
|
||||||
# Extend file list if directories are given
|
# Extend file list if directories are given
|
||||||
mets_files_real: list[str] = []
|
mets_files_real: list[str] = []
|
||||||
for m in mets_files:
|
for m in mets_files:
|
||||||
if os.path.isdir(m):
|
if os.path.isdir(m):
|
||||||
logger.info('Scanning directory {}'.format(m))
|
logger.info("Scanning directory {}".format(m))
|
||||||
mets_files_real.extend(f.path for f in tqdm(os.scandir(m), leave=False)
|
mets_files_real.extend(
|
||||||
if f.is_file() and not f.name.startswith('.'))
|
f.path
|
||||||
|
for f in tqdm(os.scandir(m), leave=False)
|
||||||
|
if f.is_file() and not f.name.startswith(".")
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
mets_files_real.append(m)
|
mets_files_real.append(m)
|
||||||
|
|
||||||
|
|
||||||
# Prepare output files
|
# Prepare output files
|
||||||
with contextlib.suppress(FileNotFoundError):
|
with contextlib.suppress(FileNotFoundError):
|
||||||
os.remove(output_file)
|
os.remove(output_file)
|
||||||
|
@ -414,28 +562,28 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
|
||||||
with contextlib.suppress(FileNotFoundError):
|
with contextlib.suppress(FileNotFoundError):
|
||||||
os.remove(output_file_sqlite3)
|
os.remove(output_file_sqlite3)
|
||||||
|
|
||||||
logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
|
logger.info("Writing SQLite DB to {}".format(output_file_sqlite3))
|
||||||
con = sqlite3.connect(output_file_sqlite3)
|
con = sqlite3.connect(output_file_sqlite3)
|
||||||
|
|
||||||
if output_page_info:
|
if output_page_info:
|
||||||
output_page_info_sqlite3 = output_page_info + ".sqlite3"
|
output_page_info_sqlite3 = output_page_info + ".sqlite3"
|
||||||
logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3))
|
logger.info("Writing SQLite DB to {}".format(output_page_info_sqlite3))
|
||||||
with contextlib.suppress(FileNotFoundError):
|
with contextlib.suppress(FileNotFoundError):
|
||||||
os.remove(output_page_info_sqlite3)
|
os.remove(output_page_info_sqlite3)
|
||||||
con_page_info = sqlite3.connect(output_page_info_sqlite3)
|
con_page_info = sqlite3.connect(output_page_info_sqlite3)
|
||||||
|
|
||||||
# Process METS files
|
# Process METS files
|
||||||
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
with open(output_file + ".warnings.csv", "w") as csvfile:
|
||||||
csvwriter = csv.writer(csvfile)
|
csvwriter = csv.writer(csvfile)
|
||||||
logger.info('Processing METS files')
|
logger.info("Processing METS files")
|
||||||
for mets_file in tqdm(mets_files_real, leave=True):
|
for mets_file in tqdm(mets_files_real, leave=True):
|
||||||
try:
|
try:
|
||||||
root = ET.parse(mets_file).getroot()
|
root = ET.parse(mets_file).getroot()
|
||||||
mets = root # XXX .find('mets:mets', ns) does not work here
|
mets = root # XXX .find('mets:mets', ns) does not work here
|
||||||
mods = root.find('mets:dmdSec//mods:mods', ns)
|
mods = root.find("mets:dmdSec//mods:mods", ns)
|
||||||
|
|
||||||
with warnings.catch_warnings(record=True) as caught_warnings:
|
with warnings.catch_warnings(record=True) as caught_warnings:
|
||||||
warnings.simplefilter('always') # do NOT filter double occurrences
|
warnings.simplefilter("always") # do NOT filter double occurrences
|
||||||
|
|
||||||
# MODS
|
# MODS
|
||||||
d = flatten(mods_to_dict(mods, raise_errors=True))
|
d = flatten(mods_to_dict(mods, raise_errors=True))
|
||||||
|
@ -445,7 +593,7 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
|
||||||
for k, v in d_mets.items():
|
for k, v in d_mets.items():
|
||||||
d[f"mets_{k}"] = v
|
d[f"mets_{k}"] = v
|
||||||
# "meta"
|
# "meta"
|
||||||
d['mets_file'] = mets_file
|
d["mets_file"] = mets_file
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
insert_into_db(con, "mods_info", d)
|
insert_into_db(con, "mods_info", d)
|
||||||
|
@ -453,8 +601,12 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
|
||||||
|
|
||||||
# METS - per-page
|
# METS - per-page
|
||||||
if output_page_info:
|
if output_page_info:
|
||||||
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
page_info_doc: list[dict] = pages_to_dict(
|
||||||
insert_into_db_multiple(con_page_info, "page_info", page_info_doc)
|
mets, raise_errors=True
|
||||||
|
)
|
||||||
|
insert_into_db_multiple(
|
||||||
|
con_page_info, "page_info", page_info_doc
|
||||||
|
)
|
||||||
con_page_info.commit()
|
con_page_info.commit()
|
||||||
|
|
||||||
if caught_warnings:
|
if caught_warnings:
|
||||||
|
@ -463,13 +615,15 @@ def process(mets_files: list[str], output_file: str, output_page_info: str):
|
||||||
for caught_warning in caught_warnings:
|
for caught_warning in caught_warnings:
|
||||||
csvwriter.writerow([mets_file, caught_warning.message])
|
csvwriter.writerow([mets_file, caught_warning.message])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception('Exception in {}'.format(mets_file))
|
logger.exception("Exception in {}".format(mets_file))
|
||||||
|
|
||||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
logger.info("Writing DataFrame to {}".format(output_file))
|
||||||
convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
|
convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
|
||||||
if output_page_info:
|
if output_page_info:
|
||||||
logger.info('Writing DataFrame to {}'.format(output_page_info))
|
logger.info("Writing DataFrame to {}".format(output_page_info))
|
||||||
convert_db_to_parquet(con_page_info, "page_info", ["ppn", "ID"], output_page_info)
|
convert_db_to_parquet(
|
||||||
|
con_page_info, "page_info", ["ppn", "ID"], output_page_info
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -481,5 +635,5 @@ def main():
|
||||||
process_command()
|
process_command()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -9,14 +9,17 @@ from mods4pandas.lib import flatten
|
||||||
|
|
||||||
TESTS_DATA_DIR = Path(__file__).parent / "data"
|
TESTS_DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
||||||
|
|
||||||
def dict_fromstring(x):
|
def dict_fromstring(x):
|
||||||
return flatten(alto_to_dict(ET.fromstring(x)))
|
return flatten(alto_to_dict(ET.fromstring(x)))
|
||||||
|
|
||||||
|
|
||||||
def test_Page_counts():
|
def test_Page_counts():
|
||||||
"""
|
"""
|
||||||
Elements below Layout/Page should be counted
|
Elements below Layout/Page should be counted
|
||||||
"""
|
"""
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
||||||
<Layout>
|
<Layout>
|
||||||
<Page ID="Page1" PHYSICAL_IMG_NR="1">
|
<Page ID="Page1" PHYSICAL_IMG_NR="1">
|
||||||
|
@ -37,13 +40,16 @@ def test_Page_counts():
|
||||||
</Page>
|
</Page>
|
||||||
</Layout>
|
</Layout>
|
||||||
</alto>
|
</alto>
|
||||||
""")
|
"""
|
||||||
assert d['Layout_Page_TextBlock-count'] == 1
|
)
|
||||||
assert d['Layout_Page_TextLine-count'] == 3
|
assert d["Layout_Page_TextBlock-count"] == 1
|
||||||
assert d['Layout_Page_String-count'] == 6
|
assert d["Layout_Page_TextLine-count"] == 3
|
||||||
|
assert d["Layout_Page_String-count"] == 6
|
||||||
|
|
||||||
|
|
||||||
def test_Tags_counts():
|
def test_Tags_counts():
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
||||||
<Tags>
|
<Tags>
|
||||||
<NamedEntityTag ID="PER0" LABEL="Pentlings"/>
|
<NamedEntityTag ID="PER0" LABEL="Pentlings"/>
|
||||||
|
@ -57,11 +63,14 @@ def test_Tags_counts():
|
||||||
<NamedEntityTag ID="PER10" LABEL="Jhesu Christi"/>
|
<NamedEntityTag ID="PER10" LABEL="Jhesu Christi"/>
|
||||||
</Tags>
|
</Tags>
|
||||||
</alto>
|
</alto>
|
||||||
""")
|
"""
|
||||||
assert d['Tags_NamedEntityTag-count'] == 9
|
)
|
||||||
|
assert d["Tags_NamedEntityTag-count"] == 9
|
||||||
|
|
||||||
|
|
||||||
def test_String_TAGREF_counts():
|
def test_String_TAGREF_counts():
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">
|
||||||
<Layout>
|
<Layout>
|
||||||
<Page>
|
<Page>
|
||||||
|
@ -80,9 +89,10 @@ def test_String_TAGREF_counts():
|
||||||
</Page>
|
</Page>
|
||||||
</Layout>
|
</Layout>
|
||||||
</alto>
|
</alto>
|
||||||
""")
|
"""
|
||||||
assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3
|
)
|
||||||
assert d['Layout_Page_String-count'] == 4
|
assert d["Layout_Page_//alto:String[@TAGREFS]-count"] == 3
|
||||||
|
assert d["Layout_Page_String-count"] == 4
|
||||||
|
|
||||||
|
|
||||||
def test_dtypes(tmp_path):
|
def test_dtypes(tmp_path):
|
||||||
|
@ -100,9 +110,9 @@ def test_dtypes(tmp_path):
|
||||||
r"Layout_Page_//alto:String/@WC-.*": ("Float64", None),
|
r"Layout_Page_//alto:String/@WC-.*": ("Float64", None),
|
||||||
r".*-count": ("Int64", None),
|
r".*-count": ("Int64", None),
|
||||||
r"alto_xmlns": ("object", ["str", "NoneType"]),
|
r"alto_xmlns": ("object", ["str", "NoneType"]),
|
||||||
|
|
||||||
r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
|
r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None),
|
||||||
}
|
}
|
||||||
|
|
||||||
def expected_types(c):
|
def expected_types(c):
|
||||||
"""Return the expected types for column c."""
|
"""Return the expected types for column c."""
|
||||||
for r, types in EXPECTED_TYPES.items():
|
for r, types in EXPECTED_TYPES.items():
|
||||||
|
@ -126,7 +136,8 @@ def test_dtypes(tmp_path):
|
||||||
|
|
||||||
if edt == "object":
|
if edt == "object":
|
||||||
inner_types = set(type(v).__name__ for v in df[c])
|
inner_types = set(type(v).__name__ for v in df[c])
|
||||||
assert all(it in einner_types for it in inner_types), \
|
assert all(
|
||||||
f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
|
it in einner_types for it in inner_types
|
||||||
|
), f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
|
||||||
|
|
||||||
check_types(alto_info_df)
|
check_types(alto_info_df)
|
|
@ -6,15 +6,17 @@ from mods4pandas.lib import flatten
|
||||||
|
|
||||||
|
|
||||||
def dict_fromstring(x):
|
def dict_fromstring(x):
|
||||||
"""Helper function to parse a METS/MODS XML string to a flattened dict"""
|
"""Helper function to parse a METS/MODS XML string to a flattened dict"""
|
||||||
return flatten(mets_to_dict(ET.fromstring(x)))
|
return flatten(mets_to_dict(ET.fromstring(x)))
|
||||||
# XXX move to test lib
|
# XXX move to test lib
|
||||||
|
|
||||||
|
|
||||||
def test_fileGrp():
|
def test_fileGrp():
|
||||||
"""
|
"""
|
||||||
Elements of mets:fileGrp should be counted
|
Elements of mets:fileGrp should be counted
|
||||||
"""
|
"""
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
|
<mets:mets xmlns:mets="http://www.loc.gov/METS/">
|
||||||
|
|
||||||
<mets:fileSec>
|
<mets:fileSec>
|
||||||
|
@ -31,5 +33,6 @@ def test_fileGrp():
|
||||||
</mets:fileGrp>
|
</mets:fileGrp>
|
||||||
</mets:fileSec>
|
</mets:fileSec>
|
||||||
</mets:mets>
|
</mets:mets>
|
||||||
""")
|
"""
|
||||||
assert d['fileSec_fileGrp-PRESENTATION-count'] == 3
|
)
|
||||||
|
assert d["fileSec_fileGrp-PRESENTATION-count"] == 3
|
||||||
|
|
|
@ -10,36 +10,45 @@ from mods4pandas.lib import flatten
|
||||||
|
|
||||||
TESTS_DATA_DIR = Path(__file__).parent / "data"
|
TESTS_DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
||||||
|
|
||||||
def dict_fromstring(x):
|
def dict_fromstring(x):
|
||||||
"""Helper function to parse a MODS XML string to a flattened dict"""
|
"""Helper function to parse a MODS XML string to a flattened dict"""
|
||||||
return flatten(mods_to_dict(ET.fromstring(x)))
|
return flatten(mods_to_dict(ET.fromstring(x)))
|
||||||
|
|
||||||
|
|
||||||
def test_single_language_languageTerm():
|
def test_single_language_languageTerm():
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:language>
|
<mods:language>
|
||||||
<mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm>
|
<mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm>
|
||||||
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
|
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
|
||||||
</mods:language>
|
</mods:language>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
assert d['language_languageTerm'] == {'ger', 'lat'}
|
)
|
||||||
|
assert d["language_languageTerm"] == {"ger", "lat"}
|
||||||
|
|
||||||
|
|
||||||
def test_multitple_language_languageTerm():
|
def test_multitple_language_languageTerm():
|
||||||
"""
|
"""
|
||||||
Different languages MAY have multiple mods:language elements.
|
Different languages MAY have multiple mods:language elements.
|
||||||
See MODS-AP 2.3.1
|
See MODS-AP 2.3.1
|
||||||
"""
|
"""
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:language><mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm></mods:language>
|
<mods:language><mods:languageTerm authority="iso639-2b" type="code">lat</mods:languageTerm></mods:language>
|
||||||
<mods:language><mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm></mods:language>
|
<mods:language><mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm></mods:language>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
assert d['language_languageTerm'] == {'ger', 'lat'}
|
)
|
||||||
|
assert d["language_languageTerm"] == {"ger", "lat"}
|
||||||
|
|
||||||
|
|
||||||
def test_role_roleTerm():
|
def test_role_roleTerm():
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669">
|
<mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669">
|
||||||
<mods:displayForm>Wurm, Mary</mods:displayForm>
|
<mods:displayForm>Wurm, Mary</mods:displayForm>
|
||||||
|
@ -51,14 +60,17 @@ def test_role_roleTerm():
|
||||||
</mods:role>
|
</mods:role>
|
||||||
</mods:name>
|
</mods:name>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
assert d['name0_role_roleTerm'] == {'cmp'}
|
)
|
||||||
|
assert d["name0_role_roleTerm"] == {"cmp"}
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_role_roleTerm():
|
def test_multiple_role_roleTerm():
|
||||||
"""
|
"""
|
||||||
Multiple mods:role/mods:roleTerm should be merged into one column.
|
Multiple mods:role/mods:roleTerm should be merged into one column.
|
||||||
"""
|
"""
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669">
|
<mods:name type="personal" valueURI="http://d-nb.info/gnd/117357669">
|
||||||
<mods:displayForm>Wurm, Mary</mods:displayForm>
|
<mods:displayForm>Wurm, Mary</mods:displayForm>
|
||||||
|
@ -73,8 +85,10 @@ def test_multiple_role_roleTerm():
|
||||||
</mods:role>
|
</mods:role>
|
||||||
</mods:name>
|
</mods:name>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
assert d['name0_role_roleTerm'] == {'cmp', 'aut'}
|
)
|
||||||
|
assert d["name0_role_roleTerm"] == {"cmp", "aut"}
|
||||||
|
|
||||||
|
|
||||||
def test_scriptTerm():
|
def test_scriptTerm():
|
||||||
"""
|
"""
|
||||||
|
@ -82,7 +96,8 @@ def test_scriptTerm():
|
||||||
|
|
||||||
See MODS-AP 2.3.1.
|
See MODS-AP 2.3.1.
|
||||||
"""
|
"""
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:language>
|
<mods:language>
|
||||||
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
|
<mods:languageTerm authority="iso639-2b" type="code">ger</mods:languageTerm>
|
||||||
|
@ -94,44 +109,59 @@ def test_scriptTerm():
|
||||||
<mods:scriptTerm authority="iso15924" type="code">216</mods:scriptTerm>
|
<mods:scriptTerm authority="iso15924" type="code">216</mods:scriptTerm>
|
||||||
</mods:language>
|
</mods:language>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
assert d['language_scriptTerm'] == {'215', '216', '217'}
|
)
|
||||||
|
assert d["language_scriptTerm"] == {"215", "216", "217"}
|
||||||
|
|
||||||
|
|
||||||
def test_recordInfo():
|
def test_recordInfo():
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:recordInfo>
|
<mods:recordInfo>
|
||||||
<mods:recordIdentifier source="gbv-ppn">PPN610714341</mods:recordIdentifier>
|
<mods:recordIdentifier source="gbv-ppn">PPN610714341</mods:recordIdentifier>
|
||||||
</mods:recordInfo>
|
</mods:recordInfo>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
assert d['recordInfo_recordIdentifier'] == 'PPN610714341'
|
)
|
||||||
|
assert d["recordInfo_recordIdentifier"] == "PPN610714341"
|
||||||
|
|
||||||
|
|
||||||
def test_accessCondition():
|
def test_accessCondition():
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:accessCondition type="use and reproduction">UNKNOWN</mods:accessCondition>
|
<mods:accessCondition type="use and reproduction">UNKNOWN</mods:accessCondition>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
assert d['accessCondition-use and reproduction'] == 'UNKNOWN'
|
)
|
||||||
|
assert d["accessCondition-use and reproduction"] == "UNKNOWN"
|
||||||
|
|
||||||
|
|
||||||
def test_originInfo_no_event_type():
|
def test_originInfo_no_event_type():
|
||||||
with pytest.warns(UserWarning) as ws:
|
with pytest.warns(UserWarning) as ws:
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:originInfo>
|
<mods:originInfo>
|
||||||
<mods:place><mods:placeTerm type="text">Berlin</mods:placeTerm></mods:place>
|
<mods:place><mods:placeTerm type="text">Berlin</mods:placeTerm></mods:place>
|
||||||
</mods:originInfo>
|
</mods:originInfo>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
assert d == {} # empty
|
assert d == {} # empty
|
||||||
|
|
||||||
assert len(ws) == 1
|
assert len(ws) == 1
|
||||||
assert ws[0].message.args[0] == 'Filtered {http://www.loc.gov/mods/v3}originInfo element (has no eventType)'
|
assert (
|
||||||
|
ws[0].message.args[0]
|
||||||
|
== "Filtered {http://www.loc.gov/mods/v3}originInfo element (has no eventType)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_relatedItem():
|
def test_relatedItem():
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:relatedItem type="original">
|
<mods:relatedItem type="original">
|
||||||
<mods:recordInfo>
|
<mods:recordInfo>
|
||||||
|
@ -139,12 +169,14 @@ def test_relatedItem():
|
||||||
</mods:recordInfo>
|
</mods:recordInfo>
|
||||||
</mods:relatedItem>
|
</mods:relatedItem>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
assert d['relatedItem-original_recordInfo_recordIdentifier'] == 'PPN167755803'
|
assert d["relatedItem-original_recordInfo_recordIdentifier"] == "PPN167755803"
|
||||||
|
|
||||||
# mods:relatedItem may also have source="dnb-ppn" recordIdentifiers:
|
# mods:relatedItem may also have source="dnb-ppn" recordIdentifiers:
|
||||||
d = dict_fromstring("""
|
d = dict_fromstring(
|
||||||
|
"""
|
||||||
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
|
||||||
<mods:relatedItem type="original">
|
<mods:relatedItem type="original">
|
||||||
<mods:recordInfo>
|
<mods:recordInfo>
|
||||||
|
@ -152,12 +184,16 @@ def test_relatedItem():
|
||||||
</mods:recordInfo>
|
</mods:recordInfo>
|
||||||
</mods:relatedItem>
|
</mods:relatedItem>
|
||||||
</mods:mods>
|
</mods:mods>
|
||||||
""")
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
assert d["relatedItem-original_recordInfo_recordIdentifier-dnb-ppn"] == "1236513355"
|
||||||
|
|
||||||
assert d['relatedItem-original_recordInfo_recordIdentifier-dnb-ppn'] == '1236513355'
|
|
||||||
|
|
||||||
def test_dtypes(tmp_path):
|
def test_dtypes(tmp_path):
|
||||||
mets_files = [p.absolute().as_posix() for p in (TESTS_DATA_DIR / "mets-mods").glob("*.xml")]
|
mets_files = [
|
||||||
|
p.absolute().as_posix() for p in (TESTS_DATA_DIR / "mets-mods").glob("*.xml")
|
||||||
|
]
|
||||||
mods_info_df_parquet = (tmp_path / "test_dtypes_mods_info.parquet").as_posix()
|
mods_info_df_parquet = (tmp_path / "test_dtypes_mods_info.parquet").as_posix()
|
||||||
page_info_df_parquet = (tmp_path / "test_dtypes_page_info.parquet").as_posix()
|
page_info_df_parquet = (tmp_path / "test_dtypes_page_info.parquet").as_posix()
|
||||||
process(mets_files, mods_info_df_parquet, page_info_df_parquet)
|
process(mets_files, mods_info_df_parquet, page_info_df_parquet)
|
||||||
|
@ -166,7 +202,6 @@ def test_dtypes(tmp_path):
|
||||||
|
|
||||||
EXPECTED_TYPES = {
|
EXPECTED_TYPES = {
|
||||||
# mods_info
|
# mods_info
|
||||||
|
|
||||||
r"mets_file": ("object", ["str"]),
|
r"mets_file": ("object", ["str"]),
|
||||||
r"titleInfo_title": ("object", ["str"]),
|
r"titleInfo_title": ("object", ["str"]),
|
||||||
r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
|
r"titleInfo_subTitle": ("object", ["str", "NoneType"]),
|
||||||
|
@ -179,19 +214,16 @@ def test_dtypes(tmp_path):
|
||||||
r"typeOfResource": ("object", ["str", "NoneType"]),
|
r"typeOfResource": ("object", ["str", "NoneType"]),
|
||||||
r"accessCondition-.*": ("object", ["str", "NoneType"]),
|
r"accessCondition-.*": ("object", ["str", "NoneType"]),
|
||||||
r"originInfo-.*": ("object", ["str", "NoneType"]),
|
r"originInfo-.*": ("object", ["str", "NoneType"]),
|
||||||
|
|
||||||
r".*-count": ("Int64", None),
|
r".*-count": ("Int64", None),
|
||||||
|
|
||||||
r"genre-.*": ("object", ["ndarray", "NoneType"]),
|
r"genre-.*": ("object", ["ndarray", "NoneType"]),
|
||||||
r"subject-.*": ("object", ["ndarray", "NoneType"]),
|
r"subject-.*": ("object", ["ndarray", "NoneType"]),
|
||||||
r"language_.*Term": ("object", ["ndarray", "NoneType"]),
|
r"language_.*Term": ("object", ["ndarray", "NoneType"]),
|
||||||
r"classification-.*": ("object", ["ndarray", "NoneType"]),
|
r"classification-.*": ("object", ["ndarray", "NoneType"]),
|
||||||
|
|
||||||
# page_info
|
# page_info
|
||||||
|
|
||||||
r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]),
|
r"fileGrp_.*_file_FLocat_href": ("object", ["str", "NoneType"]),
|
||||||
r"structMap-LOGICAL_TYPE_.*": ("boolean", None),
|
r"structMap-LOGICAL_TYPE_.*": ("boolean", None),
|
||||||
}
|
}
|
||||||
|
|
||||||
def expected_types(c):
|
def expected_types(c):
|
||||||
"""Return the expected types for column c."""
|
"""Return the expected types for column c."""
|
||||||
for r, types in EXPECTED_TYPES.items():
|
for r, types in EXPECTED_TYPES.items():
|
||||||
|
@ -215,8 +247,9 @@ def test_dtypes(tmp_path):
|
||||||
|
|
||||||
if edt == "object":
|
if edt == "object":
|
||||||
inner_types = set(type(v).__name__ for v in df[c])
|
inner_types = set(type(v).__name__ for v in df[c])
|
||||||
assert all(it in einner_types for it in inner_types), \
|
assert all(
|
||||||
f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
|
it in einner_types for it in inner_types
|
||||||
|
), f"Unexpected inner types {inner_types} for column {c} (expected {einner_types})"
|
||||||
|
|
||||||
check_types(mods_info_df)
|
check_types(mods_info_df)
|
||||||
check_types(page_info_df)
|
check_types(page_info_df)
|
|
@ -10,8 +10,8 @@ TESTS_DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
||||||
|
|
||||||
def removeprefix(s, prefix):
|
def removeprefix(s, prefix):
|
||||||
if sys.version_info < (3,9):
|
if sys.version_info < (3, 9):
|
||||||
return s[len(prefix):] if s.startswith(prefix) else s
|
return s[len(prefix) :] if s.startswith(prefix) else s
|
||||||
else:
|
else:
|
||||||
return s.removeprefix(prefix)
|
return s.removeprefix(prefix)
|
||||||
|
|
||||||
|
@ -26,20 +26,32 @@ def test_page_info():
|
||||||
assert all(p["ppn"] == "PPN821507109" for p in page_info)
|
assert all(p["ppn"] == "PPN821507109" for p in page_info)
|
||||||
|
|
||||||
# Look closer at an interesting page
|
# Look closer at an interesting page
|
||||||
from pprint import pprint; pprint(page_info[0])
|
from pprint import pprint
|
||||||
|
|
||||||
|
pprint(page_info[0])
|
||||||
page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005")
|
page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005")
|
||||||
|
|
||||||
assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif"
|
assert (
|
||||||
|
page_info_page["fileGrp_PRESENTATION_file_FLocat_href"]
|
||||||
|
== "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif"
|
||||||
|
)
|
||||||
|
|
||||||
# This is a title page with an illustration, check that we correctly got this info from the
|
# This is a title page with an illustration, check that we correctly got this info from the
|
||||||
# structMap.
|
# structMap.
|
||||||
struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1)
|
struct_types = sorted(
|
||||||
|
removeprefix(k, "structMap-LOGICAL_TYPE_")
|
||||||
|
for k, v in page_info_page.items()
|
||||||
|
if k.startswith("structMap-LOGICAL_TYPE_") and v == 1
|
||||||
|
)
|
||||||
assert struct_types == ["illustration", "monograph", "title_page"]
|
assert struct_types == ["illustration", "monograph", "title_page"]
|
||||||
|
|
||||||
|
|
||||||
def test_page_info_multivolume_work():
|
def test_page_info_multivolume_work():
|
||||||
"""Test creation of page_info for multivolume_work"""
|
"""Test creation of page_info for multivolume_work"""
|
||||||
mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml")
|
mets = ET.parse(
|
||||||
|
TESTS_DATA_DIR
|
||||||
|
/ "mets-mods"
|
||||||
|
/ "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml"
|
||||||
|
)
|
||||||
page_info = pages_to_dict(mets)
|
page_info = pages_to_dict(mets)
|
||||||
assert page_info == []
|
assert page_info == []
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue