Include METS fileGrp counts

master
Gerber, Mike 3 years ago
parent 2399699990
commit 73333ea2e2

@ -4,7 +4,7 @@ import logging
import os import os
import re import re
import warnings import warnings
import xml.etree.ElementTree as ET from lxml import etree as ET
from itertools import groupby from itertools import groupby
from operator import attrgetter from operator import attrgetter
from typing import List from typing import List
@ -66,7 +66,7 @@ class TagGroup:
return {e.text for e in self.group} return {e.text for e in self.group}
def descend(self, raise_errors): def descend(self, raise_errors):
return mods_to_dict(self.is_singleton().group[0], raise_errors) return _to_dict(self.is_singleton().group[0], raise_errors)
def filter(self, cond, warn=None): def filter(self, cond, warn=None):
new_group = [] new_group = []
@ -175,6 +175,15 @@ def sorted_groupby(iterable, key=None):
""" """
return groupby(sorted(iterable, key=key), key=key) return groupby(sorted(iterable, key=key), key=key)
def _to_dict(root, raise_errors):
root_name = ET.QName(root.tag)
if root_name.namespace == "http://www.loc.gov/mods/v3":
return mods_to_dict(root, raise_errors)
elif root_name.namespace == "http://www.loc.gov/METS/":
return mets_to_dict(root, raise_errors)
else:
raise ValueError(f"Unknown namespace {root_name.namespace}")
def mods_to_dict(mods, raise_errors=True): def mods_to_dict(mods, raise_errors=True):
"""Convert MODS metadata to a nested dictionary""" """Convert MODS metadata to a nested dictionary"""
@ -355,6 +364,48 @@ def mods_to_dict(mods, raise_errors=True):
elif tag == '{http://www.loc.gov/mods/v3}mods': elif tag == '{http://www.loc.gov/mods/v3}mods':
# XXX Ignore nested mods:mods for now (used in mods:subject) # XXX Ignore nested mods:mods for now (used in mods:subject)
pass pass
else:
if raise_errors:
raise ValueError('Unknown tag "{}"'.format(tag))
else:
pass
return value
def mets_to_dict(mets, raise_errors=True):
"""Convert METS metadata to a nested dictionary"""
# The approach taken here is to handle each element explicitly. This also means that ignored elements are ignored
# explicitly.
value = {}
# Iterate through each group of tags
for tag, group in sorted_groupby(mets, key=attrgetter('tag')):
group = list(group)
# XXX Namespaces seem to use a trailing / sometimes, sometimes not.
# (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
if tag == '{http://www.loc.gov/METS/}amdSec':
pass # TODO
elif tag == '{http://www.loc.gov/METS/}dmdSec':
pass # TODO
elif tag == '{http://www.loc.gov/METS/}metsHdr':
pass # TODO
elif tag == '{http://www.loc.gov/METS/}structLink':
pass # TODO
elif tag == '{http://www.loc.gov/METS/}structMap':
pass # TODO
elif tag == '{http://www.loc.gov/METS/}fileSec':
value['fileSec'] = TagGroup(tag, group) \
.is_singleton().descend(raise_errors)
elif tag == '{http://www.loc.gov/METS/}fileGrp':
for e in group:
use = e.attrib.get('USE')
if not use:
raise ValueError('No USE attribute for fileGrp {}'.format(e))
value[f'fileGrp-{use}-count'] = len(e)
else: else:
if raise_errors: if raise_errors:
print(value) print(value)
@ -418,13 +469,22 @@ def process(mets_files: List[str], output_file: str):
logging.info('Processing METS files') logging.info('Processing METS files')
for mets_file in tqdm(mets_files_real): for mets_file in tqdm(mets_files_real):
try: try:
dmd_sec = ET.parse(mets_file).getroot().find('mets:dmdSec', ns) root = ET.parse(mets_file).getroot()
mods = dmd_sec.find('.//mods:mods', ns) mets = root # XXX .find('mets:mets', ns) does not work here
mods = root.find('mets:dmdSec//mods:mods', ns)
with warnings.catch_warnings(record=True) as caught_warnings: with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter('always') # do NOT filter double occurrences warnings.simplefilter('always') # do NOT filter double occurrences
# MODS
d = flatten(mods_to_dict(mods, raise_errors=True)) d = flatten(mods_to_dict(mods, raise_errors=True))
# METS
d_mets = flatten(mets_to_dict(mets, raise_errors=True))
for k, v in d_mets.items():
d[f"mets_{k}"] = v
# "meta"
d['mets_file'] = mets_file d['mets_file'] = mets_file
mods_info.append(d) mods_info.append(d)
if caught_warnings: if caught_warnings:
@ -434,6 +494,7 @@ def process(mets_files: List[str], output_file: str):
csvwriter.writerow([mets_file, caught_warning.message]) csvwriter.writerow([mets_file, caught_warning.message])
except Exception as e: except Exception as e:
warnings.warn('Exception in {}:\n{}'.format(mets_file, e)) warnings.warn('Exception in {}:\n{}'.format(mets_file, e))
import traceback; traceback.print_exc()
# Convert the mods_info List[Dict] to a pandas DataFrame # Convert the mods_info List[Dict] to a pandas DataFrame
columns = [] columns = []

@ -1,3 +1,4 @@
click click
pandas pandas
tqdm tqdm
lxml

Loading…
Cancel
Save