|
|
|
@ -3,7 +3,9 @@ import csv
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import sqlite3
|
|
|
|
|
import warnings
|
|
|
|
|
import sys
|
|
|
|
|
from lxml import etree as ET
|
|
|
|
|
from itertools import groupby
|
|
|
|
|
from operator import attrgetter
|
|
|
|
@ -394,7 +396,45 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
|
|
|
|
else:
|
|
|
|
|
mets_files_real.append(m)
|
|
|
|
|
|
|
|
|
|
current_columns = []
|
|
|
|
|
|
|
|
|
|
def valid_column_key(k):
|
|
|
|
|
if re.match("^[a-zA-Z0-9 _-]+$", k):
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def insert_into_db(con, d: Dict):
|
|
|
|
|
# Create table if necessary
|
|
|
|
|
if not current_columns:
|
|
|
|
|
for k in d.keys():
|
|
|
|
|
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
|
|
|
|
|
current_columns.append(k)
|
|
|
|
|
con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
|
|
|
|
|
|
|
|
|
|
# Add columns if necessary
|
|
|
|
|
for k in d.keys():
|
|
|
|
|
if not k in current_columns:
|
|
|
|
|
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
|
|
|
|
|
current_columns.append(k)
|
|
|
|
|
con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
|
|
|
|
|
|
|
|
|
|
# Insert
|
|
|
|
|
# Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
|
|
|
|
|
# have use qmark style here.
|
|
|
|
|
columns = d.keys()
|
|
|
|
|
con.execute(
|
|
|
|
|
"INSERT INTO mods_info"
|
|
|
|
|
f"( {",".join(f"\"{c}\"" for c in columns)} )"
|
|
|
|
|
"VALUES"
|
|
|
|
|
f"( {",".join("?" for c in columns)} )",
|
|
|
|
|
[str(d[c]) for c in columns]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Process METS files
|
|
|
|
|
output_file_sqlite3 = output_file + ".sqlite3"
|
|
|
|
|
con = sqlite3.connect(output_file_sqlite3)
|
|
|
|
|
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
|
|
|
|
csvwriter = csv.writer(csvfile)
|
|
|
|
|
mods_info = []
|
|
|
|
@ -423,9 +463,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
|
|
|
|
if output_page_info:
|
|
|
|
|
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
|
|
|
|
|
|
|
|
|
mods_info.append(d)
|
|
|
|
|
if output_page_info:
|
|
|
|
|
page_info.extend(page_info_doc)
|
|
|
|
|
insert_into_db(con, d)
|
|
|
|
|
con.commit()
|
|
|
|
|
#TODO
|
|
|
|
|
#if output_page_info:
|
|
|
|
|
# page_info.extend(page_info_doc)
|
|
|
|
|
|
|
|
|
|
if caught_warnings:
|
|
|
|
|
# PyCharm thinks caught_warnings is not Iterable:
|
|
|
|
@ -433,8 +475,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
|
|
|
|
for caught_warning in caught_warnings:
|
|
|
|
|
csvwriter.writerow([mets_file, caught_warning.message])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error('Exception in {}: {}'.format(mets_file, e))
|
|
|
|
|
#import traceback; traceback.print_exc()
|
|
|
|
|
logger.exception('Exception in {}'.format(mets_file))
|
|
|
|
|
|
|
|
|
|
# Convert the mods_info List[Dict] to a pandas DataFrame
|
|
|
|
|
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
|
|
|
|
|