🚧 Write out page_info

fix/use-temp-sqlite3
Mike Gerber 4 weeks ago
parent a1390699d4
commit b385f27391

@ -2,6 +2,7 @@ from itertools import groupby
import re import re
import warnings import warnings
from typing import List, Sequence, MutableMapping, Dict from typing import List, Sequence, MutableMapping, Dict
from collections import defaultdict
import pandas as pd import pandas as pd
import numpy as np import numpy as np
@ -328,3 +329,45 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
df = pd.DataFrame(data=data, index=index, columns=columns) df = pd.DataFrame(data=data, index=index, columns=columns)
return df return df
def valid_column_key(k):
if re.match("^[a-zA-Z0-9 _-]+$", k):
return True
else:
return False
current_columns = defaultdict(list)
def insert_into_db(con, table, d: Dict):
"""Insert the values from the dict into the table, creating columns if necessary"""
# Create table if necessary
if not current_columns[table]:
for k in d.keys():
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
current_columns[table].append(k)
con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})")
# Add columns if necessary
for k in d.keys():
if not k in current_columns[table]:
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
current_columns[table].append(k)
con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"")
# Insert
# Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
# have use qmark style here.
columns = d.keys()
con.execute(
f"INSERT INTO {table}"
f"( {",".join(f"\"{c}\"" for c in columns)} )"
"VALUES"
f"( {",".join("?" for c in columns)} )",
[str(d[c]) for c in columns]
)
def insert_into_db_multiple(con, table, ld: List[Dict]):
for d in ld:
insert_into_db(con, table, d)

@ -10,13 +10,14 @@ from lxml import etree as ET
from itertools import groupby from itertools import groupby
from operator import attrgetter from operator import attrgetter
from typing import Dict, List from typing import Dict, List
from collections import defaultdict
from collections.abc import MutableMapping, Sequence from collections.abc import MutableMapping, Sequence
import click import click
import pandas as pd import pandas as pd
from tqdm import tqdm from tqdm import tqdm
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple
@ -396,45 +397,16 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
else: else:
mets_files_real.append(m) mets_files_real.append(m)
current_columns = []
def valid_column_key(k):
if re.match("^[a-zA-Z0-9 _-]+$", k):
return True
else:
return False
def insert_into_db(con, d: Dict):
# Create table if necessary
if not current_columns:
for k in d.keys():
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
current_columns.append(k)
con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
# Add columns if necessary
for k in d.keys():
if not k in current_columns:
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
current_columns.append(k)
con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
# Insert
# Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
# have use qmark style here.
columns = d.keys()
con.execute(
"INSERT INTO mods_info"
f"( {",".join(f"\"{c}\"" for c in columns)} )"
"VALUES"
f"( {",".join("?" for c in columns)} )",
[str(d[c]) for c in columns]
)
# Process METS files # Process METS files
output_file_sqlite3 = output_file + ".sqlite3" output_file_sqlite3 = output_file + ".sqlite3"
con = sqlite3.connect(output_file_sqlite3) con = sqlite3.connect(output_file_sqlite3)
if output_page_info:
output_page_info_sqlite3 = output_page_info + ".sqlite3"
con_page_info = sqlite3.connect(output_page_info_sqlite3)
with open(output_file + '.warnings.csv', 'w') as csvfile: with open(output_file + '.warnings.csv', 'w') as csvfile:
csvwriter = csv.writer(csvfile) csvwriter = csv.writer(csvfile)
mods_info = [] mods_info = []
@ -463,11 +435,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
if output_page_info: if output_page_info:
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
insert_into_db(con, d) insert_into_db(con, "mods_info", d)
con.commit() con.commit()
#TODO if output_page_info:
#if output_page_info: insert_into_db_multiple(con_page_info, "page_info", page_info_doc)
# page_info.extend(page_info_doc) con_page_info.commit()
if caught_warnings: if caught_warnings:
# PyCharm thinks caught_warnings is not Iterable: # PyCharm thinks caught_warnings is not Iterable:
@ -478,18 +450,21 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
logger.exception('Exception in {}'.format(mets_file)) logger.exception('Exception in {}'.format(mets_file))
# Convert the mods_info List[Dict] to a pandas DataFrame # Convert the mods_info List[Dict] to a pandas DataFrame
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") # TODO
# mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
# Save the DataFrame # Save the DataFrame
logger.info('Writing DataFrame to {}'.format(output_file)) # TODO
mods_info_df.to_parquet(output_file) #logger.info('Writing DataFrame to {}'.format(output_file))
#mods_info_df.to_parquet(output_file)
# Convert page_info # Convert page_info
if output_page_info: # TODO
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) # if output_page_info:
# Save the DataFrame # page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
logger.info('Writing DataFrame to {}'.format(output_page_info)) # # Save the DataFrame
page_info_df.to_parquet(output_page_info) # logger.info('Writing DataFrame to {}'.format(output_page_info))
# page_info_df.to_parquet(output_page_info)
def main(): def main():

Loading…
Cancel
Save