From 11a04916f38631a2e55192bcb4db9b25df8f384b Mon Sep 17 00:00:00 2001 From: Mike Gerber Date: Thu, 28 Nov 2024 18:27:39 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Write=20mods=5Finfo=20Parquet=20?= =?UTF-8?q?file=20again?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mods4pandas/mods4pandas.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 4427f13..4fabb52 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -399,19 +399,24 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): mets_files_real.append(m) - - # Process METS files + # Prepare output files + with contextlib.suppress(FileNotFoundError): + os.remove(output_file) output_file_sqlite3 = output_file + ".sqlite3" with contextlib.suppress(FileNotFoundError): os.remove(output_file_sqlite3) + + logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3)) con = sqlite3.connect(output_file_sqlite3) if output_page_info: output_page_info_sqlite3 = output_page_info + ".sqlite3" + logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3)) with contextlib.suppress(FileNotFoundError): os.remove(output_page_info_sqlite3) con_page_info = sqlite3.connect(output_page_info_sqlite3) + # Process METS files with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) mods_info = [] @@ -454,14 +459,12 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): except Exception as e: logger.exception('Exception in {}'.format(mets_file)) - # Convert the mods_info List[Dict] to a pandas DataFrame - # TODO - # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") - + # Convert the mods_info SQL to a pandas DataFrame + mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier") + # Save the DataFrame - # TODO - #logger.info('Writing DataFrame to {}'.format(output_file)) - #mods_info_df.to_parquet(output_file) + logger.info('Writing DataFrame to {}'.format(output_file)) + mods_info_df.to_parquet(output_file) # Convert page_info # TODO