diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py index 4427f13..4fabb52 100755 --- a/src/mods4pandas/mods4pandas.py +++ b/src/mods4pandas/mods4pandas.py @@ -399,19 +399,24 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): mets_files_real.append(m) - - # Process METS files + # Prepare output files + with contextlib.suppress(FileNotFoundError): + os.remove(output_file) output_file_sqlite3 = output_file + ".sqlite3" with contextlib.suppress(FileNotFoundError): os.remove(output_file_sqlite3) + + logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3)) con = sqlite3.connect(output_file_sqlite3) if output_page_info: output_page_info_sqlite3 = output_page_info + ".sqlite3" + logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3)) with contextlib.suppress(FileNotFoundError): os.remove(output_page_info_sqlite3) con_page_info = sqlite3.connect(output_page_info_sqlite3) + # Process METS files with open(output_file + '.warnings.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) mods_info = [] @@ -454,14 +459,12 @@ def process(mets_files: List[str], output_file: str, output_page_info: str): except Exception as e: logger.exception('Exception in {}'.format(mets_file)) - # Convert the mods_info List[Dict] to a pandas DataFrame - # TODO - # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") - + # Convert the mods_info SQL to a pandas DataFrame + mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier") + # Save the DataFrame - # TODO - #logger.info('Writing DataFrame to {}'.format(output_file)) - #mods_info_df.to_parquet(output_file) + logger.info('Writing DataFrame to {}'.format(output_file)) + mods_info_df.to_parquet(output_file) # Convert page_info # TODO