mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-10-31 17:34:13 +01:00 
			
		
		
		
	🐛 Fix converting/writing out per-page information (e.g. structure information)
This commit is contained in:
		
							parent
							
								
									1bf86bfb4c
								
							
						
					
					
						commit
						a1f333f4a4
					
				
					 2 changed files with 14 additions and 12 deletions
				
			
		
							
								
								
									
										11
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										11
									
								
								README.md
									
										
									
									
									
								
							|  | @ -10,6 +10,9 @@ instead of ordered lists for topics, etc. Parts of the tool are specific to | ||||||
| our environment/needs at the State Library Berlin and may need to be changed for | our environment/needs at the State Library Berlin and may need to be changed for | ||||||
| your library. | your library. | ||||||
| 
 | 
 | ||||||
|  | Per-page information (e.g. structure information from the METS structMap) can | ||||||
|  | be converted as well (`--output-page-info`). | ||||||
|  | 
 | ||||||
| **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame. | **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame. | ||||||
| 
 | 
 | ||||||
| Column names are derived from the corresponding ALTO elements. Some columns | Column names are derived from the corresponding ALTO elements. Some columns | ||||||
|  | @ -31,14 +34,14 @@ In this example we convert the MODS metadata contained in the METS files in | ||||||
| `mods_info_df.parquet`. This file can then be read by your data scientist using | `mods_info_df.parquet`. This file can then be read by your data scientist using | ||||||
| `pd.read_parquet()`. | `pd.read_parquet()`. | ||||||
| 
 | 
 | ||||||
| ~~~ | ``` | ||||||
| % mods4pandas /srv/data/digisam_mets-sample-300 | % mods4pandas /srv/data/digisam_mets-sample-300 | ||||||
| INFO:root:Scanning directory /srv/data/digisam_mets-sample-300 | INFO:root:Scanning directory /srv/data/digisam_mets-sample-300 | ||||||
| 301it [00:00, 19579.19it/s] | 301it [00:00, 19579.19it/s] | ||||||
| INFO:root:Processing METS files | INFO:root:Processing METS files | ||||||
| 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s] | 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s] | ||||||
| INFO:root:Writing DataFrame to mods_info_df.pkl | INFO:root:Writing DataFrame to mods_info_df.parquet | ||||||
| ~~~ | ``` | ||||||
| 
 | 
 | ||||||
| In the next example we convert the metadata from the ALTO files in the test data | In the next example we convert the metadata from the ALTO files in the test data | ||||||
| directory: | directory: | ||||||
|  | @ -56,5 +59,5 @@ Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151 | ||||||
| Scanning directory qurator/mods4pandas/tests/data/alto/749782137 | Scanning directory qurator/mods4pandas/tests/data/alto/749782137 | ||||||
| Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns | Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns | ||||||
| INFO:alto4pandas:Processing ALTO files | INFO:alto4pandas:Processing ALTO files | ||||||
| INFO:alto4pandas:Writing DataFrame to alto_info_df.pkl | INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet | ||||||
| ~~~ | ~~~ | ||||||
|  |  | ||||||
|  | @ -373,8 +373,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]: | ||||||
|               default='mods_info_df.parquet', show_default=True) |               default='mods_info_df.parquet', show_default=True) | ||||||
| @click.option('--output-csv', type=click.Path(), help='Output CSV file') | @click.option('--output-csv', type=click.Path(), help='Output CSV file') | ||||||
| @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') | @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') | ||||||
| @click.option('--page-info', is_flag=True, show_default=True, default=False, help='Save page info') | @click.option('--output-page-info', type=click.Path(), help='Save page info') | ||||||
| def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, page_info: bool): | def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str): | ||||||
|     """ |     """ | ||||||
|     A tool to convert the MODS metadata in INPUT to a pandas DataFrame. |     A tool to convert the MODS metadata in INPUT to a pandas DataFrame. | ||||||
| 
 | 
 | ||||||
|  | @ -420,11 +420,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls | ||||||
|                     d['mets_file'] = mets_file |                     d['mets_file'] = mets_file | ||||||
| 
 | 
 | ||||||
|                     # METS - per-page |                     # METS - per-page | ||||||
|                     if page_info: |                     if output_page_info: | ||||||
|                         page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) |                         page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) | ||||||
| 
 | 
 | ||||||
|                     mods_info.append(d) |                     mods_info.append(d) | ||||||
|                     if page_info: |                     if output_page_info: | ||||||
|                         page_info.extend(page_info_doc) |                         page_info.extend(page_info_doc) | ||||||
| 
 | 
 | ||||||
|                     if caught_warnings: |                     if caught_warnings: | ||||||
|  | @ -450,12 +450,11 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls | ||||||
|         mods_info_df.to_excel(output_xlsx) |         mods_info_df.to_excel(output_xlsx) | ||||||
| 
 | 
 | ||||||
|     # Convert page_info |     # Convert page_info | ||||||
|     # XXX hardcoded filenames + other formats |     if output_page_info: | ||||||
|     if page_info: |  | ||||||
|         page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) |         page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) | ||||||
|         # Save the DataFrame |         # Save the DataFrame | ||||||
|         logger.info('Writing DataFrame to {}'.format("page_info_df.parquet")) |         logger.info('Writing DataFrame to {}'.format(output_page_info)) | ||||||
|         page_info_df.to_parquet("page_info_df.parquet") |         page_info_df.to_parquet(output_page_info) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def main(): | def main(): | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue