mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 11:24:14 +01:00 
			
		
		
		
	🚧 Use a temporary sqlite db
This commit is contained in:
		
							parent
							
								
									8d6b97f6b3
								
							
						
					
					
						commit
						a1390699d4
					
				
					 1 changed files with 46 additions and 5 deletions
				
			
		| 
						 | 
					@ -3,7 +3,9 @@ import csv
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					import sqlite3
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
from lxml import etree as ET
 | 
					from lxml import etree as ET
 | 
				
			||||||
from itertools import groupby
 | 
					from itertools import groupby
 | 
				
			||||||
from operator import attrgetter
 | 
					from operator import attrgetter
 | 
				
			||||||
| 
						 | 
					@ -394,7 +396,45 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            mets_files_real.append(m)
 | 
					            mets_files_real.append(m)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    current_columns = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def valid_column_key(k):
 | 
				
			||||||
 | 
					        if re.match("^[a-zA-Z0-9 _-]+$", k):
 | 
				
			||||||
 | 
					            return True
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def insert_into_db(con, d: Dict):
 | 
				
			||||||
 | 
					        # Create table if necessary
 | 
				
			||||||
 | 
					        if not current_columns:
 | 
				
			||||||
 | 
					            for k in d.keys():
 | 
				
			||||||
 | 
					                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
 | 
				
			||||||
 | 
					                current_columns.append(k)
 | 
				
			||||||
 | 
					            con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Add columns if necessary
 | 
				
			||||||
 | 
					        for k in d.keys():
 | 
				
			||||||
 | 
					            if not k in current_columns:
 | 
				
			||||||
 | 
					                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
 | 
				
			||||||
 | 
					                current_columns.append(k)
 | 
				
			||||||
 | 
					                con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Insert
 | 
				
			||||||
 | 
					        # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
 | 
				
			||||||
 | 
					        # have use qmark style here.
 | 
				
			||||||
 | 
					        columns = d.keys()
 | 
				
			||||||
 | 
					        con.execute(
 | 
				
			||||||
 | 
					            "INSERT INTO mods_info"
 | 
				
			||||||
 | 
					            f"( {",".join(f"\"{c}\"" for c in columns)} )"
 | 
				
			||||||
 | 
					            "VALUES"
 | 
				
			||||||
 | 
					            f"( {",".join("?" for c in columns)} )",
 | 
				
			||||||
 | 
					            [str(d[c]) for c in columns]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Process METS files
 | 
					    # Process METS files
 | 
				
			||||||
 | 
					    output_file_sqlite3 = output_file + ".sqlite3"
 | 
				
			||||||
 | 
					    con = sqlite3.connect(output_file_sqlite3)
 | 
				
			||||||
    with open(output_file + '.warnings.csv', 'w') as csvfile:
 | 
					    with open(output_file + '.warnings.csv', 'w') as csvfile:
 | 
				
			||||||
        csvwriter = csv.writer(csvfile)
 | 
					        csvwriter = csv.writer(csvfile)
 | 
				
			||||||
        mods_info = []
 | 
					        mods_info = []
 | 
				
			||||||
| 
						 | 
					@ -423,9 +463,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 | 
				
			||||||
                    if output_page_info:
 | 
					                    if output_page_info:
 | 
				
			||||||
                        page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
 | 
					                        page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    mods_info.append(d)
 | 
					                    insert_into_db(con, d)
 | 
				
			||||||
                    if output_page_info:
 | 
					                    con.commit()
 | 
				
			||||||
                        page_info.extend(page_info_doc)
 | 
					                    #TODO
 | 
				
			||||||
 | 
					                    #if output_page_info:
 | 
				
			||||||
 | 
					                    #    page_info.extend(page_info_doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    if caught_warnings:
 | 
					                    if caught_warnings:
 | 
				
			||||||
                        # PyCharm thinks caught_warnings is not Iterable:
 | 
					                        # PyCharm thinks caught_warnings is not Iterable:
 | 
				
			||||||
| 
						 | 
					@ -433,8 +475,7 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 | 
				
			||||||
                        for caught_warning in caught_warnings:
 | 
					                        for caught_warning in caught_warnings:
 | 
				
			||||||
                            csvwriter.writerow([mets_file, caught_warning.message])
 | 
					                            csvwriter.writerow([mets_file, caught_warning.message])
 | 
				
			||||||
            except Exception as e:
 | 
					            except Exception as e:
 | 
				
			||||||
                logger.error('Exception in {}: {}'.format(mets_file, e))
 | 
					                logger.exception('Exception in {}'.format(mets_file))
 | 
				
			||||||
                #import traceback; traceback.print_exc()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Convert the mods_info List[Dict] to a pandas DataFrame
 | 
					    # Convert the mods_info List[Dict] to a pandas DataFrame
 | 
				
			||||||
    mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
 | 
					    mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue