mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 11:24:14 +01:00 
			
		
		
		
	🚧 Write out page_info
This commit is contained in:
		
							parent
							
								
									a1390699d4
								
							
						
					
					
						commit
						b385f27391
					
				
					 2 changed files with 66 additions and 48 deletions
				
			
		| 
						 | 
				
			
			@ -2,6 +2,7 @@ from itertools import groupby
 | 
			
		|||
import re
 | 
			
		||||
import warnings
 | 
			
		||||
from typing import List, Sequence, MutableMapping, Dict
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
| 
						 | 
				
			
			@ -328,3 +329,45 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
 | 
			
		|||
 | 
			
		||||
    df = pd.DataFrame(data=data, index=index, columns=columns)
 | 
			
		||||
    return df
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def valid_column_key(k):
 | 
			
		||||
    if re.match("^[a-zA-Z0-9 _-]+$", k):
 | 
			
		||||
        return True
 | 
			
		||||
    else:
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
current_columns = defaultdict(list)
 | 
			
		||||
 | 
			
		||||
def insert_into_db(con, table, d: Dict):
 | 
			
		||||
    """Insert the values from the dict into the table, creating columns if necessary"""
 | 
			
		||||
 | 
			
		||||
    # Create table if necessary
 | 
			
		||||
    if not current_columns[table]:
 | 
			
		||||
        for k in d.keys():
 | 
			
		||||
            assert valid_column_key(k), f"\"{k}\" is not a valid column name"
 | 
			
		||||
            current_columns[table].append(k)
 | 
			
		||||
        con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})")
 | 
			
		||||
 | 
			
		||||
    # Add columns if necessary
 | 
			
		||||
    for k in d.keys():
 | 
			
		||||
        if not k in current_columns[table]:
 | 
			
		||||
            assert valid_column_key(k), f"\"{k}\" is not a valid column name"
 | 
			
		||||
            current_columns[table].append(k)
 | 
			
		||||
            con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"")
 | 
			
		||||
 | 
			
		||||
    # Insert
 | 
			
		||||
    # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
 | 
			
		||||
    # have use qmark style here.
 | 
			
		||||
    columns = d.keys()
 | 
			
		||||
    con.execute(
 | 
			
		||||
        f"INSERT INTO {table}"
 | 
			
		||||
        f"( {",".join(f"\"{c}\"" for c in columns)} )"
 | 
			
		||||
        "VALUES"
 | 
			
		||||
        f"( {",".join("?" for c in columns)} )",
 | 
			
		||||
        [str(d[c]) for c in columns]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
def insert_into_db_multiple(con, table, ld: List[Dict]):
 | 
			
		||||
    for d in ld:
 | 
			
		||||
        insert_into_db(con, table, d)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,13 +10,14 @@ from lxml import etree as ET
 | 
			
		|||
from itertools import groupby
 | 
			
		||||
from operator import attrgetter
 | 
			
		||||
from typing import Dict, List
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
from collections.abc import MutableMapping, Sequence
 | 
			
		||||
 | 
			
		||||
import click
 | 
			
		||||
import pandas as pd
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
 | 
			
		||||
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
 | 
			
		||||
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -396,45 +397,16 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 | 
			
		|||
        else:
 | 
			
		||||
            mets_files_real.append(m)
 | 
			
		||||
 | 
			
		||||
    current_columns = []
 | 
			
		||||
 | 
			
		||||
    def valid_column_key(k):
 | 
			
		||||
        if re.match("^[a-zA-Z0-9 _-]+$", k):
 | 
			
		||||
            return True
 | 
			
		||||
        else:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def insert_into_db(con, d: Dict):
 | 
			
		||||
        # Create table if necessary
 | 
			
		||||
        if not current_columns:
 | 
			
		||||
            for k in d.keys():
 | 
			
		||||
                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
 | 
			
		||||
                current_columns.append(k)
 | 
			
		||||
            con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
 | 
			
		||||
 | 
			
		||||
        # Add columns if necessary
 | 
			
		||||
        for k in d.keys():
 | 
			
		||||
            if not k in current_columns:
 | 
			
		||||
                assert valid_column_key(k), f"\"{k}\" is not a valid column name"
 | 
			
		||||
                current_columns.append(k)
 | 
			
		||||
                con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
 | 
			
		||||
 | 
			
		||||
        # Insert
 | 
			
		||||
        # Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
 | 
			
		||||
        # have use qmark style here.
 | 
			
		||||
        columns = d.keys()
 | 
			
		||||
        con.execute(
 | 
			
		||||
            "INSERT INTO mods_info"
 | 
			
		||||
            f"( {",".join(f"\"{c}\"" for c in columns)} )"
 | 
			
		||||
            "VALUES"
 | 
			
		||||
            f"( {",".join("?" for c in columns)} )",
 | 
			
		||||
            [str(d[c]) for c in columns]
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # Process METS files
 | 
			
		||||
    output_file_sqlite3 = output_file + ".sqlite3"
 | 
			
		||||
    con = sqlite3.connect(output_file_sqlite3)
 | 
			
		||||
 | 
			
		||||
    if output_page_info:
 | 
			
		||||
        output_page_info_sqlite3 = output_page_info + ".sqlite3"
 | 
			
		||||
        con_page_info = sqlite3.connect(output_page_info_sqlite3)
 | 
			
		||||
 | 
			
		||||
    with open(output_file + '.warnings.csv', 'w') as csvfile:
 | 
			
		||||
        csvwriter = csv.writer(csvfile)
 | 
			
		||||
        mods_info = []
 | 
			
		||||
| 
						 | 
				
			
			@ -463,11 +435,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 | 
			
		|||
                    if output_page_info:
 | 
			
		||||
                        page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
 | 
			
		||||
 | 
			
		||||
                    insert_into_db(con, d)
 | 
			
		||||
                    insert_into_db(con, "mods_info", d)
 | 
			
		||||
                    con.commit()
 | 
			
		||||
                    #TODO
 | 
			
		||||
                    #if output_page_info:
 | 
			
		||||
                    #    page_info.extend(page_info_doc)
 | 
			
		||||
                    if output_page_info:
 | 
			
		||||
                        insert_into_db_multiple(con_page_info, "page_info", page_info_doc)
 | 
			
		||||
                        con_page_info.commit()
 | 
			
		||||
 | 
			
		||||
                    if caught_warnings:
 | 
			
		||||
                        # PyCharm thinks caught_warnings is not Iterable:
 | 
			
		||||
| 
						 | 
				
			
			@ -478,18 +450,21 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
 | 
			
		|||
                logger.exception('Exception in {}'.format(mets_file))
 | 
			
		||||
 | 
			
		||||
    # Convert the mods_info List[Dict] to a pandas DataFrame
 | 
			
		||||
    mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
 | 
			
		||||
 | 
			
		||||
    # TODO
 | 
			
		||||
    # mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
 | 
			
		||||
 
 | 
			
		||||
    # Save the DataFrame
 | 
			
		||||
    logger.info('Writing DataFrame to {}'.format(output_file))
 | 
			
		||||
    mods_info_df.to_parquet(output_file)
 | 
			
		||||
    # TODO
 | 
			
		||||
    #logger.info('Writing DataFrame to {}'.format(output_file))
 | 
			
		||||
    #mods_info_df.to_parquet(output_file)
 | 
			
		||||
 | 
			
		||||
    # Convert page_info
 | 
			
		||||
    if output_page_info:
 | 
			
		||||
        page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
 | 
			
		||||
        # Save the DataFrame
 | 
			
		||||
        logger.info('Writing DataFrame to {}'.format(output_page_info))
 | 
			
		||||
        page_info_df.to_parquet(output_page_info)
 | 
			
		||||
    # TODO
 | 
			
		||||
    # if output_page_info:
 | 
			
		||||
    #     page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
 | 
			
		||||
    #     # Save the DataFrame
 | 
			
		||||
    #     logger.info('Writing DataFrame to {}'.format(output_page_info))
 | 
			
		||||
    #     page_info_df.to_parquet(output_page_info)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue