mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 03:14:14 +01:00 
			
		
		
		
	✨ Represent sets as arrays in the Parquet file
This commit is contained in:
		
							parent
							
								
									ebdded90d6
								
							
						
					
					
						commit
						215bfbb11f
					
				
					 2 changed files with 7 additions and 7 deletions
				
			
		| 
						 | 
				
			
			@ -24,6 +24,7 @@ EXPECTED_TYPES = {
 | 
			
		|||
        r"titleInfo_partName": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"identifier-.*": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"location_.*": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"name\d+_.*roleTerm": ("object", ["ndarray", "NoneType"]),
 | 
			
		||||
        r"name\d+_.*": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"relatedItem-.*_recordInfo_recordIdentifier": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"typeOfResource": ("object", ["str", "NoneType"]),
 | 
			
		||||
| 
						 | 
				
			
			@ -32,11 +33,10 @@ EXPECTED_TYPES = {
 | 
			
		|||
 | 
			
		||||
        r".*-count": ("Int64", None),
 | 
			
		||||
 | 
			
		||||
        # XXX possibly sets:
 | 
			
		||||
        r"genre-.*": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"subject-.*": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"language_.*Term": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"classification-.*": ("object", ["str", "NoneType"]),
 | 
			
		||||
        r"genre-.*": ("object", ["ndarray", "NoneType"]),
 | 
			
		||||
        r"subject-.*": ("object", ["ndarray", "NoneType"]),
 | 
			
		||||
        r"language_.*Term": ("object", ["ndarray", "NoneType"]),
 | 
			
		||||
        r"classification-.*": ("object", ["ndarray", "NoneType"]),
 | 
			
		||||
 | 
			
		||||
        # page_info
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,6 @@
 | 
			
		|||
from __future__ import annotations
 | 
			
		||||
 | 
			
		||||
import ast
 | 
			
		||||
from itertools import groupby
 | 
			
		||||
import re
 | 
			
		||||
import warnings
 | 
			
		||||
| 
						 | 
				
			
			@ -383,8 +384,7 @@ def convert_db_to_parquet(con, table, index_col, output_file):
 | 
			
		|||
        elif column_type == "bool":
 | 
			
		||||
            df[c] = df[c].map({"True": True, "False": False}).astype("boolean")
 | 
			
		||||
        elif column_type == "set":
 | 
			
		||||
            # TODO WIP
 | 
			
		||||
            continue
 | 
			
		||||
            df[c] = df[c].apply(lambda s: list(ast.literal_eval(s)) if s else None)
 | 
			
		||||
        else:
 | 
			
		||||
            raise NotImplementedError(f"Column {c}: type {column_type} not implemented yet.")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue