mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
🧹 Extract a function to convert list[dict] to a DataFrame
This commit is contained in:
parent
5c2dfa8505
commit
968572168e
2 changed files with 27 additions and 10 deletions
|
@ -1,8 +1,9 @@
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from typing import List, Sequence, MutableMapping
|
from typing import List, Sequence, MutableMapping, Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'):
|
||||||
|
|
||||||
return dict(items)
|
return dict(items)
|
||||||
|
|
||||||
|
|
||||||
|
def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Convert the given list of dicts to a Pandas DataFrame.
|
||||||
|
|
||||||
|
The keys of the dicts make the columns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Build columns from keys
|
||||||
|
columns = []
|
||||||
|
for m in data_list:
|
||||||
|
for c in m.keys():
|
||||||
|
if c not in columns:
|
||||||
|
columns.append(c)
|
||||||
|
|
||||||
|
# Build data table
|
||||||
|
data = [[m.get(c) for c in columns] for m in data_list]
|
||||||
|
|
||||||
|
# Build index
|
||||||
|
index = [m[index_column] for m in data_list]
|
||||||
|
|
||||||
|
df = pd.DataFrame(data=data, index=index, columns=columns)
|
||||||
|
return df
|
||||||
|
|
|
@ -14,7 +14,7 @@ import click
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .lib import sorted_groupby, TagGroup, ns, flatten
|
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
#import traceback; traceback.print_exc()
|
#import traceback; traceback.print_exc()
|
||||||
|
|
||||||
# Convert the mods_info List[Dict] to a pandas DataFrame
|
# Convert the mods_info List[Dict] to a pandas DataFrame
|
||||||
columns = []
|
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
|
||||||
for m in mods_info:
|
|
||||||
for c in m.keys():
|
|
||||||
if c not in columns:
|
|
||||||
columns.append(c)
|
|
||||||
data = [[m.get(c) for c in columns] for m in mods_info]
|
|
||||||
index = [m['recordInfo_recordIdentifier'] for m in mods_info] # PPN
|
|
||||||
mods_info_df = pd.DataFrame(data=data, index=index, columns=columns)
|
|
||||||
|
|
||||||
# Pickle the DataFrame
|
# Pickle the DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
logger.info('Writing DataFrame to {}'.format(output_file))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue