gpn23-recipes/1_clean_json.py

65 lines
2.1 KiB
Python
Raw Normal View History

2025-06-01 23:11:38 +02:00
import json
2025-06-02 09:48:27 +02:00
import re
2025-06-01 23:11:38 +02:00
2025-06-02 01:36:04 +02:00
input_file = 'export_2025-06-02.json'
output_file = 'export_2025-06-02_clean.json'
2025-06-01 23:11:38 +02:00
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
2025-06-02 20:26:49 +02:00
# indent input file while we're at it
with open(input_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
2025-06-01 23:11:38 +02:00
def normalize_ingredients(recipe):
if 'recipeIngredient' in recipe:
recipe['recipeIngredient'] = [
ingredient.replace('g / Gramm', 'g') for ingredient in recipe['recipeIngredient']
]
recipe['recipeIngredient'] = [
ingredient.replace('kg / Kilogramm', 'kg') for ingredient in recipe['recipeIngredient']
]
recipe['recipeIngredient'] = [
ingredient.replace('.0 ', ' ') for ingredient in recipe['recipeIngredient']
]
2025-06-02 09:48:27 +02:00
recipe['recipeIngredient'] = [
2025-06-02 21:17:25 +02:00
re.sub("^0 (g|kg|Milliliter|None) ", "", ingredient) for ingredient in recipe['recipeIngredient']
2025-06-02 09:48:27 +02:00
]
recipe['recipeIngredient'] = [
ingredient \
for ingredient in recipe['recipeIngredient'] \
if ingredient != "None"
]
2025-06-01 23:11:38 +02:00
return recipe
2025-06-02 12:32:45 +02:00
def normalize_instructions(recipe):
if 'recipeInstructions' in recipe:
recipe['recipeInstructions'] = [
instruction \
for instruction in recipe['recipeInstructions'] \
if instruction.get('text')
]
if not recipe['recipeInstructions']:
del recipe['recipeInstructions']
return recipe
def normalize_recipe(recipe):
cleaned_recipe = recipe
cleaned_recipe = normalize_ingredients(cleaned_recipe)
cleaned_recipe = normalize_instructions(cleaned_recipe)
return cleaned_recipe
2025-06-01 23:11:38 +02:00
if isinstance(data, list):
2025-06-02 12:32:45 +02:00
cleaned_data = [normalize_recipe(recipe) for recipe in data]
2025-06-01 23:11:38 +02:00
else:
2025-06-02 12:32:45 +02:00
cleaned_data = normalize_recipe(data)
2025-06-01 23:11:38 +02:00
# Write the cleaned JSON data to a new file
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
print(f"Cleaned data written to '{output_file}'")