💩 Add a script fix-calamari1-model to fix regexen in 1.0 models

This is a workaround. Example:

❯ fix-calamari1-model ~/.local/share/ocrd-resources/ocrd-calamari-recognize/qurator-gt4histocr-1.0
0.ckpt.json fixed.
1.ckpt.json fixed.
2.ckpt.json fixed.
3.ckpt.json fixed.
4.ckpt.json fixed.
test-python-3.11
Mike Gerber 7 months ago
parent 0f92b524da
commit 3cf4887cb4

@ -0,0 +1,39 @@
import re
import json
import click
from glob import glob
from copy import deepcopy
from ocrd_calamari.util import working_directory
@click.command
@click.argument('checkpoint_dir')
def fix_calamari1_model(checkpoint_dir):
"""
Fix old Calamari 1 models.
This currently means fixing regexen in "replacements" to have their global flags
in front of the rest of the regex.
"""
with working_directory(checkpoint_dir):
for fn in glob("*.json"):
with open(fn, "r") as fp:
j = json.load(fp)
old_j = deepcopy(j)
for v in j["model"].values():
if type(v) != dict:
continue
for child in v.get("children", []):
for replacement in child.get("replacements", []):
# Move global flags in front
replacement["old"] = re.sub(
r"^(.*)\(\?u\)$", r"(?u)\1", replacement["old"]
)
if j == old_j:
print(f"{fn} unchanged.")
else:
with open(fn, "w") as fp:
json.dump(j, fp, indent=2)
print(f"{fn} fixed.")

@ -0,0 +1,14 @@
import os
class working_directory:
"""Context manager to temporarily change the working directory"""
def __init__(self, wd):
self.wd = wd
def __enter__(self):
self.old_wd = os.getcwd()
os.chdir(self.wd)
def __exit__(self, etype, value, traceback):
os.chdir(self.old_wd)

@ -25,6 +25,7 @@ setup(
entry_points={
'console_scripts': [
'ocrd-calamari-recognize=ocrd_calamari.cli:ocrd_calamari_recognize',
'fix-calamari1-model=ocrd_calamari.fix_calamari1_model:fix_calamari1_model',
]
},
)

Loading…
Cancel
Save