From fff42533524d3ee91d298a2873ae19957304be2e Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Tue, 3 Feb 2026 20:20:20 +0100 Subject: [PATCH] generate or update list of characters in the case of cnn-rnn ocr training --- src/eynollah/training/cli.py | 2 + ...te_or_update_cnn_rnn_ocr_character_list.py | 59 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 src/eynollah/training/generate_or_update_cnn_rnn_ocr_character_list.py diff --git a/src/eynollah/training/cli.py b/src/eynollah/training/cli.py index 3718275..862d212 100644 --- a/src/eynollah/training/cli.py +++ b/src/eynollah/training/cli.py @@ -10,6 +10,7 @@ from .inference import main as inference_cli from .train import ex from .extract_line_gt import linegt_cli from .weights_ensembling import main as ensemble_cli +from .generate_or_update_cnn_rnn_ocr_character_list import main as update_ocr_characters_cli @click.command(context_settings=dict( ignore_unknown_options=True, @@ -28,3 +29,4 @@ main.add_command(inference_cli, 'inference') main.add_command(train_cli, 'train') main.add_command(linegt_cli, 'export_textline_images_and_text') main.add_command(ensemble_cli, 'ensembling') +main.add_command(update_ocr_characters_cli, 'generate_or_update_cnn_rnn_ocr_character_list') diff --git a/src/eynollah/training/generate_or_update_cnn_rnn_ocr_character_list.py b/src/eynollah/training/generate_or_update_cnn_rnn_ocr_character_list.py new file mode 100644 index 0000000..6d1028c --- /dev/null +++ b/src/eynollah/training/generate_or_update_cnn_rnn_ocr_character_list.py @@ -0,0 +1,59 @@ +import os +import numpy as np +import json +import click +import logging + + + +def run_character_list_update(dir_labels, out, current_character_list): + ls_labels = os.listdir(dir_labels) + ls_labels = [ind for ind in ls_labels if ind.endswith('.txt')] + + if current_character_list: + with open(current_character_list, 'r') as f_name: + characters = json.load(f_name) + + characters = set(characters) + else: + characters = set() + + + for ind in ls_labels: + label = open(os.path.join(dir_labels,ind),'r').read().split('\n')[0] + + for char in label: + characters.add(char) + + + characters = sorted(list(set(characters))) + + with open(out, 'w') as f_name: + json.dump(characters, f_name) + + +@click.command() +@click.option( + "--dir_labels", + "-dl", + help="directory of labels which are txt files", + type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--current_character_list", + "-ccl", + help="current exsiting character list which is txt file and wished to be updated with a set of labels", + type=click.Path(exists=True, file_okay=True), + required=False, +) +@click.option( + "--out", + "-o", + help="output file which is a txt file where generated or updated character list will be written", + type=click.Path(exists=False, file_okay=True), +) + +def main(dir_labels, out, current_character_list): + run_character_list_update(dir_labels, out, current_character_list) +