From d7a3f4cec6d0d3397de5c5efca9be7d73b957c16 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 30 Apr 2026 16:54:26 +0200 Subject: [PATCH 1/4] =?UTF-8?q?training:=20add=20cfg=20param=20`reload=5Fw?= =?UTF-8?q?eights`=20for=20building=20but=20loading=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduce `config_params` key `reload_weights` - add respective section for all model types: - build fresh model from code - load existing weights from `dir_of_start_model` - save to `dir_output` under same basename as existing model (but without optimizer and metrics; which does not work currently) - exit immediately (i.e. no actual training) - reorder so reload_weights is after compilation but before data loading --- src/eynollah/training/models.py | 1 - src/eynollah/training/train.py | 140 ++++++++++++++++++++------------ 2 files changed, 89 insertions(+), 52 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index a95ba7e..3494249 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -15,7 +15,6 @@ from tensorflow.keras.layers import ( Embedding, Flatten, Input, - Lambda, Layer, LayerNormalization, LSTM, diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 39dac1d..de998fd 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -355,6 +355,7 @@ def config_params(): dir_output = None # Directory where the augmented training data and the model checkpoints will be saved. pretraining = False # Set to true to (down)load pretrained weights of ResNet50 encoder. save_interval = None # frequency for writing model checkpoints (positive integer for number of batches saved under "model_step_{batch:04d}", otherwise epoch saved under "model_{epoch:02d}") + reload_weights = False # Set true to build new model from config, load weights from dir_of_start_model, save under dir_output and exit. continue_training = False # Whether to continue training an existing model. if continue_training: dir_of_start_model = '' # Directory of model checkpoint to load to continue training. (E.g. if you already trained for 3 epochs, set "dir_of_start_model=dir_output/model_03".) @@ -378,6 +379,7 @@ def run(_config, weight_decay, learning_rate, continue_training, + reload_weights, save_interval, augmentation, # dependent config keys need a default, @@ -452,43 +454,6 @@ def run(_config, dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images') dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels') - if not data_is_provided: - # first create a directory in output for both training and evaluations - # in order to flow data from these directories. - if os.path.isdir(dir_train_flowing): - os.system('rm -rf ' + dir_train_flowing) - os.makedirs(dir_train_flowing) - - if os.path.isdir(dir_eval_flowing): - os.system('rm -rf ' + dir_eval_flowing) - os.makedirs(dir_eval_flowing) - - os.mkdir(dir_flow_train_imgs) - os.mkdir(dir_flow_train_labels) - - os.mkdir(dir_flow_eval_imgs) - os.mkdir(dir_flow_eval_labels) - - # writing patches into a sub-folder in order to be flowed from directory. - def gen(dir_img, dir_lab, dir_flow_imgs, dir_flow_labs, augmentation=True): - indexer = 0 - for img, lab in tqdm(preprocess_imgs(_config, - dir_img, - dir_lab, - augmentation=augmentation), - desc="data_is_provided"): - fname = 'img_%d.png' % indexer - cv2.imwrite(os.path.join(dir_flow_imgs, fname), img) - cv2.imwrite(os.path.join(dir_flow_labs, fname), lab) - indexer += 1 - gen(*get_dirs_or_files(dir_train), - dir_flow_train_imgs, - dir_flow_train_labels) - gen(*get_dirs_or_files(dir_eval), - dir_flow_eval_imgs, - dir_flow_eval_labels, - augmentation=False) - if weighted_loss: weights = np.zeros(n_classes) if data_is_provided: @@ -594,6 +559,52 @@ def run(_config, optimizer=Adam(learning_rate=learning_rate), metrics=metrics) + if reload_weights: + model.load_weights(dir_of_start_model).assert_existing_objects_matched().expect_partial() + dir_save = os.path.join(dir_output, os.path.basename(os.path.normpath(dir_of_start_model))) + model.save(dir_save, include_optimizer=False) + with open(os.path.join(dir_save, "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON + _log.info("reloaded model from %s to %s", dir_of_start_model, dir_save) + return + + if not data_is_provided: + # first create a directory in output for both training and evaluations + # in order to flow data from these directories. + if os.path.isdir(dir_train_flowing): + os.system('rm -rf ' + dir_train_flowing) + os.makedirs(dir_train_flowing) + + if os.path.isdir(dir_eval_flowing): + os.system('rm -rf ' + dir_eval_flowing) + os.makedirs(dir_eval_flowing) + + os.mkdir(dir_flow_train_imgs) + os.mkdir(dir_flow_train_labels) + + os.mkdir(dir_flow_eval_imgs) + os.mkdir(dir_flow_eval_labels) + + # writing patches into a sub-folder in order to be flowed from directory. + def gen(dir_img, dir_lab, dir_flow_imgs, dir_flow_labs, augmentation=True): + indexer = 0 + for img, lab in tqdm(preprocess_imgs(_config, + dir_img, + dir_lab, + augmentation=augmentation), + desc="data_is_provided"): + fname = 'img_%d.png' % indexer + cv2.imwrite(os.path.join(dir_flow_imgs, fname), img) + cv2.imwrite(os.path.join(dir_flow_labs, fname), lab) + indexer += 1 + gen(*get_dirs_or_files(dir_train), + dir_flow_train_imgs, + dir_flow_train_labels) + gen(*get_dirs_or_files(dir_eval), + dir_flow_eval_imgs, + dir_flow_eval_labels, + augmentation=False) + def _to_cv2float(img): # rgb→bgr and uint8→float, as expected by Eynollah models return tf.cast(tf.reverse(img, [-1]), tf.float32) / 255 @@ -701,8 +712,25 @@ def run(_config, image_width=input_width, n_classes=n_classes, max_seq=max_len) + #initial_learning_rate = 1e-4 + #decay_steps = int (n_epochs * ( len_dataset / n_batch )) + #alpha = 0.01 + #lr_schedule = 1e-4 + #tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate, decay_steps, alpha) + opt = Adam(learning_rate=learning_rate) + model.compile(optimizer=opt) # rs: loss seems to be (ctc_batch_cost) in last layer + #print(model.summary()) + if reload_weights: + model.load_weights(dir_of_start_model).assert_existing_objects_matched().expect_partial() + dir_save = os.path.join(dir_output, os.path.basename(os.path.normpath(dir_of_start_model))) + model.save(dir_save, include_optimizer=False) + with open(os.path.join(dir_save, "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON + _log.info("reloaded model from %s to %s", dir_of_start_model, dir_save) + return + # todo: use Dataset.map() on Dataset.list_files() def get_dataset(dir_img, dir_lab): def gen(): @@ -726,14 +754,6 @@ def run(_config, train_ds = get_dataset(*get_dirs_or_files(dir_train)) valdn_ds = get_dataset(*get_dirs_or_files(dir_eval)) - #initial_learning_rate = 1e-4 - #decay_steps = int (n_epochs * ( len_dataset / n_batch )) - #alpha = 0.01 - #lr_schedule = 1e-4 - #tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate, decay_steps, alpha) - opt = Adam(learning_rate=learning_rate) - model.compile(optimizer=opt) # rs: loss seems to be (ctc_batch_cost) in last layer - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), EarlyStopping(verbose=1, patience=3, restore_best_weights=False, start_from_epoch=3), SaveWeightsAfterSteps(0, dir_output, _config)] @@ -762,6 +782,15 @@ def run(_config, optimizer=Adam(learning_rate=0.001), # rs: why not learning_rate? metrics=['accuracy', F1Score(average='macro', name='f1')]) + if reload_weights: + model.load_weights(dir_of_start_model).assert_existing_objects_matched().expect_partial() + dir_save = os.path.join(dir_output, os.path.basename(os.path.normpath(dir_of_start_model))) + model.save(dir_save, include_optimizer=False) + with open(os.path.join(dir_save, "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON + _log.info("reloaded model from %s to %s", dir_of_start_model, dir_save) + return + list_classes = list(classification_classes_name.values()) data_args = dict(label_mode="categorical", class_names=list_classes, @@ -805,6 +834,21 @@ def run(_config, weight_decay, pretraining) + #f1score_tot = [0] + model.compile(loss="binary_crossentropy", + #optimizer=SGD(learning_rate=0.01, momentum=0.9), + optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? + metrics=['accuracy']) + + if reload_weights: + model.load_weights(dir_of_start_model).assert_existing_objects_matched().expect_partial() + dir_save = os.path.join(dir_output, os.path.basename(os.path.normpath(dir_of_start_model))) + model.save(dir_save, include_optimizer=False) + with open(os.path.join(dir_save, "config.json"), "w") as fp: + json.dump(_config, fp) # encode dict into JSON + _log.info("reloaded model from %s to %s", dir_of_start_model, dir_save) + return + dir_flow_train_imgs = os.path.join(dir_train, 'images') dir_flow_train_labels = os.path.join(dir_train, 'labels') @@ -815,12 +859,6 @@ def run(_config, num_rows = len(classes) #ls_test = os.listdir(dir_flow_train_labels) - #f1score_tot = [0] - model.compile(loss="binary_crossentropy", - #optimizer=SGD(learning_rate=0.01, momentum=0.9), - optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? - metrics=['accuracy']) - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: From 2747385f89bfb52db638a37457c1dfae0a27f246 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 7 May 2026 17:15:15 +0200 Subject: [PATCH 2/4] remove unused deprecating-warning-causing biopyton dependency --- requirements.txt | 1 - src/eynollah/utils/utils_ocr.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 53d1e39..d79853f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,4 @@ tensorflow tf-keras # avoid keras 3 (also needs TF_USE_LEGACY_KERAS=1) numba <= 0.58.1 scikit-image -biopython tabulate diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 928c164..93d1137 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -7,7 +7,6 @@ import tensorflow as tf from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from PIL import Image, ImageDraw, ImageFont -from Bio import pairwise2 from .resize import resize_image @@ -503,8 +502,3 @@ def return_rnn_cnn_ocr_of_given_textlines(image, ocr_textline_in_textregion.append(text_textline) ocr_all_textlines.append(ocr_textline_in_textregion) return ocr_all_textlines - -def biopython_align(str1, str2): - alignments = pairwise2.align.globalms(str1, str2, 2, -1, -2, -2) - best_alignment = alignments[0] # Get the best alignment - return best_alignment.seqA, best_alignment.seqB From 34a9d458ce7723006d6e5ccb48045d396738d254 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 7 May 2026 18:09:27 +0200 Subject: [PATCH 3/4] training deps: use sacred fork w/o pkg_resources, pin tf/tf_keras, protobuf packages to work with tensorflow_addons --- train/requirements.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/train/requirements.txt b/train/requirements.txt index 6f23d76..090bc50 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -1,7 +1,10 @@ -sacred +ocrd-fork-sacred >= 0.8.7.post1 seaborn numpy tqdm imutils scipy -tensorflow-addons # for connected_components +tensorflow-addons # for connected_components, depublished and only compatible with tensorflow < 2.16 +tensorflow < 2.16 # for tensorflow-addons, so only needed in training +tf_data < 2.16 # for tensorflow-addons, so only needed in training +protobuf < 5 # for tensorflow-addons, so only needed in training From a0bf1b51f4b10716f69d46fa5ad517ae9008eadf Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 7 May 2026 19:30:29 +0200 Subject: [PATCH 4/4] makefile to reload models --- src/eynollah/training/reload-models-v0.8.mk | 48 +++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/eynollah/training/reload-models-v0.8.mk diff --git a/src/eynollah/training/reload-models-v0.8.mk b/src/eynollah/training/reload-models-v0.8.mk new file mode 100644 index 0000000..b7a38dd --- /dev/null +++ b/src/eynollah/training/reload-models-v0.8.mk @@ -0,0 +1,48 @@ +SHELL = bash -e + +MODELS_SRC = models_eynollah +MODELS_DST = reloaded/models_eynollah + + +# $(MODELS_DST)/eynollah-binarization_20210425 \ +# $(MODELS_DST)/eynollah-column-classifier_20210425 \ +# $(MODELS_DST)/eynollah-enhancement_20210425 \ +# $(MODELS_DST)/eynollah-main-regions-aug-rotation_20210425 \ +# $(MODELS_DST)/eynollah-main-regions-aug-scaling_20210425 \ +# $(MODELS_DST)/eynollah-main-regions-ensembled_20210425 \ +# $(MODELS_DST)/eynollah-main-regions_20220314 \ +# $(MODELS_DST)/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18 \ +# $(MODELS_DST)/eynollah-tables_20210319 \ +# $(MODELS_DST)/model_eynollah_ocr_cnnrnn_20250930 \ + +RELOADABLE_MODELS = \ + $(MODELS_DST)/model_eynollah_page_extraction_20250915 \ + $(MODELS_DST)/model_eynollah_reading_order_20250824 \ + $(MODELS_DST)/modelens_e_l_all_sp_0_1_2_3_4_171024 \ + $(MODELS_DST)/modelens_full_lay_1__4_3_091124 \ + $(MODELS_DST)/modelens_table_0t4_201124 \ + $(MODELS_DST)/modelens_textline_0_1__2_4_16092024 + +all: $(RELOADABLE_MODELS) + +$(MODELS_DST)/%: $(MODELS_SRC)/% + mkdir -p $@ + test -e $&1 | tee $(notdir $<).log + cp $