From a1449da1d189887fff1683815b753216390452c8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 May 2026 03:32:19 +0200 Subject: [PATCH 01/32] Revert "fix model loading in mb_ro and ocr" This reverts commit 218a95e6a0c8881ca1919b3f00d8b15475d03e1f. --- src/eynollah/eynollah_ocr.py | 12 ++++++------ src/eynollah/mb_ro_on_layout.py | 5 +++-- .../model_zoo/.nfs00000002feddea7d00000031 | Bin 20480 -> 0 bytes src/eynollah/model_zoo/model_zoo.py | 8 ++------ 4 files changed, 11 insertions(+), 14 deletions(-) delete mode 100644 src/eynollah/model_zoo/.nfs00000002feddea7d00000031 diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 1b49077..3c918e5 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -65,14 +65,14 @@ class Eynollah_ocr: self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size if tr_ocr: - self.model_zoo.load_models('trocr_processor') - self.model_zoo.load_models(['ocr', 'tr']) + self.model_zoo.load_model('trocr_processor') + self.model_zoo.load_model('ocr', 'tr') self.model_zoo.get('ocr').to(self.device) else: - self.model_zoo.load_models('ocr') - self.model_zoo.load_models('num_to_char') - self.model_zoo.load_models('characters') - self.end_character = len(self.model_zoo.get('characters')) + 2 + self.model_zoo.load_model('ocr', '') + self.model_zoo.load_model('num_to_char') + self.model_zoo.load_model('characters') + self.end_character = len(self.model_zoo.get('characters', list)) + 2 @property def device(self): diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index b0b5910..22fe97b 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -19,6 +19,7 @@ import statistics os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf +from tensorflow.keras.models import Model from .model_zoo import EynollahModelZoo from .utils.resize import resize_image @@ -49,7 +50,7 @@ class machine_based_reading_order_on_layout: except: self.logger.warning("no GPU device available") - self.model_zoo.load_models('reading_order') + self.model_zoo.load_model('reading_order') def read_xml(self, xml_file): tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) @@ -675,7 +676,7 @@ class machine_based_reading_order_on_layout: tot_counter += 1 batch.append(j) if tot_counter % inference_bs == 0 or tot_counter == len(ij_list): - y_pr = self.model_zoo.get('reading_order').predict(input_1 , verbose='0') + y_pr = self.model_zoo.get('reading_order', Model).predict(input_1 , verbose='0') for jb, j in enumerate(batch): if y_pr[jb][0]>=0.5: post_list.append(j) diff --git a/src/eynollah/model_zoo/.nfs00000002feddea7d00000031 b/src/eynollah/model_zoo/.nfs00000002feddea7d00000031 deleted file mode 100644 index c7dd87d9c2ff0b51a6c1e942f4d82b5b4ac0da25..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20480 zcmeHPYm6jS6)phLD>I#IcRopbiVHLHw2jwLXbz*quf z35+E$mcUp7V+o8UFqXjomjsgDiPk64r4v-Q7S;W2h5JeB`4j5iDSSUsJzt~l4;Q|x z$MJ0}fw2U}5*SNhEP=5E#u6AyU@U>L1jZ5=OJFR4u>{@%3AioGqS*Z*3iz@AujBt$ z&$O&R0?z=C0QUkn1GB(Yz!XpiP6b{#!?L~y+yUGUd=a<-h=B8ebAYpff1YkxPXkW@ zj{-jg?gHk4tATTYXHK)M2Y~g!YwyA{@MGW>;5uLq*bUggnZOyqE2mo4?}3Ma?*MlL ztAG!D5P0^Tcm~b{eshXty$Jjqcm%i`xCv+h2Z3F{YbRUQpMeK}1ZV*NdWU5_2;2oM z1J?m_z)s*|;Q6`V&A@X=oIC{F0=U2ypaT3ILVp6d3qT6S`Y3Q3@Dvg; z_W?feKHyc1%hSM9z)t}&PL{>g6?4B`7f&tTW`5|l9C1dXW{N~S=|=&hlD0!7cq><3 zzqI?kmYeVz%1@>RhzwrL$%7Y7d|wQs!MvTJv`+kXqs6$+*2#yZ75UiW%U+XD%iD1q zWz%eL#9Ll738R`|*AEvLc~lp?=c^N6g03VrSLwM)v%_1Jd9-pM>T`=qWUrLVxr?Yw zzC&}3sf*{WY;o>;?Lr}&2d#L;OFFfxqsUb!8@XE4LXNG7$KDP9L$=mrN)anEm0<`S zKXSChab6ZEla*DjH1}gZqQ^Ais9MyNTy;pNAk!?dTv2q?i#>7vsL5+7L(qjJ$_Tw3 zMuuDF1PqL{?*$y;g`YEScgQSH1R;$uBFkI{Ml3@n}5T8T2Iy=zs`J8^v zXMPJwmtM%X^QPHR1}H9DAlW<2oKD&kV$8e-$9wU$7Wa75ZhJlt+%7lszN~yhs*;tH zuNT}H#w41wuMfR|sE%qEszwSigtYC&j=K`|hRvut z$u>ZbGCi`W23;U5K4`N_`dh0uJ8tAQ6CTCYI;#f#u9Jigz0A&Qst0iig7iDK7kN^X zq;+$3CE8^-@^FgJ2vX}0@o*q=gV={%SA~xxgC4J|Do0sDssX;BiY}B^$%54|?IMn9 z-ZGyQlC!8Wx^&}0IZ38QQo4GBErg*D_Q_BiRg&B*(!*3Z-#t;n)XpnnW)RFuD5T|t z%RGv_7LVC_y$qzUEYSt^=M4!r)mIvoB&rE*l3b*kY{Ni@{xVd_j0MpWgydRhW91BB z3Tl--l1(xTx&cPkPP8fXhFusljLU^_1+x-5pvPMmFA#Y%8j})sMkrU2HmS1geJWd{ z2UZ-*R`!n!Q#6!|Om){JoK z7w9UPwXe_#(3&&8j`%1n2;R3i?={2TptfUX=eC2h2b|q|cFfE=`}fXl-*0H8T`H(- zyRw@O>M~_`Su4aGBg@KHy*O#GBkIFZo9)LctIztfsVO1ODP0dbP0G*=qlh;Xtdn$u z*J)P_56Wf>?&V;p+O{AT*D^XH3Q1nkNBV(DMfP%5xRrNQ>}#M1B2%|bGB>H3>d*?H0lYQ!ZpOvj}r0%L>L-bY^&lSrN)$)*3?bOlJq|;fd7)#uRzF067P5rDt4}>eH8G?6K zXEe(7bkLQPg&Qrk(h-wfwu3H(F`BY^Y{HR|-+Ne2eQ!a3CoPm8XwvGpdJu%)H7Wkr`}dC`uD=yH0_+27z&XI#z_W@8Srn!@{a+J0=EG%umyMtvHN|%9l&RS8Q?77WyI|F0Q0~e;A-Fk;61=U z5X1ioxEr_u7yvtg2Jivk-N2KG;hzA$1>6X11&$$x|0?hm;M0Hy%mKTAb->BMZxP4; z3|Ix$11}?f|2A+ZFbhlrRp3nEZ;0Wa1Re&y0o)7>fNO!hz#ibkz&$AYD-A1T$DMxC3Ri-fSv;sb`A)JOa*BMI z0-4-UH0&ilz;cyYududt+}KUvMG_qZ0jelM35p=Gat-}B+`*a&;(kQ#6->9Aizi`R z<@y})He%nPi9i!^>*2%M-W@&+nuBf_nT(2F8^{q#`%HDv2TMyRF9X;}pSQcobQS;n zr_*1^#HCar6Hn^;)m;?(`#x_K%2I*qL4=4e3% zx;AuGXv=>|KK0Hl-9Yx2su9k-&1BOB7v(kq39#^_fXa zNEeRb%Fu?lx)z5rp;6*z7z>6F(bo*+iOS|cQ|O6I_?dbz!0B)ZuP)&M=3mPm$eg@g~BXNZ9i|+xL|UiZP_ALqnl+HgYs6Pgl)Ssb(xC z5l~}HzL`j^*P3C__7)uq<&BIe_skGn2GWdV=3G8aRA-f_g0&(>43#!Be02~#gXs^2 z`%VUH?XG2nEcBFmCSsTv*~3Do4CB;Ax{wZ&p|;ReF$guv@l97+7$Lr?GHHkXq}PYD zJi%yMHJG;;WY-?q&RQ*bT?8d)2suA$hpp9;68lx86C%8jvr!X4$b-3(%|oXhs54V3 z-_NuNv{Pt<7^YhK+Ce;9$G~)Z7~AQcE=;d!pq%1VEaGNqw&-4F4JGRA5OU^WFhk?X zqa9LVMaK*Ys*SSXZe(GbCXItYD2ZGzz^avFKHrp_Uz9|I@^mwkYGmscaX=T=4z(Pt2(vSgVC^Ig5iW1OCkGfhH0cy8RN-YBlm4EN>c86B^2IF zW;4+Y+5`p;%H?|SCJwoNpU*@Qw({ClDY?p#d_zYoB0B^<_eeD(qY((=*$i ztYlI$6Qnx%wnR(hW|bHtaZ_%bzUVBa?uwCg8suk0t;8${rK3na6mRUVyToad&_0gS z?gwHwr{MdW($^y^qb5DnxwJ!B9wSrLArpy8g&wa*TmXNMc0J1ukSL2fE6CqCFtjkM z((>vmQd=fP{Qq8rV4p`0fZ~6BzW+tU`9A?}1AJf?upW3na2oIm#r?pYz+s>PTmn!& z;046|&jUXKz7E_1aNts43vfPgHt-~J0FMA)1AO2T;C00RzXToxZUsW%8sI|UuZaJD z4crN|fE~aw#QU_r|7D;FTmhT}{0cGuy+8tN0seyc{ujU(fFr==z-7Ssz+;H@KLa#? z*Ad%44tx{vfR6xY0M8?i|2}XNK(YNWvB*4djI&ca=`}ki1t^+m$=K(K3 z#`gjw>sv|QrHck#Ut$MZ4X~MG`m5BzHB>`y23xv@zK>Q7RYj|Yu0VH8IQ2Ep%35q# z*#_47jXV~#IG$Gb!tRK!+3X|Z*mc5%>yZwQr)k>JE=#kcmOEKpJq-8g1-CQniQ>c0p4a{(sO_1j6K!nw7Zxl9p3^%&u z9T;Bo%#Cyn^6go=BSVf{D93;nN^Gl0`WY{QYHB8pDP5%1a+PPvj1np-kl>bREk8=1E8S2c-RWobayc02QC0 z6AnlNvHk8cZ|xQvIkcC=uH3P6E5$s>Op_Jx)OLip4ovl@ouo_A| zO$8XXjUs_%F`nj4!Mo8bc}YCf7g1Gjgz3G#wHwgwQ6JJB;*256buulGaTZd#${wWq zxIxIS*n1FLYjhL}Q`Pl+N`{s9vYP#tYh+PuPnr_gkXl6@V3DyM<@QYbai&>ieD?M6 zANX{oE0%AbNDRFyj2l(@Mpbelc&4)+{a`6T`mH)b#7WzhV~(kVJvj_&fDEY!rBy>MWQILI zuF1#z1SiTF8G*U7P)AURkyh$*D!OEqPhy8q2W|pX_{*+fZ{Z%zRC)!5JNp)~;b7OvTzMTH2YAOPkoWr2HQKAk94(I;(#LIMG zb(Ad0#&3sI)}Fx&5qFm?-Kk-B-5j+$vLMK2wjN7{6Z0~&TT!HfugD_vWisKEd5(L& zm(;3_Ap=TPNnfZOX^Uh|*QjUDJ83Flty-^|!g+b5^2)zfqia-;HesAzXApZhq?amR zLG5zRk($pUU1Mox@)N8aYAGw4H!$U5$~NYUVa+6;c*x6=hb#On+ZN3@T;_gsYP0sH zGb4JJJ58aI6PvV6yGa`b_*#+jYDV%zp>oU3N`ZQEa^lM>a9RoDaR2#fR#+sxcHxFH*{ z$=84f^7bHh^(RZv6+0<^KBs diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index 9611388..fffd389 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -94,12 +94,8 @@ class EynollahModelZoo: elif model_category.endswith('_patched'): load_args[0] = model_category[:-8] load_kwargs["patched"] = True - spec = self.specs.get(model_category, load_args[1] if len(load_args) > 1 else '') - if spec.type in ['Keras'] and spec.category != 'ocr': - ret[model_category] = Predictor(self.logger, self) - ret[model_category].load_model(*load_args, **load_kwargs, device=device) - else: - ret[model_category] = self.load_model(*load_args, **load_kwargs, device=device) + ret[model_category] = Predictor(self.logger, self) + ret[model_category].load_model(*load_args, **load_kwargs, device=device) self._loaded.update(ret) return self._loaded From 7e8b9311d3eb96b320b928088c4b9a3a88e882cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 May 2026 03:32:37 +0200 Subject: [PATCH 02/32] Revert "test_model_zoo: fix calls" This reverts commit 5a98f55be365e740c31aabf05f339749b6e2c6fd. --- tests/test_model_zoo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_model_zoo.py b/tests/test_model_zoo.py index 9d37431..2042b28 100644 --- a/tests/test_model_zoo.py +++ b/tests/test_model_zoo.py @@ -6,11 +6,11 @@ def test_trocr1( model_zoo = EynollahModelZoo(model_dir) try: from transformers import TrOCRProcessor, VisionEncoderDecoderModel - model_zoo.load_models('trocr_processor') - proc = model_zoo.get('trocr_processor') + model_zoo.load_model('trocr_processor') + proc = model_zoo.get('trocr_processor', TrOCRProcessor) assert isinstance(proc, TrOCRProcessor) - model_zoo.load_models(['ocr', 'tr']) - model = model_zoo.get('ocr') + model_zoo.load_model('ocr', 'tr') + model = model_zoo.get('ocr', VisionEncoderDecoderModel) assert isinstance(model, VisionEncoderDecoderModel) except ImportError: pass From 98e6fbbcbbe5c1ebc8f9729cbcd6c8864ad389ac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 May 2026 11:30:39 +0200 Subject: [PATCH 03/32] mbreorder: make work again, re-use Eynollah base class --- src/eynollah/cli/cli_readingorder.py | 11 ++++++++--- src/eynollah/mb_ro_on_layout.py | 29 ++++++++++++++++------------ 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/eynollah/cli/cli_readingorder.py b/src/eynollah/cli/cli_readingorder.py index 0f44b7f..eed9fb9 100644 --- a/src/eynollah/cli/cli_readingorder.py +++ b/src/eynollah/cli/cli_readingorder.py @@ -20,14 +20,19 @@ import click type=click.Path(exists=True, file_okay=False), required=True, ) +@click.option( + "--device", + "-D", + help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", +) @click.pass_context -def readingorder_cli(ctx, input, dir_in, out): +def readingorder_cli(ctx, input, dir_in, out, device): """ Generate ReadingOrder with a ML model """ - from ..mb_ro_on_layout import machine_based_reading_order_on_layout + from ..mb_ro_on_layout import Reorder assert bool(input) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." - orderer = machine_based_reading_order_on_layout(model_zoo=ctx.obj.model_zoo) + orderer = Reorder(model_zoo=ctx.obj.model_zoo, device=device) orderer.run(xml_filename=input, dir_in=dir_in, dir_out=out, diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index 22fe97b..5725ba1 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -21,6 +21,7 @@ os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.models import Model +from .eynollah import Eynollah from .model_zoo import EynollahModelZoo from .utils.resize import resize_image from .utils.contour import ( @@ -34,23 +35,27 @@ DPI_THRESHOLD = 298 KERNEL = np.ones((5, 5), np.uint8) -class machine_based_reading_order_on_layout: +class Reorder(Eynollah): def __init__( - self, - *, - model_zoo: EynollahModelZoo, - logger : Optional[logging.Logger] = None, + self, + *, + model_zoo: EynollahModelZoo, + logger : Optional[logging.Logger] = None, + device: str = '', ): self.logger = logger or logging.getLogger('eynollah.mbreorder') self.model_zoo = model_zoo - try: - for device in tf.config.list_physical_devices('GPU'): - tf.config.experimental.set_memory_growth(device, True) - except: - self.logger.warning("no GPU device available") - self.model_zoo.load_model('reading_order') + self.setup_models(device=device) + + def setup_models(self, device=''): + loadable = ['reading_order'] + self.model_zoo.load_models(*loadable, device=device) + for model in loadable: + self.logger.debug("model %s has input shape %s", model, + self.model_zoo.get(model).input_shape) + def read_xml(self, xml_file): tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) @@ -676,7 +681,7 @@ class machine_based_reading_order_on_layout: tot_counter += 1 batch.append(j) if tot_counter % inference_bs == 0 or tot_counter == len(ij_list): - y_pr = self.model_zoo.get('reading_order', Model).predict(input_1 , verbose='0') + y_pr = self.model_zoo.get('reading_order').predict(input_1, verbose=0) for jb, j in enumerate(batch): if y_pr[jb][0]>=0.5: post_list.append(j) From ded668a2562d2dc59646554a06338303cf2a6034 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 18:17:43 +0200 Subject: [PATCH 04/32] =?UTF-8?q?model=5Fzoo:=20fix=20clash=20between=20Pr?= =?UTF-8?q?edictor=20and=20direct=20(OCR)=20use-cases=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `load_models()`: uniformly handle arg types - `load_model()`: move handling of non-model categories to `load_models()` - `load_model()`: move SavedModel preference over HDF5 to `model_path()` - `_load_ocr_model()`: add user-selected device handling and reporting for Torch (as for TF) - `_load_ocr_model()`: move (TF-based) CNN-RNN case to `load_model()` (including Keras layer mapping) - `shutdown()`: only apply `shutdown()` to Predictor model types --- src/eynollah/model_zoo/model_zoo.py | 144 +++++++++++++++++----------- 1 file changed, 87 insertions(+), 57 deletions(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index fffd389..7f3cd6c 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -70,6 +70,9 @@ class EynollahModelZoo: model_path = Path(self.model_basedir).joinpath(spec.filename) else: model_path = Path(spec.filename) + if model_path.suffix == '.h5' and Path(model_path.stem).exists(): + # prefer SavedModel over HDF5 format if it exists + model_path = Path(model_path.stem) return model_path def load_models( @@ -82,28 +85,50 @@ class EynollahModelZoo: """ ret = {} # cannot use self._loaded here, yet – first spawn all predictors for load_args in all_load_args: + load_kwargs = dict(device=device) if isinstance(load_args, str): - model_category = load_args - load_args = [model_category] + model_category, model_variant = load_args, "" + elif len(load_args) > 2: + # for calls to self.model_path + self.override_models(load_args) + # for calls to Predictor.load_model + model_category, model_variant, model_path = load_args + load_kwargs["model_variant"] = model_variant + load_kwargs["model_path_override"] = model_path else: - model_category = load_args[0] - load_kwargs = {} + model_category, model_variant = load_args + load_kwargs["model_variant"] = model_variant + if model_category.endswith('_resized'): - load_args[0] = model_category[:-8] + model_category = model_category[:-8] load_kwargs["resized"] = True elif model_category.endswith('_patched'): - load_args[0] = model_category[:-8] + model_category = model_category[:-8] load_kwargs["patched"] = True - ret[model_category] = Predictor(self.logger, self) - ret[model_category].load_model(*load_args, **load_kwargs, device=device) + + if model_category == 'ocr': + model = self._load_ocr_model(variant=model_variant, device=device) + elif model_category == 'num_to_char': + model = self._load_num_to_char() + elif model_category == 'characters': + model = self._load_characters() + elif model_category == 'trocr_processor': + from transformers import TrOCRProcessor + model_path = self.model_path(model_category, model_variant) + model = TrOCRProcessor.from_pretrained(model_path) + else: + model = Predictor(self.logger, self) + model.load_model(model_category, **load_kwargs) + + ret[model_category] = model self._loaded.update(ret) return self._loaded def load_model( - self, - model_category: str, - model_variant: str = '', - model_path_override: Optional[str] = None, + self, + model_category: str, + model_variant: str = '', + model_path_override: Optional[str] = None, patched: bool = False, resized: bool = False, device: str = '', @@ -117,6 +142,7 @@ class EynollahModelZoo: import tensorflow as tf from tensorflow.keras.models import load_model + from tensorflow.keras.models import Model as KerasModel from ..patch_encoder import ( PatchEncoder, @@ -162,38 +188,33 @@ class EynollahModelZoo: if model_path_override: self.override_models((model_category, model_variant, model_path_override)) model_path = self.model_path(model_category, model_variant) - if model_path.suffix == '.h5' and Path(model_path.stem).exists(): - # prefer SavedModel over HDF5 format if it exists - model_path = Path(model_path.stem) - if model_category == 'ocr': - model = self._load_ocr_model(variant=model_variant) - elif model_category == 'num_to_char': - model = self._load_num_to_char() - elif model_category == 'characters': - model = self._load_characters() - elif model_category == 'trocr_processor': - from transformers import TrOCRProcessor - model = TrOCRProcessor.from_pretrained(model_path) + try: + # avoid wasting VRAM on non-transformer models + model = load_model(model_path, compile=False) + except Exception as e: + self.logger.error(e) + model = load_model( + model_path, compile=False, + custom_objects=dict(PatchEncoder=PatchEncoder, + Patches=Patches)) + assert isinstance(model, KerasModel) + model._name = model_category + if resized: + model = wrap_layout_model_resized(model) + model._name = model_category + '_resized' + elif patched: + model = wrap_layout_model_patched(model) + model._name = model_category + '_patched' else: - try: - # avoid wasting VRAM on non-transformer models - model = load_model(model_path, compile=False) - except Exception as e: - self.logger.error(e) - model = load_model( - model_path, compile=False, - custom_objects=dict(PatchEncoder=PatchEncoder, - Patches=Patches)) - model._name = model_category - if resized: - model = wrap_layout_model_resized(model) - model._name = model_category + '_resized' - elif patched: - model = wrap_layout_model_patched(model) - model._name = model_category + '_patched' - else: - model.jit_compile = True - model.make_predict_function() + model.jit_compile = True + + if model_category == 'ocr': + model = KerasModel( + model.get_layer(name="image").input, # type: ignore + model.get_layer(name="dense2").output, # type: ignore + ) + + model.make_predict_function() return model def get(self, model_category: str) -> Predictor: @@ -201,26 +222,34 @@ class EynollahModelZoo: raise ValueError(f'Model "{model_category}" not previously loaded with "load_model(..)"') return self._loaded[model_category] - def _load_ocr_model(self, variant: str) -> AnyModel: + def _load_ocr_model(self, variant: str, device: str = "") -> AnyModel: """ Load OCR model """ - from tensorflow.keras.models import Model as KerasModel - from tensorflow.keras.models import load_model - - ocr_model_dir = self.model_path('ocr', variant) + model_dir = self.model_path('ocr', variant) if variant == 'tr': from transformers import VisionEncoderDecoderModel - ret = VisionEncoderDecoderModel.from_pretrained(ocr_model_dir) + import torch + ret = VisionEncoderDecoderModel.from_pretrained(model_dir) assert isinstance(ret, VisionEncoderDecoderModel) + dev = torch.device('cpu') + if not device and torch.cuda.is_available(): + device = 'GPU' # try + if device and device.startswith('GPU'): + try: + dev = torch.device('cuda', int(device[3:] or 0)) + name = torch.cuda.get_device_name(dev) + self.logger.info("using GPU %s (%s) for model ocr:tr", dev, name) + except: + self.logger.exception("cannot configure GPU device") + dev = torch.device('cpu') + if dev.type == 'cuda': + ret.to(dev) + else: + self.logger.warning("no GPU device available") return ret - else: - ocr_model = load_model(ocr_model_dir, compile=False) - assert isinstance(ocr_model, KerasModel) - return KerasModel( - ocr_model.get_layer(name="image").input, # type: ignore - ocr_model.get_layer(name="dense2").output, # type: ignore - ) + + return self.load_model('ocr', model_variant=variant, device=device) def _load_characters(self) -> List[str]: """ @@ -273,5 +302,6 @@ class EynollahModelZoo: """ if hasattr(self, '_loaded') and getattr(self, '_loaded'): for needle in list(self._loaded.keys()): - self._loaded[needle].shutdown() + if isinstance(self._loaded[needle], Predictor): + self._loaded[needle].shutdown() del self._loaded[needle] From cd62f13872419deb3c8740e8d5ded6a21cdec3c9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 18:31:18 +0200 Subject: [PATCH 05/32] =?UTF-8?q?eynollah=5Focr:=20make=20work=20again,=20?= =?UTF-8?q?re-use=20Eynollah=20base=20class=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - re-use Eynollah base class - use `ModelZoo.load_models()` instead of `load_model()` - pass in `device` init kwarg, delegate to `ModelZoo.load_models()` - `device`: return Torch device at loaded model tensors instead of ad-hoc selection - make numeric init kwargs non-optional (only numeric) --- src/eynollah/cli/cli_ocr.py | 10 ++++++-- src/eynollah/eynollah_ocr.py | 48 ++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/eynollah/cli/cli_ocr.py b/src/eynollah/cli/cli_ocr.py index 406af61..f9b74c8 100644 --- a/src/eynollah/cli/cli_ocr.py +++ b/src/eynollah/cli/cli_ocr.py @@ -66,6 +66,10 @@ import click "--min_conf_value_of_textline_text", "-min_conf", help="minimum OCR confidence value. Text lines with a confidence value lower than this threshold will not be included in the output XML file.", +@click.option( + "--device", + "-D", + help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", ) @click.pass_context def ocr_cli( @@ -81,18 +85,20 @@ def ocr_cli( do_not_mask_with_textline_contour, batch_size, min_conf_value_of_textline_text, + device, ): """ Recognize text with a CNN/RNN or transformer ML model. """ - assert bool(image) ^ bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both." + assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both." from ..eynollah_ocr import Eynollah_ocr eynollah_ocr = Eynollah_ocr( model_zoo=ctx.obj.model_zoo, tr_ocr=tr_ocr, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, batch_size=batch_size, - min_conf_value_of_textline_text=min_conf_value_of_textline_text) + min_conf_value_of_textline_text=min_conf_value_of_textline_text, + device=device) eynollah_ocr.run(overwrite=overwrite, dir_in=dir_in, dir_in_bin=dir_in_bin, diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 3c918e5..4470671 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -14,16 +14,17 @@ from cv2.typing import MatLike from xml.etree import ElementTree as ET from PIL import Image, ImageDraw import numpy as np -from eynollah.model_zoo import EynollahModelZoo -from eynollah.utils.font import get_font -from eynollah.utils.xml import etree_namespace_for_element_tag try: import torch except ImportError: torch = None +from .eynollah import Eynollah +from .model_zoo import EynollahModelZoo from .utils import is_image_filename +from .utils.font import get_font +from .utils.xml import etree_namespace_for_element_tag from .utils.resize import resize_image from .utils.utils_ocr import ( break_curved_line_into_small_pieces_and_then_merge, @@ -44,45 +45,44 @@ class EynollahOcrResult: cropped_lines_region_indexer: List total_bb_coordinates:List -class Eynollah_ocr: +class Eynollah_ocr(Eynollah): def __init__( self, *, model_zoo: EynollahModelZoo, tr_ocr=False, - batch_size: Optional[int]=None, + batch_size: int=0, do_not_mask_with_textline_contour: bool=False, - min_conf_value_of_textline_text : Optional[float]=None, + min_conf_value_of_textline_text : float=0.3, logger: Optional[Logger]=None, + device: str = '', ): self.tr_ocr = tr_ocr # masking for OCR and GT generation, relevant for skewed lines and bounding boxes self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour self.logger = logger if logger else getLogger('eynollah.ocr') - self.model_zoo = model_zoo - self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3 - self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size + self.min_conf_value_of_textline_text = min_conf_value_of_textline_text + self.b_s = batch_size or 2 if tr_ocr else 8 - if tr_ocr: - self.model_zoo.load_model('trocr_processor') - self.model_zoo.load_model('ocr', 'tr') - self.model_zoo.get('ocr').to(self.device) + self.model_zoo = model_zoo + self.setup_models(device=device) + + def setup_models(self, device=''): + if self.tr_ocr: + self.model_zoo.load_models('trocr_processor', + ('ocr', 'tr'), + device=device) else: - self.model_zoo.load_model('ocr', '') - self.model_zoo.load_model('num_to_char') - self.model_zoo.load_model('characters') - self.end_character = len(self.model_zoo.get('characters', list)) + 2 + self.model_zoo.load_models('ocr', + 'num_to_char', + 'characters', + device=device) + self.end_character = len(self.model_zoo.get('characters')) + 2 @property def device(self): - assert torch - if torch.cuda.is_available(): - self.logger.info("Using GPU acceleration") - return torch.device("cuda:0") - else: - self.logger.info("Using CPU processing") - return torch.device("cpu") + return self.model_zoo.get('ocr').device def run_trocr( self, From 7ed1a1ebac0c4b34db02b254c3dbb5c3d639ed9c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 18:34:56 +0200 Subject: [PATCH 06/32] CLIs: allow `-h` and show defaults uniformly, harmonise help, drop remaining redundant negative options --- src/eynollah/cli/cli_binarize.py | 4 +++- src/eynollah/cli/cli_enhance.py | 4 +++- src/eynollah/cli/cli_extract_images.py | 32 +++++++++++++++----------- src/eynollah/cli/cli_ocr.py | 21 +++++++++++------ src/eynollah/cli/cli_readingorder.py | 4 +++- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/src/eynollah/cli/cli_binarize.py b/src/eynollah/cli/cli_binarize.py index f0e56f5..d544a67 100644 --- a/src/eynollah/cli/cli_binarize.py +++ b/src/eynollah/cli/cli_binarize.py @@ -1,6 +1,8 @@ import click -@click.command() +@click.command(context_settings=dict( + help_option_names=['-h', '--help'], + show_default=True)) @click.option( '--patches/--no-patches', default=True, diff --git a/src/eynollah/cli/cli_enhance.py b/src/eynollah/cli/cli_enhance.py index 517e1e8..42b1d41 100644 --- a/src/eynollah/cli/cli_enhance.py +++ b/src/eynollah/cli/cli_enhance.py @@ -1,6 +1,8 @@ import click -@click.command() +@click.command(context_settings=dict( + help_option_names=['-h', '--help'], + show_default=True)) @click.option( "--image", "-i", diff --git a/src/eynollah/cli/cli_extract_images.py b/src/eynollah/cli/cli_extract_images.py index 0add5b5..acd31f1 100644 --- a/src/eynollah/cli/cli_extract_images.py +++ b/src/eynollah/cli/cli_extract_images.py @@ -1,6 +1,8 @@ import click -@click.command() +@click.command(context_settings=dict( + help_option_names=['-h', '--help'], + show_default=True)) @click.option( "--image", "-i", @@ -30,36 +32,40 @@ import click @click.option( "--save_images", "-si", - help="if a directory is given, images in documents will be cropped and saved there", + help="if a directory is given, cropped images of pages will be saved there", type=click.Path(exists=True, file_okay=False), ) @click.option( - "--enable-plotting/--disable-plotting", - "-ep/-noep", + "--enable-plotting", + "-ep", is_flag=True, - help="If set, will plot intermediary files and images", + help="plot intermediary diagnostic images to files", ) @click.option( - "--input_binary/--input-RGB", - "-ib/-irgb", + "--input_binary", + "-ib", is_flag=True, - help="In general, eynollah uses RGB as input but if the input document is very dark, very bright or for any other reason you can turn on input binarization. When this flag is set, eynollah will binarize the RGB input document, you should always provide RGB images to eynollah.", + help="In general, eynollah uses RGB as input, but if the input document is very dark, very bright or for any other reason you can turn on internal binarization here. When set, eynollah will binarize the RGB input document first.", ) @click.option( - "--ignore_page_extraction/--extract_page_included", - "-ipe/-epi", + "--ignore_page_extraction", + "-ipe", is_flag=True, - help="if this parameter set to true, this tool would ignore page extraction", + help="ignore page extraction (cropping via page frame detection model)", ) @click.option( "--num_col_upper", "-ncu", - help="lower limit of columns in document image", + default=0, + type=click.IntRange(min=0), + help="lower limit of columns in document image; 0 means autodetected from model", ) @click.option( "--num_col_lower", "-ncl", - help="upper limit of columns in document image", + default=0, + type=click.IntRange(min=0), + help="upper limit of columns in document image; 0 means autodetected from model", ) @click.pass_context def extract_images_cli( diff --git a/src/eynollah/cli/cli_ocr.py b/src/eynollah/cli/cli_ocr.py index f9b74c8..99e03c5 100644 --- a/src/eynollah/cli/cli_ocr.py +++ b/src/eynollah/cli/cli_ocr.py @@ -1,6 +1,8 @@ import click -@click.command() +@click.command(context_settings=dict( + help_option_names=['-h', '--help'], + show_default=True)) @click.option( "--image", "-i", @@ -16,7 +18,7 @@ import click @click.option( "--dir_in_bin", "-dib", - help=("directory of binarized images (in addition to --dir_in for RGB images; filename stems must match the RGB image files, with '.png' \n Perform prediction using both RGB and binary images. (This does not necessarily improve results, however it may be beneficial for certain document images."), + help=("directory of binarized images (in addition to --dir_in for RGB images; filename stems must match the RGB image files, with '.png'. \n Perform prediction using both RGB and binary images. (This may improve results for certain document images.)"), type=click.Path(exists=True, file_okay=False), ) @click.option( @@ -47,25 +49,30 @@ import click ) @click.option( "--tr_ocr", - "-trocr/-notrocr", + "-trocr", is_flag=True, - help="if this parameter set to true, transformer ocr will be applied, otherwise cnn_rnn model.", + help="use transformer OCR (instead of classic CNN-RNN) model", ) @click.option( "--do_not_mask_with_textline_contour", - "-nmtc/-mtc", + "-nmtc", is_flag=True, - help="if this parameter set to true, cropped textline images will not be masked with textline contour.", + help="skip masking each cropped textline image with its corresponding textline contour", ) @click.option( "--batch_size", "-bs", + default=0, + type=click.IntRange(min=0), help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", ) @click.option( "--min_conf_value_of_textline_text", "-min_conf", - help="minimum OCR confidence value. Text lines with a confidence value lower than this threshold will not be included in the output XML file.", + default=0.3, + type=click.FloatRange(min=0.0, max=1.0), + help="minimum OCR confidence threshold. Text lines with a lower confidence value will not be included in the output XML file.", +) @click.option( "--device", "-D", diff --git a/src/eynollah/cli/cli_readingorder.py b/src/eynollah/cli/cli_readingorder.py index eed9fb9..9bb7092 100644 --- a/src/eynollah/cli/cli_readingorder.py +++ b/src/eynollah/cli/cli_readingorder.py @@ -1,6 +1,8 @@ import click -@click.command() +@click.command(context_settings=dict( + help_option_names=['-h', '--help'], + show_default=True)) @click.option( "--input", "-i", From 21ecb043f763045aabc043805acb4d39da0316c9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 18:41:21 +0200 Subject: [PATCH 07/32] CLIs: move `--device` option to group level --- src/eynollah/cli/cli.py | 9 ++++++++- src/eynollah/cli/cli_binarize.py | 9 ++------- src/eynollah/cli/cli_enhance.py | 9 ++------- src/eynollah/cli/cli_layout.py | 8 +------- src/eynollah/cli/cli_ocr.py | 9 ++------- src/eynollah/cli/cli_readingorder.py | 10 +++------- 6 files changed, 18 insertions(+), 36 deletions(-) diff --git a/src/eynollah/cli/cli.py b/src/eynollah/cli/cli.py index ace3f1c..2a4c8d1 100644 --- a/src/eynollah/cli/cli.py +++ b/src/eynollah/cli/cli.py @@ -15,6 +15,7 @@ class EynollahCliCtx: Holds options relevant for all eynollah subcommands """ model_zoo: EynollahModelZoo + device: str = '' log_level : Union[str, None] = 'INFO' @@ -35,6 +36,11 @@ class EynollahCliCtx: type=(str, str, str), multiple=True, ) +@click.option( + "--device", + "-D", + help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", +) @click.option( "--log_level", "-l", @@ -42,7 +48,7 @@ class EynollahCliCtx: help="Override log level globally to this", ) @click.pass_context -def main(ctx, model_basedir, model_overrides, log_level): +def main(ctx, model_basedir, model_overrides, device, log_level): """ eynollah - Document Layout Analysis, Image Enhancement, OCR """ @@ -58,6 +64,7 @@ def main(ctx, model_basedir, model_overrides, log_level): # Initialize CLI context ctx.obj = EynollahCliCtx( model_zoo=model_zoo, + device=device, log_level=log_level, ) diff --git a/src/eynollah/cli/cli_binarize.py b/src/eynollah/cli/cli_binarize.py index d544a67..82209be 100644 --- a/src/eynollah/cli/cli_binarize.py +++ b/src/eynollah/cli/cli_binarize.py @@ -33,11 +33,6 @@ import click help="overwrite (instead of skipping) if output xml exists", is_flag=True, ) -@click.option( - "--device", - "-D", - help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", -) @click.pass_context def binarize_cli( ctx, @@ -46,14 +41,14 @@ def binarize_cli( dir_in, output, overwrite, - device, ): """ Binarize images with a ML model """ from ..sbb_binarize import SbbBinarizer assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." - binarizer = SbbBinarizer(model_zoo=ctx.obj.model_zoo, device=device) + binarizer = SbbBinarizer(model_zoo=ctx.obj.model_zoo, + device=ctx.obj.device) binarizer.run( image_filename=input_image, use_patches=patches, diff --git a/src/eynollah/cli/cli_enhance.py b/src/eynollah/cli/cli_enhance.py index 42b1d41..bcb8263 100644 --- a/src/eynollah/cli/cli_enhance.py +++ b/src/eynollah/cli/cli_enhance.py @@ -48,13 +48,8 @@ import click is_flag=True, help="save the enhanced image in original image size", ) -@click.option( - "--device", - "-D", - help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", -) @click.pass_context -def enhance_cli(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower, save_org_scale, device): +def enhance_cli(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower, save_org_scale): """ Enhance image """ @@ -62,10 +57,10 @@ def enhance_cli(ctx, image, out, overwrite, dir_in, num_col_upper, num_col_lower from ..image_enhancer import Enhancer enhancer = Enhancer( model_zoo=ctx.obj.model_zoo, + device=ctx.obj.device, num_col_upper=num_col_upper, num_col_lower=num_col_lower, save_org_scale=save_org_scale, - device=device, ) enhancer.run(overwrite=overwrite, dir_in=dir_in, diff --git a/src/eynollah/cli/cli_layout.py b/src/eynollah/cli/cli_layout.py index 417b202..0a083d5 100644 --- a/src/eynollah/cli/cli_layout.py +++ b/src/eynollah/cli/cli_layout.py @@ -172,11 +172,6 @@ import click type=click.FloatRange(min=0), help="abort when number of failed images exceeds this value (if >=1) or ratio of failed over total images exceeds this value (if <1); 0 means ignore failures", ) -@click.option( - "--device", - "-D", - help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", -) @click.pass_context def layout_cli( ctx, @@ -207,7 +202,6 @@ def layout_cli( ignore_page_extraction, num_jobs, halt_fail, - device, ): """ Detect Layout (with optional image enhancement and reading order detection) @@ -223,7 +217,7 @@ def layout_cli( assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." eynollah = Eynollah( model_zoo=ctx.obj.model_zoo, - device=device, + device=ctx.obj.device, enable_plotting=enable_plotting, allow_enhancement=allow_enhancement, curved_line=curved_line, diff --git a/src/eynollah/cli/cli_ocr.py b/src/eynollah/cli/cli_ocr.py index 99e03c5..daeccbe 100644 --- a/src/eynollah/cli/cli_ocr.py +++ b/src/eynollah/cli/cli_ocr.py @@ -73,11 +73,6 @@ import click type=click.FloatRange(min=0.0, max=1.0), help="minimum OCR confidence threshold. Text lines with a lower confidence value will not be included in the output XML file.", ) -@click.option( - "--device", - "-D", - help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", -) @click.pass_context def ocr_cli( ctx, @@ -92,7 +87,6 @@ def ocr_cli( do_not_mask_with_textline_contour, batch_size, min_conf_value_of_textline_text, - device, ): """ Recognize text with a CNN/RNN or transformer ML model. @@ -101,11 +95,12 @@ def ocr_cli( from ..eynollah_ocr import Eynollah_ocr eynollah_ocr = Eynollah_ocr( model_zoo=ctx.obj.model_zoo, + device=ctx.obj.device, tr_ocr=tr_ocr, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, batch_size=batch_size, min_conf_value_of_textline_text=min_conf_value_of_textline_text, - device=device) + ) eynollah_ocr.run(overwrite=overwrite, dir_in=dir_in, dir_in_bin=dir_in_bin, diff --git a/src/eynollah/cli/cli_readingorder.py b/src/eynollah/cli/cli_readingorder.py index 9bb7092..ac52e38 100644 --- a/src/eynollah/cli/cli_readingorder.py +++ b/src/eynollah/cli/cli_readingorder.py @@ -22,19 +22,15 @@ import click type=click.Path(exists=True, file_okay=False), required=True, ) -@click.option( - "--device", - "-D", - help="placement of computations in predictors for each model type; if none (by default), will try to use first available GPU or fall back to CPU; set string to force using a device (e.g. 'GPU0', 'GPU1' or 'CPU'). Can also be a comma-separated list of model category to device mappings (e.g. 'col_classifier:CPU,page:GPU0,*:GPU1')", -) @click.pass_context -def readingorder_cli(ctx, input, dir_in, out, device): +def readingorder_cli(ctx, input, dir_in, out): """ Generate ReadingOrder with a ML model """ from ..mb_ro_on_layout import Reorder assert bool(input) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." - orderer = Reorder(model_zoo=ctx.obj.model_zoo, device=device) + orderer = Reorder(model_zoo=ctx.obj.model_zoo, + device=ctx.obj.device) orderer.run(xml_filename=input, dir_in=dir_in, dir_out=out, From 1ed633bc254ce76e6422d7496661ab5a173e5551 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 19:02:43 +0200 Subject: [PATCH 08/32] test_model_zoo: adapt (`load_models` instead of `load_model`) --- tests/test_model_zoo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_model_zoo.py b/tests/test_model_zoo.py index 2042b28..341bc21 100644 --- a/tests/test_model_zoo.py +++ b/tests/test_model_zoo.py @@ -6,11 +6,11 @@ def test_trocr1( model_zoo = EynollahModelZoo(model_dir) try: from transformers import TrOCRProcessor, VisionEncoderDecoderModel - model_zoo.load_model('trocr_processor') - proc = model_zoo.get('trocr_processor', TrOCRProcessor) + model_zoo.load_models('trocr_processor', + ('ocr', 'tr')) + proc = model_zoo.get('trocr_processor') assert isinstance(proc, TrOCRProcessor) - model_zoo.load_model('ocr', 'tr') - model = model_zoo.get('ocr', VisionEncoderDecoderModel) + model = model_zoo.get('ocr') assert isinstance(model, VisionEncoderDecoderModel) except ImportError: pass From 87cce6c9636ff4a0c726fb2be0bbdc37b3838a32 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 19:03:32 +0200 Subject: [PATCH 09/32] CLI tests: add opt-in envvar `EYNOLLAH_OPTIONS` for device selection, model directory etc. --- tests/cli_tests/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/cli_tests/conftest.py b/tests/cli_tests/conftest.py index 601d76b..2e1501c 100644 --- a/tests/cli_tests/conftest.py +++ b/tests/cli_tests/conftest.py @@ -1,4 +1,5 @@ from typing import List +import os import pytest import logging @@ -31,6 +32,8 @@ def run_eynollah_ok_and_check_logs( subcommand, *args ] + if 'EYNOLLAH_OPTIONS' in os.environ: + args = os.environ['EYNOLLAH_OPTIONS'].split() + args if pytestconfig.getoption('verbose') > 0: args = ['-l', 'DEBUG'] + args caplog.set_level(logging.INFO) From be4fe8c263ed219e8d3df08e53e271d68556e7ff Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 19:04:37 +0200 Subject: [PATCH 10/32] contour: drop unused functions depending on `rotation_image_new()` --- src/eynollah/utils/contour.py | 90 +---------------------------------- src/eynollah/utils/rotate.py | 4 -- 2 files changed, 1 insertion(+), 93 deletions(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index f1a7a8e..1dbead1 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -11,7 +11,7 @@ from shapely.geometry.polygon import orient from shapely import set_precision, affinity from shapely.ops import unary_union, nearest_points -from .rotate import rotate_image, rotation_image_new +from .rotate import rotate_image def contours_in_same_horizon(cy_main_hor): """ @@ -120,94 +120,6 @@ def return_contours_of_interested_region(region_pre_p, label, min_area=0.0002, d dilate=dilate) return contours_imgs -def do_work_of_contours_in_image(contour, index_r_con, img, slope_first): - img_copy = np.zeros(img.shape[:2], dtype=np.uint8) - img_copy = cv2.fillPoly(img_copy, pts=[contour], color=1) - - img_copy = rotation_image_new(img_copy, -slope_first) - _, thresh = cv2.threshold(img_copy, 0, 255, 0) - - cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) - cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0]) - - return cont_int[0], index_r_con - -def get_textregion_contours_in_org_image_multi(cnts, img, slope_first, map=map): - if not len(cnts): - return [], [] - results = map(partial(do_work_of_contours_in_image, - img=img, - slope_first=slope_first, - ), - cnts, range(len(cnts))) - return tuple(zip(*results)) - -def get_textregion_contours_in_org_image(cnts, img, slope_first): - cnts_org = [] - # print(cnts,'cnts') - for i in range(len(cnts)): - img_copy = np.zeros(img.shape[:2], dtype=np.uint8) - img_copy = cv2.fillPoly(img_copy, pts=[cnts[i]], color=1) - - # plt.imshow(img_copy) - # plt.show() - - # print(img.shape,'img') - img_copy = rotation_image_new(img_copy, -slope_first) - ##print(img_copy.shape,'img_copy') - # plt.imshow(img_copy) - # plt.show() - - _, thresh = cv2.threshold(img_copy, 0, 255, 0) - - cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) - cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0]) - # print(np.shape(cont_int[0])) - cnts_org.append(cont_int[0]) - - return cnts_org - -def get_textregion_confidences_old(cnts, img, slope_first): - zoom = 3 - img = cv2.resize(img, (img.shape[1] // zoom, - img.shape[0] // zoom), - interpolation=cv2.INTER_NEAREST) - cnts_org = [] - for cnt in cnts: - img_copy = np.zeros(img.shape[:2], dtype=np.uint8) - img_copy = cv2.fillPoly(img_copy, pts=[cnt // zoom], color=1) - - img_copy = rotation_image_new(img_copy, -slope_first).astype(np.uint8) - _, thresh = cv2.threshold(img_copy, 0, 255, 0) - - cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) - cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0]) - cnts_org.append(cont_int[0] * zoom) - - return cnts_org - -def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first, confidence_matrix): - img_copy = np.zeros(img.shape[:2], dtype=np.uint8) - img_copy = cv2.fillPoly(img_copy, pts=[contour_par], color=1) - confidence_matrix_mapped_with_contour = confidence_matrix * img_copy - confidence_contour = np.sum(confidence_matrix_mapped_with_contour) / float(np.sum(img_copy)) - - img_copy = rotation_image_new(img_copy, -slope_first).astype(np.uint8) - _, thresh = cv2.threshold(img_copy, 0, 255, 0) - - cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - if len(cont_int)==0: - cont_int = [contour_par] - confidence_contour = 0 - else: - cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) - cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0]) - return cont_int[0], index_r_con, confidence_contour - def get_region_confidences(cnts, confidence_matrix): if not len(cnts): return [] diff --git a/src/eynollah/utils/rotate.py b/src/eynollah/utils/rotate.py index 6651c4e..e45a438 100644 --- a/src/eynollah/utils/rotate.py +++ b/src/eynollah/utils/rotate.py @@ -2,10 +2,6 @@ import math import cv2 -def rotation_image_new(img, thetha): - rotated = rotate_image(img, thetha) - return rotate_max_area_new(img, rotated, thetha) - def rotate_image(img_patch, slope): (h, w) = img_patch.shape[:2] center = (w // 2, h // 2) From 17b311441a30cd3599b9414be8a734922aa6077d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 20:02:40 +0200 Subject: [PATCH 11/32] model_zoo: also parse comma/colon syntax for `device` in Torch case --- src/eynollah/model_zoo/model_zoo.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index 7f3cd6c..f1d8824 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -154,7 +154,7 @@ class EynollahModelZoo: try: gpus = tf.config.list_physical_devices('GPU') if device: - if ',' in device: + if ':' in device: for spec in device.split(','): cat, dev = spec.split(':') if fnmatchcase(model_category, cat): @@ -235,6 +235,12 @@ class EynollahModelZoo: dev = torch.device('cpu') if not device and torch.cuda.is_available(): device = 'GPU' # try + if device and ':' in device: + for spec in device.split(','): + cat, dev = spec.split(':') + if fnmatchcase('ocr', cat): + device = dev + break if device and device.startswith('GPU'): try: dev = torch.device('cuda', int(device[3:] or 0)) From f329e10a805b57f18c454981757948e52dcabf9d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 May 2026 20:04:41 +0200 Subject: [PATCH 12/32] test_layout: rm ignored `--allow_scaling` option --- tests/cli_tests/test_layout.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/cli_tests/test_layout.py b/tests/cli_tests/test_layout.py index 7cbe013..503aeac 100644 --- a/tests/cli_tests/test_layout.py +++ b/tests/cli_tests/test_layout.py @@ -6,11 +6,12 @@ from ocrd_models.constants import NAMESPACES as NS "options", [ [], # defaults - #["--allow_scaling", "--curved-line"], - ["--allow_scaling", "--curved-line", "--full-layout"], - ["--allow_scaling", "--curved-line", "--full-layout", "--reading_order_machine_based"], + #["--curved-line"], + ["--curved-line", "--full-layout"], + ["--curved-line", "--full-layout", "--reading_order_machine_based"], # -ep ... - # -eoi ... + # --input_binary + # --ignore_page_extraction # --skip_layout_and_reading_order ], ids=str) def test_run_eynollah_layout_filename( From 481c286da9522d1117cc57f1775423e833076325 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 May 2026 02:08:14 +0200 Subject: [PATCH 13/32] ModelZoo.load_model: no XLA compilation --- src/eynollah/model_zoo/model_zoo.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index f1d8824..054552a 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -35,7 +35,7 @@ class EynollahModelZoo: self._overrides = [] if model_overrides: self.override_models(*model_overrides) - self._loaded: Dict[str, Predictor] = {} + self._loaded: Dict[str, Union[Predictor, AnyModel]] = {} @property def model_overrides(self): @@ -197,6 +197,7 @@ class EynollahModelZoo: model_path, compile=False, custom_objects=dict(PatchEncoder=PatchEncoder, Patches=Patches)) + model.make_predict_function() assert isinstance(model, KerasModel) model._name = model_category if resized: @@ -206,7 +207,10 @@ class EynollahModelZoo: model = wrap_layout_model_patched(model) model._name = model_category + '_patched' else: - model.jit_compile = True + # increases required VRAM, does not always work + # (depending on CUDA/libcudnn/TF version): + #model.jit_compile = True + pass if model_category == 'ocr': model = KerasModel( @@ -214,10 +218,9 @@ class EynollahModelZoo: model.get_layer(name="dense2").output, # type: ignore ) - model.make_predict_function() return model - def get(self, model_category: str) -> Predictor: + def get(self, model_category: str) -> Union[Predictor, AnyModel]: if model_category not in self._loaded: raise ValueError(f'Model "{model_category}" not previously loaded with "load_model(..)"') return self._loaded[model_category] From ffe5cdc5197b7e9c11e77b10969647ae8b1e2a75 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 May 2026 02:09:49 +0200 Subject: [PATCH 14/32] ModelZoo.shutdown: drop extra `del` (already done by `shutdown()`) --- src/eynollah/model_zoo/model_zoo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index 054552a..3de8b6b 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -313,4 +313,3 @@ class EynollahModelZoo: for needle in list(self._loaded.keys()): if isinstance(self._loaded[needle], Predictor): self._loaded[needle].shutdown() - del self._loaded[needle] From 9efce5e9f2b5afb3c7cf1c44f4d262e383c737fd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 May 2026 03:16:15 +0200 Subject: [PATCH 15/32] Predictor.shutdown: use `join()` instead of `terminate()` --- src/eynollah/predictor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/eynollah/predictor.py b/src/eynollah/predictor.py index e1159e7..3c6890e 100644 --- a/src/eynollah/predictor.py +++ b/src/eynollah/predictor.py @@ -194,17 +194,18 @@ class Predictor(mp.context.SpawnProcess): def shutdown(self): # do not terminate from forked processor instances - if mp.parent_process() is None: + if not hasattr(self, 'model'): self.stopped.set() + self.join() self.taskq.close() self.taskq.cancel_join_thread() self.resultq.close() self.resultq.cancel_join_thread() self.logq.close() - self.terminate() + #self.terminate() else: del self.model def __del__(self): - #self.logger.debug(f"deinit of {self} in {mp.current_process().name}") + #self.logger.debug(f"deinit of {self.name} in {mp.current_process().name}") self.shutdown() From 86adaf299ade201b178fe851c7b4f884a680fc0c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 19 May 2026 03:17:31 +0200 Subject: [PATCH 16/32] =?UTF-8?q?training.models.transformer=5Fblock:=20tf?= =?UTF-8?q?.reshape=20=E2=86=92=20Keras=20Reshape=20layer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/training/models.py | 9 ++++----- src/eynollah/training/reload-models-v0.8.mk | 7 ++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 3494249..f700d14 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -309,11 +309,10 @@ def transformer_block(img, # Skip connection 2. encoded_patches = Add()([x3, x2]) - encoded_patches = tf.reshape(encoded_patches, - [-1, - img.shape[1], - img.shape[2], - projection_dim // (patchsize_x * patchsize_y)]) + encoded_patches = Reshape(target_shape=(img.shape[1], + img.shape[2], + projection_dim // (patchsize_x * patchsize_y)), + name="reshape_patches")(encoded_patches) return encoded_patches def vit_resnet50_unet(num_patches, diff --git a/src/eynollah/training/reload-models-v0.8.mk b/src/eynollah/training/reload-models-v0.8.mk index b7a38dd..07be7cf 100644 --- a/src/eynollah/training/reload-models-v0.8.mk +++ b/src/eynollah/training/reload-models-v0.8.mk @@ -26,16 +26,17 @@ RELOADABLE_MODELS = \ all: $(RELOADABLE_MODELS) $(MODELS_DST)/%: $(MODELS_SRC)/% - mkdir -p $@ test -e $&1 | tee $(notdir $<).log - cp $ Date: Tue, 19 May 2026 03:20:24 +0200 Subject: [PATCH 17/32] =?UTF-8?q?reload=5Fweights:=20`save()`=20=E2=86=92?= =?UTF-8?q?=20`export()`=20w/=20`serve()`=20inference?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/model_zoo/model_zoo.py | 12 +++++------- src/eynollah/training/train.py | 9 ++++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index 3de8b6b..815663e 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -191,14 +191,12 @@ class EynollahModelZoo: try: # avoid wasting VRAM on non-transformer models model = load_model(model_path, compile=False) - except Exception as e: - self.logger.error(e) - model = load_model( - model_path, compile=False, - custom_objects=dict(PatchEncoder=PatchEncoder, - Patches=Patches)) + assert isinstance(model, KerasModel) model.make_predict_function() - assert isinstance(model, KerasModel) + except ValueError: + model = tf.saved_model.load(model_path) + model.predict_on_batch = model.serve + model.input_shape = model.signatures.get('serving_default').inputs[0].shape model._name = model_category if resized: model = wrap_layout_model_resized(model) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index de998fd..00ed6ee 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -562,7 +562,8 @@ def run(_config, if reload_weights: model.load_weights(dir_of_start_model).assert_existing_objects_matched().expect_partial() dir_save = os.path.join(dir_output, os.path.basename(os.path.normpath(dir_of_start_model))) - model.save(dir_save, include_optimizer=False) + #model.save(dir_save, include_optimizer=False) + model.export(dir_save) with open(os.path.join(dir_save, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON _log.info("reloaded model from %s to %s", dir_of_start_model, dir_save) @@ -725,7 +726,8 @@ def run(_config, if reload_weights: model.load_weights(dir_of_start_model).assert_existing_objects_matched().expect_partial() dir_save = os.path.join(dir_output, os.path.basename(os.path.normpath(dir_of_start_model))) - model.save(dir_save, include_optimizer=False) + #model.save(dir_save, include_optimizer=False) + model.export(dir_save) with open(os.path.join(dir_save, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON _log.info("reloaded model from %s to %s", dir_of_start_model, dir_save) @@ -843,7 +845,8 @@ def run(_config, if reload_weights: model.load_weights(dir_of_start_model).assert_existing_objects_matched().expect_partial() dir_save = os.path.join(dir_output, os.path.basename(os.path.normpath(dir_of_start_model))) - model.save(dir_save, include_optimizer=False) + #model.save(dir_save, include_optimizer=False) + model.export(dir_save) with open(os.path.join(dir_save, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON _log.info("reloaded model from %s to %s", dir_of_start_model, dir_save) From 3de1407d1811d1c3135a3f353ef3260947ab3a93 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 02:38:20 +0200 Subject: [PATCH 18/32] drop unnecessary TF / Torch imports --- src/eynollah/cli/__init__.py | 4 ---- src/eynollah/extract_images.py | 7 ------- src/eynollah/eynollah_imports.py | 13 ------------- src/eynollah/eynollah_ocr.py | 4 ---- src/eynollah/mb_ro_on_layout.py | 4 ---- src/eynollah/model_zoo/model_zoo.py | 4 ++++ src/eynollah/ocrd_cli.py | 6 ++---- 7 files changed, 6 insertions(+), 36 deletions(-) delete mode 100644 src/eynollah/eynollah_imports.py diff --git a/src/eynollah/cli/__init__.py b/src/eynollah/cli/__init__.py index 43ed046..1584fa5 100644 --- a/src/eynollah/cli/__init__.py +++ b/src/eynollah/cli/__init__.py @@ -1,7 +1,3 @@ -# NOTE: For predictable order of imports of torch/shapely/tensorflow -# this must be the first import of the CLI! -from ..eynollah_imports import imported_libs - from .cli import main from .cli_binarize import binarize_cli from .cli_enhance import enhance_cli diff --git a/src/eynollah/extract_images.py b/src/eynollah/extract_images.py index 7a7e3f6..40476a3 100644 --- a/src/eynollah/extract_images.py +++ b/src/eynollah/extract_images.py @@ -9,7 +9,6 @@ import os import time from typing import Optional from pathlib import Path -import tensorflow as tf import numpy as np import cv2 @@ -64,12 +63,6 @@ class EynollahImageExtractor(Eynollah): t_start = time.time() - try: - for device in tf.config.list_physical_devices('GPU'): - tf.config.experimental.set_memory_growth(device, True) - except: - self.logger.warning("no GPU device available") - self.logger.info("Loading models...") self.setup_models() self.logger.info(f"Model initialization complete ({time.time() - t_start:.1f}s)") diff --git a/src/eynollah/eynollah_imports.py b/src/eynollah/eynollah_imports.py deleted file mode 100644 index 496406c..0000000 --- a/src/eynollah/eynollah_imports.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -Load libraries with possible race conditions once. This must be imported as the first module of eynollah. -""" -import os -os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 - -from ocrd_utils import tf_disable_interactive_logs -from torch import * -tf_disable_interactive_logs() -import tensorflow.keras -from shapely import * -imported_libs = True -__all__ = ['imported_libs'] diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 4470671..77ad98f 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -14,10 +14,6 @@ from cv2.typing import MatLike from xml.etree import ElementTree as ET from PIL import Image, ImageDraw import numpy as np -try: - import torch -except ImportError: - torch = None from .eynollah import Eynollah diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index 5725ba1..6c0477b 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -17,10 +17,6 @@ import cv2 import numpy as np import statistics -os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 -import tensorflow as tf -from tensorflow.keras.models import Model - from .eynollah import Eynollah from .model_zoo import EynollahModelZoo from .utils.resize import resize_image diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index 815663e..ec35a80 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -269,6 +269,10 @@ class EynollahModelZoo: """ Load decoder for OCR """ + os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 + from ocrd_utils import tf_disable_interactive_logs + tf_disable_interactive_logs() + from tensorflow.keras.layers import StringLookup characters = self._load_characters() diff --git a/src/eynollah/ocrd_cli.py b/src/eynollah/ocrd_cli.py index acd8d4e..effecb2 100644 --- a/src/eynollah/ocrd_cli.py +++ b/src/eynollah/ocrd_cli.py @@ -1,10 +1,8 @@ -# NOTE: For predictable order of imports of torch/shapely/tensorflow -# this must be the first import of the CLI! -from .eynollah_imports import imported_libs -from .processor import EynollahProcessor from click import command from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from .processor import EynollahProcessor + @command() @ocrd_cli_options def main(*args, **kwargs): From 7f2bf715df02911325dea68228dca33dd9137fa7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 02:39:59 +0200 Subject: [PATCH 19/32] ModelZoo.load_model: fix loading exported vs saved models --- src/eynollah/model_zoo/model_zoo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index ec35a80..a1f9a24 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -191,9 +191,8 @@ class EynollahModelZoo: try: # avoid wasting VRAM on non-transformer models model = load_model(model_path, compile=False) - assert isinstance(model, KerasModel) model.make_predict_function() - except ValueError: + except (AttributeError, ValueError): model = tf.saved_model.load(model_path) model.predict_on_batch = model.serve model.input_shape = model.signatures.get('serving_default').inputs[0].shape From 94a5e9da149967b4f3a54c87da7108035d1dd236 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 02:41:19 +0200 Subject: [PATCH 20/32] ModelZoo.load_model: avoid attempting to load exported models as Keras models (which causes a warning), but switch to TF-Serving import right away --- src/eynollah/model_zoo/model_zoo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index a1f9a24..b97911a 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -189,7 +189,8 @@ class EynollahModelZoo: self.override_models((model_category, model_variant, model_path_override)) model_path = self.model_path(model_category, model_variant) try: - # avoid wasting VRAM on non-transformer models + if model_path.is_dir() and not (model_path / "keras_metadata.pb").exists(): + raise ValueError() model = load_model(model_path, compile=False) model.make_predict_function() except (AttributeError, ValueError): From bf7ec0233df245ff14b18472fdaa2cb2bda51a1e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 02:43:34 +0200 Subject: [PATCH 21/32] =?UTF-8?q?ModelZoo.load=5Fmodel:=20use=20`memory=5F?= =?UTF-8?q?limit`=20instead=20of=20`memory=5Fgrowth`=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - growth strategy is more flexible, but uses much more VRAM - limit strategy needs to be calibrated to models (currently fixed), and batch size, but needs much less VRAM and is faster --- src/eynollah/model_zoo/model_zoo.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index b97911a..c63a58d 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -169,7 +169,23 @@ class EynollahModelZoo: gpus = gpus[:1] # TF will always use first allowable tf.config.set_visible_devices(gpus, 'GPU') for device in gpus: - tf.config.experimental.set_memory_growth(device, True) + # tf.config.experimental.set_memory_growth(device, True) + # dynamic growth never frees memory (to avoid fragmentation), + # so the VRAM requirements end up much larger than feasible + # (for small GPUs); so try hard (calibrated) limits instead: + tf.config.set_logical_device_configuration( + device, + [tf.config.LogicalDeviceConfiguration(memory_limit={ + "binarization": 868, # due to bs 5 + "enhancement": 980, # due to bs 3 + "col_classifier": 210, + "page": 618, + "textline": 1680, # 954 for bs 1 + "region_1_2": 1580, + "region_fl_np": 1756, + "table": 1818, + "reading_order": 632, + }[model_category])]) vendor_name = ( tf.config.experimental.get_device_details(device) .get('device_name', 'unknown')) From f9f9130dbbb4c755d96e56f9855d9871db592806 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 03:21:36 +0200 Subject: [PATCH 22/32] do_order_of_regions: remove redundant+overcautious assertion --- src/eynollah/eynollah.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index c632941..9db47ce 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1148,7 +1148,6 @@ class Eynollah: boxes, textline_mask_tot ): - assert np.any(textline_mask_tot) self.logger.debug("enter do_order_of_regions") contours_only_text_parent = ensure_array(contours_only_text_parent) contours_only_text_parent_h = ensure_array(contours_only_text_parent_h) From d50bd7c650fe6413efe5d70bfdc235716d22e5d7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 14:20:51 +0200 Subject: [PATCH 23/32] trocr: avoid warnings by passing `clean_up_tokenization_spaces=False` --- src/eynollah/eynollah_ocr.py | 50 ++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 77ad98f..4371453 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -139,11 +139,14 @@ class Eynollah_ocr(Eynollah): cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')( + imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, skip_special_tokens=True) + generated_ids_merged, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) extracted_texts = extracted_texts + generated_text_merged @@ -162,11 +165,14 @@ class Eynollah_ocr(Eynollah): cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')( + imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, skip_special_tokens=True) + generated_ids_merged, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) extracted_texts = extracted_texts + generated_text_merged @@ -182,11 +188,14 @@ class Eynollah_ocr(Eynollah): cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values + pixel_values_merged = self.model_zoo.get('trocr_processor')( + imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, skip_special_tokens=True) + generated_ids_merged, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) extracted_texts = extracted_texts + generated_text_merged @@ -194,22 +203,23 @@ class Eynollah_ocr(Eynollah): cropped_lines.append(img_crop) cropped_lines_meging_indexing.append(0) indexer_b_s+=1 - + if indexer_b_s==self.b_s: imgs = cropped_lines[:] cropped_lines = [] indexer_b_s = 0 - - pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values + + pixel_values_merged = self.model_zoo.get('trocr_processor')( + imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( pixel_values_merged.to(self.device)) generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, skip_special_tokens=True) - + generated_ids_merged, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) + extracted_texts = extracted_texts + generated_text_merged - - - + indexer_text_region = indexer_text_region +1 if indexer_b_s!=0: @@ -217,9 +227,14 @@ class Eynollah_ocr(Eynollah): cropped_lines = [] indexer_b_s = 0 - pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_zoo.get('ocr').generate(pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode(generated_ids_merged, skip_special_tokens=True) + pixel_values_merged = self.model_zoo.get('trocr_processor')( + imgs, return_tensors="pt").pixel_values + generated_ids_merged = self.model_zoo.get('ocr').generate( + pixel_values_merged.to(self.device)) + generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( + generated_ids_merged, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) extracted_texts = extracted_texts + generated_text_merged @@ -750,6 +765,7 @@ class Eynollah_ocr(Eynollah): indexer_textregion = indexer_textregion + 1 ET.register_namespace("",page_ns) + self.logger.info("output filename: '%s'", out_file_ocr) page_tree.write(out_file_ocr, xml_declaration=True, method='xml', encoding="utf-8", default_namespace=None) def run( From 1d67e65f11ad5266ba27262d38b4c49a7a864714 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 15:48:21 +0200 Subject: [PATCH 24/32] =?UTF-8?q?trocr:=20simplify,=20batch=20over=20entir?= =?UTF-8?q?e=20page=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - batching over entire page instead of region-wise (underfilling batches) - avoid copied redundant code --- src/eynollah/eynollah_ocr.py | 201 +++++++------------------------- src/eynollah/utils/utils_ocr.py | 6 + 2 files changed, 51 insertions(+), 156 deletions(-) diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 4371453..747d2f5 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -14,6 +14,7 @@ from cv2.typing import MatLike from xml.etree import ElementTree as ET from PIL import Image, ImageDraw import numpy as np +from ocrd_utils import polygon_from_points, xywh_from_polygon from .eynollah import Eynollah @@ -31,6 +32,7 @@ from .utils.utils_ocr import ( preprocess_and_resize_image_for_ocrcnn_model, return_textlines_split_if_needed, rotate_image_with_padding, + batched, ) # TODO: refine typing @@ -90,143 +92,55 @@ class Eynollah_ocr(Eynollah): ) -> EynollahOcrResult: total_bb_coordinates = [] - - cropped_lines = [] cropped_lines_region_indexer = [] cropped_lines_meging_indexing = [] - extracted_texts = [] - indexer_text_region = 0 - indexer_b_s = 0 - - for nn in page_tree.getroot().iter(f'{{{page_ns}}}TextRegion'): - for child_textregion in nn: - if child_textregion.tag.endswith("TextLine"): - - for child_textlines in child_textregion: - if child_textlines.tag.endswith("Coords"): - cropped_lines_region_indexer.append(indexer_text_region) - p_h=child_textlines.attrib['points'].split(' ') - textline_coords = np.array( [ [int(x.split(',')[0]), - int(x.split(',')[1]) ] - for x in p_h] ) - x,y,w,h = cv2.boundingRect(textline_coords) - - total_bb_coordinates.append([x,y,w,h]) - - h2w_ratio = h/float(w) - - img_poly_on_img = np.copy(img) - mask_poly = np.zeros(img.shape) - mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1)) - - mask_poly = mask_poly[y:y+h, x:x+w, :] - img_crop = img_poly_on_img[y:y+h, x:x+w, :] - img_crop[mask_poly==0] = 255 - - self.logger.debug("processing %d lines for '%s'", - len(cropped_lines), nn.attrib['id']) - if h2w_ratio > 0.1: - cropped_lines.append(resize_image(img_crop, - tr_ocr_input_height_and_width, - tr_ocr_input_height_and_width) ) - cropped_lines_meging_indexing.append(0) - indexer_b_s+=1 - if indexer_b_s==self.b_s: - imgs = cropped_lines[:] - cropped_lines = [] - indexer_b_s = 0 - - pixel_values_merged = self.model_zoo.get('trocr_processor')( - imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_zoo.get('ocr').generate( - pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, - skip_special_tokens=True, - clean_up_tokenization_spaces=False) - - extracted_texts = extracted_texts + generated_text_merged - - else: - splited_images, _ = return_textlines_split_if_needed(img_crop, None) - #print(splited_images) - if splited_images: - cropped_lines.append(resize_image(splited_images[0], - tr_ocr_input_height_and_width, - tr_ocr_input_height_and_width)) - cropped_lines_meging_indexing.append(1) - indexer_b_s+=1 - - if indexer_b_s==self.b_s: - imgs = cropped_lines[:] - cropped_lines = [] - indexer_b_s = 0 - - pixel_values_merged = self.model_zoo.get('trocr_processor')( - imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_zoo.get('ocr').generate( - pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, - skip_special_tokens=True, - clean_up_tokenization_spaces=False) - - extracted_texts = extracted_texts + generated_text_merged - - - cropped_lines.append(resize_image(splited_images[1], - tr_ocr_input_height_and_width, - tr_ocr_input_height_and_width)) - cropped_lines_meging_indexing.append(-1) - indexer_b_s+=1 - - if indexer_b_s==self.b_s: - imgs = cropped_lines[:] - cropped_lines = [] - indexer_b_s = 0 - - pixel_values_merged = self.model_zoo.get('trocr_processor')( - imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_zoo.get('ocr').generate( - pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, - skip_special_tokens=True, - clean_up_tokenization_spaces=False) - - extracted_texts = extracted_texts + generated_text_merged - - else: - cropped_lines.append(img_crop) - cropped_lines_meging_indexing.append(0) - indexer_b_s+=1 + for n_region, region in enumerate(page_tree.getroot().iter('{%s}TextRegion' % page_ns)): + for n_line, line in enumerate(region.iter('{%s}TextLine' % page_ns)): + cropped_lines_region_indexer.append(n_region) - if indexer_b_s==self.b_s: - imgs = cropped_lines[:] - cropped_lines = [] - indexer_b_s = 0 + coords = line.find('{%s}Coords' % page_ns) + if coords is None: + self.logger.warning("region '%s' line '%s' has no Coords", region.attrib['id'], line.attrib['id']) + continue + poly = np.array(polygon_from_points(coords.attrib['points'])).astype(int) + cont = poly[:, np.newaxis] + xywh = xywh_from_polygon(poly) + x, y, w, h = xywh['x'], xywh['y'], xywh['w'], xywh['h'] - pixel_values_merged = self.model_zoo.get('trocr_processor')( - imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_zoo.get('ocr').generate( - pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, - skip_special_tokens=True, - clean_up_tokenization_spaces=False) + total_bb_coordinates.append([x, y, w, h]) - extracted_texts = extracted_texts + generated_text_merged + img_crop = img[y: y + h, x: x + w] + mask_poly = np.zeros(img_crop.shape[:2], dtype=np.uint8) + mask_poly = cv2.fillPoly(mask_poly, pts=[cont - [x, y]], color=1) + img_crop[mask_poly == 0] = 255 # FIXME: or median color? - indexer_text_region = indexer_text_region +1 + if h > 0.1 * w: + cropped_lines.append(resize_image(img_crop, + tr_ocr_input_height_and_width, + tr_ocr_input_height_and_width) ) + cropped_lines_meging_indexing.append(0) + else: + splited_images, _ = return_textlines_split_if_needed(img_crop, None) + if splited_images: + cropped_lines.append(resize_image(splited_images[0], + tr_ocr_input_height_and_width, + tr_ocr_input_height_and_width)) + cropped_lines_meging_indexing.append(1) + cropped_lines.append(resize_image(splited_images[1], + tr_ocr_input_height_and_width, + tr_ocr_input_height_and_width)) + cropped_lines_meging_indexing.append(-1) + else: + cropped_lines.append(img_crop) + cropped_lines_meging_indexing.append(0) - if indexer_b_s!=0: - imgs = cropped_lines[:] - cropped_lines = [] - indexer_b_s = 0 - + + self.logger.debug("processing %d lines for %d regions", + len(cropped_lines), len(set(cropped_lines_region_indexer))) + for imgs in batched(cropped_lines, self.b_s): pixel_values_merged = self.model_zoo.get('trocr_processor')( imgs, return_tensors="pt").pixel_values generated_ids_merged = self.model_zoo.get('ocr').generate( @@ -235,40 +149,15 @@ class Eynollah_ocr(Eynollah): generated_ids_merged, skip_special_tokens=True, clean_up_tokenization_spaces=False) - extracted_texts = extracted_texts + generated_text_merged - - ####extracted_texts = [] - ####n_iterations = math.ceil(len(cropped_lines) / self.b_s) - - ####for i in range(n_iterations): - ####if i==(n_iterations-1): - ####n_start = i*self.b_s - ####imgs = cropped_lines[n_start:] - ####else: - ####n_start = i*self.b_s - ####n_end = (i+1)*self.b_s - ####imgs = cropped_lines[n_start:n_end] - ####pixel_values_merged = self.model_zoo.get('trocr_processor')(imgs, return_tensors="pt").pixel_values - ####generated_ids_merged = self.model_ocr.generate( - #### pixel_values_merged.to(self.device)) - ####generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - #### generated_ids_merged, skip_special_tokens=True) - - ####extracted_texts = extracted_texts + generated_text_merged - del cropped_lines gc.collect() extracted_texts_merged = [extracted_texts[ind] - if cropped_lines_meging_indexing[ind]==0 - else extracted_texts[ind]+" "+extracted_texts[ind+1] - if cropped_lines_meging_indexing[ind]==1 - else None - for ind in range(len(cropped_lines_meging_indexing))] - - extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] - #print(extracted_texts_merged, len(extracted_texts_merged)) + if cropped_lines_meging_indexing[ind] == 0 + else extracted_texts[ind] + " " + extracted_texts[ind + 1] + for ind in range(len(cropped_lines_meging_indexing)) + if cropped_lines_meging_indexing[ind] >= 0] return EynollahOcrResult( extracted_texts_merged=extracted_texts_merged, diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 93d1137..6914fee 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -1,5 +1,6 @@ import math import copy +from itertools import islice import numpy as np import cv2 @@ -502,3 +503,8 @@ def return_rnn_cnn_ocr_of_given_textlines(image, ocr_textline_in_textregion.append(text_textline) ocr_all_textlines.append(ocr_textline_in_textregion) return ocr_all_textlines + +def batched(iterable, n): + iterator = iter(iterable) + while batch := tuple(islice(iterator, n)): + yield batch From f3649adbf24eb6b4d189846d67eeed88f153ea06 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 17:23:11 +0200 Subject: [PATCH 25/32] trocr: apply `do_not_mask_with_textline_contour` here, too --- src/eynollah/eynollah_ocr.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 747d2f5..f1b155b 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -113,9 +113,10 @@ class Eynollah_ocr(Eynollah): total_bb_coordinates.append([x, y, w, h]) img_crop = img[y: y + h, x: x + w] - mask_poly = np.zeros(img_crop.shape[:2], dtype=np.uint8) - mask_poly = cv2.fillPoly(mask_poly, pts=[cont - [x, y]], color=1) - img_crop[mask_poly == 0] = 255 # FIXME: or median color? + if not self.do_not_mask_with_textline_contour: + mask_poly = np.zeros(img_crop.shape[:2], dtype=np.uint8) + mask_poly = cv2.fillPoly(mask_poly, pts=[cont - [x, y]], color=1) + img_crop[mask_poly == 0] = 255 # FIXME: or median color? if h > 0.1 * w: cropped_lines.append(resize_image(img_crop, From 000e4ac8d8b66f874b0423c627c9bdccab880b57 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 17:25:39 +0200 Subject: [PATCH 26/32] trocr: extract confidence, too --- src/eynollah/eynollah_ocr.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index f1b155b..faeb042 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -90,12 +90,14 @@ class Eynollah_ocr(Eynollah): page_ns, tr_ocr_input_height_and_width, ) -> EynollahOcrResult: + import torch total_bb_coordinates = [] cropped_lines = [] cropped_lines_region_indexer = [] cropped_lines_meging_indexing = [] extracted_texts = [] + extracted_confs = [] for n_region, region in enumerate(page_tree.getroot().iter('{%s}TextRegion' % page_ns)): for n_line, line in enumerate(region.iter('{%s}TextLine' % page_ns)): @@ -142,15 +144,20 @@ class Eynollah_ocr(Eynollah): self.logger.debug("processing %d lines for %d regions", len(cropped_lines), len(set(cropped_lines_region_indexer))) for imgs in batched(cropped_lines, self.b_s): - pixel_values_merged = self.model_zoo.get('trocr_processor')( + pixel_values = self.model_zoo.get('trocr_processor')( imgs, return_tensors="pt").pixel_values - generated_ids_merged = self.model_zoo.get('ocr').generate( - pixel_values_merged.to(self.device)) - generated_text_merged = self.model_zoo.get('trocr_processor').batch_decode( - generated_ids_merged, + output = self.model_zoo.get('ocr').generate( + pixel_values.to(self.device), + output_scores=True, + return_dict_in_generate=True) + conf = torch.max(torch.softmax(torch.cat( + output.scores, dim=0), dim=1), dim=1).values.tolist() + text = self.model_zoo.get('trocr_processor').batch_decode( + output.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False) - extracted_texts = extracted_texts + generated_text_merged + extracted_confs.extend(conf) + extracted_texts.extend(text) del cropped_lines gc.collect() @@ -159,10 +166,15 @@ class Eynollah_ocr(Eynollah): else extracted_texts[ind] + " " + extracted_texts[ind + 1] for ind in range(len(cropped_lines_meging_indexing)) if cropped_lines_meging_indexing[ind] >= 0] + extracted_confs_merged = [extracted_confs[ind] + if cropped_lines_meging_indexing[ind] == 0 + else 0.5 * (extracted_confs[ind] + extracted_confs[ind + 1]) + for ind in range(len(cropped_lines_meging_indexing)) + if cropped_lines_meging_indexing[ind] >= 0] return EynollahOcrResult( extracted_texts_merged=extracted_texts_merged, - extracted_conf_value_merged=None, + extracted_conf_value_merged=extracted_confs_merged, cropped_lines_region_indexer=cropped_lines_region_indexer, total_bb_coordinates=total_bb_coordinates, ) @@ -618,6 +630,7 @@ class Eynollah_ocr(Eynollah): has_textline = False for child_textregion in nn: + # FIXME: should remove Word level, if it already exists if child_textregion.tag.endswith("TextLine"): is_textline_text = False From 074753a98e647b83c99358034a610e1f5364c79f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 17:25:53 +0200 Subject: [PATCH 27/32] ModelZoo: fix Torch device selection --- src/eynollah/model_zoo/model_zoo.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index c63a58d..be41d2a 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -247,9 +247,9 @@ class EynollahModelZoo: if variant == 'tr': from transformers import VisionEncoderDecoderModel import torch - ret = VisionEncoderDecoderModel.from_pretrained(model_dir) - assert isinstance(ret, VisionEncoderDecoderModel) - dev = torch.device('cpu') + model = VisionEncoderDecoderModel.from_pretrained(model_dir) + assert isinstance(model, VisionEncoderDecoderModel) + device0 = torch.device('cpu') if not device and torch.cuda.is_available(): device = 'GPU' # try if device and ':' in device: @@ -260,17 +260,17 @@ class EynollahModelZoo: break if device and device.startswith('GPU'): try: - dev = torch.device('cuda', int(device[3:] or 0)) - name = torch.cuda.get_device_name(dev) - self.logger.info("using GPU %s (%s) for model ocr:tr", dev, name) + device0 = torch.device('cuda', int(device[3:] or 0)) + name = torch.cuda.get_device_name(device0) + self.logger.info("using GPU %s (%s) for model ocr:tr", device0, name) except: self.logger.exception("cannot configure GPU device") - dev = torch.device('cpu') - if dev.type == 'cuda': - ret.to(dev) + device0 = torch.device('cpu') + if device0.type == 'cuda': + model.to(device0) else: self.logger.warning("no GPU device available") - return ret + return model return self.load_model('ocr', model_variant=variant, device=device) From ea41dcae1d401ac2b4b74403d4cc515d8da6c4ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 17:52:27 +0200 Subject: [PATCH 28/32] trocr: use beam search instead of greedy decoding --- src/eynollah/eynollah_ocr.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index faeb042..b94853b 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -90,7 +90,6 @@ class Eynollah_ocr(Eynollah): page_ns, tr_ocr_input_height_and_width, ) -> EynollahOcrResult: - import torch total_bb_coordinates = [] cropped_lines = [] @@ -148,10 +147,16 @@ class Eynollah_ocr(Eynollah): imgs, return_tensors="pt").pixel_values output = self.model_zoo.get('ocr').generate( pixel_values.to(self.device), + # beam search instead of greedy decoding: + num_beams=4, + # also return probability output_scores=True, return_dict_in_generate=True) - conf = torch.max(torch.softmax(torch.cat( - output.scores, dim=0), dim=1), dim=1).values.tolist() + if output.sequences_scores is not None: + # log-prob averaged over length + conf = output.sequences_scores.exp().clamp(0.0, 1.0).tolist() + else: + conf = [1.0] * len(output.sequences) text = self.model_zoo.get('trocr_processor').batch_decode( output.sequences, skip_special_tokens=True, From f3a93983c0848bc02785a24656b9524f90dfd22a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 22:50:13 +0200 Subject: [PATCH 29/32] ModelZoo: add `ocr` key for `memory_limit` --- src/eynollah/model_zoo/model_zoo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index be41d2a..2bac7f3 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -185,6 +185,7 @@ class EynollahModelZoo: "region_fl_np": 1756, "table": 1818, "reading_order": 632, + "ocr": 850, }[model_category])]) vendor_name = ( tf.config.experimental.get_device_details(device) From 0836230c6b29384e7ecb6700d92573518dac64ef Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 21 May 2026 22:50:53 +0200 Subject: [PATCH 30/32] utils_ocr: avoid module-level import of TF --- src/eynollah/utils/utils_ocr.py | 8 +++++++- tests/cli_tests/test_ocr.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 6914fee..817406c 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -4,7 +4,9 @@ from itertools import islice import numpy as np import cv2 -import tensorflow as tf +# avoid module-level import: +# import tensorflow as tf +# (wait for tf-keras and logging setup in ModelZoo.load_model) from scipy.signal import find_peaks from scipy.ndimage import gaussian_filter1d from PIL import Image, ImageDraw, ImageFont @@ -13,6 +15,8 @@ from .resize import resize_image def decode_batch_predictions(pred, num_to_char, max_len = 128): + import tensorflow as tf + # input_len is the product of the batch size and the # number of time steps. input_len = np.ones(pred.shape[0]) * pred.shape[1] @@ -40,6 +44,8 @@ def decode_batch_predictions(pred, num_to_char, max_len = 128): def distortion_free_resize(image, img_size): + import tensorflow as tf + w, h = img_size image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True) diff --git a/tests/cli_tests/test_ocr.py b/tests/cli_tests/test_ocr.py index 6bf3080..cf34e06 100644 --- a/tests/cli_tests/test_ocr.py +++ b/tests/cli_tests/test_ocr.py @@ -30,7 +30,7 @@ def test_run_eynollah_ocr_filename( '-o', str(outfile.parent), ] + options, [ - # FIXME: ocr has no logging! + 'output filename:' ] ) assert outfile.exists() @@ -57,7 +57,7 @@ def test_run_eynollah_ocr_directory( '-o', str(outdir), ], [ - # FIXME: ocr has no logging! + 'output filename:' ] ) assert len(list(outdir.iterdir())) == 2 From 26afc5ddab34c2d0c966a706f8b283b891280209 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 22 May 2026 12:35:44 +0200 Subject: [PATCH 31/32] ModelZoo: ensure exported TensorShape is converted to plain tuple --- src/eynollah/model_zoo/model_zoo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eynollah/model_zoo/model_zoo.py b/src/eynollah/model_zoo/model_zoo.py index 2bac7f3..d5e69a2 100644 --- a/src/eynollah/model_zoo/model_zoo.py +++ b/src/eynollah/model_zoo/model_zoo.py @@ -207,13 +207,14 @@ class EynollahModelZoo: model_path = self.model_path(model_category, model_variant) try: if model_path.is_dir() and not (model_path / "keras_metadata.pb").exists(): + # short-cut to avoid warning for exported models raise ValueError() model = load_model(model_path, compile=False) model.make_predict_function() except (AttributeError, ValueError): model = tf.saved_model.load(model_path) model.predict_on_batch = model.serve - model.input_shape = model.signatures.get('serving_default').inputs[0].shape + model.input_shape = tuple(model.signatures.get('serving_default').inputs[0].shape) model._name = model_category if resized: model = wrap_layout_model_resized(model) From 9801129aa6da83af1562fd14b47a37b67011de5a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 22 May 2026 12:37:07 +0200 Subject: [PATCH 32/32] estimate_skew_contours: ensure retval is always float --- src/eynollah/utils/contour.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 1dbead1..eda60e9 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -330,7 +330,7 @@ def estimate_skew_contours(contours): if not np.any(usable): raise ValueError("not enough contours with consistent length") if np.count_nonzero(usable) == 1: - return angle_in[usable] + return angle_in[usable][0] # 4. there is no way to distinguish between +90 and -89.9 here, # so map to [0,180] when calculating averages, then map back to [-90,90] # (we don't want -90 and +89 to average zero, or +1 and +179 to average 90)