models: split into layout, extra and ocr

layout: Everything not OCR or extra
ocr: trocr/cnnrnn models
extra: obsolete or niche models
This commit is contained in:
kba 2025-11-26 19:45:58 +01:00
parent 000af16a47
commit 095b36c389

View file

@ -4,7 +4,7 @@ from .specs import EynollahModelSpec, EynollahModelSpecSet
ZENODO = "https://zenodo.org/records/17295988/files" ZENODO = "https://zenodo.org/records/17295988/files"
MODELS_VERSION = "v0_7_0" MODELS_VERSION = "v0_7_0"
def dist_url(dist_name: str) -> str: def dist_url(dist_name: str="layout") -> str:
return f'{ZENODO}/models_{dist_name}_{MODELS_VERSION}.zip' return f'{ZENODO}/models_{dist_name}_{MODELS_VERSION}.zip'
DEFAULT_MODEL_SPECS = EynollahModelSpecSet([ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
@ -14,7 +14,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
variant='', variant='',
filename="models_eynollah/eynollah-enhancement_20210425", filename="models_eynollah/eynollah-enhancement_20210425",
dists=['enhancement', 'layout', 'ci'], dists=['enhancement', 'layout', 'ci'],
dist_url=dist_url("enhancement"), dist_url=dist_url(),
type='Keras', type='Keras',
), ),
@ -23,7 +23,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
variant='hybrid', variant='hybrid',
filename="models_eynollah/eynollah-binarization-hybrid_20230504/model_bin_hybrid_trans_cnn_sbb_ens", filename="models_eynollah/eynollah-binarization-hybrid_20230504/model_bin_hybrid_trans_cnn_sbb_ens",
dists=['layout', 'binarization', ], dists=['layout', 'binarization', ],
dist_url=dist_url("binarization"), dist_url=dist_url(),
type='Keras', type='Keras',
), ),
@ -32,7 +32,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
variant='20210309', variant='20210309',
filename="models_eynollah/eynollah-binarization_20210309", filename="models_eynollah/eynollah-binarization_20210309",
dists=['binarization'], dists=['binarization'],
dist_url=dist_url("binarization"), dist_url=dist_url("extra"),
type='Keras', type='Keras',
), ),
@ -41,7 +41,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
variant='', variant='',
filename="models_eynollah/eynollah-binarization_20210425", filename="models_eynollah/eynollah-binarization_20210425",
dists=['binarization'], dists=['binarization'],
dist_url=dist_url("binarization"), dist_url=dist_url("extra"),
type='Keras', type='Keras',
), ),
@ -49,7 +49,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="col_classifier", category="col_classifier",
variant='', variant='',
filename="models_eynollah/eynollah-column-classifier_20210425", filename="models_eynollah/eynollah-column-classifier_20210425",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -58,7 +58,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="page", category="page",
variant='', variant='',
filename="models_eynollah/model_eynollah_page_extraction_20250915", filename="models_eynollah/model_eynollah_page_extraction_20250915",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -67,7 +67,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="region", category="region",
variant='', variant='',
filename="models_eynollah/eynollah-main-regions-ensembled_20210425", filename="models_eynollah/eynollah-main-regions-ensembled_20210425",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -76,7 +76,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="region", category="region",
variant='extract_only_images', variant='extract_only_images',
filename="models_eynollah/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18", filename="models_eynollah/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -85,7 +85,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="region", category="region",
variant='light', variant='light',
filename="models_eynollah/eynollah-main-regions_20220314", filename="models_eynollah/eynollah-main-regions_20220314",
dist_url=dist_url("layout"), dist_url=dist_url(),
help="early layout", help="early layout",
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
@ -95,7 +95,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="region_p2", category="region_p2",
variant='', variant='',
filename="models_eynollah/eynollah-main-regions-aug-rotation_20210425", filename="models_eynollah/eynollah-main-regions-aug-rotation_20210425",
dist_url=dist_url("layout"), dist_url=dist_url(),
help="early layout, non-light, 2nd part", help="early layout, non-light, 2nd part",
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
@ -110,7 +110,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
#filename="models_eynollah/modelens_1_2_4_5_early_lay_1_2_spaltige", #filename="models_eynollah/modelens_1_2_4_5_early_lay_1_2_spaltige",
#filename="models_eynollah/model_3_eraly_layout_no_patches_1_2_spaltige", #filename="models_eynollah/model_3_eraly_layout_no_patches_1_2_spaltige",
filename="models_eynollah/modelens_e_l_all_sp_0_1_2_3_4_171024", filename="models_eynollah/modelens_e_l_all_sp_0_1_2_3_4_171024",
dist_url=dist_url("layout"), dist_url=dist_url("all"),
dists=['layout'], dists=['layout'],
help="early layout, light, 1-or-2-column", help="early layout, light, 1-or-2-column",
type='Keras', type='Keras',
@ -126,7 +126,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
#'filename="models_eynollah/modelens_full_lay_1_2_221024", #'filename="models_eynollah/modelens_full_lay_1_2_221024",
#'filename="models_eynollah/eynollah-full-regions-1column_20210425", #'filename="models_eynollah/eynollah-full-regions-1column_20210425",
filename="models_eynollah/modelens_full_lay_1__4_3_091124", filename="models_eynollah/modelens_full_lay_1__4_3_091124",
dist_url=dist_url("layout"), dist_url=dist_url(),
help="full layout / no patches", help="full layout / no patches",
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
@ -146,7 +146,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
# filename="models_eynollah/modelens_full_layout_24_till_28", # filename="models_eynollah/modelens_full_layout_24_till_28",
# filename="models_eynollah/model_2_full_layout_new_trans", # filename="models_eynollah/model_2_full_layout_new_trans",
filename="models_eynollah/modelens_full_lay_1__4_3_091124", filename="models_eynollah/modelens_full_lay_1__4_3_091124",
dist_url=dist_url("layout"), dist_url=dist_url(),
help="full layout / with patches", help="full layout / with patches",
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
@ -161,7 +161,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
#filename="models_eynollah/model_mb_ro_aug_ens_8", #filename="models_eynollah/model_mb_ro_aug_ens_8",
#filename="models_eynollah/model_ens_reading_order_machine_based", #filename="models_eynollah/model_ens_reading_order_machine_based",
filename="models_eynollah/model_eynollah_reading_order_20250824", filename="models_eynollah/model_eynollah_reading_order_20250824",
dist_url=dist_url("reading_order"), dist_url=dist_url(),
dists=['layout', 'reading_order'], dists=['layout', 'reading_order'],
type='Keras', type='Keras',
), ),
@ -176,7 +176,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
#filename="models_eynollah/modelens_textline_9_12_13_14_15", #filename="models_eynollah/modelens_textline_9_12_13_14_15",
#filename="models_eynollah/eynollah-textline_20210425", #filename="models_eynollah/eynollah-textline_20210425",
filename="models_eynollah/modelens_textline_0_1__2_4_16092024", filename="models_eynollah/modelens_textline_0_1__2_4_16092024",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -186,7 +186,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
variant='light', variant='light',
#filename="models_eynollah/eynollah-textline_light_20210425", #filename="models_eynollah/eynollah-textline_light_20210425",
filename="models_eynollah/modelens_textline_0_1__2_4_16092024", filename="models_eynollah/modelens_textline_0_1__2_4_16092024",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -195,7 +195,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="table", category="table",
variant='', variant='',
filename="models_eynollah/eynollah-tables_20210319", filename="models_eynollah/eynollah-tables_20210319",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -204,7 +204,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="table", category="table",
variant='light', variant='light',
filename="models_eynollah/modelens_table_0t4_201124", filename="models_eynollah/modelens_table_0t4_201124",
dist_url=dist_url("layout"), dist_url=dist_url(),
dists=['layout'], dists=['layout'],
type='Keras', type='Keras',
), ),
@ -250,7 +250,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="ocr", category="ocr",
variant='tr', variant='tr',
filename="models_eynollah/model_eynollah_ocr_trocr_20250919", filename="models_eynollah/model_eynollah_ocr_trocr_20250919",
dist_url=dist_url("trocr"), dist_url=dist_url("ocr"),
help='much slower transformer-based', help='much slower transformer-based',
dists=['trocr'], dists=['trocr'],
type='Keras', type='Keras',
@ -260,7 +260,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="trocr_processor", category="trocr_processor",
variant='', variant='',
filename="models_eynollah/model_eynollah_ocr_trocr_20250919", filename="models_eynollah/model_eynollah_ocr_trocr_20250919",
dist_url=dist_url("trocr"), dist_url=dist_url("ocr"),
dists=['trocr'], dists=['trocr'],
type='TrOCRProcessor', type='TrOCRProcessor',
), ),
@ -269,7 +269,7 @@ DEFAULT_MODEL_SPECS = EynollahModelSpecSet([
category="trocr_processor", category="trocr_processor",
variant='htr', variant='htr',
filename="models_eynollah/microsoft/trocr-base-handwritten", filename="models_eynollah/microsoft/trocr-base-handwritten",
dist_url=dist_url("trocr"), dist_url=dist_url("extra"),
dists=['trocr'], dists=['trocr'],
type='TrOCRProcessor', type='TrOCRProcessor',
), ),