2020-11-30 15:40:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								# Eynollah
  
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								>  Document Layout Analysis (segmentation) using pre-trained models and heuristics
  
						 
					
						
							
								
									
										
										
										
											2023-04-05 10:40:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-03-31 03:18:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								[](https://pypi.org/project/eynollah/)
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:11:51 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								[](https://circleci.com/gh/qurator-spk/eynollah)
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								[](https://github.com/qurator-spk/eynollah/actions/workflows/test-eynollah.yml)
							 
						 
					
						
							
								
									
										
										
										
											2023-03-31 03:19:44 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								[](https://opensource.org/license/apache-2-0/)
							 
						 
					
						
							
								
									
										
										
										
											2020-11-20 12:49:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-12-16 15:52:37 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								## Features
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								*  Support for up to 10 segmentation classes:  
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:33:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  *  background, [page border ](https://ocr-d.de/en/gt-guidelines/trans/lyRand.html ), [text region ](https://ocr-d.de/en/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_TextRegionType.html ), [text line ](https://ocr-d.de/en/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_TextLineType.html ), [header ](https://ocr-d.de/en/gt-guidelines/trans/lyUeberschrift.html ), [image ](https://ocr-d.de/en/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_ImageRegionType.html ), [separator ](https://ocr-d.de/en/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_SeparatorRegionType.html ), [marginalia ](https://ocr-d.de/en/gt-guidelines/trans/lyMarginalie.html ), [initial ](https://ocr-d.de/en/gt-guidelines/trans/lyInitiale.html ), [table ](https://ocr-d.de/en/gt-guidelines/trans/lyTabellen.html )
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								*  Support for various image optimization operations: 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  *  cropping (border detection), binarization, deskewing, dewarping, scaling, enhancing, resizing
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								*  Text line segmentation to bounding boxes or polygons (contours) including for curved lines and vertical text 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								*  Detection of reading order 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								*  Output in [PAGE-XML ](https://github.com/PRImA-Research-Lab/PAGE-XML ) 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:25:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								*  [OCR-D ](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor ) interface 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								## Installation
  
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								Python versions `3.7-3.10`  with Tensorflow `>=2.4`  are currently supported.
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								For (limited) GPU support the [matching ](https://www.tensorflow.org/install/source#gpu ) CUDA toolkit `>=10.1`  needs to be installed.
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								You can either install via 
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								pip install eynollah
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								or clone the repository, enter it and install (editable) with
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								git clone git@github .com:qurator-spk/eynollah.git
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								cd eynollah; pip install -e .
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
									
										
										
										
											2022-09-13 16:40:44 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								Alternatively, you can run `make install`  or `make install-dev`  for editable installation.
							 
						 
					
						
							
								
									
										
										
										
											2022-09-13 16:40:44 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 12:47:06 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								## Models
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								Pre-trained models can be downloaded from [qurator-data.de ](https://qurator-data.de/eynollah/ ).
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								In case you want to train your own model to use with Eynollah, have a look at [sbb_pixelwise_segmentation ](https://github.com/qurator-spk/sbb_pixelwise_segmentation ). 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								## Usage
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								The command-line interface can be called like this:
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								```sh
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 02:48:42 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								eynollah \
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  -i < image  file >  \
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  -o < output  directory >  \
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  -m < path  to  directory  containing  model  files >  \
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     [OPTIONS]
							 
						 
					
						
							
								
									
										
										
										
											2022-09-13 17:19:19 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2022-09-13 21:48:21 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								The following options can be used to further configure the processing:
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| option   |      description      |
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								|----------|:-------------|
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 02:39:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-fl`   | full layout analysis including all steps and segmentation classes |
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								| `-light`  | lighter and faster but simpler method for main region detection and deskewing |
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-tab`  | apply table detection |
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 02:39:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-ae`   | apply enhancement (the resulting image is saved to the output directory) |
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-as`   | apply scaling |
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 02:39:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-cl`   | apply countour detection for curved text lines instead of bounding boxes |
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-ib`   | apply binarization (the resulting image is saved to the output directory)  |
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								| `-ep`   | enable plotting (MUST always be used with `-sl` , `-sd` , `-sa` , `-si`  or `-ae` ) |
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								| `-ho`   | ignore headers for reading order dectection |
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								| `-di <directory>`   | process all images in a directory in batch mode |
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 02:39:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-si <directory>`   | save image regions detected to this directory |
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:13:07 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-sd <directory>`   | save deskewed image to this directory |
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								| `-sl <directory>`   | save layout prediction as plot to this directory |
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 03:21:24 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-sp <directory>`   | save cropped page image to this directory |
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 02:39:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								| `-sa <directory>`   | save all (plot, enhanced/binary image, layout) to this directory |
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 13:24:13 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								If no option is set, the tool will perform layout detection of main regions (background, text, images, separators and marginals).
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 12:47:06 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								The tool produces better quality output when RGB images are used as input than greyscale or binarized images.
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 02:39:18 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								#### Use as OCR-D processor
  
						 
					
						
							
								
									
										
										
										
											2020-11-20 12:49:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 13:24:13 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								Eynollah ships with a CLI interface to be used as [OCR-D ](https://ocr-d.de ) processor. 
							 
						 
					
						
							
								
									
										
										
										
											2020-11-20 12:49:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 13:24:13 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								In this case, the source image file group with (preferably) RGB images should be used as input like this:
							 
						 
					
						
							
								
									
										
										
										
											2020-11-20 12:49:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 13:24:13 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								ocrd-eynollah-segment -I OCR-D-IMG -O SEG-LINE -P models
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 12:47:06 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								Any image referenced by `@imageFilename`  in PAGE-XML is passed on directly to Eynollah as a processor, so that e.g.
							 
						 
					
						
							
								
									
										
										
										
											2020-11-20 12:49:27 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2023-04-14 13:24:13 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								ocrd-eynollah-segment -I OCR-D-IMG-BIN -O SEG-LINE -P models
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								```
							 
						 
					
						
							
								
									
										
										
										
											2022-04-04 21:13:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    
							 
						 
					
						
							
								
									
										
										
										
											2023-05-13 12:47:06 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								uses the original (RGB) image despite any binarization that may have occured in previous OCR-D processing steps