mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-08-02 22:59:53 +02:00
Merge 2996fc8b30
into 6b8893b188
This commit is contained in:
commit
1a1170ab5d
7 changed files with 84 additions and 40 deletions
2
Makefile
2
Makefile
|
@ -86,7 +86,7 @@ smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif
|
||||||
eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_eynollah
|
eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_eynollah
|
||||||
test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml
|
test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml
|
||||||
# binarize:
|
# binarize:
|
||||||
eynollah binarization -m $(CURDIR)/default-2021-03-09 $< $(TMPDIR)/$(<F)
|
eynollah binarization -m $(CURDIR)/default-2021-03-09 -i $< -o $(TMPDIR)/$(<F)
|
||||||
test -s $(TMPDIR)/$(<F)
|
test -s $(TMPDIR)/$(<F)
|
||||||
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
|
@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
|
||||||
$(RM) -r $(TMPDIR)
|
$(RM) -r $(TMPDIR)
|
||||||
|
|
53
README.md
53
README.md
|
@ -19,7 +19,8 @@
|
||||||
* Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML)
|
* Output in [PAGE-XML](https://github.com/PRImA-Research-Lab/PAGE-XML)
|
||||||
* [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface
|
* [OCR-D](https://github.com/qurator-spk/eynollah#use-as-ocr-d-processor) interface
|
||||||
|
|
||||||
:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome.
|
:warning: Development is currently focused on achieving the best possible quality of results for a wide variety of
|
||||||
|
historical documents and therefore processing can be very slow. We aim to improve this, but contributions are welcome.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported.
|
Python `3.8-3.11` with Tensorflow `<2.13` on Linux are currently supported.
|
||||||
|
@ -42,7 +43,7 @@ cd eynollah; pip install -e .
|
||||||
Alternatively, you can run `make install` or `make install-dev` for editable installation.
|
Alternatively, you can run `make install` or `make install-dev` for editable installation.
|
||||||
|
|
||||||
## Models
|
## Models
|
||||||
Pre-trained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB?search_models=eynollah).
|
Pretrained models can be downloaded from [qurator-data.de](https://qurator-data.de/eynollah/) or [huggingface](https://huggingface.co/SBB?search_models=eynollah).
|
||||||
|
|
||||||
For documentation on methods and models, have a look at [`models.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/models.md).
|
For documentation on methods and models, have a look at [`models.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/models.md).
|
||||||
|
|
||||||
|
@ -50,10 +51,20 @@ For documentation on methods and models, have a look at [`models.md`](https://gi
|
||||||
In case you want to train your own model with Eynollah, have a look at [`train.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/train.md).
|
In case you want to train your own model with Eynollah, have a look at [`train.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/train.md).
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
The command-line interface can be called like this:
|
Eynollah supports four use cases: layout analysis (segmentation), binarization, text recognition (OCR),
|
||||||
|
and (trainable) reading order detection.
|
||||||
|
|
||||||
|
### Layout Analysis
|
||||||
|
The layout analysis module is responsible for detecting layouts, identifying text lines, and determining reading order
|
||||||
|
using both heuristic methods or a machine-based reading order detection model.
|
||||||
|
|
||||||
|
Note that there are currently two supported ways for reading order detection: either as part of layout analysis based
|
||||||
|
on image input, or, currently under development, for given layout analysis results based on PAGE-XML data as input.
|
||||||
|
|
||||||
|
The command-line interface for layout analysis can be called like this:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
eynollah \
|
eynollah layout \
|
||||||
-i <single image file> | -di <directory containing image files> \
|
-i <single image file> | -di <directory containing image files> \
|
||||||
-o <output directory> \
|
-o <output directory> \
|
||||||
-m <directory containing model files> \
|
-m <directory containing model files> \
|
||||||
|
@ -66,6 +77,7 @@ The following options can be used to further configure the processing:
|
||||||
|-------------------|:-------------------------------------------------------------------------------|
|
|-------------------|:-------------------------------------------------------------------------------|
|
||||||
| `-fl` | full layout analysis including all steps and segmentation classes |
|
| `-fl` | full layout analysis including all steps and segmentation classes |
|
||||||
| `-light` | lighter and faster but simpler method for main region detection and deskewing |
|
| `-light` | lighter and faster but simpler method for main region detection and deskewing |
|
||||||
|
| `-tll` | this indicates the light textline and should be passed with light version |
|
||||||
| `-tab` | apply table detection |
|
| `-tab` | apply table detection |
|
||||||
| `-ae` | apply enhancement (the resulting image is saved to the output directory) |
|
| `-ae` | apply enhancement (the resulting image is saved to the output directory) |
|
||||||
| `-as` | apply scaling |
|
| `-as` | apply scaling |
|
||||||
|
@ -80,11 +92,38 @@ The following options can be used to further configure the processing:
|
||||||
| `-sp <directory>` | save cropped page image to this directory |
|
| `-sp <directory>` | save cropped page image to this directory |
|
||||||
| `-sa <directory>` | save all (plot, enhanced/binary image, layout) to this directory |
|
| `-sa <directory>` | save all (plot, enhanced/binary image, layout) to this directory |
|
||||||
|
|
||||||
If no option is set, the tool performs layout detection of main regions (background, text, images, separators and marginals).
|
If no option is set, the tool performs layout detection of main regions (background, text, images, separators
|
||||||
|
and marginals).
|
||||||
The best output quality is produced when RGB images are used as input rather than greyscale or binarized images.
|
The best output quality is produced when RGB images are used as input rather than greyscale or binarized images.
|
||||||
|
|
||||||
#### Use as OCR-D processor
|
### Binarization
|
||||||
|
The binarization module performs document image binarization using pretrained pixelwise segmentation models.
|
||||||
|
|
||||||
|
The command-line interface for binarization of single image can be called like this:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
eynollah binarization \
|
||||||
|
-m <directory containing model files> \
|
||||||
|
<single image file> \
|
||||||
|
<output image>
|
||||||
|
```
|
||||||
|
|
||||||
|
and for flowing from a directory like this:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
eynollah binarization \
|
||||||
|
-m <path to directory containing model files> \
|
||||||
|
-di <directory containing image files> \
|
||||||
|
-do <output directory>
|
||||||
|
```
|
||||||
|
|
||||||
|
### OCR
|
||||||
|
Under development
|
||||||
|
|
||||||
|
### Machine-based-reading-order
|
||||||
|
Under development
|
||||||
|
|
||||||
|
#### Use as OCR-D processor
|
||||||
Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli),
|
Eynollah ships with a CLI interface to be used as [OCR-D](https://ocr-d.de) [processor](https://ocr-d.de/en/spec/cli),
|
||||||
formally described in [`ocrd-tool.json`](https://github.com/qurator-spk/eynollah/tree/main/src/eynollah/ocrd-tool.json).
|
formally described in [`ocrd-tool.json`](https://github.com/qurator-spk/eynollah/tree/main/src/eynollah/ocrd-tool.json).
|
||||||
|
|
||||||
|
@ -92,7 +131,6 @@ In this case, the source image file group with (preferably) RGB images should be
|
||||||
|
|
||||||
ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models 2022-04-05
|
ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models 2022-04-05
|
||||||
|
|
||||||
|
|
||||||
If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows:
|
If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows:
|
||||||
- existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results)
|
- existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results)
|
||||||
- existing annotation (and respective `AlternativeImage`s) are partially _ignored_:
|
- existing annotation (and respective `AlternativeImage`s) are partially _ignored_:
|
||||||
|
@ -103,7 +141,6 @@ If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynol
|
||||||
(because some other preprocessing step was in effect like `denoised`), then
|
(because some other preprocessing step was in effect like `denoised`), then
|
||||||
the output PAGE-XML will be based on that as new top-level (`@imageFilename`)
|
the output PAGE-XML will be based on that as new top-level (`@imageFilename`)
|
||||||
|
|
||||||
|
|
||||||
ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models 2022-04-05
|
ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models 2022-04-05
|
||||||
|
|
||||||
Still, in general, it makes more sense to add other workflow steps **after** Eynollah.
|
Still, in general, it makes more sense to add other workflow steps **after** Eynollah.
|
||||||
|
|
|
@ -48,8 +48,7 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i
|
||||||
@main.command()
|
@main.command()
|
||||||
@click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.')
|
@click.option('--patches/--no-patches', default=True, help='by enabling this parameter you let the model to see the image in patches.')
|
||||||
@click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction')
|
@click.option('--model_dir', '-m', type=click.Path(exists=True, file_okay=False), required=True, help='directory containing models for prediction')
|
||||||
@click.argument('input_image', required=False)
|
@click.option("--input-image", "-i", help="input image", type=click.Path(exists=True, dir_okay=False))
|
||||||
@click.argument('output_image', required=False)
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--dir_in",
|
"--dir_in",
|
||||||
"-di",
|
"-di",
|
||||||
|
@ -57,16 +56,14 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i
|
||||||
type=click.Path(exists=True, file_okay=False),
|
type=click.Path(exists=True, file_okay=False),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--dir_out",
|
"--output",
|
||||||
"-do",
|
"-o",
|
||||||
help="directory for output images",
|
help="output image (if using -i) or output image directory (if using -di)",
|
||||||
type=click.Path(exists=True, file_okay=False),
|
type=click.Path(file_okay=True, dir_okay=True),
|
||||||
)
|
)
|
||||||
def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out):
|
def binarization(patches, model_dir, input_image, dir_in, output):
|
||||||
assert (dir_out is None) == (dir_in is None), "Options -di and -do are mutually dependent"
|
assert (dir_in is None) != (input_image is None), "Specify either -di and or -i not both"
|
||||||
assert (input_image is None) == (output_image is None), "INPUT_IMAGE and OUTPUT_IMAGE are mutually dependent"
|
SbbBinarizer(model_dir).run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in)
|
||||||
assert (dir_in is None) != (input_image is None), "Specify either -di and -do options, or INPUT_IMAGE and OUTPUT_IMAGE"
|
|
||||||
SbbBinarizer(model_dir).run(image_path=input_image, use_patches=patches, save=output_image, dir_in=dir_in, dir_out=dir_out)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4343,12 +4343,12 @@ class Eynollah:
|
||||||
polygons_lines_xml = []
|
polygons_lines_xml = []
|
||||||
contours_tables = []
|
contours_tables = []
|
||||||
ocr_all_textlines = None
|
ocr_all_textlines = None
|
||||||
conf_contours_textregions =None
|
conf_contours_textregions = [0]
|
||||||
pcgts = self.writer.build_pagexml_no_full_layout(
|
pcgts = self.writer.build_pagexml_no_full_layout(
|
||||||
cont_page, page_coord, order_text_new, id_of_texts_tot,
|
cont_page, page_coord, order_text_new, id_of_texts_tot,
|
||||||
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
|
all_found_textline_polygons, page_coord, polygons_of_images, polygons_of_marginals,
|
||||||
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals,
|
||||||
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions)
|
cont_page, polygons_lines_xml, contours_tables, ocr_all_textlines, conf_contours_textregions, self.skip_layout_and_reading_order)
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
#print("text region early -1 in %.1fs", time.time() - t0)
|
#print("text region early -1 in %.1fs", time.time() - t0)
|
||||||
|
|
|
@ -314,8 +314,8 @@ class SbbBinarizer:
|
||||||
prediction_true = prediction_true.astype(np.uint8)
|
prediction_true = prediction_true.astype(np.uint8)
|
||||||
return prediction_true[:,:,0]
|
return prediction_true[:,:,0]
|
||||||
|
|
||||||
def run(self, image=None, image_path=None, save=None, use_patches=False, dir_in=None, dir_out=None):
|
def run(self, image=None, image_path=None, output=None, use_patches=False, dir_in=None):
|
||||||
print(dir_in,'dir_in')
|
# print(dir_in,'dir_in')
|
||||||
if not dir_in:
|
if not dir_in:
|
||||||
if (image is not None and image_path is not None) or \
|
if (image is not None and image_path is not None) or \
|
||||||
(image is None and image_path is None):
|
(image is None and image_path is None):
|
||||||
|
@ -343,8 +343,8 @@ class SbbBinarizer:
|
||||||
kernel = np.ones((5, 5), np.uint8)
|
kernel = np.ones((5, 5), np.uint8)
|
||||||
img_last[:, :][img_last[:, :] > 0] = 255
|
img_last[:, :][img_last[:, :] > 0] = 255
|
||||||
img_last = (img_last[:, :] == 0) * 255
|
img_last = (img_last[:, :] == 0) * 255
|
||||||
if save:
|
if output:
|
||||||
cv2.imwrite(save, img_last)
|
cv2.imwrite(output, img_last)
|
||||||
return img_last
|
return img_last
|
||||||
else:
|
else:
|
||||||
ls_imgs = os.listdir(dir_in)
|
ls_imgs = os.listdir(dir_in)
|
||||||
|
@ -374,4 +374,4 @@ class SbbBinarizer:
|
||||||
img_last[:, :][img_last[:, :] > 0] = 255
|
img_last[:, :][img_last[:, :] > 0] = 255
|
||||||
img_last = (img_last[:, :] == 0) * 255
|
img_last = (img_last[:, :] == 0) * 255
|
||||||
|
|
||||||
cv2.imwrite(os.path.join(dir_out,image_stem+'.png'), img_last)
|
cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last)
|
||||||
|
|
|
@ -168,7 +168,7 @@ class EynollahXmlWriter():
|
||||||
with open(self.output_filename, 'w') as f:
|
with open(self.output_filename, 'w') as f:
|
||||||
f.write(to_xml(pcgts))
|
f.write(to_xml(pcgts))
|
||||||
|
|
||||||
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion):
|
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines, conf_contours_textregion, skip_layout_reading_order=False):
|
||||||
self.logger.debug('enter build_pagexml_no_full_layout')
|
self.logger.debug('enter build_pagexml_no_full_layout')
|
||||||
|
|
||||||
# create the file structure
|
# create the file structure
|
||||||
|
@ -184,7 +184,7 @@ class EynollahXmlWriter():
|
||||||
|
|
||||||
for mm in range(len(found_polygons_text_region)):
|
for mm in range(len(found_polygons_text_region)):
|
||||||
textregion = TextRegionType(id=counter.next_region_id, type_='paragraph',
|
textregion = TextRegionType(id=counter.next_region_id, type_='paragraph',
|
||||||
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord), conf=conf_contours_textregion[mm]),
|
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord, skip_layout_reading_order), conf=conf_contours_textregion[mm]),
|
||||||
)
|
)
|
||||||
#textregion.set_conf(conf_contours_textregion[mm])
|
#textregion.set_conf(conf_contours_textregion[mm])
|
||||||
page.add_TextRegion(textregion)
|
page.add_TextRegion(textregion)
|
||||||
|
@ -303,10 +303,20 @@ class EynollahXmlWriter():
|
||||||
|
|
||||||
return pcgts
|
return pcgts
|
||||||
|
|
||||||
def calculate_polygon_coords(self, contour, page_coord):
|
def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False):
|
||||||
self.logger.debug('enter calculate_polygon_coords')
|
self.logger.debug('enter calculate_polygon_coords')
|
||||||
coords = ''
|
coords = ''
|
||||||
for value_bbox in contour:
|
for value_bbox in contour:
|
||||||
|
if skip_layout_reading_order:
|
||||||
|
if len(value_bbox) == 2:
|
||||||
|
coords += str(int((value_bbox[0]) / self.scale_x))
|
||||||
|
coords += ','
|
||||||
|
coords += str(int((value_bbox[1]) / self.scale_y))
|
||||||
|
else:
|
||||||
|
coords += str(int((value_bbox[0][0]) / self.scale_x))
|
||||||
|
coords += ','
|
||||||
|
coords += str(int((value_bbox[0][1]) / self.scale_y))
|
||||||
|
else:
|
||||||
if len(value_bbox) == 2:
|
if len(value_bbox) == 2:
|
||||||
coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x))
|
coords += str(int((value_bbox[0] + page_coord[2]) / self.scale_x))
|
||||||
coords += ','
|
coords += ','
|
||||||
|
|
|
@ -85,8 +85,8 @@ def test_run_eynollah_binarization_filename(tmp_path, subtests, pytestconfig, ca
|
||||||
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
outfile = tmp_path.joinpath('kant_aufklaerung_1784_0020.png')
|
||||||
args = [
|
args = [
|
||||||
'-m', SBBBIN_MODELS,
|
'-m', SBBBIN_MODELS,
|
||||||
str(infile),
|
'-i', str(infile),
|
||||||
str(outfile),
|
'-o', str(outfile),
|
||||||
]
|
]
|
||||||
caplog.set_level(logging.INFO)
|
caplog.set_level(logging.INFO)
|
||||||
def only_eynollah(logrec):
|
def only_eynollah(logrec):
|
||||||
|
@ -117,7 +117,7 @@ def test_run_eynollah_binarization_directory(tmp_path, subtests, pytestconfig, c
|
||||||
args = [
|
args = [
|
||||||
'-m', SBBBIN_MODELS,
|
'-m', SBBBIN_MODELS,
|
||||||
'-di', str(indir),
|
'-di', str(indir),
|
||||||
'-do', str(outdir),
|
'-o', str(outdir),
|
||||||
]
|
]
|
||||||
caplog.set_level(logging.INFO)
|
caplog.set_level(logging.INFO)
|
||||||
def only_eynollah(logrec):
|
def only_eynollah(logrec):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue