mirror of
				https://github.com/qurator-spk/dinglehopper.git
				synced 2025-10-31 09:24:15 +01:00 
			
		
		
		
	Compare commits
	
		
			86 commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 1efb382a54 | |||
| 6d23bbb45c | |||
| 3443edd6d3 | |||
|  | b1ef3af1a8 | ||
|  | d09e3969f8 | ||
| b5e99d96c9 | |||
| 774790c36f | |||
| addb572922 | |||
| 1ebb004386 | |||
| c3aa48ec3b | |||
| 628594ef98 | |||
| d7814db705 | |||
| 5639f3db7f | |||
| 9fc8937324 | |||
| 14a4bc56d8 | |||
| a70260c10e | |||
| 224aa02163 | |||
| 9db5b4caf5 | |||
| 5578ce83a3 | |||
| cf59b951a3 | |||
| 480b3cf864 | |||
| f1a586cff1 | |||
| 3b16c14c16 | |||
| 322faeb26c | |||
| c37316da09 | |||
| 9414a92f9f | |||
| 68344e48f8 | |||
| 73ee16fe51 | |||
| 6980d7a252 | |||
| 2bf2529c38 | |||
| ad8e6de36b | |||
| 4024e350f7 | |||
| 3c317cbeaf | |||
| d8403421fc | |||
| 3305043234 | |||
| 6bf5bd7178 | |||
| 817e0c95f7 | |||
| 3d7c7ee1e3 | |||
|  | a24623b966 | ||
|  | ea33602336 | ||
|  | 64444dd419 | ||
| f6dfb77f94 | |||
| ef817cb343 | |||
| b1c109baae | |||
| 13ab1ae150 | |||
| d974369e13 | |||
| b7bdca4ac8 | |||
|  | 831a24fc4c | ||
|  | f6a2c94520 | ||
|  | 4162836612 | ||
|  | c0aa82d188 | ||
|  | 8c1b6d65f5 | ||
| f287386c0e | |||
|  | 63031b30bf | ||
| bf6633be02 | |||
| d3aa9eb520 | |||
| 625686f204 | |||
| ce7886af23 | |||
| a09a624bde | |||
| badfa9c99e | |||
| 7f8a8dd564 | |||
| b72d4f5af9 | |||
| 058042accb | |||
| 071e6a8bd1 | |||
| 6b82293670 | |||
| 6ecf49a355 | |||
|  | 9c7c104dce | ||
| 2e6fe0c279 | |||
| 1753ed4d13 | |||
| 3233dbcc8f | |||
| f2e290dffe | |||
| 6d1daf1dfe | |||
| 27ad145c7e | |||
| 2e9e88cc1e | |||
| 129e6eb427 | |||
| cf998443c1 | |||
| 6048107889 | |||
| 2ee37ed4e3 | |||
| 521f034fba | |||
| d1a2247615 | |||
| 4047f8b6e5 | |||
|  | cd68a973cb | ||
| bc5818da9f | |||
| c91234daba | |||
| a534b5e28e | |||
| b336f98271 | 
					 39 changed files with 582 additions and 123 deletions
				
			
		
							
								
								
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								.dockerignore
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,5 @@ | ||||||
|  | src/dinglehopper/tests | ||||||
|  | dist | ||||||
|  | build | ||||||
|  | *.egg-info | ||||||
|  | .git | ||||||
							
								
								
									
										8
									
								
								.github/workflows/release.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/release.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -17,7 +17,7 @@ jobs: | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|       - name: Checkout |       - name: Checkout | ||||||
|         uses: actions/checkout@v3 |         uses: actions/checkout@v4 | ||||||
|       - name: Upgrade pip |       - name: Upgrade pip | ||||||
|         run: python3 -m pip install --upgrade pip |         run: python3 -m pip install --upgrade pip | ||||||
|       - name: Install setuptools |       - name: Install setuptools | ||||||
|  | @ -32,7 +32,7 @@ jobs: | ||||||
|       - name: Build package |       - name: Build package | ||||||
|         run: python3 -m pip install --upgrade build && python3 -m build |         run: python3 -m pip install --upgrade build && python3 -m build | ||||||
|       - name: Upload dist |       - name: Upload dist | ||||||
|         uses: actions/upload-artifact@v3 |         uses: actions/upload-artifact@v4 | ||||||
|         with: |         with: | ||||||
|           name: dist |           name: dist | ||||||
|           path: dist/ |           path: dist/ | ||||||
|  | @ -42,7 +42,7 @@ jobs: | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|       - name: Download dist |       - name: Download dist | ||||||
|         uses: actions/download-artifact@v3 |         uses: actions/download-artifact@v4 | ||||||
|         with: |         with: | ||||||
|           name: dist |           name: dist | ||||||
|           path: dist/ |           path: dist/ | ||||||
|  | @ -61,7 +61,7 @@ jobs: | ||||||
|       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing |       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing | ||||||
|     steps: |     steps: | ||||||
|       - name: Download dist |       - name: Download dist | ||||||
|         uses: actions/download-artifact@v3 |         uses: actions/download-artifact@v4 | ||||||
|         with: |         with: | ||||||
|           name: dist |           name: dist | ||||||
|           path: dist/ |           path: dist/ | ||||||
|  |  | ||||||
							
								
								
									
										11
									
								
								.github/workflows/test.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										11
									
								
								.github/workflows/test.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -1,4 +1,4 @@ | ||||||
| name: Test | name: 'Test' | ||||||
| 
 | 
 | ||||||
| on: | on: | ||||||
| 
 | 
 | ||||||
|  | @ -25,18 +25,19 @@ jobs: | ||||||
|     strategy: |     strategy: | ||||||
|       fail-fast: false |       fail-fast: false | ||||||
|       matrix: |       matrix: | ||||||
|         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] |         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13" ] | ||||||
| 
 | 
 | ||||||
|     runs-on: "ubuntu-latest" |     runs-on: "ubuntu-latest" | ||||||
| 
 | 
 | ||||||
|     steps: |     steps: | ||||||
|       - name: Set up Python |       - name: Set up Python | ||||||
|         uses: actions/setup-python@v4 |         uses: actions/setup-python@v5 | ||||||
|         with: |         with: | ||||||
|           python-version: ${{ matrix.python-version }} |           python-version: ${{ matrix.python-version }} | ||||||
|  |           allow-prereleases: true | ||||||
| 
 | 
 | ||||||
|       - name: Checkout |       - name: Checkout | ||||||
|         uses: actions/checkout@v3 |         uses: actions/checkout@v4 | ||||||
| 
 | 
 | ||||||
|       - name: Install possible lxml build requirements (if building from source) |       - name: Install possible lxml build requirements (if building from source) | ||||||
|         run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev |         run: sudo apt-get install -y libxml2-dev libxslt-dev python3-dev | ||||||
|  | @ -56,7 +57,7 @@ jobs: | ||||||
|             cd src |             cd src | ||||||
|             python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy |             python3 -m pytest --junitxml=../${{matrix.python-version}}-junit.xml -o junit_family=legacy | ||||||
|       - name: Upload test results |       - name: Upload test results | ||||||
|         uses: actions/upload-artifact@v3 |         uses: actions/upload-artifact@v4 | ||||||
|         if: success() || failure() |         if: success() || failure() | ||||||
|         with: |         with: | ||||||
|           name: test-results-${{matrix.python-version}} |           name: test-results-${{matrix.python-version}} | ||||||
|  |  | ||||||
							
								
								
									
										6
									
								
								.github/workflows/test_report.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/test_report.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -1,4 +1,4 @@ | ||||||
| name: 'Test Report' | name: 'Test - Report results' | ||||||
| on: | on: | ||||||
|   workflow_run: |   workflow_run: | ||||||
|     workflows: ['test'] |     workflows: ['test'] | ||||||
|  | @ -12,9 +12,9 @@ jobs: | ||||||
|   report: |   report: | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|       - uses: dorny/test-reporter@v1.7.0 |       - uses: dorny/test-reporter@v1 | ||||||
|         with: |         with: | ||||||
|           artifact: /test-results-(.*)/ |           artifact: /test-results-(.*)/ | ||||||
|           name: 'Tests Results - $1' |           name: 'test - Results ($1)' | ||||||
|           path: '*junit.xml' |           path: '*junit.xml' | ||||||
|           reporter: java-junit |           reporter: java-junit | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							|  | @ -25,6 +25,7 @@ dmypy.json | ||||||
| 
 | 
 | ||||||
| # User-specific stuff | # User-specific stuff | ||||||
| .idea | .idea | ||||||
|  | .*.swp | ||||||
| 
 | 
 | ||||||
| # Build artifacts | # Build artifacts | ||||||
| /build | /build | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| repos: | repos: | ||||||
| -   repo: https://github.com/pre-commit/pre-commit-hooks | -   repo: https://github.com/pre-commit/pre-commit-hooks | ||||||
|     rev: v4.6.0 |     rev: v6.0.0 | ||||||
|     hooks: |     hooks: | ||||||
|     -   id: trailing-whitespace |     -   id: trailing-whitespace | ||||||
|     -   id: end-of-file-fixer |     -   id: end-of-file-fixer | ||||||
|  | @ -11,12 +11,12 @@ repos: | ||||||
|     -   id: check-ast |     -   id: check-ast | ||||||
| 
 | 
 | ||||||
| -   repo: https://github.com/psf/black | -   repo: https://github.com/psf/black | ||||||
|     rev: 24.4.2 |     rev: 25.1.0 | ||||||
|     hooks: |     hooks: | ||||||
|     -   id: black |     -   id: black | ||||||
| 
 | 
 | ||||||
| -   repo: https://github.com/astral-sh/ruff-pre-commit | -   repo: https://github.com/astral-sh/ruff-pre-commit | ||||||
|     rev: v0.4.3 |     rev: v0.12.10 | ||||||
|     hooks: |     hooks: | ||||||
|     -   args: |     -   args: | ||||||
|         -   --fix |         -   --fix | ||||||
|  | @ -24,7 +24,7 @@ repos: | ||||||
|         id: ruff |         id: ruff | ||||||
| 
 | 
 | ||||||
| -   repo: https://github.com/pre-commit/mirrors-mypy | -   repo: https://github.com/pre-commit/mirrors-mypy | ||||||
|     rev: v1.10.0 |     rev: v1.17.1 | ||||||
|     hooks: |     hooks: | ||||||
|     -   additional_dependencies: |     -   additional_dependencies: | ||||||
|         -   types-setuptools |         -   types-setuptools | ||||||
|  | @ -36,6 +36,12 @@ repos: | ||||||
|         id: mypy |         id: mypy | ||||||
| 
 | 
 | ||||||
| -   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update | -   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update | ||||||
|     rev: v0.3.1post2 |     rev: v0.8.0 | ||||||
|     hooks: |     hooks: | ||||||
|     -   id: pre-commit-update |     -   id: pre-commit-update | ||||||
|  | 
 | ||||||
|  | -   repo: https://github.com/dhatim/python-license-check | ||||||
|  |     rev: 0.9.3 | ||||||
|  |     hooks: | ||||||
|  |     -   id: liccheck | ||||||
|  |         language: system | ||||||
|  |  | ||||||
							
								
								
									
										40
									
								
								Dockerfile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								Dockerfile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,40 @@ | ||||||
|  | ARG DOCKER_BASE_IMAGE | ||||||
|  | FROM $DOCKER_BASE_IMAGE | ||||||
|  | ARG VCS_REF | ||||||
|  | ARG BUILD_DATE | ||||||
|  | LABEL \ | ||||||
|  |     maintainer="https://github.com/qurator-spk/dinglehopper/issues" \ | ||||||
|  |     org.label-schema.vcs-ref=$VCS_REF \ | ||||||
|  |     org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \ | ||||||
|  |     org.label-schema.build-date=$BUILD_DATE \ | ||||||
|  |     org.opencontainers.image.vendor="Staatsbibliothek zu Berlin — SPK" \ | ||||||
|  |     org.opencontainers.image.title="dinglehopper" \ | ||||||
|  |     org.opencontainers.image.description="An OCR evaluation tool" \ | ||||||
|  |     org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \ | ||||||
|  |     org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \ | ||||||
|  |     org.opencontainers.image.revision=$VCS_REF \ | ||||||
|  |     org.opencontainers.image.created=$BUILD_DATE \ | ||||||
|  |     org.opencontainers.image.base.name=ocrd/core | ||||||
|  | 
 | ||||||
|  | ENV LANG=C.UTF-8 | ||||||
|  | ENV LC_ALL=C.UTF-8 | ||||||
|  | 
 | ||||||
|  | # avoid HOME/.local/share (hard to predict USER here) | ||||||
|  | # so let XDG_DATA_HOME coincide with fixed system location | ||||||
|  | # (can still be overridden by derived stages) | ||||||
|  | ENV XDG_DATA_HOME /usr/local/share | ||||||
|  | # avoid the need for an extra volume for persistent resource user db | ||||||
|  | # (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) | ||||||
|  | ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources | ||||||
|  | 
 | ||||||
|  | WORKDIR /build/dinglehopper | ||||||
|  | COPY . . | ||||||
|  | COPY ocrd-tool.json . | ||||||
|  | # prepackage ocrd-tool.json as ocrd-all-tool.json | ||||||
|  | RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json | ||||||
|  | # prepackage ocrd-all-module-dir.json | ||||||
|  | RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json | ||||||
|  | RUN make install && rm -rf /build/dinglehopper | ||||||
|  | 
 | ||||||
|  | WORKDIR /data | ||||||
|  | VOLUME /data | ||||||
							
								
								
									
										2
									
								
								LICENSE
									
										
									
									
									
								
							
							
						
						
									
										2
									
								
								LICENSE
									
										
									
									
									
								
							|  | @ -186,7 +186,7 @@ | ||||||
|       same "printed page" as the copyright notice for easier |       same "printed page" as the copyright notice for easier | ||||||
|       identification within third-party archives. |       identification within third-party archives. | ||||||
| 
 | 
 | ||||||
|    Copyright 2019 qurator |    Copyright 2019-2025 Staatsbibliothek zu Berlin — SPK | ||||||
| 
 | 
 | ||||||
|    Licensed under the Apache License, Version 2.0 (the "License"); |    Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|    you may not use this file except in compliance with the License. |    you may not use this file except in compliance with the License. | ||||||
|  |  | ||||||
							
								
								
									
										34
									
								
								Makefile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								Makefile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,34 @@ | ||||||
|  | PYTHON = python3 | ||||||
|  | PIP = pip3 | ||||||
|  | PYTHONIOENCODING=utf8 | ||||||
|  | PYTEST_ARGS = -vv | ||||||
|  | 
 | ||||||
|  | DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest | ||||||
|  | DOCKER_TAG ?= ocrd/dinglehopper | ||||||
|  | DOCKER ?= docker | ||||||
|  | 
 | ||||||
|  | help: | ||||||
|  | 	@echo | ||||||
|  | 	@echo "  Targets" | ||||||
|  | 	@echo | ||||||
|  | 	@echo "    install Install full Python package via pip" | ||||||
|  | 	@echo "    docker  Build the ocrd/dinglehopper docker image" | ||||||
|  | 
 | ||||||
|  | # Install Python package via pip
 | ||||||
|  | install: | ||||||
|  | 	$(PIP) install . | ||||||
|  | 
 | ||||||
|  | install-dev: | ||||||
|  | 	$(PIP) install -e . | ||||||
|  | 
 | ||||||
|  | test: | ||||||
|  | 	pytest $(PYTEST_ARGS) | ||||||
|  | 
 | ||||||
|  | docker: | ||||||
|  | 	$(DOCKER) build \
 | ||||||
|  | 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
 | ||||||
|  | 	--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
 | ||||||
|  | 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
 | ||||||
|  | 	-t $(DOCKER_TAG) . | ||||||
|  | 
 | ||||||
|  | .PHONY: help install install-dev test docker | ||||||
|  | @ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt. | ||||||
| with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate | with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate | ||||||
| CLI interface: | CLI interface: | ||||||
| 
 | 
 | ||||||
| ~~~ | ``` | ||||||
| dinglehopper-line-dirs gt/ ocr/ | dinglehopper-line-dirs gt/ ocr/ | ||||||
| ~~~ | ``` | ||||||
|  | 
 | ||||||
|  | The CLI `dinglehopper-line-dirs` can also work with GT text files in the same | ||||||
|  | directories as the the OCR text files. You should read `dinglehopper-line-dirs --help` | ||||||
|  | in this case. | ||||||
| 
 | 
 | ||||||
| ### dinglehopper-extract | ### dinglehopper-extract | ||||||
| The tool `dinglehopper-extract` extracts the text of the given input file on | The tool `dinglehopper-extract` extracts the text of the given input file on | ||||||
|  |  | ||||||
|  | @ -7,8 +7,9 @@ authors = [ | ||||||
|     {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"}, |     {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"}, | ||||||
|     {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"}, |     {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"}, | ||||||
| ] | ] | ||||||
| description = "The OCR evaluation tool" | description = "An OCR evaluation tool" | ||||||
| readme = "README.md" | readme = "README.md" | ||||||
|  | license.file = "LICENSE" | ||||||
| requires-python = ">=3.8" | requires-python = ">=3.8" | ||||||
| keywords = ["qurator", "ocr", "evaluation", "ocr-d"] | keywords = ["qurator", "ocr", "evaluation", "ocr-d"] | ||||||
| 
 | 
 | ||||||
|  | @ -48,7 +49,7 @@ optional-dependencies.dev = {file = ["requirements-dev.txt"]} | ||||||
| where = ["src"] | where = ["src"] | ||||||
| 
 | 
 | ||||||
| [tool.setuptools.package-data] | [tool.setuptools.package-data] | ||||||
| dinglehopper = ["templates/*"] | dinglehopper = ["templates/*", "*.json"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| [tool.pytest.ini_options] | [tool.pytest.ini_options] | ||||||
|  | @ -74,5 +75,40 @@ disallow_untyped_defs = false | ||||||
| disallow_untyped_calls = false | disallow_untyped_calls = false | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| [tool.ruff] | [tool.ruff.lint] | ||||||
| select = ["E", "F", "I"] | select = ["E", "F", "I", "B"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | [tool.liccheck] | ||||||
|  | authorized_licenses = [ | ||||||
|  |     "bsd", | ||||||
|  |     "new bsd", | ||||||
|  |     "bsd license", | ||||||
|  |     "new bsd license", | ||||||
|  |     "simplified bsd", | ||||||
|  |     "apache", | ||||||
|  |     "apache 2.0", | ||||||
|  |     "apache software license", | ||||||
|  |     "apache software", | ||||||
|  |     "apache license 2.0", | ||||||
|  |     "gnu lgpl", | ||||||
|  |     "lgpl with exceptions or zpl", | ||||||
|  |     "GNU Library or Lesser General Public License (LGPL)", | ||||||
|  |     "GNU Lesser General Public License v3 (LGPLv3)", | ||||||
|  |     "GNU Lesser General Public License v2 or later (LGPLv2+)", | ||||||
|  |     "mit", | ||||||
|  |     "mit license", | ||||||
|  |     "mit-cmu", | ||||||
|  |     "python software foundation", | ||||||
|  |     "psf", | ||||||
|  |     "psf-2.0", | ||||||
|  |     "Historical Permission Notice and Disclaimer (HPND)", | ||||||
|  |     "public domain", | ||||||
|  |     'The Unlicense (Unlicense)', | ||||||
|  |     "isc", | ||||||
|  |     "ISC License (ISCL)", | ||||||
|  |     'Mozilla Public License 2.0 (MPL 2.0)', | ||||||
|  | ] | ||||||
|  | unauthorized_licenses = [ | ||||||
|  |     "gpl v3", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | @ -10,3 +10,5 @@ mypy | ||||||
| types-lxml | types-lxml | ||||||
| types-setuptools | types-setuptools | ||||||
| pytest-mypy | pytest-mypy | ||||||
|  | 
 | ||||||
|  | liccheck | ||||||
|  |  | ||||||
|  | @ -5,9 +5,10 @@ uniseg >= 0.8.0 | ||||||
| numpy | numpy | ||||||
| colorama | colorama | ||||||
| MarkupSafe | MarkupSafe | ||||||
| ocrd >= 2.65.0 | ocrd >= 3.3.0 | ||||||
| attrs | attrs | ||||||
| multimethod >= 1.3 | multimethod >= 1.3 | ||||||
| tqdm | tqdm | ||||||
| rapidfuzz >= 2.7.0 | rapidfuzz >= 2.7.0 | ||||||
| chardet | chardet | ||||||
|  | importlib_resources | ||||||
|  |  | ||||||
|  | @ -114,6 +114,7 @@ def process( | ||||||
|     metrics: bool = True, |     metrics: bool = True, | ||||||
|     differences: bool = False, |     differences: bool = False, | ||||||
|     textequiv_level: str = "region", |     textequiv_level: str = "region", | ||||||
|  |     plain_encoding: str = "autodetect", | ||||||
| ) -> None: | ) -> None: | ||||||
|     """Check OCR result against GT. |     """Check OCR result against GT. | ||||||
| 
 | 
 | ||||||
|  | @ -121,8 +122,12 @@ def process( | ||||||
|     this undecorated version and use Click on a wrapper. |     this undecorated version and use Click on a wrapper. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     gt_text = extract(gt, textequiv_level=textequiv_level) |     gt_text = extract( | ||||||
|     ocr_text = extract(ocr, textequiv_level=textequiv_level) |         gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding | ||||||
|  |     ) | ||||||
|  |     ocr_text = extract( | ||||||
|  |         ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding | ||||||
|  |     ) | ||||||
|     gt_words: List[str] = list(words_normalized(gt_text)) |     gt_words: List[str] = list(words_normalized(gt_text)) | ||||||
|     ocr_words: List[str] = list(words_normalized(ocr_text)) |     ocr_words: List[str] = list(words_normalized(ocr_text)) | ||||||
| 
 | 
 | ||||||
|  | @ -195,6 +200,7 @@ def process_dir( | ||||||
|     metrics: bool = True, |     metrics: bool = True, | ||||||
|     differences: bool = False, |     differences: bool = False, | ||||||
|     textequiv_level: str = "region", |     textequiv_level: str = "region", | ||||||
|  |     plain_encoding: str = "autodetect", | ||||||
| ) -> None: | ) -> None: | ||||||
|     for gt_file in os.listdir(gt): |     for gt_file in os.listdir(gt): | ||||||
|         gt_file_path = os.path.join(gt, gt_file) |         gt_file_path = os.path.join(gt, gt_file) | ||||||
|  | @ -209,6 +215,7 @@ def process_dir( | ||||||
|                 metrics=metrics, |                 metrics=metrics, | ||||||
|                 differences=differences, |                 differences=differences, | ||||||
|                 textequiv_level=textequiv_level, |                 textequiv_level=textequiv_level, | ||||||
|  |                 plain_encoding=plain_encoding, | ||||||
|             ) |             ) | ||||||
|         else: |         else: | ||||||
|             print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) |             print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path)) | ||||||
|  | @ -233,7 +240,13 @@ def process_dir( | ||||||
|     help="PAGE TextEquiv level to extract text from", |     help="PAGE TextEquiv level to extract text from", | ||||||
|     metavar="LEVEL", |     metavar="LEVEL", | ||||||
| ) | ) | ||||||
|  | @click.option( | ||||||
|  |     "--plain-encoding", | ||||||
|  |     default="autodetect", | ||||||
|  |     help='Encoding (e.g. "utf-8") of plain text files', | ||||||
|  | ) | ||||||
| @click.option("--progress", default=False, is_flag=True, help="Show progress bar") | @click.option("--progress", default=False, is_flag=True, help="Show progress bar") | ||||||
|  | @click.version_option() | ||||||
| def main( | def main( | ||||||
|     gt, |     gt, | ||||||
|     ocr, |     ocr, | ||||||
|  | @ -242,6 +255,7 @@ def main( | ||||||
|     metrics, |     metrics, | ||||||
|     differences, |     differences, | ||||||
|     textequiv_level, |     textequiv_level, | ||||||
|  |     plain_encoding, | ||||||
|     progress, |     progress, | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|  | @ -279,6 +293,7 @@ def main( | ||||||
|                 metrics=metrics, |                 metrics=metrics, | ||||||
|                 differences=differences, |                 differences=differences, | ||||||
|                 textequiv_level=textequiv_level, |                 textequiv_level=textequiv_level, | ||||||
|  |                 plain_encoding=plain_encoding, | ||||||
|             ) |             ) | ||||||
|     else: |     else: | ||||||
|         process( |         process( | ||||||
|  | @ -289,6 +304,7 @@ def main( | ||||||
|             metrics=metrics, |             metrics=metrics, | ||||||
|             differences=differences, |             differences=differences, | ||||||
|             textequiv_level=textequiv_level, |             textequiv_level=textequiv_level, | ||||||
|  |             plain_encoding=plain_encoding, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -12,7 +12,12 @@ from .ocr_files import extract | ||||||
|     help="PAGE TextEquiv level to extract text from", |     help="PAGE TextEquiv level to extract text from", | ||||||
|     metavar="LEVEL", |     metavar="LEVEL", | ||||||
| ) | ) | ||||||
| def main(input_file, textequiv_level): | @click.option( | ||||||
|  |     "--plain-encoding", | ||||||
|  |     default="autodetect", | ||||||
|  |     help='Encoding (e.g. "utf-8") of plain text files', | ||||||
|  | ) | ||||||
|  | def main(input_file, textequiv_level, plain_encoding): | ||||||
|     """ |     """ | ||||||
|     Extract the text of the given INPUT_FILE. |     Extract the text of the given INPUT_FILE. | ||||||
| 
 | 
 | ||||||
|  | @ -23,7 +28,9 @@ def main(input_file, textequiv_level): | ||||||
|     use "--textequiv-level line" to extract from the level of TextLine tags. |     use "--textequiv-level line" to extract from the level of TextLine tags. | ||||||
|     """ |     """ | ||||||
|     initLogging() |     initLogging() | ||||||
|     input_text = extract(input_file, textequiv_level=textequiv_level).text |     input_text = extract( | ||||||
|  |         input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding | ||||||
|  |     ).text | ||||||
|     print(input_text) |     print(input_text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,5 +1,6 @@ | ||||||
| import itertools | import itertools | ||||||
| import os | import os | ||||||
|  | from typing import Callable, Iterator, List, Optional, Tuple | ||||||
| 
 | 
 | ||||||
| import click | import click | ||||||
| from jinja2 import Environment, FileSystemLoader | from jinja2 import Environment, FileSystemLoader | ||||||
|  | @ -12,6 +13,41 @@ from .ocr_files import plain_extract | ||||||
| from .word_error_rate import word_error_rate_n, words_normalized | from .word_error_rate import word_error_rate_n, words_normalized | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def removesuffix(text, suffix): | ||||||
|  |     """ | ||||||
|  |     Remove suffix from text. | ||||||
|  | 
 | ||||||
|  |     Can be replaced with str.removesuffix when we only support Python >= 3.9. | ||||||
|  |     """ | ||||||
|  |     if suffix and text.endswith(suffix): | ||||||
|  |         return text[: -len(suffix)] | ||||||
|  |     return text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def is_hidden(filepath): | ||||||
|  |     filename = os.path.basename(os.path.abspath(filepath)) | ||||||
|  |     return filename.startswith(".") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def find_all_files( | ||||||
|  |     dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False | ||||||
|  | ) -> Iterator[str]: | ||||||
|  |     """ | ||||||
|  |     Find all files in dir_, returning filenames | ||||||
|  | 
 | ||||||
|  |     If pred is given, pred(filename) must be True for the filename. | ||||||
|  | 
 | ||||||
|  |     Does not return hidden files by default. | ||||||
|  |     """ | ||||||
|  |     for root, _, filenames in os.walk(dir_): | ||||||
|  |         for fn in filenames: | ||||||
|  |             if not return_hidden and is_hidden(fn): | ||||||
|  |                 continue | ||||||
|  |             if pred and not pred(fn): | ||||||
|  |                 continue | ||||||
|  |             yield os.path.join(root, fn) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def all_equal(iterable): | def all_equal(iterable): | ||||||
|     g = itertools.groupby(iterable) |     g = itertools.groupby(iterable) | ||||||
|     return next(g, True) and not next(g, False) |     return next(g, True) and not next(g, False) | ||||||
|  | @ -25,15 +61,63 @@ def common_suffix(its): | ||||||
|     return reversed(common_prefix(reversed(it) for it in its)) |     return reversed(common_prefix(reversed(it) for it in its)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def removesuffix(text, suffix): | def find_gt_and_ocr_files( | ||||||
|     if suffix and text.endswith(suffix): |     gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str | ||||||
|         return text[: -len(suffix)] | ) -> Iterator[Tuple[str, str]]: | ||||||
|     return text |     """ | ||||||
|  |     Find GT files and matching OCR files. | ||||||
|  | 
 | ||||||
|  |     Returns pairs of GT and OCR files. | ||||||
|  |     """ | ||||||
|  |     for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)): | ||||||
|  |         ocr_fn = os.path.join( | ||||||
|  |             ocr_dir, | ||||||
|  |             removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix, | ||||||
|  |         ) | ||||||
|  |         if not os.path.exists(ocr_fn): | ||||||
|  |             raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist") | ||||||
|  | 
 | ||||||
|  |         yield gt_fn, ocr_fn | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): | def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir): | ||||||
|     gt_suffix = "".join(common_suffix(os.listdir(gt_dir))) |     """ | ||||||
|     ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir))) |     Find GT files and matching OCR files, autodetect suffixes. | ||||||
|  | 
 | ||||||
|  |     This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR) | ||||||
|  |     files with a common suffix. Currently the files must have a suffix, e.g. | ||||||
|  |     ".gt.txt" (e.g. ".ocr.txt"). | ||||||
|  | 
 | ||||||
|  |     Returns pairs of GT and OCR files. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     # Autodetect suffixes | ||||||
|  |     gt_files = find_all_files(gt_dir) | ||||||
|  |     gt_suffix = "".join(common_suffix(gt_files)) | ||||||
|  |     if len(gt_suffix) == 0: | ||||||
|  |         raise RuntimeError( | ||||||
|  |             f"Files in GT directory {gt_dir} do not have a common suffix" | ||||||
|  |         ) | ||||||
|  |     ocr_files = find_all_files(ocr_dir) | ||||||
|  |     ocr_suffix = "".join(common_suffix(ocr_files)) | ||||||
|  |     if len(ocr_suffix) == 0: | ||||||
|  |         raise RuntimeError( | ||||||
|  |             f"Files in OCR directory {ocr_dir} do not have a common suffix" | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def process( | ||||||
|  |     gt_dir, | ||||||
|  |     ocr_dir, | ||||||
|  |     report_prefix, | ||||||
|  |     *, | ||||||
|  |     metrics=True, | ||||||
|  |     gt_suffix=None, | ||||||
|  |     ocr_suffix=None, | ||||||
|  |     plain_encoding="autodetect", | ||||||
|  | ): | ||||||
| 
 | 
 | ||||||
|     cer = None |     cer = None | ||||||
|     n_characters = None |     n_characters = None | ||||||
|  | @ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): | ||||||
|     n_words = None |     n_words = None | ||||||
|     word_diff_report = "" |     word_diff_report = "" | ||||||
| 
 | 
 | ||||||
|     for k, gt in enumerate(os.listdir(gt_dir)): |     if gt_suffix is not None and ocr_suffix is not None: | ||||||
|         # Find a match by replacing the suffix |         gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix) | ||||||
|         ocr = removesuffix(gt, gt_suffix) + ocr_suffix |     else: | ||||||
|  |         gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir) | ||||||
| 
 | 
 | ||||||
|         gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True) |     for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files): | ||||||
|         ocr_text = plain_extract( |         gt_text = plain_extract( | ||||||
|             os.path.join(ocr_dir, ocr), include_filename_in_id=True |             gt_fn, include_filename_in_id=True, encoding=plain_encoding | ||||||
|         ) |         ) | ||||||
|         gt_words = words_normalized(gt_text) |         ocr_text = plain_extract( | ||||||
|         ocr_words = words_normalized(ocr_text) |             ocr_fn, include_filename_in_id=True, encoding=plain_encoding | ||||||
|  |         ) | ||||||
|  |         gt_words: List[str] = list(words_normalized(gt_text)) | ||||||
|  |         ocr_words: List[str] = list(words_normalized(ocr_text)) | ||||||
| 
 | 
 | ||||||
|         # Compute CER |         # Compute CER | ||||||
|         l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) |         l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text) | ||||||
|  | @ -81,7 +169,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): | ||||||
|             joiner="", |             joiner="", | ||||||
|             none="·", |             none="·", | ||||||
|             score_hint=score_hint(l_cer, l_n_characters), |             score_hint=score_hint(l_cer, l_n_characters), | ||||||
|         ) |         )[0] | ||||||
|         word_diff_report += gen_diff_report( |         word_diff_report += gen_diff_report( | ||||||
|             gt_words, |             gt_words, | ||||||
|             ocr_words, |             ocr_words, | ||||||
|  | @ -89,7 +177,7 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): | ||||||
|             joiner=" ", |             joiner=" ", | ||||||
|             none="⋯", |             none="⋯", | ||||||
|             score_hint=score_hint(l_wer, l_n_words), |             score_hint=score_hint(l_wer, l_n_words), | ||||||
|         ) |         )[0] | ||||||
| 
 | 
 | ||||||
|     env = Environment( |     env = Environment( | ||||||
|         loader=FileSystemLoader( |         loader=FileSystemLoader( | ||||||
|  | @ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True): | ||||||
| @click.option( | @click.option( | ||||||
|     "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" |     "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" | ||||||
| ) | ) | ||||||
| def main(gt, ocr, report_prefix, metrics): | @click.option("--gt-suffix", help="Suffix of GT line text files") | ||||||
|  | @click.option("--ocr-suffix", help="Suffix of OCR line text files") | ||||||
|  | @click.option( | ||||||
|  |     "--plain-encoding", | ||||||
|  |     default="autodetect", | ||||||
|  |     help='Encoding (e.g. "utf-8") of plain text files', | ||||||
|  | ) | ||||||
|  | def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding): | ||||||
|     """ |     """ | ||||||
|     Compare the GT line text directory against the OCR line text directory. |     Compare the GT line text directory against the OCR line text directory. | ||||||
| 
 | 
 | ||||||
|     This assumes that the GT line text directory contains textfiles with a common |     This assumes that the GT line text directory contains textfiles with a common | ||||||
|     suffix like ".gt.txt", and the OCR line text directory contains textfiles with |     suffix like ".gt.txt", and the OCR line text directory contains textfiles with | ||||||
|     a common suffix like ".some-ocr.txt". The text files also need to be paired, |     a common suffix like ".some-ocr.txt". The text files also need to be paired, | ||||||
|     i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt" |     i.e. the GT filename "line001.gt.txt" needs to match a filename | ||||||
|     in the OCT lines directory. |     "line001.some-ocr.txt" in the OCR lines directory. | ||||||
| 
 | 
 | ||||||
|     The GT and OCR directories are usually round truth line texts and the results of |     GT and OCR directories may contain line text files in matching subdirectories, | ||||||
|  |     e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt". | ||||||
|  | 
 | ||||||
|  |     GT and OCR directories can also be the same directory, but in this case you need | ||||||
|  |     to give --gt-suffix and --ocr-suffix explicitly. | ||||||
|  | 
 | ||||||
|  |     The GT and OCR directories are usually ground truth line texts and the results of | ||||||
|     an OCR software, but you may use dinglehopper to compare two OCR results. In |     an OCR software, but you may use dinglehopper to compare two OCR results. In | ||||||
|     that case, use --no-metrics to disable the then meaningless metrics and also |     that case, use --no-metrics to disable the then meaningless metrics and also | ||||||
|     change the color scheme from green/red to blue. |     change the color scheme from green/red to blue. | ||||||
|  | @ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics): | ||||||
|     $REPORT_PREFIX defaults to "report". The reports include the character error |     $REPORT_PREFIX defaults to "report". The reports include the character error | ||||||
|     rate (CER) and the word error rate (WER). |     rate (CER) and the word error rate (WER). | ||||||
| 
 | 
 | ||||||
|  |     It is recommended to specify the encoding of the text files, for example with | ||||||
|  |     --plain-encoding utf-8. If this option is not given, we try to auto-detect it. | ||||||
|     """ |     """ | ||||||
|     initLogging() |     initLogging() | ||||||
|     process(gt, ocr, report_prefix, metrics=metrics) |     process( | ||||||
|  |         gt, | ||||||
|  |         ocr, | ||||||
|  |         report_prefix, | ||||||
|  |         metrics=metrics, | ||||||
|  |         gt_suffix=gt_suffix, | ||||||
|  |         ocr_suffix=ocr_suffix, | ||||||
|  |         plain_encoding=plain_encoding, | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|  |  | ||||||
|  | @ -149,7 +149,7 @@ class ExtractedText: | ||||||
|                 raise ValueError("Can't have joiner without segments to join") |                 raise ValueError("Can't have joiner without segments to join") | ||||||
|         if self.segments is not None: |         if self.segments is not None: | ||||||
|             if value not in ("", " ", "\n"): |             if value not in ("", " ", "\n"): | ||||||
|                 raise ValueError(f"Unexcepted segment joiner value {repr(value)}") |                 raise ValueError(f"Unexpected segment joiner value {repr(value)}") | ||||||
| 
 | 
 | ||||||
|     @_text.validator |     @_text.validator | ||||||
|     def is_valid_text(self, _, value): |     def is_valid_text(self, _, value): | ||||||
|  |  | ||||||
|  | @ -5,10 +5,13 @@ from typing import Dict, Iterator, Optional | ||||||
| import chardet | import chardet | ||||||
| from lxml import etree as ET | from lxml import etree as ET | ||||||
| from lxml.etree import XMLSyntaxError | from lxml.etree import XMLSyntaxError | ||||||
|  | from ocrd_utils import getLogger | ||||||
| from uniseg.graphemecluster import grapheme_clusters | from uniseg.graphemecluster import grapheme_clusters | ||||||
| 
 | 
 | ||||||
| from .extracted_text import ExtractedText, normalize_sbb | from .extracted_text import ExtractedText, normalize_sbb | ||||||
| 
 | 
 | ||||||
|  | log = getLogger("processor.OcrdDinglehopperEvaluate") | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def alto_namespace(tree: ET._ElementTree) -> Optional[str]: | def alto_namespace(tree: ET._ElementTree) -> Optional[str]: | ||||||
|     """Return the ALTO namespace used in the given ElementTree. |     """Return the ALTO namespace used in the given ElementTree. | ||||||
|  | @ -36,7 +39,7 @@ def alto_extract_lines(tree: ET._ElementTree) -> Iterator[ExtractedText]: | ||||||
|     for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): |     for line in tree.iterfind(".//alto:TextLine", namespaces=nsmap): | ||||||
|         line_id = line.attrib.get("ID") |         line_id = line.attrib.get("ID") | ||||||
|         line_text = " ".join( |         line_text = " ".join( | ||||||
|             string.attrib.get("CONTENT") |             string.attrib.get("CONTENT", "") | ||||||
|             for string in line.iterfind("alto:String", namespaces=nsmap) |             for string in line.iterfind("alto:String", namespaces=nsmap) | ||||||
|         ) |         ) | ||||||
|         normalized_text = normalize_sbb(line_text) |         normalized_text = normalize_sbb(line_text) | ||||||
|  | @ -149,7 +152,7 @@ def detect_encoding(filename): | ||||||
|     return chardet.detect(open(filename, "rb").read(1024))["encoding"] |     return chardet.detect(open(filename, "rb").read(1024))["encoding"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def plain_extract(filename, include_filename_in_id=False): | def plain_extract(filename, include_filename_in_id=False, encoding="autodetect"): | ||||||
|     id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" |     id_template = "{filename} - line {no}" if include_filename_in_id else "line {no}" | ||||||
| 
 | 
 | ||||||
|     def make_segment(no, line): |     def make_segment(no, line): | ||||||
|  | @ -163,11 +166,18 @@ def plain_extract(filename, include_filename_in_id=False): | ||||||
|             clusters, |             clusters, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     fileencoding = detect_encoding(filename) |     if encoding == "autodetect": | ||||||
|  |         fileencoding = detect_encoding(filename) | ||||||
|  |         log.warning( | ||||||
|  |             f"Autodetected encoding as '{fileencoding}'" | ||||||
|  |             ", it is recommended to specify it explicitly with --plain-encoding" | ||||||
|  |         ) | ||||||
|  |     else: | ||||||
|  |         fileencoding = encoding | ||||||
|     with open(filename, "r", encoding=fileencoding) as f: |     with open(filename, "r", encoding=fileencoding) as f: | ||||||
|         return ExtractedText( |         return ExtractedText( | ||||||
|             None, |             None, | ||||||
|             [make_segment(no, line) for no, line in enumerate(f.readlines())], |             [make_segment(no, line.strip()) for no, line in enumerate(f.readlines())], | ||||||
|             "\n", |             "\n", | ||||||
|             None, |             None, | ||||||
|             None, |             None, | ||||||
|  | @ -175,11 +185,11 @@ def plain_extract(filename, include_filename_in_id=False): | ||||||
|     # XXX hardcoded SBB normalization |     # XXX hardcoded SBB normalization | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def plain_text(filename): | def plain_text(filename, encoding="autodetect"): | ||||||
|     return plain_extract(filename).text |     return plain_extract(filename, encoding=encoding).text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def extract(filename, *, textequiv_level="region"): | def extract(filename, *, textequiv_level="region", plain_encoding="autodetect"): | ||||||
|     """Extract the text from the given file. |     """Extract the text from the given file. | ||||||
| 
 | 
 | ||||||
|     Supports PAGE, ALTO and falls back to plain text. |     Supports PAGE, ALTO and falls back to plain text. | ||||||
|  | @ -187,7 +197,7 @@ def extract(filename, *, textequiv_level="region"): | ||||||
|     try: |     try: | ||||||
|         tree = ET.parse(filename) |         tree = ET.parse(filename) | ||||||
|     except (XMLSyntaxError, UnicodeDecodeError): |     except (XMLSyntaxError, UnicodeDecodeError): | ||||||
|         return plain_extract(filename) |         return plain_extract(filename, encoding=plain_encoding) | ||||||
|     try: |     try: | ||||||
|         return page_extract(tree, textequiv_level=textequiv_level) |         return page_extract(tree, textequiv_level=textequiv_level) | ||||||
|     except ValueError: |     except ValueError: | ||||||
|  |  | ||||||
|  | @ -1,17 +1,13 @@ | ||||||
| { | { | ||||||
|   "version": "0.9.6", |   "version": "0.11.0", | ||||||
|   "git_url": "https://github.com/qurator-spk/dinglehopper", |   "git_url": "https://github.com/qurator-spk/dinglehopper", | ||||||
|  |   "dockerhub": "ocrd/dinglehopper", | ||||||
|   "tools": { |   "tools": { | ||||||
|     "ocrd-dinglehopper": { |     "ocrd-dinglehopper": { | ||||||
|       "executable": "ocrd-dinglehopper", |       "executable": "ocrd-dinglehopper", | ||||||
|  |       "input_file_grp_cardinality": 2, | ||||||
|  |       "output_file_grp_cardinality": 1, | ||||||
|       "description": "Evaluate OCR text against ground truth with dinglehopper", |       "description": "Evaluate OCR text against ground truth with dinglehopper", | ||||||
|       "input_file_grp": [ |  | ||||||
|         "OCR-D-GT-PAGE", |  | ||||||
|         "OCR-D-OCR" |  | ||||||
|       ], |  | ||||||
|       "output_file_grp": [ |  | ||||||
|         "OCR-D-OCR-EVAL" |  | ||||||
|       ], |  | ||||||
|       "categories": [ |       "categories": [ | ||||||
|         "Quality assurance" |         "Quality assurance" | ||||||
|       ], |       ], | ||||||
|  | @ -29,6 +25,11 @@ | ||||||
|           "enum": ["region", "line"], |           "enum": ["region", "line"], | ||||||
|           "default": "region", |           "default": "region", | ||||||
|           "description": "PAGE XML hierarchy level to extract the text from" |           "description": "PAGE XML hierarchy level to extract the text from" | ||||||
|  |         }, | ||||||
|  |         "plain_encoding": { | ||||||
|  |           "type": "string", | ||||||
|  |           "default": "autodetect", | ||||||
|  |           "description": "Encoding (e.g. \"utf-8\") of plain text files" | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -1,78 +1,78 @@ | ||||||
| import json | from functools import cached_property | ||||||
| import os | import os | ||||||
|  | from typing import Optional | ||||||
| 
 | 
 | ||||||
| import click | import click | ||||||
|  | from ocrd_models import OcrdFileType | ||||||
| from ocrd import Processor | from ocrd import Processor | ||||||
| from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor | ||||||
| from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id | from ocrd_utils import make_file_id | ||||||
| from pkg_resources import resource_string |  | ||||||
| 
 | 
 | ||||||
| from .cli import process as cli_process | from .cli import process as cli_process | ||||||
| 
 | 
 | ||||||
| OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8")) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @click.command() | @click.command() | ||||||
| @ocrd_cli_options | @ocrd_cli_options | ||||||
| def ocrd_dinglehopper(*args, **kwargs): | def ocrd_dinglehopper(*args, **kwargs): | ||||||
|     return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) |     return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| class OcrdDinglehopperEvaluate(Processor): | class OcrdDinglehopperEvaluate(Processor): | ||||||
|     def __init__(self, *args, **kwargs): |  | ||||||
|         kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] |  | ||||||
|         super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) |  | ||||||
| 
 | 
 | ||||||
|     def process(self): |     @cached_property | ||||||
|         assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") |     def executable(self): | ||||||
|         assert_file_grp_cardinality(self.output_file_grp, 1) |         return 'ocrd-dinglehopper' | ||||||
| 
 | 
 | ||||||
|         log = getLogger("processor.OcrdDinglehopperEvaluate") |     def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: | ||||||
| 
 | 
 | ||||||
|  |         assert self.parameter | ||||||
|         metrics = self.parameter["metrics"] |         metrics = self.parameter["metrics"] | ||||||
|         textequiv_level = self.parameter["textequiv_level"] |         textequiv_level = self.parameter["textequiv_level"] | ||||||
|         gt_grp, ocr_grp = self.input_file_grp.split(",") |         plain_encoding = self.parameter["plain_encoding"] | ||||||
| 
 | 
 | ||||||
|         input_file_tuples = self.zip_input_files(on_error="abort") |         # wrong number of inputs: let fail | ||||||
|         for n, (gt_file, ocr_file) in enumerate(input_file_tuples): |         gt_file, ocr_file = input_files | ||||||
|             if not gt_file or not ocr_file: |         # missing on either side: skip (zip_input_files already warned) | ||||||
|                 # file/page was not found in this group |         if not gt_file or not ocr_file: | ||||||
|                 continue |             return | ||||||
|             gt_file = self.workspace.download_file(gt_file) |         # missing download (i.e. OCRD_DOWNLOAD_INPUT=false): | ||||||
|             ocr_file = self.workspace.download_file(ocr_file) |         if not gt_file.local_filename: | ||||||
|             page_id = gt_file.pageId |             if config.OCRD_MISSING_INPUT == 'ABORT': | ||||||
|  |                 raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) | ||||||
|  |             return | ||||||
|  |         if not ocr_file.local_filename: | ||||||
|  |             if config.OCRD_MISSING_INPUT == 'ABORT': | ||||||
|  |                 raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) | ||||||
|  |             return | ||||||
| 
 | 
 | ||||||
|             log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) |         page_id = gt_file.pageId | ||||||
| 
 | 
 | ||||||
|             file_id = make_file_id(ocr_file, self.output_file_grp) |         file_id = make_file_id(ocr_file, self.output_file_grp) | ||||||
|             report_prefix = os.path.join(self.output_file_grp, file_id) |         cli_process( | ||||||
|  |             gt_file.local_filename, | ||||||
|  |             ocr_file.local_filename, | ||||||
|  |             file_id, | ||||||
|  |             self.output_file_grp, | ||||||
|  |             metrics=metrics, | ||||||
|  |             textequiv_level=textequiv_level, | ||||||
|  |             plain_encoding=plain_encoding, | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|             # Process the files |         # Add reports to the workspace | ||||||
|             try: |         for report_suffix, mimetype in [ | ||||||
|                 os.mkdir(self.output_file_grp) |             [".html", "text/html"], | ||||||
|             except FileExistsError: |             [".json", "application/json"], | ||||||
|                 pass |         ]: | ||||||
|             cli_process( |             output_file_id = file_id + report_suffix | ||||||
|                 gt_file.local_filename, |             output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) | ||||||
|                 ocr_file.local_filename, |             if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': | ||||||
|                 report_prefix, |                 raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set") | ||||||
|                 metrics=metrics, |             self.workspace.add_file( | ||||||
|                 textequiv_level=textequiv_level, |                file_id=output_file_id, | ||||||
|  |                 file_grp=self.output_file_grp, | ||||||
|  |                 page_id=page_id, | ||||||
|  |                 mimetype=mimetype, | ||||||
|  |                 local_filename=file_id + report_suffix, | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             # Add reports to the workspace |  | ||||||
|             for report_suffix, mimetype in [ |  | ||||||
|                 [".html", "text/html"], |  | ||||||
|                 [".json", "application/json"], |  | ||||||
|             ]: |  | ||||||
|                 self.workspace.add_file( |  | ||||||
|                     file_id=file_id + report_suffix, |  | ||||||
|                     file_grp=self.output_file_grp, |  | ||||||
|                     page_id=page_id, |  | ||||||
|                     mimetype=mimetype, |  | ||||||
|                     local_filename=report_prefix + report_suffix, |  | ||||||
|                 ) |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     ocrd_dinglehopper() |     ocrd_dinglehopper() | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/basic/gt/a.gt.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | ||||||
|  | This is a test. | ||||||
							
								
								
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/basic/gt/b.gt.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | ||||||
|  | Another test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | Tis is a test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | AnÖther test. | ||||||
							
								
								
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/merged/a/a.gt.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | ||||||
|  | This is a test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | Tis is a test. | ||||||
							
								
								
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/dinglehopper/tests/data/line_dirs/merged/b/b.gt.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | ||||||
|  | Another test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | AnÖther test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | This is a test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | Another test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | Tis is a test. | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | AnÖther test. | ||||||
							
								
								
									
										61
									
								
								src/dinglehopper/tests/test_integ_cli_line_dirs.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								src/dinglehopper/tests/test_integ_cli_line_dirs.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,61 @@ | ||||||
|  | import json | ||||||
|  | import os.path | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from ..cli_line_dirs import process | ||||||
|  | from .util import working_directory | ||||||
|  | 
 | ||||||
|  | data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.integration | ||||||
|  | def test_cli_line_dirs_basic(tmp_path): | ||||||
|  |     """Test that the cli/process() produces a good report""" | ||||||
|  | 
 | ||||||
|  |     with working_directory(tmp_path): | ||||||
|  |         gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") | ||||||
|  |         ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") | ||||||
|  |         process(gt_dir, ocr_dir, "report") | ||||||
|  |         with open("report.json", "r") as jsonf: | ||||||
|  |             print(jsonf.read()) | ||||||
|  |         with open("report.json", "r") as jsonf: | ||||||
|  |             j = json.load(jsonf) | ||||||
|  |             assert j["cer"] == pytest.approx(0.1071429) | ||||||
|  |             assert j["wer"] == pytest.approx(0.5) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.integration | ||||||
|  | def test_cli_line_dirs_basic_report_diff(tmp_path): | ||||||
|  |     """Test that the cli/process() produces a report wiff char+word diff""" | ||||||
|  | 
 | ||||||
|  |     with working_directory(tmp_path): | ||||||
|  |         gt_dir = os.path.join(data_dir, "line_dirs/basic/gt") | ||||||
|  |         ocr_dir = os.path.join(data_dir, "line_dirs/basic/ocr") | ||||||
|  |         process(gt_dir, ocr_dir, "report") | ||||||
|  | 
 | ||||||
|  |         with open("report.html", "r") as htmlf: | ||||||
|  |             html_report = htmlf.read() | ||||||
|  | 
 | ||||||
|  |     # Counting GT lines in the diff | ||||||
|  |     assert len(re.findall(r"gt.*l\d+-cdiff", html_report)) == 2 | ||||||
|  |     assert len(re.findall(r"gt.*l\d+-wdiff", html_report)) == 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.integration | ||||||
|  | def test_cli_line_dirs_merged(tmp_path): | ||||||
|  |     """Test that the cli/process() produces a good report""" | ||||||
|  | 
 | ||||||
|  |     with working_directory(tmp_path): | ||||||
|  |         gt_dir = os.path.join(data_dir, "line_dirs/merged") | ||||||
|  |         ocr_dir = os.path.join(data_dir, "line_dirs/merged") | ||||||
|  |         process( | ||||||
|  |             gt_dir, ocr_dir, "report", gt_suffix=".gt.txt", ocr_suffix=".some-ocr.txt" | ||||||
|  |         ) | ||||||
|  |         with open("report.json", "r") as jsonf: | ||||||
|  |             print(jsonf.read()) | ||||||
|  |         with open("report.json", "r") as jsonf: | ||||||
|  |             j = json.load(jsonf) | ||||||
|  |             assert j["cer"] == pytest.approx(0.1071429) | ||||||
|  |             assert j["wer"] == pytest.approx(0.5) | ||||||
|  | @ -1,4 +1,5 @@ | ||||||
| import json | import json | ||||||
|  | import re | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
|  | @ -40,3 +41,25 @@ def test_cli_json_cer_is_infinity(tmp_path): | ||||||
|         with open("report.json", "r") as jsonf: |         with open("report.json", "r") as jsonf: | ||||||
|             j = json.load(jsonf) |             j = json.load(jsonf) | ||||||
|             assert j["cer"] == pytest.approx(float("inf")) |             assert j["cer"] == pytest.approx(float("inf")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.integration | ||||||
|  | def test_cli_html(tmp_path): | ||||||
|  |     """Test that the cli/process() yields complete HTML report""" | ||||||
|  | 
 | ||||||
|  |     with working_directory(tmp_path): | ||||||
|  |         with open("gt.txt", "w") as gtf: | ||||||
|  |             gtf.write("AAAAA") | ||||||
|  |         with open("ocr.txt", "w") as ocrf: | ||||||
|  |             ocrf.write("AAAAB") | ||||||
|  | 
 | ||||||
|  |         process("gt.txt", "ocr.txt", "report") | ||||||
|  | 
 | ||||||
|  |         with open("report.html", "r") as htmlf: | ||||||
|  |             html_report = htmlf.read() | ||||||
|  |             print(html_report) | ||||||
|  | 
 | ||||||
|  |         assert re.search(r"CER: 0\.\d+", html_report) | ||||||
|  |         assert re.search(r"WER: 1\.0", html_report) | ||||||
|  |         assert len(re.findall("gt.*cdiff", html_report)) == 1 | ||||||
|  |         assert len(re.findall("gt.*wdiff", html_report)) == 1 | ||||||
							
								
								
									
										71
									
								
								src/dinglehopper/tests/test_line_dirs.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								src/dinglehopper/tests/test_line_dirs.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,71 @@ | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | from ..cli_line_dirs import find_gt_and_ocr_files, find_gt_and_ocr_files_autodetect | ||||||
|  | 
 | ||||||
|  | data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_basic(): | ||||||
|  |     """Test the dumb method: User gives directories and suffixes.""" | ||||||
|  |     pairs = list( | ||||||
|  |         find_gt_and_ocr_files( | ||||||
|  |             os.path.join(data_dir, "line_dirs/basic/gt"), | ||||||
|  |             ".gt.txt", | ||||||
|  |             os.path.join(data_dir, "line_dirs/basic/ocr"), | ||||||
|  |             ".some-ocr.txt", | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     assert len(pairs) == 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_basic_autodetect(): | ||||||
|  |     """Test autodetect: User gives directories, suffixes are autodetected if possible""" | ||||||
|  |     pairs = list( | ||||||
|  |         find_gt_and_ocr_files_autodetect( | ||||||
|  |             os.path.join(data_dir, "line_dirs/basic/gt"), | ||||||
|  |             os.path.join(data_dir, "line_dirs/basic/ocr"), | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     assert len(pairs) == 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_subdirs(): | ||||||
|  |     """Test the dumb method: Should also work when subdirectories are involved.""" | ||||||
|  |     pairs = list( | ||||||
|  |         find_gt_and_ocr_files( | ||||||
|  |             os.path.join(data_dir, "line_dirs/subdirs/gt"), | ||||||
|  |             ".gt.txt", | ||||||
|  |             os.path.join(data_dir, "line_dirs/subdirs/ocr"), | ||||||
|  |             ".some-ocr.txt", | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     assert len(pairs) == 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_subdirs_autodetect(): | ||||||
|  |     """Test the autodetect method: Should also work when subdirectories are involved.""" | ||||||
|  |     pairs = list( | ||||||
|  |         find_gt_and_ocr_files_autodetect( | ||||||
|  |             os.path.join(data_dir, "line_dirs/subdirs/gt"), | ||||||
|  |             os.path.join(data_dir, "line_dirs/subdirs/ocr"), | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     assert len(pairs) == 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_merged(): | ||||||
|  |     """Test the dumb method: GT and OCR texts are in the same directories.""" | ||||||
|  |     pairs = list( | ||||||
|  |         find_gt_and_ocr_files( | ||||||
|  |             os.path.join(data_dir, "line_dirs/merged"), | ||||||
|  |             ".gt.txt", | ||||||
|  |             os.path.join(data_dir, "line_dirs/merged"), | ||||||
|  |             ".some-ocr.txt", | ||||||
|  |         ) | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     assert len(pairs) == 2 | ||||||
|  | @ -177,8 +177,20 @@ def test_text(): | ||||||
| def test_plain(tmp_path): | def test_plain(tmp_path): | ||||||
|     with working_directory(tmp_path): |     with working_directory(tmp_path): | ||||||
|         with open("ocr.txt", "w") as ocrf: |         with open("ocr.txt", "w") as ocrf: | ||||||
|             ocrf.write("AAAAB") |             ocrf.write("First, a line.\nAnd a second line.\n") | ||||||
| 
 | 
 | ||||||
|         result = plain_text("ocr.txt") |         result = plain_text("ocr.txt") | ||||||
|         expected = "AAAAB" |         expected = "First, a line.\nAnd a second line." | ||||||
|  |         assert result == expected | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_plain_BOM(tmp_path): | ||||||
|  |     """Test that plain text files with BOM are read correctly.""" | ||||||
|  |     BOM = "\ufeff" | ||||||
|  |     with working_directory(tmp_path): | ||||||
|  |         with open("ocr.txt", "w") as ocrf: | ||||||
|  |             ocrf.write(BOM + "First, a line.\nAnd a second line.\n") | ||||||
|  | 
 | ||||||
|  |         result = plain_text("ocr.txt") | ||||||
|  |         expected = "First, a line.\nAnd a second line." | ||||||
|         assert result == expected |         assert result == expected | ||||||
|  |  | ||||||
|  | @ -21,12 +21,17 @@ def patch_word_break(): | ||||||
|     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt |     https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt | ||||||
|     """ |     """ | ||||||
|     old_word_break = uniseg.wordbreak.word_break |     old_word_break = uniseg.wordbreak.word_break | ||||||
|  |     if hasattr(uniseg.wordbreak, 'Word_Break'): | ||||||
|  |         aletter = uniseg.wordbreak.Word_Break.ALetter | ||||||
|  |     else: | ||||||
|  |         # uniseg<0.9 | ||||||
|  |         aletter = uniseg.wordbreak.WordBreak.ALETTER | ||||||
| 
 | 
 | ||||||
|     def new_word_break(c, index=0): |     def new_word_break(c): | ||||||
|         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area |         if 0xE000 <= ord(c) <= 0xF8FF:  # Private Use Area | ||||||
|             return uniseg.wordbreak.WordBreak.ALETTER |             return aletter | ||||||
|         else: |         else: | ||||||
|             return old_word_break(c, index) |             return old_word_break(c) | ||||||
| 
 | 
 | ||||||
|     uniseg.wordbreak.word_break = new_word_break |     uniseg.wordbreak.word_break = new_word_break | ||||||
|     global word_break_patched |     global word_break_patched | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue