c4ab7c9a7c 
								
							 
						 
						
							
							
								
								🕸Do not use deprecated ID, pageId options  
							
							... 
							
							
							
							See gh-75. 
							
						 
						
							2023-03-14 13:16:28 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								833efa37da 
								
							 
						 
						
							
							
								
								🐛  Remove deprecated declare_namespace call  
							
							... 
							
							
							
							Remove depecreated declare_namespace call and use implicit namespace (PEP-0420).
Fixes gh-76. 
							
						 
						
							2023-03-14 12:44:22 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								0fd4ea1973 
								
							 
						 
						
							
							
								
								✔ Add @cneud's former 40 GB problem files to the test suite  
							
							
							
						 
						
							2023-03-02 16:24:08 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								0f0819512e 
								
							 
						 
						
							
							
								
								🎨  Reformat using Black  
							
							
							
						 
						
							2023-03-02 10:22:51 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								dcc10c5389 
								
							 
						 
						
							
							
								
								✔️  Skip test_lines_similar() for now  
							
							... 
							
							
							
							test_lines_similar() fails with rapidfuzz 2.5 and is flawed anyway:
The test was based on our own implementation that used __eq__ and not __hash__ as
rapidfuzz does. Need to review this in the future. 
							
						 
						
							2022-08-18 15:51:16 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								555f586775 
								
							 
						 
						
							
							
								
								📝  Note that old terminals might not render the Unicode characters correctly  
							
							
							
						 
						
							2022-08-17 17:59:15 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								c4e85da5ab 
								
							 
						 
						
							
							
								
								🐛  Update editops() and seq_align() due to RapidFuzz API changes  
							
							
							
						 
						
							2022-08-17 17:55:44 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								15dfbac3a7 
								
							 
						 
						
							
							
								
								Revert "Revert "Merge pull request  #67  from maxbachmann/rapidfuzz""  
							
							... 
							
							
							
							This reverts commit 76bd50f1db 
							
						 
						
							2022-08-17 11:42:19 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								76bd50f1db 
								
							 
						 
						
							
							
								
								Revert "Merge pull request  #67  from maxbachmann/rapidfuzz"  
							
							... 
							
							
							
							This reverts commit 85f751aacc1febea8c92 
							
						 
						
							2022-08-16 19:31:28 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Max Bachmann 
								
							 
						 
						
							
							
							
							
								
							
							
								e543438496 
								
							 
						 
						
							
							
								
								replace usage of deprecated rapidfuzz APIs  
							
							
							
						 
						
							2022-08-07 10:40:31 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								d726396002 
								
							 
						 
						
							
							
								
								👷🏾♂️  Remove str() on Path objects  
							
							... 
							
							
							
							As of Python 3.6 we don't need to call str() on Path objects anymore.
See also gh-20. 
							
						 
						
							2022-03-02 11:19:40 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								8a3f5e48c2 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Patch word_break only once  
							
							... 
							
							
	
		
			
	 
	
	
		
	
	
		
			
				
	continuous-integration/drone/push Build encountered an error 
				
			 
		
		
	 
 
	 
							
							Previously, we (accidently) patched uniseg's word_break on every call
to words(). Do it only once. 
							
						 
						
							2022-01-24 18:44:30 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								f77ce857b2 
								
							 
						 
						
							
							
								
								🚧  dinglehopper: Sahre json_float code  
							
							
	
		
			
	 
	
	
		
	
	
		
			
				
	continuous-integration/drone/push Build encountered an error 
				
			 
		
		
	 
 
	 
							
						 
						
							2021-12-14 18:37:07 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								5b394649a7 
								
							 
						 
						
							
							
								
								🚧  dinglehopper: Compute WER in line-dirs CLI  
							
							
							
						 
						
							2021-12-14 18:33:20 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								cb2be96179 
								
							 
						 
						
							
							
								
								🚧  dinglehopper: Add word differences in line-dirs report  
							
							
							
						 
						
							2021-12-14 18:20:04 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								dbb660615a 
								
							 
						 
						
							
							
								
								🚧  dinglehopper: Compare line text directories (WIP)  
							
							
	
		
			
	 
	
	
		
	
	
		
			
				
	continuous-integration/drone/push Build encountered an error 
				
			 
		
		
	 
 
	 
							
						 
						
							2021-12-14 11:37:07 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								a018006f98 
								
							 
						 
						
							
							
								
								🚧  dinglehopper: Compare line text directories (WIP)  
							
							
							
						 
						
							2021-12-14 11:37:07 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								36b36f6986 
								
							 
						 
						
							
							
								
								🚧  dinglehopper: Compare line text directories (WIP)  
							
							
							
						 
						
							2021-12-14 11:37:07 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								06ea38449c 
								
							 
						 
						
							
							
								
								📝  dinglehopper: Update Levenshtein notebook  
							
							
							
						 
						
							2021-10-22 16:58:40 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								3ee688001a 
								
							 
						 
						
							
							
								
								🧹  dinglehopper: Directly import levenshtein() from rapidfuzz  
							
							
							
						 
						
							2021-10-22 16:30:21 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								5d496df267 
								
							 
						 
						
							
							
								
								⚡  dinglehopper: Remove tests that only test rapidfuzz's levenshtein()  
							
							
							
						 
						
							2021-10-22 16:26:55 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								091f069b3c 
								
							 
						 
						
							
							
								
								⚡  dinglehopper: Remove tests that only test rapidfuzz's levenshtein_ops()  
							
							
							
						 
						
							2021-10-22 16:21:16 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								af8da1d716 
								
							 
						 
						
							
							
								
								⚡  dinglehopper: Use rapidfuzz for editops  
							
							
							
						 
						
							2021-10-22 15:38:59 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								249787686f 
								
							 
						 
						
							
							
								
								Merge branch 'master' of github.com:qurator-spk/dinglehopper  
							
							
	
		
			
	 
	
	
		
	
	
		
			
				
	continuous-integration/drone/push Build is failing 
				
			 
		
		
	 
 
	 
							
						 
						
							2021-05-20 09:42:15 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								2a6cc5823e 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Call initLogging before logging  
							
							... 
							
							
							
							When using ocrd_utils' getLogger(), we need to call initLogging() before doing any
logging.
Fixes  #55 . 
							
						 
						
							2021-05-20 09:39:09 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Konstantin Baierer 
								
							 
						 
						
							
							
							
							
								
							
							
								7fde00d911 
								
							 
						 
						
							
							
								
								ReadingOrder may also contain UnorderedGroupIndexed  
							
							
							
						 
						
							2021-05-18 17:34:08 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								1778b36a9a 
								
							 
						 
						
							
							
								
								🚧  dinglehopper: Read PAGE UnorderedGroup in XML order  
							
							
							
						 
						
							2021-04-15 21:09:45 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Benjamin Rosemann 
								
							 
						 
						
							
							
							
							
								
							
							
								a68fc269d9 
								
							 
						 
						
							
							
								
								Fix the extraction of text from Page with TableRegion  
							
							... 
							
							
							
							Dinglehopper did not consider `OrderedGroupIndex` in the `ReadingOrder`
element when extracting text regions. As a consequence a `TableRegion`
was not considered for text extraction. 
							
						 
						
							2020-11-27 11:18:11 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Konstantin Baierer 
								
							 
						 
						
							
							
							
							
								
							
							
								74e0ac18ed 
								
							 
						 
						
							
							
								
								ocrd cli: use core-provided zip_input_files method  
							
							
							
						 
						
							2020-11-19 16:00:28 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								389e253c11 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Fix alto_extract_lines()'s type annotation  
							
							
							
						 
						
							2020-11-12 19:32:38 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								fe3923a8af 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Fix alto_extract()'s type annotation  
							
							
							
						 
						
							2020-11-12 19:19:05 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								132f91d500 
								
							 
						 
						
							
							
								
								✔️  dinglehopper: Add missing integration test markers  
							
							
							
						 
						
							2020-11-12 19:10:23 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Benjamin Rosemann 
								
							 
						 
						
							
							
							
							
								
							
							
								ce752e1912 
								
							 
						 
						
							
							
								
								Remove .idea folder and modify .gitignore  
							
							... 
							
							
							
							Sharing even parts of the .idea folder in worldwide setting is bound to
generate more problems than solutions. Therefore it should be removed
and consequently ignore in .gitignore.
Also adds some Python specific stuff to the .gitignore file. 
							
						 
						
							2020-11-11 11:36:17 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Benjamin Rosemann 
								
							 
						 
						
							
							
							
							
								
							
							
								5270737c1f 
								
							 
						 
						
							
							
								
								Skip test on windows because it is unix specific.  
							
							
							
						 
						
							2020-11-11 11:36:17 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								32a4b95a99 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Normalize in plain_extract()  
							
							
							
						 
						
							2020-11-10 18:51:14 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								14421c8e53 
								
							 
						 
						
							
							
								
								🎨  dinglehopper: Reformat using black  
							
							
							
						 
						
							2020-11-10 12:29:55 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								31c63f9e4c 
								
							 
						 
						
							
							
								
								🎨  dinglehopper: s/LOG/log  
							
							
							
						 
						
							2020-11-09 16:55:43 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Robert Sachunsky 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a60c14351e 
								
							 
						 
						
							
							
								
								1 more update for core's getLogger context  
							
							
							
						 
						
							2020-11-03 17:46:59 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Benjamin Rosemann 
								
							 
						 
						
							
							
							
							
								
							
							
								c02569b41e 
								
							 
						 
						
							
							
								
								Fix f-strings for Python 3.5  
							
							
							
						 
						
							2020-10-29 12:33:54 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Benjamin Rosemann 
								
							 
						 
						
							
							
							
							
								
							
							
								7b27b2834e 
								
							 
						 
						
							
							
								
								More complex sorting for text extraction  
							
							... 
							
							
							
							When extracting text from TextEquiv nodes we may encounter nodes without
index or nodes that should get sorted via the conf attribute.
Therefore we added a more complex algorithm to extract a TextEquiv and
inform the user via log messages if we encounter structures that we can
handle but may produce unexpected results. 
							
						 
						
							2020-10-29 10:03:40 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
								
								
									Benjamin Rosemann 
								
							 
						 
						
							
							
							
							
								
							
							
								6ff831dfd2 
								
							 
						 
						
							
							
								
								Sort textlines with missing indices  
							
							... 
							
							
							
							Python's `sorted` method will fail with a TypeError when called with
`None` and Integers:
```python
>>> sorted([None, 1])
TypeError: '<' not supported between instances of 'int' and 'NoneType'
```
Therefore we are using `float('inf')` instead of `None` in case of
missing textline indices. 
							
						 
						
							2020-10-29 10:03:40 +01:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								5cbe148741 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Skip pages if there is no GT nor OCR (Fixes GH-34)  
							
							
							
						 
						
							2020-10-21 19:29:45 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								e4e2777cb7 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Do try to get text when no TextEquivs exist  
							
							
							
						 
						
							2020-10-21 17:59:44 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								1c88891a98 
								
							 
						 
						
							
							
								
								✔️  Add test data for LAREX's indexed TextEquivs (unused)  
							
							
							
						 
						
							2020-10-21 17:51:15 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								19d15e3ecc 
								
							 
						 
						
							
							
								
								🐛  dinglehopper: Honor TextEquiv index (Closes GH-33)  
							
							
							
						 
						
							2020-10-21 17:50:21 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								f626a2ebe6 
								
							 
						 
						
							
							
								
								🧹  dinglehopper: Remove warning when there is a non-TextRegion in the ReadingOrder  
							
							
							
						 
						
							2020-10-21 17:03:55 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								8b4ee20a40 
								
							 
						 
						
							
							
								
								✨  Add a new CLI tool dinglehopper-extract to just give the extracted text  
							
							
							
						 
						
							2020-10-21 16:30:48 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								b23b75b601 
								
							 
						 
						
							
							
								
								✨  dinglehopper: Give segment ids from the extracted textequiv_level  
							
							
							
						 
						
							2020-10-21 16:04:33 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								b23e4ce30e 
								
							 
						 
						
							
							
								
								✨  dinglehopper: Add OCR-D parameter to choose TextEquiv level  
							
							
							
						 
						
							2020-10-21 14:38:19 +02:00 
							
								 
							
							
								 
							
						 
					 
				
					
						
							
						 
						
							
							
							
							
								
							
							
								9744fa2567 
								
							 
						 
						
							
							
								
								✨  dinglehopper: Add CLI option to choose TextEquiv level  
							
							
							
						 
						
							2020-10-20 19:33:39 +02:00