mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-31 03:04:12 +01:00 
			
		
		
		
	✨ Validate PAGE XML after OCR
This commit is contained in:
		
							parent
							
								
									0d7fd21446
								
							
						
					
					
						commit
						8b67866aac
					
				
					 1 changed files with 1 additions and 16 deletions
				
			
		|  | @ -81,21 +81,6 @@ do_ocr() { | ||||||
|     -p <(echo $ocrd_tesserocr_recognize_parameters) |     -p <(echo $ocrd_tesserocr_recognize_parameters) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| page_fix_xml() { |  | ||||||
|   # Fix the PAGE XML generated by OCR-D core |  | ||||||
|   # |  | ||||||
|   # XXX core does not produce valid XML (See https://github.com/OCR-D/core/issues/242), fix it by setting the correct |  | ||||||
|   # PAGE XML version. This makes PAGE Viewer open the file. |  | ||||||
| 
 |  | ||||||
|   filegrp=$1 |  | ||||||
| 
 |  | ||||||
|   local file |  | ||||||
|   for file in `ocrd workspace find -G $filegrp`; do |  | ||||||
|     sed -i 's#pagecontent/2017-07-15#pagecontent/2019-07-15#g' $file |  | ||||||
|     sed -i 's#pagecontent/2018-07-15#pagecontent/2019-07-15#g' $file |  | ||||||
|   done |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| page_validate_xml() { | page_validate_xml() { | ||||||
|   # Validate all PAGE XML against the XML schema |   # Validate all PAGE XML against the XML schema | ||||||
| 
 | 
 | ||||||
|  | @ -133,9 +118,9 @@ page_validate_xml         OCR-D-SEG-LINE | ||||||
| do_validate | do_validate | ||||||
| 
 | 
 | ||||||
| do_ocr | do_ocr | ||||||
|  | page_validate_xml         OCR-D-OCR-TESS | ||||||
| do_validate | do_validate | ||||||
| 
 | 
 | ||||||
| page_fix_xml              OCR-D-OCR-TESS  # XXX is it necessary anymore? |  | ||||||
| page_fix_image_references OCR-D-OCR-TESS | page_fix_image_references OCR-D-OCR-TESS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue