mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-31 03:04:12 +01:00 
			
		
		
		
	Merge branch 'master' of github.com:mikegerber/my_ocrd_workflow
This commit is contained in:
		
						commit
						4e37a52899
					
				
					 5 changed files with 12 additions and 56 deletions
				
			
		|  | @ -70,10 +70,13 @@ COPY data/textline_detection /var/lib/textline_detection | |||
| 
 | ||||
| 
 | ||||
| # Install requirements | ||||
| # Using pipdeptree here to get more info than from pip3 check | ||||
| COPY requirements.txt /tmp/ | ||||
| RUN pip3 install --no-cache-dir --upgrade pip && \ | ||||
|     pip3 install --no-cache-dir -r /tmp/requirements.txt && \ | ||||
|     pip3 check | ||||
|     pip3 install --no-cache-dir pipdeptree && \ | ||||
|     pipdeptree -w fail | ||||
| 
 | ||||
| 
 | ||||
| COPY my_ocrd_workflow /usr/bin/ | ||||
| COPY xsd/*            /usr/share/xml/ | ||||
|  |  | |||
|  | @ -72,9 +72,6 @@ This produces a workspace directory `PPN77164308X` with the OCR results in it; | |||
| the results are viewable as explained above. | ||||
| 
 | ||||
| ppn2ocr requires a working Docker setup and properly set up environment | ||||
| variables for the proxy configuration. At SBB, this means: | ||||
| ~~~ | ||||
| export HTTP_PROXY=http://http-proxy.sbb.spk-berlin.de:3128/ | ||||
| export HTTPS_PROXY=$HTTP_PROXY; export http_proxy=$HTTP_PROXY; export https_proxy=$HTTP_PROXY | ||||
| export no_proxy=localhost,digital.staatsbibliothek-berlin.de,content.staatsbibliothek-berlin.de | ||||
| ~~~ | ||||
| variables for the proxy configuration. At SBB, this following | ||||
| `howto/docker-proxy.md` and `howto/proxy-settings-for-shell+python.md` | ||||
| (in qurator's mono-repo). | ||||
|  |  | |||
|  | @ -1,47 +0,0 @@ | |||
| """Check FULLTEXT ALTO page dimensions against BEST image dimensions""" | ||||
| 
 | ||||
| import PIL.Image | ||||
| import sys | ||||
| from ocrd.workspace import Workspace | ||||
| from ocrd.resolver import Resolver | ||||
| from lxml import etree as ET | ||||
| 
 | ||||
| 
 | ||||
| def alto_namespace(tree): | ||||
|     """ | ||||
|     Return the ALTO namespace used in the given ElementTree. | ||||
| 
 | ||||
|     This relies on the assumption that, in any given ALTO file, the root | ||||
|     element has the local name "alto". We do not check if the files uses any | ||||
|     valid ALTO namespace. | ||||
|     """ | ||||
|     root_name = ET.QName(tree.getroot().tag) | ||||
|     if root_name.localname == 'alto': | ||||
|         return root_name.namespace | ||||
|     else: | ||||
|         raise ValueError('Not an ALTO tree') | ||||
| 
 | ||||
| 
 | ||||
| exit_code = 0 | ||||
| workspace = Workspace(Resolver(), '.') | ||||
| 
 | ||||
| for n, page_id in enumerate(workspace.mets.physical_pages): | ||||
|     gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0] | ||||
|     img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0] | ||||
|     gt_file = workspace.download_file(gt_file) | ||||
|     img_file = workspace.download_file(img_file) | ||||
| 
 | ||||
|     tree = ET.parse(gt_file.local_filename) | ||||
|     nsmap = {'alto': alto_namespace(tree)} | ||||
|     alto_page = tree.find('//alto:Page', namespaces=nsmap)  # one page assumed | ||||
|     gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT']) | ||||
| 
 | ||||
|     img_size = PIL.Image.open(img_file.local_filename).size | ||||
| 
 | ||||
|     if gt_size == img_size: | ||||
|         print('OK', page_id) | ||||
|     else: | ||||
|         print('ERR', page_id, gt_size, '!=', img_size) | ||||
|         exit_code = 1 | ||||
| 
 | ||||
| sys.exit(exit_code) | ||||
|  | @ -134,6 +134,8 @@ page_upgrade_to_2019() { | |||
| 
 | ||||
| 
 | ||||
| main() { | ||||
|   do_validate | ||||
| 
 | ||||
|   do_binarization | ||||
|   do_validate | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										5
									
								
								ppn2ocr
									
										
									
									
									
								
							
							
						
						
									
										5
									
								
								ppn2ocr
									
										
									
									
									
								
							|  | @ -74,9 +74,10 @@ def make_workspace(ppn, workspace): | |||
|     mets = oai_mets(ppn) | ||||
| 
 | ||||
|     # XXX | ||||
|     # Delete PRESENTATION file group | ||||
|     # (local file:/// links, not handled well by "ocrd workspace") | ||||
|     # Delete PRESENTATION + LOCAL file groups | ||||
|     # (local file:/// or file:/ links, not handled well by "ocrd workspace") | ||||
|     remove_file_grp(mets, 'PRESENTATION') | ||||
|     remove_file_grp(mets, 'LOCAL') | ||||
| 
 | ||||
| 
 | ||||
|     # Duplicate DEFAULT file group into a new file group BEST | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue