mirror of
				https://github.com/qurator-spk/ocrd-galley.git
				synced 2025-10-31 03:04:12 +01:00 
			
		
		
		
	Merge branch 'master' of github.com:mikegerber/my_ocrd_workflow
This commit is contained in:
		
						commit
						4e37a52899
					
				
					 5 changed files with 12 additions and 56 deletions
				
			
		|  | @ -70,10 +70,13 @@ COPY data/textline_detection /var/lib/textline_detection | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Install requirements | # Install requirements | ||||||
|  | # Using pipdeptree here to get more info than from pip3 check | ||||||
| COPY requirements.txt /tmp/ | COPY requirements.txt /tmp/ | ||||||
| RUN pip3 install --no-cache-dir --upgrade pip && \ | RUN pip3 install --no-cache-dir --upgrade pip && \ | ||||||
|     pip3 install --no-cache-dir -r /tmp/requirements.txt && \ |     pip3 install --no-cache-dir -r /tmp/requirements.txt && \ | ||||||
|     pip3 check |     pip3 install --no-cache-dir pipdeptree && \ | ||||||
|  |     pipdeptree -w fail | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| COPY my_ocrd_workflow /usr/bin/ | COPY my_ocrd_workflow /usr/bin/ | ||||||
| COPY xsd/*            /usr/share/xml/ | COPY xsd/*            /usr/share/xml/ | ||||||
|  |  | ||||||
|  | @ -72,9 +72,6 @@ This produces a workspace directory `PPN77164308X` with the OCR results in it; | ||||||
| the results are viewable as explained above. | the results are viewable as explained above. | ||||||
| 
 | 
 | ||||||
| ppn2ocr requires a working Docker setup and properly set up environment | ppn2ocr requires a working Docker setup and properly set up environment | ||||||
| variables for the proxy configuration. At SBB, this means: | variables for the proxy configuration. At SBB, this following | ||||||
| ~~~ | `howto/docker-proxy.md` and `howto/proxy-settings-for-shell+python.md` | ||||||
| export HTTP_PROXY=http://http-proxy.sbb.spk-berlin.de:3128/ | (in qurator's mono-repo). | ||||||
| export HTTPS_PROXY=$HTTP_PROXY; export http_proxy=$HTTP_PROXY; export https_proxy=$HTTP_PROXY |  | ||||||
| export no_proxy=localhost,digital.staatsbibliothek-berlin.de,content.staatsbibliothek-berlin.de |  | ||||||
| ~~~ |  | ||||||
|  |  | ||||||
|  | @ -1,47 +0,0 @@ | ||||||
| """Check FULLTEXT ALTO page dimensions against BEST image dimensions""" |  | ||||||
| 
 |  | ||||||
| import PIL.Image |  | ||||||
| import sys |  | ||||||
| from ocrd.workspace import Workspace |  | ||||||
| from ocrd.resolver import Resolver |  | ||||||
| from lxml import etree as ET |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def alto_namespace(tree): |  | ||||||
|     """ |  | ||||||
|     Return the ALTO namespace used in the given ElementTree. |  | ||||||
| 
 |  | ||||||
|     This relies on the assumption that, in any given ALTO file, the root |  | ||||||
|     element has the local name "alto". We do not check if the files uses any |  | ||||||
|     valid ALTO namespace. |  | ||||||
|     """ |  | ||||||
|     root_name = ET.QName(tree.getroot().tag) |  | ||||||
|     if root_name.localname == 'alto': |  | ||||||
|         return root_name.namespace |  | ||||||
|     else: |  | ||||||
|         raise ValueError('Not an ALTO tree') |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| exit_code = 0 |  | ||||||
| workspace = Workspace(Resolver(), '.') |  | ||||||
| 
 |  | ||||||
| for n, page_id in enumerate(workspace.mets.physical_pages): |  | ||||||
|     gt_file = workspace.mets.find_files(fileGrp='FULLTEXT', pageId=page_id)[0] |  | ||||||
|     img_file = workspace.mets.find_files(fileGrp='BEST', pageId=page_id)[0] |  | ||||||
|     gt_file = workspace.download_file(gt_file) |  | ||||||
|     img_file = workspace.download_file(img_file) |  | ||||||
| 
 |  | ||||||
|     tree = ET.parse(gt_file.local_filename) |  | ||||||
|     nsmap = {'alto': alto_namespace(tree)} |  | ||||||
|     alto_page = tree.find('//alto:Page', namespaces=nsmap)  # one page assumed |  | ||||||
|     gt_size = int(alto_page.attrib['WIDTH']), int(alto_page.attrib['HEIGHT']) |  | ||||||
| 
 |  | ||||||
|     img_size = PIL.Image.open(img_file.local_filename).size |  | ||||||
| 
 |  | ||||||
|     if gt_size == img_size: |  | ||||||
|         print('OK', page_id) |  | ||||||
|     else: |  | ||||||
|         print('ERR', page_id, gt_size, '!=', img_size) |  | ||||||
|         exit_code = 1 |  | ||||||
| 
 |  | ||||||
| sys.exit(exit_code) |  | ||||||
|  | @ -134,6 +134,8 @@ page_upgrade_to_2019() { | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| main() { | main() { | ||||||
|  |   do_validate | ||||||
|  | 
 | ||||||
|   do_binarization |   do_binarization | ||||||
|   do_validate |   do_validate | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										5
									
								
								ppn2ocr
									
										
									
									
									
								
							
							
						
						
									
										5
									
								
								ppn2ocr
									
										
									
									
									
								
							|  | @ -74,9 +74,10 @@ def make_workspace(ppn, workspace): | ||||||
|     mets = oai_mets(ppn) |     mets = oai_mets(ppn) | ||||||
| 
 | 
 | ||||||
|     # XXX |     # XXX | ||||||
|     # Delete PRESENTATION file group |     # Delete PRESENTATION + LOCAL file groups | ||||||
|     # (local file:/// links, not handled well by "ocrd workspace") |     # (local file:/// or file:/ links, not handled well by "ocrd workspace") | ||||||
|     remove_file_grp(mets, 'PRESENTATION') |     remove_file_grp(mets, 'PRESENTATION') | ||||||
|  |     remove_file_grp(mets, 'LOCAL') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     # Duplicate DEFAULT file group into a new file group BEST |     # Duplicate DEFAULT file group into a new file group BEST | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue