From f197f01d3f7f0366f43791e5eb47ce3103f738cd Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 15 Sep 2021 17:26:14 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20ppn2ocr:=20Keep=20only=20wanted=20f?= =?UTF-8?q?ile=20groups?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppn2ocr | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/ppn2ocr b/ppn2ocr index a152acc..cbd0030 100755 --- a/ppn2ocr +++ b/ppn2ocr @@ -79,11 +79,22 @@ def mime_type_for_format(format_): def prune_file_grps(mets): - # XXX - # Delete PRESENTATION + LOCAL file groups - # (local file:/// or file:/ links, not handled well by "ocrd workspace") - remove_file_grp(mets, 'PRESENTATION') - remove_file_grp(mets, 'LOCAL') + """ + Prune unwanted file groups + + We only want to keep the MAX file group (we created it ourselves) and + possibly ABBYY full texts in FULLTEXT. + + For the PRESENTATION + LOCAL file groups we definitely want to delete + because they contain local file:/// or file:/ links, which are not handled + well by "ocrd workspace". They are not explicitly mentioned, as we + only keep a whitelist. + """ + wanted_file_grps = ["MAX", "FULLTEXT"] + + for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS): + if u not in wanted_file_grps: + remove_file_grp(mets, u) def make_workspace(ppn, workspace):