diff --git a/qurator/modstool/__init__.py b/qurator/modstool/__init__.py index eabaacd..e69de29 100644 --- a/qurator/modstool/__init__.py +++ b/qurator/modstool/__init__.py @@ -1 +0,0 @@ -from .modstool import * diff --git a/qurator/modstool/alto4pandas.py b/qurator/modstool/alto4pandas.py new file mode 100755 index 0000000..ceb498a --- /dev/null +++ b/qurator/modstool/alto4pandas.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +import csv +import logging +import os +import re +import warnings +import sys +from xml.dom.expatbuilder import Namespaces +from lxml import etree as ET +from itertools import groupby +from operator import attrgetter +from typing import List +from collections.abc import MutableMapping, Sequence + +import click +import pandas as pd +import numpy as np +from tqdm import tqdm + +from .lib import TagGroup, sorted_groupby, flatten, ns + + +logger = logging.getLogger('alto4pandas') + + + +def alto_to_dict(alto, raise_errors=True): + """Convert ALTO metadata to a nested dictionary""" + + value = {} + + # Iterate through each group of tags + for tag, group in sorted_groupby(alto, key=attrgetter('tag')): + group = list(group) + + localname = ET.QName(tag).localname + + if localname == 'Description': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'MeasurementUnit': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'OCRProcessing': + value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'Processing': + # TODO This enumerated descent is used more than once, DRY! + for n, e in enumerate(group): + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) + elif localname == 'ocrProcessingStep': + for n, e in enumerate(group): + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) + elif localname == 'preProcessingStep': + for n, e in enumerate(group): + value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) + elif localname == 'processingDateTime': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingSoftware': + value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) + elif localname == 'processingAgency': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingStepDescription': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'processingStepSettings': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareCreator': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareName': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + elif localname == 'softwareVersion': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + + elif localname == 'sourceImageInformation': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'fileName': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() + + elif localname == 'Layout': + value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) + elif localname == 'Page': + alto_namespace = ET.QName(group[0]).namespace + namespaces={"alto": alto_namespace} + + value[localname] = {} + value[localname].update(TagGroup(tag, group).is_singleton().attributes()) + value[localname].update(TagGroup(tag, group).subelement_counts()) + value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)) + + elif localname == 'Styles': + pass + elif localname == 'Tags': + pass + else: + if raise_errors: + print(value) + raise ValueError('Unknown tag "{}"'.format(tag)) + else: + pass + + return value + + + +def walk(m): + # XXX do this in modstool, too + if os.path.isdir(m): + tqdm.write(f'Scanning directory {m}') + for f in tqdm(os.scandir(m), leave=False): + if f.is_file() and not f.name.startswith('.'): + yield f.path + elif f.is_dir(): + try: + yield from walk(f.path) + except PermissionError: + warnings.warn(f"Error walking {f.path}") + else: + yield m.path + + + +@click.command() +@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1) +@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', + default='alto_info_df.pkl', show_default=True) +@click.option('--output-csv', type=click.Path(), help='Output CSV file') +@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') +def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str): + """ + A tool to convert the ALTO metadata in INPUT to a pandas DataFrame. + + INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads + all files in the directory. + + alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. + """ + + # Extend file list if directories are given + alto_files_real = [] + for m in alto_files: + for x in walk(m): + alto_files_real.append(x) + + # Process ALTO files + with open(output_file + '.warnings.csv', 'w') as csvfile: + csvwriter = csv.writer(csvfile) + alto_info = [] + logger.info('Processing ALTO files') + for alto_file in tqdm(alto_files_real, leave=False): + try: + root = ET.parse(alto_file).getroot() + alto = root # XXX .find('alto:alto', ns) does not work here + + with warnings.catch_warnings(record=True) as caught_warnings: + warnings.simplefilter('always') # do NOT filter double occurrences + + # ALTO + d = flatten(alto_to_dict(alto, raise_errors=True)) + # "meta" + d['alto_file'] = alto_file + d['alto_xmlns'] = ET.QName(alto).namespace + + alto_info.append(d) + + if caught_warnings: + # PyCharm thinks caught_warnings is not Iterable: + # noinspection PyTypeChecker + for caught_warning in caught_warnings: + csvwriter.writerow([alto_file, caught_warning.message]) + except Exception as e: + logger.error('Exception in {}: {}'.format(alto_file, e)) + import traceback; traceback.print_exc() + + # Convert the alto_info List[Dict] to a pandas DataFrame + columns = [] + for m in alto_info: + for c in m.keys(): + if c not in columns: + columns.append(c) + data = [[m.get(c) for c in columns] for m in alto_info] + index = [m['alto_file'] for m in alto_info] # TODO use ppn + page? + alto_info_df = pd.DataFrame(data=data, index=index, columns=columns) + + # Pickle the DataFrame + logger.info('Writing DataFrame to {}'.format(output_file)) + alto_info_df.to_pickle(output_file) + if output_csv: + logger.info('Writing CSV to {}'.format(output_csv)) + alto_info_df.to_csv(output_csv) + if output_xlsx: + logger.info('Writing Excel .xlsx to {}'.format(output_xlsx)) + alto_info_df.to_excel(output_xlsx) + + +def main(): + logging.basicConfig(level=logging.INFO) + + for prefix, uri in ns.items(): + ET.register_namespace(prefix, uri) + + process() + + +if __name__ == '__main__': + main() diff --git a/qurator/modstool/lib.py b/qurator/modstool/lib.py new file mode 100644 index 0000000..a6be479 --- /dev/null +++ b/qurator/modstool/lib.py @@ -0,0 +1,285 @@ +from itertools import groupby +import re +import warnings +from typing import List, Sequence, MutableMapping + +import numpy as np +from lxml import etree as ET + + +__all__ = ["ns"] + + +ns = { + 'mets': 'http://www.loc.gov/METS/', + 'mods': 'http://www.loc.gov/mods/v3', + "alto": "http://www.loc.gov/standards/alto/ns-v2" +} + + + +class TagGroup: + """Helper class to simplify the parsing and checking of MODS metadata""" + + def __init__(self, tag, group: List[ET.Element]): + self.tag = tag + self.group = group + + def __str__(self): + return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group) + + def is_singleton(self): + if len(self.group) != 1: + raise ValueError('More than one instance: {}'.format(self)) + return self + + def has_no_attributes(self): + return self.has_attributes({}) + + def has_attributes(self, attrib): + if not isinstance(attrib, Sequence): + attrib = [attrib] + if not all(e.attrib in attrib for e in self.group): + raise ValueError('One or more element has unexpected attributes: {}'.format(self)) + return self + + def ignore_attributes(self): + # This serves as documentation for now. + return self + + def sort(self, key=None, reverse=False): + self.group = sorted(self.group, key=key, reverse=reverse) + return self + + def text(self, separator='\n'): + t = '' + for e in self.group: + if t != '': + t += separator + if e.text: + t += e.text + return t + + def text_set(self): + return {e.text for e in self.group} + + def descend(self, raise_errors): + return _to_dict(self.is_singleton().group[0], raise_errors) + + def filter(self, cond, warn=None): + new_group = [] + for e in self.group: + if cond(e): + new_group.append(e) + else: + if warn: + warnings.warn('Filtered {} element ({})'.format(self.tag, warn)) + return TagGroup(self.tag, new_group) + + def force_singleton(self, warn=True): + if len(self.group) == 1: + return self + else: + if warn: + warnings.warn('Forced single instance of {}'.format(self.tag)) + return TagGroup(self.tag, self.group[:1]) + + RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX' + RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$' + + def fix_date(self): + + for e in self.group: + if e.attrib.get('encoding') == 'w3cdtf': + # This should be 'iso8601' according to MODS-AP 2.3.1 + warnings.warn('Changed w3cdtf encoding to iso8601') + e.attrib['encoding'] = 'iso8601' + + new_group = [] + for e in self.group: + if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text): + new_group.append(e) + elif re.match(self.RE_ISO8601_DATE, e.text): + warnings.warn('Added iso8601 encoding to date {}'.format(e.text)) + e.attrib['encoding'] = 'iso8601' + new_group.append(e) + elif re.match(self.RE_GERMAN_DATE, e.text): + warnings.warn('Converted date {} to iso8601 encoding'.format(e.text)) + m = re.match(self.RE_GERMAN_DATE, e.text) + e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd')) + e.attrib['encoding'] = 'iso8601' + new_group.append(e) + else: + warnings.warn('Not a iso8601 date: "{}"'.format(e.text)) + new_group.append(e) + self.group = new_group + + # Notes: + # - There are dates with the misspelled qualifier 'aproximate' + # - Rough periods are sometimes given either by: + # - years like '19xx' + # - or 'approximate' date ranges with point="start"/"end" attributes set + # (this could be correct according to MODS-AP 2.3.1) + # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier + # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)" + + return self + + def fix_event_type(self): + # According to MODS-AP 2.3.1, every originInfo should have its eventType set. + # Fix this for special cases. + + for e in self.group: + if e.attrib.get('eventType') is None: + try: + if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \ + e.find('mods:edition', ns).text == '[Electronic ed.]': + e.attrib['eventType'] = 'digitization' + warnings.warn('Fixed eventType for electronic ed.') + continue + except AttributeError: + pass + try: + if e.find('mods:dateIssued', ns) is not None: + e.attrib['eventType'] = 'publication' + warnings.warn('Fixed eventType for an issued origin') + continue + except AttributeError: + pass + try: + if e.find('mods:dateCreated', ns) is not None: + e.attrib['eventType'] = 'production' + warnings.warn('Fixed eventType for a created origin') + continue + except AttributeError: + pass + return self + + def fix_script_term(self): + for e in self.group: + # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case. + if e.attrib['authority'] == 'ISO15924': + e.attrib['authority'] = 'iso15924' + warnings.warn('Changed scriptTerm authority to lower case') + return self + + def merge_sub_tags_to_set(self): + from .modstool import mods_to_dict + value = {} + + sub_dicts = [mods_to_dict(e) for e in self.group] + sub_tags = {k for d in sub_dicts for k in d.keys()} + for sub_tag in sub_tags: + s = set() + for d in sub_dicts: + v = d.get(sub_tag) + if v: + # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a + # German language document. + if isinstance(v, set): + s.update(v) + else: + s.add(v) + value[sub_tag] = s + return value + + def attributes(self): + """ + Return a merged dict of all attributes of the tag group. + + Probably most useful if used on a singleton, for example: + + value['Page'] = TagGroup(tag, group).is_singleton().attributes() + """ + attrib = {} + for e in self.group: + for a, v in e.attrib.items(): + a_localname = ET.QName(a).localname + attrib[a_localname] = v + return attrib + + def subelement_counts(self): + counts = {} + for e in self.group: + for x in e.iter(): + tag = ET.QName(x.tag).localname + key = f"{tag}-count" + counts[key] = counts.get(key, 0) + 1 + return counts + + def xpath_statistics(self, xpath_expr, namespaces): + """ + Extract values and calculate statistics + + Extract values using the given XPath expression, convert them to float and return descriptive + statistics on the values. + """ + values = [] + for e in self.group: + r = e.xpath(xpath_expr, namespaces=namespaces) + values += r + values = np.array([float(v) for v in values]) + + statistics = {} + if values.size > 0: + statistics[f'{xpath_expr}-mean'] = np.mean(values) + statistics[f'{xpath_expr}-median'] = np.median(values) + statistics[f'{xpath_expr}-std'] = np.std(values) + statistics[f'{xpath_expr}-min'] = np.min(values) + statistics[f'{xpath_expr}-max'] = np.max(values) + return statistics + + + + +def sorted_groupby(iterable, key=None): + """ + Sort iterable by key and then group by the same key. + + itertools.groupby() assumes that the iterable is already sorted. This function + conveniently sorts the iterable first, and then groups its elements. + """ + return groupby(sorted(iterable, key=key), key=key) + + +def _to_dict(root, raise_errors): + from .modstool import mods_to_dict, mets_to_dict + from .alto4pandas import alto_to_dict + + root_name = ET.QName(root.tag) + if root_name.namespace == "http://www.loc.gov/mods/v3": + return mods_to_dict(root, raise_errors) + elif root_name.namespace == "http://www.loc.gov/METS/": + return mets_to_dict(root, raise_errors) + elif root_name.namespace in [ + "http://schema.ccs-gmbh.com/ALTO", + "http://www.loc.gov/standards/alto/", + "http://www.loc.gov/standards/alto/ns-v2#", + "http://www.loc.gov/standards/alto/ns-v4#", + ]: + return alto_to_dict(root, raise_errors) + else: + raise ValueError(f"Unknown namespace {root_name.namespace}") + + +def flatten(d: MutableMapping, parent='', separator='_'): + """ + Flatten the given nested dict. + + It is assumed that d maps strings to either another dictionary (similarly structured) or some other value. + """ + items = [] + + for k, v in d.items(): + if parent: + new_key = parent + separator + k + else: + new_key = k + + if isinstance(v, MutableMapping): + items.extend(flatten(v, new_key, separator=separator).items()) + else: + items.append((new_key, v)) + + return dict(items) + diff --git a/qurator/modstool/modstool.py b/qurator/modstool/modstool.py index 4b035ac..a6fe164 100755 --- a/qurator/modstool/modstool.py +++ b/qurator/modstool/modstool.py @@ -14,196 +14,11 @@ import click import pandas as pd from tqdm import tqdm +from .lib import sorted_groupby, TagGroup, ns, flatten -ns = { - 'mets': 'http://www.loc.gov/METS/', - 'mods': 'http://www.loc.gov/mods/v3' -} -logger = logging.getLogger('modstool') - -class TagGroup: - """Helper class to simplify the parsing and checking of MODS metadata""" - - def __init__(self, tag, group: List[ET.Element]): - self.tag = tag - self.group = group - - def __str__(self): - return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group) - - def is_singleton(self): - if len(self.group) != 1: - raise ValueError('More than one instance: {}'.format(self)) - return self - - def has_no_attributes(self): - return self.has_attributes({}) - - def has_attributes(self, attrib): - if not isinstance(attrib, Sequence): - attrib = [attrib] - if not all(e.attrib in attrib for e in self.group): - raise ValueError('One or more element has unexpected attributes: {}'.format(self)) - return self - - def ignore_attributes(self): - # This serves as documentation for now. - return self - - def sort(self, key=None, reverse=False): - self.group = sorted(self.group, key=key, reverse=reverse) - return self - - def text(self, separator='\n'): - t = '' - for e in self.group: - if t != '': - t += separator - t += e.text - return t - - def text_set(self): - return {e.text for e in self.group} - - def descend(self, raise_errors): - return _to_dict(self.is_singleton().group[0], raise_errors) - - def filter(self, cond, warn=None): - new_group = [] - for e in self.group: - if cond(e): - new_group.append(e) - else: - if warn: - warnings.warn('Filtered {} element ({})'.format(self.tag, warn)) - return TagGroup(self.tag, new_group) - def force_singleton(self, warn=True): - if len(self.group) == 1: - return self - else: - if warn: - warnings.warn('Forced single instance of {}'.format(self.tag)) - return TagGroup(self.tag, self.group[:1]) - - RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX' - RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$' - - def fix_date(self): - - for e in self.group: - if e.attrib.get('encoding') == 'w3cdtf': - # This should be 'iso8601' according to MODS-AP 2.3.1 - warnings.warn('Changed w3cdtf encoding to iso8601') - e.attrib['encoding'] = 'iso8601' - - new_group = [] - for e in self.group: - if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text): - new_group.append(e) - elif re.match(self.RE_ISO8601_DATE, e.text): - warnings.warn('Added iso8601 encoding to date {}'.format(e.text)) - e.attrib['encoding'] = 'iso8601' - new_group.append(e) - elif re.match(self.RE_GERMAN_DATE, e.text): - warnings.warn('Converted date {} to iso8601 encoding'.format(e.text)) - m = re.match(self.RE_GERMAN_DATE, e.text) - e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd')) - e.attrib['encoding'] = 'iso8601' - new_group.append(e) - else: - warnings.warn('Not a iso8601 date: "{}"'.format(e.text)) - new_group.append(e) - self.group = new_group - - # Notes: - # - There are dates with the misspelled qualifier 'aproximate' - # - Rough periods are sometimes given either by: - # - years like '19xx' - # - or 'approximate' date ranges with point="start"/"end" attributes set - # (this could be correct according to MODS-AP 2.3.1) - # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier - # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)" - - return self - - def fix_event_type(self): - # According to MODS-AP 2.3.1, every originInfo should have its eventType set. - # Fix this for special cases. - - for e in self.group: - if e.attrib.get('eventType') is None: - try: - if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \ - e.find('mods:edition', ns).text == '[Electronic ed.]': - e.attrib['eventType'] = 'digitization' - warnings.warn('Fixed eventType for electronic ed.') - continue - except AttributeError: - pass - try: - if e.find('mods:dateIssued', ns) is not None: - e.attrib['eventType'] = 'publication' - warnings.warn('Fixed eventType for an issued origin') - continue - except AttributeError: - pass - try: - if e.find('mods:dateCreated', ns) is not None: - e.attrib['eventType'] = 'production' - warnings.warn('Fixed eventType for a created origin') - continue - except AttributeError: - pass - return self - - def fix_script_term(self): - for e in self.group: - # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case. - if e.attrib['authority'] == 'ISO15924': - e.attrib['authority'] = 'iso15924' - warnings.warn('Changed scriptTerm authority to lower case') - return self - - def merge_sub_tags_to_set(self): - value = {} - - sub_dicts = [mods_to_dict(e) for e in self.group] - sub_tags = {k for d in sub_dicts for k in d.keys()} - for sub_tag in sub_tags: - s = set() - for d in sub_dicts: - v = d.get(sub_tag) - if v: - # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a - # German language document. - if isinstance(v, set): - s.update(v) - else: - s.add(v) - value[sub_tag] = s - return value - - -def sorted_groupby(iterable, key=None): - """ - Sort iterable by key and then group by the same key. - - itertools.groupby() assumes that the iterable is already sorted. This function - conveniently sorts the iterable first, and then groups its elements. - """ - return groupby(sorted(iterable, key=key), key=key) - -def _to_dict(root, raise_errors): - - root_name = ET.QName(root.tag) - if root_name.namespace == "http://www.loc.gov/mods/v3": - return mods_to_dict(root, raise_errors) - elif root_name.namespace == "http://www.loc.gov/METS/": - return mets_to_dict(root, raise_errors) - else: - raise ValueError(f"Unknown namespace {root_name.namespace}") +logger = logging.getLogger('modstool') def mods_to_dict(mods, raise_errors=True): """Convert MODS metadata to a nested dictionary""" @@ -427,28 +242,6 @@ def mets_to_dict(mets, raise_errors=True): return value -def flatten(d: MutableMapping, parent='', separator='_'): - """ - Flatten the given nested dict. - - It is assumed that d maps strings to either another dictionary (similarly structured) or some other value. - """ - items = [] - - for k, v in d.items(): - if parent: - new_key = parent + separator + k - else: - new_key = k - - if isinstance(v, MutableMapping): - items.extend(flatten(v, new_key, separator=separator).items()) - else: - items.append((new_key, v)) - - return dict(items) - - @click.command() @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1) @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', diff --git a/qurator/modstool/tests/test_alto.py b/qurator/modstool/tests/test_alto.py new file mode 100644 index 0000000..154c848 --- /dev/null +++ b/qurator/modstool/tests/test_alto.py @@ -0,0 +1,39 @@ +import xml.etree.ElementTree as ET + + +from qurator.modstool.alto4pandas import alto_to_dict +from qurator.modstool.lib import flatten + + +def dict_fromstring(x): + return flatten(alto_to_dict(ET.fromstring(x))) + +def test_Page_counts(): + """ + Elements below Layout/Page should be counted + """ + d = dict_fromstring(""" + + + + + + + + + + + + + + + + + + + + + """) + assert d['Layout_Page_TextBlock-count'] == 1 + assert d['Layout_Page_TextLine-count'] == 3 + assert d['Layout_Page_String-count'] == 6 diff --git a/qurator/modstool/tests/test_mets.py b/qurator/modstool/tests/test_mets.py index 6ca22fc..315c7b6 100644 --- a/qurator/modstool/tests/test_mets.py +++ b/qurator/modstool/tests/test_mets.py @@ -1,8 +1,8 @@ -import pytest import xml.etree.ElementTree as ET -from .. import mets_to_dict, flatten +from qurator.modstool.modstool import mets_to_dict +from qurator.modstool.lib import flatten def dict_fromstring(x): diff --git a/qurator/modstool/tests/test_modstool.py b/qurator/modstool/tests/test_modstool.py index b13586a..c31e3e8 100644 --- a/qurator/modstool/tests/test_modstool.py +++ b/qurator/modstool/tests/test_modstool.py @@ -1,8 +1,10 @@ +from tkinter import W import pytest import xml.etree.ElementTree as ET -from .. import mods_to_dict, flatten +from qurator.modstool.modstool import mods_to_dict +from qurator.modstool.lib import flatten def dict_fromstring(x): diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1 @@ +pytest diff --git a/setup.py b/setup.py index d03bc94..a35c92d 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,8 @@ from setuptools import find_packages, setup with open('requirements.txt') as fp: install_requires = fp.read() +with open('requirements-test.txt') as fp: + tests_requires = fp.read() setup( name='modstool', @@ -19,8 +21,9 @@ setup( entry_points={ 'console_scripts': [ 'modstool=qurator.modstool.modstool:main', + 'alto4pandas=qurator.modstool.alto4pandas:main', ] }, python_requires='>=3.0.0', - tests_require=['pytest'], + tests_requires=tests_requires, )