diff --git a/check_dtypes.py b/check_dtypes.py index 502e4bb..946c5fe 100644 --- a/check_dtypes.py +++ b/check_dtypes.py @@ -47,8 +47,7 @@ EXPECTED_TYPES = { r"Layout_Page_//alto:String/@WC-.*": ("Float64", None), r"alto_xmlns": ("object", ["str", "NoneType"]), - # XXX r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), - r"Layout_Page_(WIDTH|HEIGHT)": ("object", ["str", "NoneType"]), + r"Layout_Page_(WIDTH|HEIGHT)": ("Int64", None), } def expected_types(c): for r, types in EXPECTED_TYPES.items(): diff --git a/src/mods4pandas/alto4pandas.py b/src/mods4pandas/alto4pandas.py index 668d7f3..1d7b748 100755 --- a/src/mods4pandas/alto4pandas.py +++ b/src/mods4pandas/alto4pandas.py @@ -89,6 +89,12 @@ def alto_to_dict(alto, raise_errors=True): elif localname == 'Page': value[localname] = {} value[localname].update(TagGroup(tag, group).is_singleton().attributes()) + for attr in ("WIDTH", "HEIGHT"): + if attr in value[localname]: + try: + value[localname][attr] = int(value[localname][attr]) + except ValueError: + del value[localname][attr] value[localname].update(TagGroup(tag, group).subelement_counts()) value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))