mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 03:14:14 +01:00 
			
		
		
		
	🚧 ALTO: Move xpath_statistics to TagGroup class
This commit is contained in:
		
							parent
							
								
									9246519162
								
							
						
					
					
						commit
						aa4e8e290d
					
				
					 2 changed files with 23 additions and 18 deletions
				
			
		| 
						 | 
					@ -76,27 +76,13 @@ def alto_to_dict(alto, raise_errors=True):
 | 
				
			||||||
        elif localname == 'Layout':
 | 
					        elif localname == 'Layout':
 | 
				
			||||||
            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
					            value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
 | 
				
			||||||
        elif localname == 'Page':
 | 
					        elif localname == 'Page':
 | 
				
			||||||
            value['Page'] = {}
 | 
					 | 
				
			||||||
            value['Page'].update(TagGroup(tag, group).is_singleton().attributes())
 | 
					 | 
				
			||||||
            value['Page'].update(TagGroup(tag, group).subelement_counts())
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            xpath_expr = "//alto:String/@WC"
 | 
					 | 
				
			||||||
            alto_namespace = ET.QName(group[0]).namespace
 | 
					            alto_namespace = ET.QName(group[0]).namespace
 | 
				
			||||||
            namespaces={"alto": alto_namespace}
 | 
					            namespaces={"alto": alto_namespace}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            def xpath_statistics(xpath_expr, namespaces):
 | 
					            value[localname] = {}
 | 
				
			||||||
                values = []
 | 
					            value[localname].update(TagGroup(tag, group).is_singleton().attributes())
 | 
				
			||||||
                for e in group:
 | 
					            value[localname].update(TagGroup(tag, group).subelement_counts())
 | 
				
			||||||
                    r = e.xpath(xpath_expr, namespaces=namespaces)
 | 
					            value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
 | 
				
			||||||
                    values += r
 | 
					 | 
				
			||||||
                values = np.array([float(v) for v in values])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                statistics = {}
 | 
					 | 
				
			||||||
                statistics[f'{xpath_expr}-mean'] = np.mean(values)
 | 
					 | 
				
			||||||
                return statistics
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            value['Page'].update(xpath_statistics(xpath_expr, namespaces))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        elif localname == 'Styles':
 | 
					        elif localname == 'Styles':
 | 
				
			||||||
            pass
 | 
					            pass
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ import re
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from typing import List, Sequence, MutableMapping
 | 
					from typing import List, Sequence, MutableMapping
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
from lxml import etree as ET
 | 
					from lxml import etree as ET
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -206,6 +207,24 @@ class TagGroup:
 | 
				
			||||||
                counts[key] = counts.get(key, 0) + 1
 | 
					                counts[key] = counts.get(key, 0) + 1
 | 
				
			||||||
        return counts
 | 
					        return counts
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def xpath_statistics(self, xpath_expr, namespaces):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Extract values and calculate statistics
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Extract values using the given XPath expression, convert them to float and return descriptive
 | 
				
			||||||
 | 
					        statistics on the values.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        values = []
 | 
				
			||||||
 | 
					        for e in self.group:
 | 
				
			||||||
 | 
					            r = e.xpath(xpath_expr, namespaces=namespaces)
 | 
				
			||||||
 | 
					            values += r
 | 
				
			||||||
 | 
					        values = np.array([float(v) for v in values])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        statistics = {}
 | 
				
			||||||
 | 
					        statistics[f'{xpath_expr}-mean'] = np.mean(values)
 | 
				
			||||||
 | 
					        return statistics
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def sorted_groupby(iterable, key=None):
 | 
					def sorted_groupby(iterable, key=None):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue