mirror of
https://github.com/qurator-spk/dinglehopper.git
synced 2025-06-17 07:39:59 +02:00
Fix some typos (found by codespell
and typos
)
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
2383730a55
commit
79701e410d
4 changed files with 6 additions and 6 deletions
|
@ -100,11 +100,11 @@ This generates `summary.html` and `summary.json` in the same `output_folder`.
|
||||||
|
|
||||||
If you are summarizing many reports and have used the `--differences` flag while
|
If you are summarizing many reports and have used the `--differences` flag while
|
||||||
generating them, it may be useful to limit the number of differences reported by using
|
generating them, it may be useful to limit the number of differences reported by using
|
||||||
the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
|
the `--occurrences-threshold` parameter. This will reduce the size of the generated HTML
|
||||||
report, making it easier to open and navigate. Note that the JSON report will still
|
report, making it easier to open and navigate. Note that the JSON report will still
|
||||||
contain all differences. Example:
|
contain all differences. Example:
|
||||||
~~~
|
~~~
|
||||||
dinglehopper-summarize output_folder/ --occurences-threshold 10
|
dinglehopper-summarize output_folder/ --occurrences-threshold 10
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
### dinglehopper-line-dirs
|
### dinglehopper-line-dirs
|
||||||
|
|
|
@ -329,7 +329,7 @@ def get_attr(te: Any, attr_name: str) -> float:
|
||||||
"""Extract the attribute for the given name.
|
"""Extract the attribute for the given name.
|
||||||
|
|
||||||
Note: currently only handles numeric values!
|
Note: currently only handles numeric values!
|
||||||
Other or non existend values are encoded as np.nan.
|
Other or non existent values are encoded as np.nan.
|
||||||
"""
|
"""
|
||||||
attr_value = te.attrib.get(attr_name)
|
attr_value = te.attrib.get(attr_name)
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -391,7 +391,7 @@
|
||||||
"\\text{CER} = \\frac{i + s + d}{n}\n",
|
"\\text{CER} = \\frac{i + s + d}{n}\n",
|
||||||
"$$\n",
|
"$$\n",
|
||||||
"\n",
|
"\n",
|
||||||
"where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)"
|
"where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropriate as they *are* clear about this when computing the word error rate.)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -680,7 +680,7 @@
|
||||||
" return cat in unwanted_categories or subcat in unwanted_subcategories\n",
|
" return cat in unwanted_categories or subcat in unwanted_subcategories\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
|
" # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
|
||||||
" # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n",
|
" # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctuation \"or similar characters.\"\n",
|
||||||
" for word in uniseg.wordbreak.words(s):\n",
|
" for word in uniseg.wordbreak.words(s):\n",
|
||||||
" if all(unwanted(c) for c in word):\n",
|
" if all(unwanted(c) for c in word):\n",
|
||||||
" pass\n",
|
" pass\n",
|
||||||
|
|
|
@ -54,7 +54,7 @@ def words(s: str) -> Generator[str, None, None]:
|
||||||
|
|
||||||
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
|
# We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
|
||||||
# word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
|
# word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
|
||||||
# only whitespace, punctation "or similar characters."
|
# only whitespace, punctuation "or similar characters."
|
||||||
for word in uniseg.wordbreak.words(s):
|
for word in uniseg.wordbreak.words(s):
|
||||||
if all(unwanted(c) for c in word):
|
if all(unwanted(c) for c in word):
|
||||||
pass
|
pass
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue