From 79701e410d1c343b7ac03b9de4567cb042723939 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 29 Apr 2024 08:42:14 +0200
Subject: [PATCH] Fix some typos (found by `codespell` and `typos`)

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 README.md                                    | 4 ++--
 src/dinglehopper/extracted_text.py           | 2 +-
 src/dinglehopper/notebooks/Levenshtein.ipynb | 4 ++--
 src/dinglehopper/word_error_rate.py          | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 035133c..76fcc5a 100644
--- a/README.md
+++ b/README.md
@@ -100,11 +100,11 @@ This generates `summary.html` and `summary.json` in the same `output_folder`.
 
 If you are summarizing many reports and have used the `--differences` flag while
 generating them, it may be useful to limit the number of differences reported by using
-the `--occurences-threshold` parameter. This will reduce the size of the generated HTML
+the `--occurrences-threshold` parameter. This will reduce the size of the generated HTML
 report, making it easier to open and navigate. Note that the JSON report will still
 contain all differences. Example:
 ~~~
-dinglehopper-summarize output_folder/ --occurences-threshold 10
+dinglehopper-summarize output_folder/ --occurrences-threshold 10
 ~~~
 
 ### dinglehopper-line-dirs
diff --git a/src/dinglehopper/extracted_text.py b/src/dinglehopper/extracted_text.py
index c7bcba7..6dcf0a7 100644
--- a/src/dinglehopper/extracted_text.py
+++ b/src/dinglehopper/extracted_text.py
@@ -329,7 +329,7 @@ def get_attr(te: Any, attr_name: str) -> float:
     """Extract the attribute for the given name.
 
     Note: currently only handles numeric values!
-    Other or non existend values are encoded as np.nan.
+    Other or non existent values are encoded as np.nan.
     """
     attr_value = te.attrib.get(attr_name)
     try:
diff --git a/src/dinglehopper/notebooks/Levenshtein.ipynb b/src/dinglehopper/notebooks/Levenshtein.ipynb
index 876bee3..b9671d7 100644
--- a/src/dinglehopper/notebooks/Levenshtein.ipynb
+++ b/src/dinglehopper/notebooks/Levenshtein.ipynb
@@ -391,7 +391,7 @@
     "\\text{CER} = \\frac{i + s + d}{n}\n",
     "$$\n",
     "\n",
-    "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropiate as they *are* clear about this when computing the word error rate.)"
+    "where $i$ is the number of inserts, $s$ the number of substitutions, $d$ the number of deletions and $n$ is the number of characters in the reference text. (The text is not super clear about $n$ being the number of characters in the reference text, but it seems appropriate as they *are* clear about this when computing the word error rate.)"
    ]
   },
   {
@@ -680,7 +680,7 @@
       "        return cat in unwanted_categories or subcat in unwanted_subcategories\n",
       "\n",
       "    # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on word boundaries using\n",
-      "    # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctation \"or similar characters.\"\n",
+      "    # uniseg.wordbreak.words() and ignore all \"words\" that contain only whitespace, punctuation \"or similar characters.\"\n",
       "    for word in uniseg.wordbreak.words(s):\n",
       "        if all(unwanted(c) for c in word):\n",
       "            pass\n",
diff --git a/src/dinglehopper/word_error_rate.py b/src/dinglehopper/word_error_rate.py
index b759a69..578850f 100644
--- a/src/dinglehopper/word_error_rate.py
+++ b/src/dinglehopper/word_error_rate.py
@@ -54,7 +54,7 @@ def words(s: str) -> Generator[str, None, None]:
 
     # We follow Unicode Standard Annex #29 on Unicode Text Segmentation here: Split on
     # word boundaries using uniseg.wordbreak.words() and ignore all "words" that contain
-    # only whitespace, punctation "or similar characters."
+    # only whitespace, punctuation "or similar characters."
     for word in uniseg.wordbreak.words(s):
         if all(unwanted(c) for c in word):
             pass