From 16d9fcd087132b510bb6a83d34f6df704fab3ed4 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 25 Oct 2018 21:14:52 +0700
Subject: [PATCH 01/16] Update Peter Norvig's spell checker to be able to
 suggest words based on probability (as suggested in issue #90 ) - use word
 frequencies from Thai National Corpus

---
 pythainlp/corpus/tnc.py |  2 +-
 pythainlp/corpus/ttc.py |  2 +-
 pythainlp/spell/pn.py   | 28 +++++++++++++++-------------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
index 5b9a15438..3d035914b 100644
--- a/pythainlp/corpus/tnc.py
+++ b/pythainlp/corpus/tnc.py
@@ -57,6 +57,6 @@ def get_word_frequency_all():
     listword = []
     for line in lines:
         listindata = line.split("	")
-        listword.append((listindata[0], listindata[1]))
+        listword.append((listindata[0], int(listindata[1])))
 
     return listword
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
index 015b8e2ae..34c4f6c3a 100644
--- a/pythainlp/corpus/ttc.py
+++ b/pythainlp/corpus/ttc.py
@@ -34,6 +34,6 @@ def get_word_frequency_all():
     listword = []
     for line in lines:
         listindata = line.split("	")
-        listword.append((listindata[0], listindata[1]))
+        listword.append((listindata[0], int(listindata[1])))
 
     return listword
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index fe3ff225e..7281cf836 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -1,30 +1,32 @@
 # -*- coding: utf-8 -*-
 """
-Spell checker
+Spell checker, using Peter Norvig algorithm + word frequency from Thai National Corpus
 
-Based on Peter Norvig's Python code at http://norvig.com/spell-correct.html
+Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html
 """
 from collections import Counter
-from pythainlp.corpus.thaiword import get_data
 
-WORDS = Counter(get_data())
+from pythainlp.corpus import tnc
 
+WORDS = Counter(dict(tnc.get_word_frequency_all()))
+WORDS_TOTAL = sum(WORDS.values())
 
-def prob(word, n=sum(WORDS.values())):
+
+def _prob(word, n=WORDS_TOTAL):
     "Probability of `word`."
     return WORDS[word] / n
 
 
-def correction(word):
+def _correction(word):
     "แสดงคำที่เป็นไปได้มากที่สุด"
-    return max(spell(word), key=prob)
+    return max(spell(word), key=_prob)
 
 
-def known(words):
+def _known(words):
     return list(w for w in words if w in WORDS)
 
 
-def edits1(word):
+def _edits1(word):
     letters = [
         "ก",
         "ข",
@@ -111,12 +113,12 @@ def edits1(word):
     return set(deletes + transposes + replaces + inserts)
 
 
-def edits2(word):
-    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
+def _edits2(word):
+    return (e2 for e1 in _edits1(word) for e2 in _edits1(e1))
 
 
 def spell(word):
     if not word:
         return ""
-    else:
-        return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]
+
+    return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]

From 5c957c59d30562857b0a5a140290b63cdeaccef2 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 25 Oct 2018 21:23:39 +0700
Subject: [PATCH 02/16] remove import future

---
 pythainlp/corpus/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
index 62b71194f..4968f5eb6 100644
--- a/pythainlp/corpus/__init__.py
+++ b/pythainlp/corpus/__init__.py
@@ -3,7 +3,7 @@
 import os
 
 import requests
-from future.moves.urllib.request import urlopen
+from urllib.request import urlopen
 from pythainlp.tools import get_path_data, get_path_db
 from tinydb import Query, TinyDB
 from tqdm import tqdm
@@ -12,7 +12,7 @@
     "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json"
 )
 
-# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
+# __all__ = ["thaipos", "thaiword", "alphabet", "tone", "country", "wordnet"]
 path_db_ = get_path_db()
 
 

From 6f40f7e524be93018ea2f5a9324557407ab84f6e Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 25 Oct 2018 21:28:15 +0700
Subject: [PATCH 03/16] minor sort of imports

---
 pythainlp/corpus/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
index 4968f5eb6..8310894e7 100644
--- a/pythainlp/corpus/__init__.py
+++ b/pythainlp/corpus/__init__.py
@@ -3,10 +3,10 @@
 import os
 
 import requests
-from urllib.request import urlopen
 from pythainlp.tools import get_path_data, get_path_db
 from tinydb import Query, TinyDB
 from tqdm import tqdm
+from urllib.request import urlopen
 
 CORPUS_DB_URL = (
     "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json"

From ae6e251e5d98262f873de5bef0d22ae7eda3159a Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 25 Oct 2018 23:27:09 +0700
Subject: [PATCH 04/16] More docstring for Peter Norvig's spell checker

---
 .gitignore            |  1 +
 pythainlp/spell/pn.py | 31 ++++++++++++++++++++-----------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 18dff633d..d6f0b0a65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+Untitled*.ipynb
 
 # IDE files
 .idea
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 7281cf836..f59b53365 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -8,22 +8,17 @@
 
 from pythainlp.corpus import tnc
 
-WORDS = Counter(dict(tnc.get_word_frequency_all()))
-WORDS_TOTAL = sum(WORDS.values())
+_WORDS = Counter(dict(tnc.get_word_frequency_all()))
+_WORDS_TOTAL = sum(_WORDS.values())
 
 
-def _prob(word, n=WORDS_TOTAL):
+def _prob(word, n=_WORDS_TOTAL):
     "Probability of `word`."
-    return WORDS[word] / n
-
-
-def _correction(word):
-    "แสดงคำที่เป็นไปได้มากที่สุด"
-    return max(spell(word), key=_prob)
+    return _WORDS[word] / n
 
 
 def _known(words):
-    return list(w for w in words if w in WORDS)
+    return list(w for w in words if w in _WORDS)
 
 
 def _edits1(word):
@@ -110,6 +105,7 @@ def _edits1(word):
     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
     replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
     inserts = [L + c + R for L, R in splits for c in letters]
+
     return set(deletes + transposes + replaces + inserts)
 
 
@@ -118,7 +114,20 @@ def _edits2(word):
 
 
 def spell(word):
+    """
+    Return set of possible words, according to edit distance
+    """
     if not word:
         return ""
 
-    return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]
+    return set(
+        _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]
+    )
+
+
+def correction(word):
+    """
+    Return the most possible word, according to probability from the corpus
+    แสดงคำที่เป็นไปได้มากที่สุด
+    """
+    return max(spell(word), key=_prob)

From 2433a69c201ebe102470a56e643ac44767187c93 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 25 Oct 2018 23:43:24 +0700
Subject: [PATCH 05/16] should return list not set

---
 pythainlp/spell/pn.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index f59b53365..6fd172074 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -115,14 +115,12 @@ def _edits2(word):
 
 def spell(word):
     """
-    Return set of possible words, according to edit distance
+    Return a list of possible words, according to edit distance
     """
     if not word:
         return ""
 
-    return set(
-        _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]
-    )
+    return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]
 
 
 def correction(word):

From 5721e75a599da008af80a06a951f54ffa39fbbfc Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 26 Oct 2018 02:42:21 +0700
Subject: [PATCH 06/16] Filter out  non-Thai words and low frequency words from
 word frequency list for spell checker

---
 pythainlp/ner/__init__.py  | 21 ++++-----------------
 pythainlp/spell/pn.py      | 21 ++++++++++++++++++++-
 pythainlp/util/__init__.py | 22 ++++++++++++++++++----
 3 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py
index 12089b927..9992d2547 100644
--- a/pythainlp/ner/__init__.py
+++ b/pythainlp/ner/__init__.py
@@ -5,6 +5,7 @@
 from pythainlp.corpus import download, get_file, stopwords
 from pythainlp.tag import pos_tag
 from pythainlp.tokenize import word_tokenize
+from pythainlp.util import is_thaiword
 
 try:
     import sklearn_crfsuite
@@ -22,20 +23,6 @@
 _STOPWORDS = stopwords.words("thai")
 
 
-def _is_thaichar(ch):  # เป็นอักษรไทยหรือไม่
-    ch_val = ord(ch)
-    if ch_val >= 3584 and ch_val <= 3711:
-        return True
-    return False
-
-
-def _is_thaiword(word):  # เป็นคำที่มีแต่อักษรไทยหรือไม่
-    for ch in word:
-        if ch != "." and not _is_thaichar(ch):
-            return False
-    return True
-
-
 def _is_stopword(word):  # เช็คว่าเป็นคำฟุ่มเฟือย
     return word in _STOPWORDS
 
@@ -47,7 +34,7 @@ def _doc2features(doc, i):
     features = {
         "word.word": word,
         "word.stopword": _is_stopword(word),
-        "word.isthai": _is_thaiword(word),
+        "word.isthai": is_thaiword(word),
         "word.isspace": word.isspace(),
         "postag": postag,
         "word.isdigit()": word.isdigit(),
@@ -61,7 +48,7 @@ def _doc2features(doc, i):
         postag1 = doc[i - 1][1]
         features["word.prevword"] = prevword
         features["word.previsspace"] = prevword.isspace()
-        features["word.previsthai"] = _is_thaiword(prevword)
+        features["word.previsthai"] = is_thaiword(prevword)
         features["word.prevstopword"] = _is_stopword(prevword)
         features["word.prepostag"] = postag1
         features["word.prevwordisdigit"] = prevword.isdigit()
@@ -75,7 +62,7 @@ def _doc2features(doc, i):
         features["word.nextword"] = nextword
         features["word.nextisspace"] = nextword.isspace()
         features["word.nextpostag"] = postag1
-        features["word.nextisthai"] = _is_thaiword(nextword)
+        features["word.nextisthai"] = is_thaiword(nextword)
         features["word.nextstopword"] = _is_stopword(nextword)
         features["word.nextwordisdigit"] = nextword.isdigit()
     else:
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 6fd172074..1820617af 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -7,8 +7,27 @@
 from collections import Counter
 
 from pythainlp.corpus import tnc
+from pythainlp.util import is_thaichar
 
-_WORDS = Counter(dict(tnc.get_word_frequency_all()))
+
+def _keep(word):
+    for ch in word:
+        if ch != "." and not is_thaichar(ch):
+            return False
+        if ch in "๐๑๒๓๔๕๖๗๘๙":
+            return False
+    return True
+
+
+# get word frequency from TNC then filter out non-Thai words and low frequency words
+word_freqs = tnc.get_word_frequency_all()
+word_freqs = [
+    word_freq
+    for word_freq in word_freqs
+    if word_freq[1] > 2 and len(word_freq[0]) <= 40 and _keep(word_freq[0])
+]
+
+_WORDS = Counter(dict(word_freqs))
 _WORDS_TOTAL = sum(_WORDS.values())
 
 
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
index 7566d83a3..f129fe5ad 100644
--- a/pythainlp/util/__init__.py
+++ b/pythainlp/util/__init__.py
@@ -7,6 +7,20 @@
 from nltk.util import ngrams as ngramsdata
 
 
+def is_thaichar(ch):  # เป็นอักษรไทยหรือไม่
+    ch_val = ord(ch)
+    if ch_val >= 3584 and ch_val <= 3711:
+        return True
+    return False
+
+
+def is_thaiword(word):  # เป็นคำที่มีแต่อักษรไทยหรือไม่
+    for ch in word:
+        if ch != "." and not is_thaichar(ch):
+            return False
+    return True
+
+
 def ngrams(token, num):
     """
     ngrams สร้าง ngrams
@@ -34,7 +48,7 @@ def trigram(token):
     return ngrams(token, 3)
 
 
-RULE1 = [
+_NORMALIZE_RULE1 = [
     "ะ",
     "ั",
     "็",
@@ -61,7 +75,7 @@ def trigram(token):
 ]  # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา
 
 
-RULE2 = [
+_NORMALIZE_RULE2 = [
     ("เเ", "แ"),  # เ เ -> แ
     ("ํ(t)า", "\\1ำ"),
     ("ํา(t)", "\\1ำ"),
@@ -81,9 +95,9 @@ def normalize(text):
     >>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก
     True
     """
-    for data in RULE2:
+    for data in _NORMALIZE_RULE2:
         text = re.sub(data[0].replace("t", "[่้๊๋]"), data[1], text)
-    for data in list(zip(RULE1, RULE1)):
+    for data in list(zip(_NORMALIZE_RULE1, _NORMALIZE_RULE1)):
         text = re.sub(data[0].replace("t", "[่้๊๋]") + "+", data[1], text)
     return text
 

From 83c5187525df80f67950fa203c1adc8ed055432f Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 26 Oct 2018 02:46:22 +0700
Subject: [PATCH 07/16] make Thai characters list a constant outside function
 _edits1()

---
 pythainlp/spell/pn.py | 162 +++++++++++++++++++++---------------------
 1 file changed, 82 insertions(+), 80 deletions(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 1820617af..40196924e 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -10,6 +10,86 @@
 from pythainlp.util import is_thaichar
 
 
+_THAI_CHARS = [
+    "ก",
+    "ข",
+    "ฃ",
+    "ค",
+    "ฅ",
+    "ฆ",
+    "ง",
+    "จ",
+    "ฉ",
+    "ช",
+    "ซ",
+    "ฌ",
+    "ญ",
+    "ฎ",
+    "ฏ",
+    "ฐ",
+    "ฑ",
+    "ฒ",
+    "ณ",
+    "ด",
+    "ต",
+    "ถ",
+    "ท",
+    "ธ",
+    "น",
+    "บ",
+    "ป",
+    "ผ",
+    "ฝ",
+    "พ",
+    "ฟ",
+    "ภ",
+    "ม",
+    "ย",
+    "ร",
+    "ฤ",
+    "ล",
+    "ฦ",
+    "ว",
+    "ศ",
+    "ษ",
+    "ส",
+    "ห",
+    "ฬ",
+    "อ",
+    "ฮ",
+    "ฯ",
+    "ะ",
+    "ั",
+    "า",
+    "ำ",
+    "ิ",
+    "ี",
+    "ึ",
+    "ื",
+    "ุ",
+    "ู",
+    "ฺ",
+    "\u0e3b",
+    "\u0e3c",
+    "\u0e3d",
+    "\u0e3e",
+    "฿",
+    "เ",
+    "แ",
+    "โ",
+    "ใ",
+    "ไ",
+    "ๅ",
+    "ๆ",
+    "็",
+    "่",
+    "้",
+    "๊",
+    "๋",
+    "์",
+]
+
+
 def _keep(word):
     for ch in word:
         if ch != "." and not is_thaichar(ch):
@@ -41,89 +121,11 @@ def _known(words):
 
 
 def _edits1(word):
-    letters = [
-        "ก",
-        "ข",
-        "ฃ",
-        "ค",
-        "ฅ",
-        "ฆ",
-        "ง",
-        "จ",
-        "ฉ",
-        "ช",
-        "ซ",
-        "ฌ",
-        "ญ",
-        "ฎ",
-        "ฏ",
-        "ฐ",
-        "ฑ",
-        "ฒ",
-        "ณ",
-        "ด",
-        "ต",
-        "ถ",
-        "ท",
-        "ธ",
-        "น",
-        "บ",
-        "ป",
-        "ผ",
-        "ฝ",
-        "พ",
-        "ฟ",
-        "ภ",
-        "ม",
-        "ย",
-        "ร",
-        "ฤ",
-        "ล",
-        "ฦ",
-        "ว",
-        "ศ",
-        "ษ",
-        "ส",
-        "ห",
-        "ฬ",
-        "อ",
-        "ฮ",
-        "ฯ",
-        "ะ",
-        "ั",
-        "า",
-        "ำ",
-        "ิ",
-        "ี",
-        "ึ",
-        "ื",
-        "ุ",
-        "ู",
-        "ฺ",
-        "\u0e3b",
-        "\u0e3c",
-        "\u0e3d",
-        "\u0e3e",
-        "฿",
-        "เ",
-        "แ",
-        "โ",
-        "ใ",
-        "ไ",
-        "ๅ",
-        "ๆ",
-        "็",
-        "่",
-        "้",
-        "๊",
-        "๋",
-        "์",
-    ]
     splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
     deletes = [L + R[1:] for L, R in splits if R]
     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
-    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
-    inserts = [L + c + R for L, R in splits for c in letters]
+    replaces = [L + c + R[1:] for L, R in splits if R for c in _THAI_CHARS]
+    inserts = [L + c + R for L, R in splits for c in _THAI_CHARS]
 
     return set(deletes + transposes + replaces + inserts)
 

From 08278b159d50aaea5cf3b0b72011c0e1f49ea774 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 26 Oct 2018 03:03:27 +0700
Subject: [PATCH 08/16] Adjust word frequency filter

---
 pythainlp/spell/pn.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 40196924e..d162c8b64 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -91,11 +91,15 @@
 
 
 def _keep(word):
+    if word[0] == ".":
+        return False
+
     for ch in word:
         if ch != "." and not is_thaichar(ch):
             return False
         if ch in "๐๑๒๓๔๕๖๗๘๙":
             return False
+
     return True
 
 
@@ -104,7 +108,7 @@ def _keep(word):
 word_freqs = [
     word_freq
     for word_freq in word_freqs
-    if word_freq[1] > 2 and len(word_freq[0]) <= 40 and _keep(word_freq[0])
+    if word_freq[1] > 1 and len(word_freq[0]) <= 40 and _keep(word_freq[0])
 ]
 
 _WORDS = Counter(dict(word_freqs))

From 75ab30d9111eea3d10ac88e907ddb2bc18232c1d Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 26 Oct 2018 11:26:08 +0700
Subject: [PATCH 09/16] Trying to reduce cognitive complex in functions, as
 suggested by Code Climate

---
 pythainlp/corpus/tnc.py   |  3 ++-
 pythainlp/corpus/ttc.py   |  5 +++--
 pythainlp/ner/__init__.py | 37 ++++++++++++++++++++++---------------
 pythainlp/spell/pn.py     | 21 ++++++++++++---------
 4 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
index 3d035914b..28c2c0780 100644
--- a/pythainlp/corpus/tnc.py
+++ b/pythainlp/corpus/tnc.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Word frequency from Thai National Corpus
+Thai National Corpus word frequency
+
 Credit: Korakot Chaovavanich‎
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
index 34c4f6c3a..fbf82d9a5 100644
--- a/pythainlp/corpus/ttc.py
+++ b/pythainlp/corpus/ttc.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-TTC Thai word frequency
+Thai Textbook Corpus (TTC) word frequency
+
 Credit: Korakot Chaovavanich‎
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
@@ -13,7 +14,7 @@
 
 def get_word_frequency_all():
     """
-    ดึงข้อมูลความถี่คำของ TTC มาใช้งาน
+    ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน
     โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...]
     """
     path = os.path.join(os.path.expanduser("~"), "pythainlp-data")
diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py
index 9992d2547..b73ef4402 100644
--- a/pythainlp/ner/__init__.py
+++ b/pythainlp/ner/__init__.py
@@ -30,6 +30,7 @@ def _is_stopword(word):  # เช็คว่าเป็นคำฟุ่ม
 def _doc2features(doc, i):
     word = doc[i][0]
     postag = doc[i][1]
+
     # Features from current word
     features = {
         "word.word": word,
@@ -39,32 +40,38 @@ def _doc2features(doc, i):
         "postag": postag,
         "word.isdigit()": word.isdigit(),
     }
-
     if word.isdigit() and len(word) == 5:
         features["word.islen5"] = True
 
+    # Features from previous word
     if i > 0:
         prevword = doc[i - 1][0]
-        postag1 = doc[i - 1][1]
-        features["word.prevword"] = prevword
-        features["word.previsspace"] = prevword.isspace()
-        features["word.previsthai"] = is_thaiword(prevword)
-        features["word.prevstopword"] = _is_stopword(prevword)
-        features["word.prepostag"] = postag1
-        features["word.prevwordisdigit"] = prevword.isdigit()
+        prevpostag = doc[i - 1][1]
+        prev_features = {
+            "word.prevword": prevword,
+            "word.previsspace": prevword.isspace(),
+            "word.previsthai": is_thaiword(prevword),
+            "word.prevstopword": _is_stopword(prevword),
+            "word.prevpostag": prevpostag,
+            "word.prevwordisdigit": prevword.isdigit(),
+        }
+        features.update(prev_features)
     else:
         features["BOS"] = True  # Special "Beginning of Sequence" tag
 
     # Features from next word
     if i < len(doc) - 1:
         nextword = doc[i + 1][0]
-        postag1 = doc[i + 1][1]
-        features["word.nextword"] = nextword
-        features["word.nextisspace"] = nextword.isspace()
-        features["word.nextpostag"] = postag1
-        features["word.nextisthai"] = is_thaiword(nextword)
-        features["word.nextstopword"] = _is_stopword(nextword)
-        features["word.nextwordisdigit"] = nextword.isdigit()
+        nextpostag = doc[i + 1][1]
+        next_features = {
+            "word.nextword": nextword,
+            "word.nextisspace": nextword.isspace(),
+            "word.nextpostag": nextpostag,
+            "word.nextisthai": is_thaiword(nextword),
+            "word.nextstopword": _is_stopword(nextword),
+            "word.nextwordisdigit": nextword.isdigit(),
+        }
+        features.update(next_features)
     else:
         features["EOS"] = True  # Special "End of Sequence" tag
 
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index d162c8b64..2b4ec5f2e 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -91,16 +91,19 @@
 
 
 def _keep(word):
+    keep = True
     if word[0] == ".":
-        return False
-
-    for ch in word:
-        if ch != "." and not is_thaichar(ch):
-            return False
-        if ch in "๐๑๒๓๔๕๖๗๘๙":
-            return False
-
-    return True
+        keep = False
+    else:
+        for ch in word:
+            if ch != "." and not is_thaichar(ch):
+                keep = False
+                break
+            if ch in "๐๑๒๓๔๕๖๗๘๙":
+                keep = False
+                break
+
+    return keep
 
 
 # get word frequency from TNC then filter out non-Thai words and low frequency words

From a476471d23ef896f32b8f07abc6cd4c25b33eab0 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 26 Oct 2018 11:29:31 +0700
Subject: [PATCH 10/16] Stick with the previous _keep() code, less cognitive
 complexity

---
 pythainlp/spell/pn.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 2b4ec5f2e..d162c8b64 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -91,19 +91,16 @@
 
 
 def _keep(word):
-    keep = True
     if word[0] == ".":
-        keep = False
-    else:
-        for ch in word:
-            if ch != "." and not is_thaichar(ch):
-                keep = False
-                break
-            if ch in "๐๑๒๓๔๕๖๗๘๙":
-                keep = False
-                break
-
-    return keep
+        return False
+
+    for ch in word:
+        if ch != "." and not is_thaichar(ch):
+            return False
+        if ch in "๐๑๒๓๔๕๖๗๘๙":
+            return False
+
+    return True
 
 
 # get word frequency from TNC then filter out non-Thai words and low frequency words

From 32cc4fe9221ffd27abbd37a4659b8ee802de11de Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 27 Oct 2018 15:04:53 +0700
Subject: [PATCH 11/16] check empty string case in correction()

---
 pythainlp/spell/pn.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index d162c8b64..d949fe4db 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -103,7 +103,7 @@ def _keep(word):
     return True
 
 
-# get word frequency from TNC then filter out non-Thai words and low frequency words
+# get word frequency from corpus then filter out non-Thai words and low frequency words
 word_freqs = tnc.get_word_frequency_all()
 word_freqs = [
     word_freq
@@ -140,7 +140,7 @@ def _edits2(word):
 
 def spell(word):
     """
-    Return a list of possible words, according to edit distance
+    Return a list of possible words, according to edit distance of 1 and 2
     """
     if not word:
         return ""
@@ -153,4 +153,7 @@ def correction(word):
     Return the most possible word, according to probability from the corpus
     แสดงคำที่เป็นไปได้มากที่สุด
     """
+    if not word:
+        return ""
+
     return max(spell(word), key=_prob)

From 794ae9b7429656589f5027deeb3f006f3b7c7700 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 27 Oct 2018 16:42:44 +0700
Subject: [PATCH 12/16] Sorted spelling candidates by probability of word
 occurrence

---
 pythainlp/spell/pn.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index d949fe4db..b8a91e5b9 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -116,7 +116,9 @@ def _keep(word):
 
 
 def _prob(word, n=_WORDS_TOTAL):
-    "Probability of `word`."
+    """
+    Return probability of an input word, according to the corpus
+    """
     return _WORDS[word] / n
 
 
@@ -125,6 +127,9 @@ def _known(words):
 
 
 def _edits1(word):
+    """
+    Return a set of words with edit distance of 1 from the input word
+    """
     splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
     deletes = [L + R[1:] for L, R in splits if R]
     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
@@ -135,17 +140,26 @@ def _edits1(word):
 
 
 def _edits2(word):
+    """
+    Return a set of words with edit distance of 2 from the input word
+    """
     return (e2 for e1 in _edits1(word) for e2 in _edits1(e1))
 
 
 def spell(word):
     """
-    Return a list of possible words, according to edit distance of 1 and 2
+    Return a list of possible words, according to edit distance of 1 and 2,
+    sorted by probability of word occurrance
     """
     if not word:
         return ""
 
-    return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]
+    candidates = (
+        _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]
+    )
+    candidates.sort(key=_prob, reverse=True)
+
+    return candidates
 
 
 def correction(word):
@@ -156,4 +170,4 @@ def correction(word):
     if not word:
         return ""
 
-    return max(spell(word), key=_prob)
+    return spell(word)[0]

From 4c8ada500ae362ac5c345272679cb910ab4b065a Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 27 Oct 2018 16:57:31 +0700
Subject: [PATCH 13/16] _edits2() should return a set, to remove duplicated
 candidates

---
 pythainlp/spell/pn.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index b8a91e5b9..ea08f254d 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -91,7 +91,10 @@
 
 
 def _keep(word):
-    if word[0] == ".":
+    """
+    Keep only Thai words with length between 2 and 40 characters
+    """
+    if not word or len(word) < 2 or len(word) > 40 or word[0] == ".":
         return False
 
     for ch in word:
@@ -103,13 +106,9 @@ def _keep(word):
     return True
 
 
-# get word frequency from corpus then filter out non-Thai words and low frequency words
+# TODO: Add spell checker class, so user can provide customized word list
 word_freqs = tnc.get_word_frequency_all()
-word_freqs = [
-    word_freq
-    for word_freq in word_freqs
-    if word_freq[1] > 1 and len(word_freq[0]) <= 40 and _keep(word_freq[0])
-]
+word_freqs = [wf for wf in word_freqs if wf[1] > 1 and _keep(wf[0])]
 
 _WORDS = Counter(dict(word_freqs))
 _WORDS_TOTAL = sum(_WORDS.values())
@@ -143,7 +142,7 @@ def _edits2(word):
     """
     Return a set of words with edit distance of 2 from the input word
     """
-    return (e2 for e1 in _edits1(word) for e2 in _edits1(e1))
+    return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))
 
 
 def spell(word):

From 6bba43120be6b55c7060b36edd897116c2ed0bed Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 29 Oct 2018 11:27:37 +0800
Subject: [PATCH 14/16] Add ability to use custom dictionary, by creating a
 spell checker object, based on NorvigSpellChecker class

---
 examples/spell.py     |  19 ++++-
 pythainlp/spell/pn.py | 176 ++++++++++++++++++++++++++++++++----------
 2 files changed, 152 insertions(+), 43 deletions(-)

diff --git a/examples/spell.py b/examples/spell.py
index 0d39ff07f..9d82c44a7 100644
--- a/examples/spell.py
+++ b/examples/spell.py
@@ -1,8 +1,21 @@
 # -*- coding: utf-8 -*-
 
 from pythainlp.spell import spell
+from pythainlp.spell.pn import spell as pn_tnc_spell
+from pythainlp.spell.pn import correct as pn_tnc_correct
+from pythainlp.spell.pn import NorvigSpellChecker
+from pythainlp.corpus import ttc
 
-a = spell("สี่เหลียม")
-print(a)  # ['สี่เหลี่ยม']
+# checker from pythainlp.spell module (generic)
+spell("สี่เหลียม")  # ['สี่เหลี่ยม']
+# spell("สี่เหลียม", engine="hunspell")  # available in some Linux systems
 
-# a = spell("สี่เหลียม", engine="hunspell")  # available in some Linux systems
+# checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
+pn_tnc_spell("เหลืยม")
+pn_tnc_correct("เหลืยม")
+
+# checker from pythainlp.spell.pn module (specified algorithm, custom dictionary)
+ttc_word_freqs = ttc.get_word_frequency_all()
+pn_ttc_spell_checker = NorvigSpellChecker(word_freqs=ttc_word_freqs)
+pn_ttc_spell_checker.spell("เหลืยม")
+pn_ttc_spell_checker.correct("เหลืยม")
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index ea08f254d..72dbc5a5a 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 """
-Spell checker, using Peter Norvig algorithm + word frequency from Thai National Corpus
+Spell checker, using Peter Norvig algorithm.
+Spelling dictionary can be customized.
+Default spelling dictionary is based on Thai National Corpus.
 
 Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html
 """
@@ -9,7 +11,6 @@
 from pythainlp.corpus import tnc
 from pythainlp.util import is_thaichar
 
-
 _THAI_CHARS = [
     "ก",
     "ข",
@@ -90,39 +91,28 @@
 ]
 
 
-def _keep(word):
-    """
-    Keep only Thai words with length between 2 and 40 characters
-    """
-    if not word or len(word) < 2 or len(word) > 40 or word[0] == ".":
-        return False
-
+def _is_thai_and_not_num(word):
     for ch in word:
         if ch != "." and not is_thaichar(ch):
             return False
-        if ch in "๐๑๒๓๔๕๖๗๘๙":
+        if ch in "๐๑๒๓๔๕๖๗๘๙0123456789":
             return False
-
     return True
 
 
-# TODO: Add spell checker class, so user can provide customized word list
-word_freqs = tnc.get_word_frequency_all()
-word_freqs = [wf for wf in word_freqs if wf[1] > 1 and _keep(wf[0])]
-
-_WORDS = Counter(dict(word_freqs))
-_WORDS_TOTAL = sum(_WORDS.values())
-
-
-def _prob(word, n=_WORDS_TOTAL):
+def _keep(wf, min_freq, min_len, max_len, condition_func):
     """
-    Return probability of an input word, according to the corpus
+    Keep only Thai words with at least min_freq frequency
+    and has length between min_len and (max_len characters
     """
-    return _WORDS[word] / n
+    if not wf or wf[1] < min_freq:
+        return False
 
+    word = wf[0]
+    if not word or len(word) < min_len or len(word) > max_len or word[0] == ".":
+        return False
 
-def _known(words):
-    return list(w for w in words if w in _WORDS)
+    return condition_func(word)
 
 
 def _edits1(word):
@@ -145,28 +135,134 @@ def _edits2(word):
     return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))
 
 
-def spell(word):
+class NorvigSpellChecker:
+    def __init__(
+        self,
+        word_freqs=None,
+        min_freq=2,
+        min_len=2,
+        max_len=40,
+        condition_func=_is_thai_and_not_num,
+    ):
+        """
+        Initialize Peter Norvig's spell checker object
+
+        :param str word_freqs: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words).
+        :param int min_freq: Minimum frequency of a word to keep (default = 2)
+        :param int min_len: Minimum length (in characters) of a word to keep (default = 2)
+        :param int max_len: Maximum length (in characters) of a word to keep (default = 40)
+        """
+        if not word_freqs:  # default, use Thai National Corpus
+            word_freqs = tnc.get_word_frequency_all()
+
+        # filter word list
+        word_freqs = [
+            wf
+            for wf in word_freqs
+            if _keep(wf, min_freq, min_len, max_len, condition_func)
+        ]
+
+        self.__WORDS = Counter(dict(word_freqs))
+        self.__WORDS_TOTAL = sum(self.__WORDS.values())
+
+    def dictionary(self):
+        """
+        Return the spelling dictionary currently used by this spell checker
+        """
+        return self.__WORDS.items()
+
+    def known(self, words):
+        """
+        Return a list of given words that found in the spelling dictionary
+
+        :param str words: A list of words to check if they are in the spelling dictionary
+        """
+        return list(w for w in words if w in self.__WORDS)
+
+    def prob(self, word):
+        """
+        Return probability of an input word, according to the spelling dictionary
+
+        :param str word: A word to check its probability of occurrence
+        """
+        return self.__WORDS[word] / self.__WORDS_TOTAL
+
+    def spell(self, word):
+        """
+        Return a list of possible words, according to edit distance of 1 and 2,
+        sorted by probability of word occurrance in the spelling dictionary
+
+        :param str word: A word to check its spelling
+        """
+        if not word:
+            return ""
+
+        candidates = (
+            self.known([word])
+            or self.known(_edits1(word))
+            or self.known(_edits2(word))
+            or [word]
+        )
+        candidates.sort(key=self.prob, reverse=True)
+
+        return candidates
+
+    def correct(self, word):
+        """
+        Return the most possible word, using the probability from the spelling dictionary
+
+        :param str word: A word to correct its spelling
+        """
+        if not word:
+            return ""
+
+        return self.spell(word)[0]
+
+
+DEFAULT_SPELL_CHECKER = NorvigSpellChecker()
+
+
+def dictionary():
     """
-    Return a list of possible words, according to edit distance of 1 and 2,
-    sorted by probability of word occurrance
+    Return the spelling dictionary currently used by this spell checker.
+    The spelling dictionary is based on words found in the Thai National Corpus.
     """
-    if not word:
-        return ""
+    return DEFAULT_SPELL_CHECKER.dictionary()
+
 
-    candidates = (
-        _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word]
-    )
-    candidates.sort(key=_prob, reverse=True)
+def known(words):
+    """
+    Return a list of given words that found in the spelling dictionary.
+    The spelling dictionary is based on words found in the Thai National Corpus.
 
-    return candidates
+    :param str words: A list of words to check if they are in the spelling dictionary
+    """
+    return DEFAULT_SPELL_CHECKER.known(words)
 
 
-def correction(word):
+def prob(word):
     """
-    Return the most possible word, according to probability from the corpus
-    แสดงคำที่เป็นไปได้มากที่สุด
+    Return probability of an input word, according to the Thai National Corpus
+
+    :param str word: A word to check its probability of occurrence
     """
-    if not word:
-        return ""
+    return DEFAULT_SPELL_CHECKER.prob(word)
 
-    return spell(word)[0]
+
+def spell(word):
+    """
+    Return a list of possible words, according to edit distance of 1 and 2,
+    sorted by probability of word occurrance in the Thai National Corpus.
+
+    :param str word: A word to check its spelling
+    """
+    return DEFAULT_SPELL_CHECKER.spell(word)
+
+
+def correct(word):
+    """
+    Return the most possible word, according to probability from the Thai National Corpus
+
+    :param str word: A word to correct its spelling
+    """
+    return DEFAULT_SPELL_CHECKER.correct(word)

From 5e94b14c72705807d8ccbafe19059b044e0bcf1e Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 29 Oct 2018 12:05:02 +0800
Subject: [PATCH 15/16] Add None option for dict_filter, using _no_filter()
 function.

---
 examples/spell.py     |  2 +-
 pythainlp/spell/pn.py | 40 +++++++++++++++++++++++++---------------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/examples/spell.py b/examples/spell.py
index 9d82c44a7..773a122ae 100644
--- a/examples/spell.py
+++ b/examples/spell.py
@@ -16,6 +16,6 @@
 
 # checker from pythainlp.spell.pn module (specified algorithm, custom dictionary)
 ttc_word_freqs = ttc.get_word_frequency_all()
-pn_ttc_spell_checker = NorvigSpellChecker(word_freqs=ttc_word_freqs)
+pn_ttc_spell_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs)
 pn_ttc_spell_checker.spell("เหลืยม")
 pn_ttc_spell_checker.correct("เหลืยม")
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 72dbc5a5a..4a79451bb 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -91,6 +91,10 @@
 ]
 
 
+def _no_filter(word):
+    return True
+
+
 def _is_thai_and_not_num(word):
     for ch in word:
         if ch != "." and not is_thaichar(ch):
@@ -100,19 +104,19 @@ def _is_thai_and_not_num(word):
     return True
 
 
-def _keep(wf, min_freq, min_len, max_len, condition_func):
+def _keep(word_freq, min_freq, min_len, max_len, dict_filter):
     """
     Keep only Thai words with at least min_freq frequency
-    and has length between min_len and (max_len characters
+    and has length between min_len and max_len characters
     """
-    if not wf or wf[1] < min_freq:
+    if not word_freq or word_freq[1] < min_freq:
         return False
 
-    word = wf[0]
+    word = word_freq[0]
     if not word or len(word) < min_len or len(word) > max_len or word[0] == ".":
         return False
 
-    return condition_func(word)
+    return dict_filter(word)
 
 
 def _edits1(word):
@@ -138,32 +142,38 @@ def _edits2(word):
 class NorvigSpellChecker:
     def __init__(
         self,
-        word_freqs=None,
+        custom_dict=None,
         min_freq=2,
         min_len=2,
         max_len=40,
-        condition_func=_is_thai_and_not_num,
+        dict_filter=_is_thai_and_not_num,
     ):
         """
         Initialize Peter Norvig's spell checker object
 
-        :param str word_freqs: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words).
+        :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words).
         :param int min_freq: Minimum frequency of a word to keep (default = 2)
         :param int min_len: Minimum length (in characters) of a word to keep (default = 2)
         :param int max_len: Maximum length (in characters) of a word to keep (default = 40)
+        :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None.
         """
-        if not word_freqs:  # default, use Thai National Corpus
-            word_freqs = tnc.get_word_frequency_all()
+        if not custom_dict:  # default, use Thai National Corpus
+            custom_dict = tnc.get_word_frequency_all()
+
+        if dict_filter is None:
+            dict_filter = _no_filter
 
         # filter word list
-        word_freqs = [
-            wf
-            for wf in word_freqs
-            if _keep(wf, min_freq, min_len, max_len, condition_func)
+        custom_dict = [
+            word_freq
+            for word_freq in custom_dict
+            if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
         ]
 
-        self.__WORDS = Counter(dict(word_freqs))
+        self.__WORDS = Counter(dict(custom_dict))
         self.__WORDS_TOTAL = sum(self.__WORDS.values())
+        if self.__WORDS_TOTAL < 1:
+            self.__WORDS_TOTAL = 0
 
     def dictionary(self):
         """

From 0f315b92b45b59a319a46c7db43e9174f71e115a Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 29 Oct 2018 12:09:24 +0800
Subject: [PATCH 16/16] Update dict_filter condition

---
 pythainlp/spell/pn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
index 4a79451bb..5099ca30f 100644
--- a/pythainlp/spell/pn.py
+++ b/pythainlp/spell/pn.py
@@ -160,7 +160,7 @@ def __init__(
         if not custom_dict:  # default, use Thai National Corpus
             custom_dict = tnc.get_word_frequency_all()
 
-        if dict_filter is None:
+        if not dict_filter:
             dict_filter = _no_filter
 
         # filter word list