From 16d9fcd087132b510bb6a83d34f6df704fab3ed4 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 25 Oct 2018 21:14:52 +0700 Subject: [PATCH 01/16] Update Peter Norvig's spell checker to be able to suggest words based on probability (as suggested in issue #90 ) - use word frequencies from Thai National Corpus --- pythainlp/corpus/tnc.py | 2 +- pythainlp/corpus/ttc.py | 2 +- pythainlp/spell/pn.py | 28 +++++++++++++++------------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 5b9a15438..3d035914b 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -57,6 +57,6 @@ def get_word_frequency_all(): listword = [] for line in lines: listindata = line.split(" ") - listword.append((listindata[0], listindata[1])) + listword.append((listindata[0], int(listindata[1]))) return listword diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py index 015b8e2ae..34c4f6c3a 100644 --- a/pythainlp/corpus/ttc.py +++ b/pythainlp/corpus/ttc.py @@ -34,6 +34,6 @@ def get_word_frequency_all(): listword = [] for line in lines: listindata = line.split(" ") - listword.append((listindata[0], listindata[1])) + listword.append((listindata[0], int(listindata[1]))) return listword diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index fe3ff225e..7281cf836 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -1,30 +1,32 @@ # -*- coding: utf-8 -*- """ -Spell checker +Spell checker, using Peter Norvig algorithm + word frequency from Thai National Corpus -Based on Peter Norvig's Python code at http://norvig.com/spell-correct.html +Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html """ from collections import Counter -from pythainlp.corpus.thaiword import get_data -WORDS = Counter(get_data()) +from pythainlp.corpus import tnc +WORDS = Counter(dict(tnc.get_word_frequency_all())) +WORDS_TOTAL = sum(WORDS.values()) -def prob(word, n=sum(WORDS.values())): + +def _prob(word, n=WORDS_TOTAL): "Probability of `word`." return WORDS[word] / n -def correction(word): +def _correction(word): "แสดงคำที่เป็นไปได้มากที่สุด" - return max(spell(word), key=prob) + return max(spell(word), key=_prob) -def known(words): +def _known(words): return list(w for w in words if w in WORDS) -def edits1(word): +def _edits1(word): letters = [ "ก", "ข", @@ -111,12 +113,12 @@ def edits1(word): return set(deletes + transposes + replaces + inserts) -def edits2(word): - return (e2 for e1 in edits1(word) for e2 in edits1(e1)) +def _edits2(word): + return (e2 for e1 in _edits1(word) for e2 in _edits1(e1)) def spell(word): if not word: return "" - else: - return known([word]) or known(edits1(word)) or known(edits2(word)) or [word] + + return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] From 5c957c59d30562857b0a5a140290b63cdeaccef2 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 25 Oct 2018 21:23:39 +0700 Subject: [PATCH 02/16] remove import future --- pythainlp/corpus/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 62b71194f..4968f5eb6 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -3,7 +3,7 @@ import os import requests -from future.moves.urllib.request import urlopen +from urllib.request import urlopen from pythainlp.tools import get_path_data, get_path_db from tinydb import Query, TinyDB from tqdm import tqdm @@ -12,7 +12,7 @@ "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json" ) -# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"] +# __all__ = ["thaipos", "thaiword", "alphabet", "tone", "country", "wordnet"] path_db_ = get_path_db() From 6f40f7e524be93018ea2f5a9324557407ab84f6e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 25 Oct 2018 21:28:15 +0700 Subject: [PATCH 03/16] minor sort of imports --- pythainlp/corpus/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 4968f5eb6..8310894e7 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -3,10 +3,10 @@ import os import requests -from urllib.request import urlopen from pythainlp.tools import get_path_data, get_path_db from tinydb import Query, TinyDB from tqdm import tqdm +from urllib.request import urlopen CORPUS_DB_URL = ( "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json" From ae6e251e5d98262f873de5bef0d22ae7eda3159a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 25 Oct 2018 23:27:09 +0700 Subject: [PATCH 04/16] More docstring for Peter Norvig's spell checker --- .gitignore | 1 + pythainlp/spell/pn.py | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 18dff633d..d6f0b0a65 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +Untitled*.ipynb # IDE files .idea diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 7281cf836..f59b53365 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -8,22 +8,17 @@ from pythainlp.corpus import tnc -WORDS = Counter(dict(tnc.get_word_frequency_all())) -WORDS_TOTAL = sum(WORDS.values()) +_WORDS = Counter(dict(tnc.get_word_frequency_all())) +_WORDS_TOTAL = sum(_WORDS.values()) -def _prob(word, n=WORDS_TOTAL): +def _prob(word, n=_WORDS_TOTAL): "Probability of `word`." - return WORDS[word] / n - - -def _correction(word): - "แสดงคำที่เป็นไปได้มากที่สุด" - return max(spell(word), key=_prob) + return _WORDS[word] / n def _known(words): - return list(w for w in words if w in WORDS) + return list(w for w in words if w in _WORDS) def _edits1(word): @@ -110,6 +105,7 @@ def _edits1(word): transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] + return set(deletes + transposes + replaces + inserts) @@ -118,7 +114,20 @@ def _edits2(word): def spell(word): + """ + Return set of possible words, according to edit distance + """ if not word: return "" - return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] + return set( + _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] + ) + + +def correction(word): + """ + Return the most possible word, according to probability from the corpus + แสดงคำที่เป็นไปได้มากที่สุด + """ + return max(spell(word), key=_prob) From 2433a69c201ebe102470a56e643ac44767187c93 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 25 Oct 2018 23:43:24 +0700 Subject: [PATCH 05/16] should return list not set --- pythainlp/spell/pn.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index f59b53365..6fd172074 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -115,14 +115,12 @@ def _edits2(word): def spell(word): """ - Return set of possible words, according to edit distance + Return a list of possible words, according to edit distance """ if not word: return "" - return set( - _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] - ) + return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] def correction(word): From 5721e75a599da008af80a06a951f54ffa39fbbfc Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 26 Oct 2018 02:42:21 +0700 Subject: [PATCH 06/16] Filter out non-Thai words and low frequency words from word frequency list for spell checker --- pythainlp/ner/__init__.py | 21 ++++----------------- pythainlp/spell/pn.py | 21 ++++++++++++++++++++- pythainlp/util/__init__.py | 22 ++++++++++++++++++---- 3 files changed, 42 insertions(+), 22 deletions(-) diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py index 12089b927..9992d2547 100644 --- a/pythainlp/ner/__init__.py +++ b/pythainlp/ner/__init__.py @@ -5,6 +5,7 @@ from pythainlp.corpus import download, get_file, stopwords from pythainlp.tag import pos_tag from pythainlp.tokenize import word_tokenize +from pythainlp.util import is_thaiword try: import sklearn_crfsuite @@ -22,20 +23,6 @@ _STOPWORDS = stopwords.words("thai") -def _is_thaichar(ch): # เป็นอักษรไทยหรือไม่ - ch_val = ord(ch) - if ch_val >= 3584 and ch_val <= 3711: - return True - return False - - -def _is_thaiword(word): # เป็นคำที่มีแต่อักษรไทยหรือไม่ - for ch in word: - if ch != "." and not _is_thaichar(ch): - return False - return True - - def _is_stopword(word): # เช็คว่าเป็นคำฟุ่มเฟือย return word in _STOPWORDS @@ -47,7 +34,7 @@ def _doc2features(doc, i): features = { "word.word": word, "word.stopword": _is_stopword(word), - "word.isthai": _is_thaiword(word), + "word.isthai": is_thaiword(word), "word.isspace": word.isspace(), "postag": postag, "word.isdigit()": word.isdigit(), @@ -61,7 +48,7 @@ def _doc2features(doc, i): postag1 = doc[i - 1][1] features["word.prevword"] = prevword features["word.previsspace"] = prevword.isspace() - features["word.previsthai"] = _is_thaiword(prevword) + features["word.previsthai"] = is_thaiword(prevword) features["word.prevstopword"] = _is_stopword(prevword) features["word.prepostag"] = postag1 features["word.prevwordisdigit"] = prevword.isdigit() @@ -75,7 +62,7 @@ def _doc2features(doc, i): features["word.nextword"] = nextword features["word.nextisspace"] = nextword.isspace() features["word.nextpostag"] = postag1 - features["word.nextisthai"] = _is_thaiword(nextword) + features["word.nextisthai"] = is_thaiword(nextword) features["word.nextstopword"] = _is_stopword(nextword) features["word.nextwordisdigit"] = nextword.isdigit() else: diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 6fd172074..1820617af 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -7,8 +7,27 @@ from collections import Counter from pythainlp.corpus import tnc +from pythainlp.util import is_thaichar -_WORDS = Counter(dict(tnc.get_word_frequency_all())) + +def _keep(word): + for ch in word: + if ch != "." and not is_thaichar(ch): + return False + if ch in "๐๑๒๓๔๕๖๗๘๙": + return False + return True + + +# get word frequency from TNC then filter out non-Thai words and low frequency words +word_freqs = tnc.get_word_frequency_all() +word_freqs = [ + word_freq + for word_freq in word_freqs + if word_freq[1] > 2 and len(word_freq[0]) <= 40 and _keep(word_freq[0]) +] + +_WORDS = Counter(dict(word_freqs)) _WORDS_TOTAL = sum(_WORDS.values()) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 7566d83a3..f129fe5ad 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -7,6 +7,20 @@ from nltk.util import ngrams as ngramsdata +def is_thaichar(ch): # เป็นอักษรไทยหรือไม่ + ch_val = ord(ch) + if ch_val >= 3584 and ch_val <= 3711: + return True + return False + + +def is_thaiword(word): # เป็นคำที่มีแต่อักษรไทยหรือไม่ + for ch in word: + if ch != "." and not is_thaichar(ch): + return False + return True + + def ngrams(token, num): """ ngrams สร้าง ngrams @@ -34,7 +48,7 @@ def trigram(token): return ngrams(token, 3) -RULE1 = [ +_NORMALIZE_RULE1 = [ "ะ", "ั", "็", @@ -61,7 +75,7 @@ def trigram(token): ] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา -RULE2 = [ +_NORMALIZE_RULE2 = [ ("เเ", "แ"), # เ เ -> แ ("ํ(t)า", "\\1ำ"), ("ํา(t)", "\\1ำ"), @@ -81,9 +95,9 @@ def normalize(text): >>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก True """ - for data in RULE2: + for data in _NORMALIZE_RULE2: text = re.sub(data[0].replace("t", "[่้๊๋]"), data[1], text) - for data in list(zip(RULE1, RULE1)): + for data in list(zip(_NORMALIZE_RULE1, _NORMALIZE_RULE1)): text = re.sub(data[0].replace("t", "[่้๊๋]") + "+", data[1], text) return text From 83c5187525df80f67950fa203c1adc8ed055432f Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 26 Oct 2018 02:46:22 +0700 Subject: [PATCH 07/16] make Thai characters list a constant outside function _edits1() --- pythainlp/spell/pn.py | 162 +++++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 80 deletions(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 1820617af..40196924e 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -10,6 +10,86 @@ from pythainlp.util import is_thaichar +_THAI_CHARS = [ + "ก", + "ข", + "ฃ", + "ค", + "ฅ", + "ฆ", + "ง", + "จ", + "ฉ", + "ช", + "ซ", + "ฌ", + "ญ", + "ฎ", + "ฏ", + "ฐ", + "ฑ", + "ฒ", + "ณ", + "ด", + "ต", + "ถ", + "ท", + "ธ", + "น", + "บ", + "ป", + "ผ", + "ฝ", + "พ", + "ฟ", + "ภ", + "ม", + "ย", + "ร", + "ฤ", + "ล", + "ฦ", + "ว", + "ศ", + "ษ", + "ส", + "ห", + "ฬ", + "อ", + "ฮ", + "ฯ", + "ะ", + "ั", + "า", + "ำ", + "ิ", + "ี", + "ึ", + "ื", + "ุ", + "ู", + "ฺ", + "\u0e3b", + "\u0e3c", + "\u0e3d", + "\u0e3e", + "฿", + "เ", + "แ", + "โ", + "ใ", + "ไ", + "ๅ", + "ๆ", + "็", + "่", + "้", + "๊", + "๋", + "์", +] + + def _keep(word): for ch in word: if ch != "." and not is_thaichar(ch): @@ -41,89 +121,11 @@ def _known(words): def _edits1(word): - letters = [ - "ก", - "ข", - "ฃ", - "ค", - "ฅ", - "ฆ", - "ง", - "จ", - "ฉ", - "ช", - "ซ", - "ฌ", - "ญ", - "ฎ", - "ฏ", - "ฐ", - "ฑ", - "ฒ", - "ณ", - "ด", - "ต", - "ถ", - "ท", - "ธ", - "น", - "บ", - "ป", - "ผ", - "ฝ", - "พ", - "ฟ", - "ภ", - "ม", - "ย", - "ร", - "ฤ", - "ล", - "ฦ", - "ว", - "ศ", - "ษ", - "ส", - "ห", - "ฬ", - "อ", - "ฮ", - "ฯ", - "ะ", - "ั", - "า", - "ำ", - "ิ", - "ี", - "ึ", - "ื", - "ุ", - "ู", - "ฺ", - "\u0e3b", - "\u0e3c", - "\u0e3d", - "\u0e3e", - "฿", - "เ", - "แ", - "โ", - "ใ", - "ไ", - "ๅ", - "ๆ", - "็", - "่", - "้", - "๊", - "๋", - "์", - ] splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] - replaces = [L + c + R[1:] for L, R in splits if R for c in letters] - inserts = [L + c + R for L, R in splits for c in letters] + replaces = [L + c + R[1:] for L, R in splits if R for c in _THAI_CHARS] + inserts = [L + c + R for L, R in splits for c in _THAI_CHARS] return set(deletes + transposes + replaces + inserts) From 08278b159d50aaea5cf3b0b72011c0e1f49ea774 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 26 Oct 2018 03:03:27 +0700 Subject: [PATCH 08/16] Adjust word frequency filter --- pythainlp/spell/pn.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 40196924e..d162c8b64 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -91,11 +91,15 @@ def _keep(word): + if word[0] == ".": + return False + for ch in word: if ch != "." and not is_thaichar(ch): return False if ch in "๐๑๒๓๔๕๖๗๘๙": return False + return True @@ -104,7 +108,7 @@ def _keep(word): word_freqs = [ word_freq for word_freq in word_freqs - if word_freq[1] > 2 and len(word_freq[0]) <= 40 and _keep(word_freq[0]) + if word_freq[1] > 1 and len(word_freq[0]) <= 40 and _keep(word_freq[0]) ] _WORDS = Counter(dict(word_freqs)) From 75ab30d9111eea3d10ac88e907ddb2bc18232c1d Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 26 Oct 2018 11:26:08 +0700 Subject: [PATCH 09/16] Trying to reduce cognitive complex in functions, as suggested by Code Climate --- pythainlp/corpus/tnc.py | 3 ++- pythainlp/corpus/ttc.py | 5 +++-- pythainlp/ner/__init__.py | 37 ++++++++++++++++++++++--------------- pythainlp/spell/pn.py | 21 ++++++++++++--------- 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py index 3d035914b..28c2c0780 100644 --- a/pythainlp/corpus/tnc.py +++ b/pythainlp/corpus/tnc.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ -Word frequency from Thai National Corpus +Thai National Corpus word frequency + Credit: Korakot Chaovavanich‎ https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py index 34c4f6c3a..fbf82d9a5 100644 --- a/pythainlp/corpus/ttc.py +++ b/pythainlp/corpus/ttc.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ -TTC Thai word frequency +Thai Textbook Corpus (TTC) word frequency + Credit: Korakot Chaovavanich‎ https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 """ @@ -13,7 +14,7 @@ def get_word_frequency_all(): """ - ดึงข้อมูลความถี่คำของ TTC มาใช้งาน + ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...] """ path = os.path.join(os.path.expanduser("~"), "pythainlp-data") diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py index 9992d2547..b73ef4402 100644 --- a/pythainlp/ner/__init__.py +++ b/pythainlp/ner/__init__.py @@ -30,6 +30,7 @@ def _is_stopword(word): # เช็คว่าเป็นคำฟุ่ม def _doc2features(doc, i): word = doc[i][0] postag = doc[i][1] + # Features from current word features = { "word.word": word, @@ -39,32 +40,38 @@ def _doc2features(doc, i): "postag": postag, "word.isdigit()": word.isdigit(), } - if word.isdigit() and len(word) == 5: features["word.islen5"] = True + # Features from previous word if i > 0: prevword = doc[i - 1][0] - postag1 = doc[i - 1][1] - features["word.prevword"] = prevword - features["word.previsspace"] = prevword.isspace() - features["word.previsthai"] = is_thaiword(prevword) - features["word.prevstopword"] = _is_stopword(prevword) - features["word.prepostag"] = postag1 - features["word.prevwordisdigit"] = prevword.isdigit() + prevpostag = doc[i - 1][1] + prev_features = { + "word.prevword": prevword, + "word.previsspace": prevword.isspace(), + "word.previsthai": is_thaiword(prevword), + "word.prevstopword": _is_stopword(prevword), + "word.prevpostag": prevpostag, + "word.prevwordisdigit": prevword.isdigit(), + } + features.update(prev_features) else: features["BOS"] = True # Special "Beginning of Sequence" tag # Features from next word if i < len(doc) - 1: nextword = doc[i + 1][0] - postag1 = doc[i + 1][1] - features["word.nextword"] = nextword - features["word.nextisspace"] = nextword.isspace() - features["word.nextpostag"] = postag1 - features["word.nextisthai"] = is_thaiword(nextword) - features["word.nextstopword"] = _is_stopword(nextword) - features["word.nextwordisdigit"] = nextword.isdigit() + nextpostag = doc[i + 1][1] + next_features = { + "word.nextword": nextword, + "word.nextisspace": nextword.isspace(), + "word.nextpostag": nextpostag, + "word.nextisthai": is_thaiword(nextword), + "word.nextstopword": _is_stopword(nextword), + "word.nextwordisdigit": nextword.isdigit(), + } + features.update(next_features) else: features["EOS"] = True # Special "End of Sequence" tag diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index d162c8b64..2b4ec5f2e 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -91,16 +91,19 @@ def _keep(word): + keep = True if word[0] == ".": - return False - - for ch in word: - if ch != "." and not is_thaichar(ch): - return False - if ch in "๐๑๒๓๔๕๖๗๘๙": - return False - - return True + keep = False + else: + for ch in word: + if ch != "." and not is_thaichar(ch): + keep = False + break + if ch in "๐๑๒๓๔๕๖๗๘๙": + keep = False + break + + return keep # get word frequency from TNC then filter out non-Thai words and low frequency words From a476471d23ef896f32b8f07abc6cd4c25b33eab0 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 26 Oct 2018 11:29:31 +0700 Subject: [PATCH 10/16] Stick with the previous _keep() code, less cognitive complexity --- pythainlp/spell/pn.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 2b4ec5f2e..d162c8b64 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -91,19 +91,16 @@ def _keep(word): - keep = True if word[0] == ".": - keep = False - else: - for ch in word: - if ch != "." and not is_thaichar(ch): - keep = False - break - if ch in "๐๑๒๓๔๕๖๗๘๙": - keep = False - break - - return keep + return False + + for ch in word: + if ch != "." and not is_thaichar(ch): + return False + if ch in "๐๑๒๓๔๕๖๗๘๙": + return False + + return True # get word frequency from TNC then filter out non-Thai words and low frequency words From 32cc4fe9221ffd27abbd37a4659b8ee802de11de Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 27 Oct 2018 15:04:53 +0700 Subject: [PATCH 11/16] check empty string case in correction() --- pythainlp/spell/pn.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index d162c8b64..d949fe4db 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -103,7 +103,7 @@ def _keep(word): return True -# get word frequency from TNC then filter out non-Thai words and low frequency words +# get word frequency from corpus then filter out non-Thai words and low frequency words word_freqs = tnc.get_word_frequency_all() word_freqs = [ word_freq @@ -140,7 +140,7 @@ def _edits2(word): def spell(word): """ - Return a list of possible words, according to edit distance + Return a list of possible words, according to edit distance of 1 and 2 """ if not word: return "" @@ -153,4 +153,7 @@ def correction(word): Return the most possible word, according to probability from the corpus แสดงคำที่เป็นไปได้มากที่สุด """ + if not word: + return "" + return max(spell(word), key=_prob) From 794ae9b7429656589f5027deeb3f006f3b7c7700 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 27 Oct 2018 16:42:44 +0700 Subject: [PATCH 12/16] Sorted spelling candidates by probability of word occurrence --- pythainlp/spell/pn.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index d949fe4db..b8a91e5b9 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -116,7 +116,9 @@ def _keep(word): def _prob(word, n=_WORDS_TOTAL): - "Probability of `word`." + """ + Return probability of an input word, according to the corpus + """ return _WORDS[word] / n @@ -125,6 +127,9 @@ def _known(words): def _edits1(word): + """ + Return a set of words with edit distance of 1 from the input word + """ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] @@ -135,17 +140,26 @@ def _edits1(word): def _edits2(word): + """ + Return a set of words with edit distance of 2 from the input word + """ return (e2 for e1 in _edits1(word) for e2 in _edits1(e1)) def spell(word): """ - Return a list of possible words, according to edit distance of 1 and 2 + Return a list of possible words, according to edit distance of 1 and 2, + sorted by probability of word occurrance """ if not word: return "" - return _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] + candidates = ( + _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] + ) + candidates.sort(key=_prob, reverse=True) + + return candidates def correction(word): @@ -156,4 +170,4 @@ def correction(word): if not word: return "" - return max(spell(word), key=_prob) + return spell(word)[0] From 4c8ada500ae362ac5c345272679cb910ab4b065a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 27 Oct 2018 16:57:31 +0700 Subject: [PATCH 13/16] _edits2() should return a set, to remove duplicated candidates --- pythainlp/spell/pn.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index b8a91e5b9..ea08f254d 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -91,7 +91,10 @@ def _keep(word): - if word[0] == ".": + """ + Keep only Thai words with length between 2 and 40 characters + """ + if not word or len(word) < 2 or len(word) > 40 or word[0] == ".": return False for ch in word: @@ -103,13 +106,9 @@ def _keep(word): return True -# get word frequency from corpus then filter out non-Thai words and low frequency words +# TODO: Add spell checker class, so user can provide customized word list word_freqs = tnc.get_word_frequency_all() -word_freqs = [ - word_freq - for word_freq in word_freqs - if word_freq[1] > 1 and len(word_freq[0]) <= 40 and _keep(word_freq[0]) -] +word_freqs = [wf for wf in word_freqs if wf[1] > 1 and _keep(wf[0])] _WORDS = Counter(dict(word_freqs)) _WORDS_TOTAL = sum(_WORDS.values()) @@ -143,7 +142,7 @@ def _edits2(word): """ Return a set of words with edit distance of 2 from the input word """ - return (e2 for e1 in _edits1(word) for e2 in _edits1(e1)) + return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1)) def spell(word): From 6bba43120be6b55c7060b36edd897116c2ed0bed Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 29 Oct 2018 11:27:37 +0800 Subject: [PATCH 14/16] Add ability to use custom dictionary, by creating a spell checker object, based on NorvigSpellChecker class --- examples/spell.py | 19 ++++- pythainlp/spell/pn.py | 176 ++++++++++++++++++++++++++++++++---------- 2 files changed, 152 insertions(+), 43 deletions(-) diff --git a/examples/spell.py b/examples/spell.py index 0d39ff07f..9d82c44a7 100644 --- a/examples/spell.py +++ b/examples/spell.py @@ -1,8 +1,21 @@ # -*- coding: utf-8 -*- from pythainlp.spell import spell +from pythainlp.spell.pn import spell as pn_tnc_spell +from pythainlp.spell.pn import correct as pn_tnc_correct +from pythainlp.spell.pn import NorvigSpellChecker +from pythainlp.corpus import ttc -a = spell("สี่เหลียม") -print(a) # ['สี่เหลี่ยม'] +# checker from pythainlp.spell module (generic) +spell("สี่เหลียม") # ['สี่เหลี่ยม'] +# spell("สี่เหลียม", engine="hunspell") # available in some Linux systems -# a = spell("สี่เหลียม", engine="hunspell") # available in some Linux systems +# checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's) +pn_tnc_spell("เหลืยม") +pn_tnc_correct("เหลืยม") + +# checker from pythainlp.spell.pn module (specified algorithm, custom dictionary) +ttc_word_freqs = ttc.get_word_frequency_all() +pn_ttc_spell_checker = NorvigSpellChecker(word_freqs=ttc_word_freqs) +pn_ttc_spell_checker.spell("เหลืยม") +pn_ttc_spell_checker.correct("เหลืยม") diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index ea08f254d..72dbc5a5a 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- """ -Spell checker, using Peter Norvig algorithm + word frequency from Thai National Corpus +Spell checker, using Peter Norvig algorithm. +Spelling dictionary can be customized. +Default spelling dictionary is based on Thai National Corpus. Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html """ @@ -9,7 +11,6 @@ from pythainlp.corpus import tnc from pythainlp.util import is_thaichar - _THAI_CHARS = [ "ก", "ข", @@ -90,39 +91,28 @@ ] -def _keep(word): - """ - Keep only Thai words with length between 2 and 40 characters - """ - if not word or len(word) < 2 or len(word) > 40 or word[0] == ".": - return False - +def _is_thai_and_not_num(word): for ch in word: if ch != "." and not is_thaichar(ch): return False - if ch in "๐๑๒๓๔๕๖๗๘๙": + if ch in "๐๑๒๓๔๕๖๗๘๙0123456789": return False - return True -# TODO: Add spell checker class, so user can provide customized word list -word_freqs = tnc.get_word_frequency_all() -word_freqs = [wf for wf in word_freqs if wf[1] > 1 and _keep(wf[0])] - -_WORDS = Counter(dict(word_freqs)) -_WORDS_TOTAL = sum(_WORDS.values()) - - -def _prob(word, n=_WORDS_TOTAL): +def _keep(wf, min_freq, min_len, max_len, condition_func): """ - Return probability of an input word, according to the corpus + Keep only Thai words with at least min_freq frequency + and has length between min_len and (max_len characters """ - return _WORDS[word] / n + if not wf or wf[1] < min_freq: + return False + word = wf[0] + if not word or len(word) < min_len or len(word) > max_len or word[0] == ".": + return False -def _known(words): - return list(w for w in words if w in _WORDS) + return condition_func(word) def _edits1(word): @@ -145,28 +135,134 @@ def _edits2(word): return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1)) -def spell(word): +class NorvigSpellChecker: + def __init__( + self, + word_freqs=None, + min_freq=2, + min_len=2, + max_len=40, + condition_func=_is_thai_and_not_num, + ): + """ + Initialize Peter Norvig's spell checker object + + :param str word_freqs: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words). + :param int min_freq: Minimum frequency of a word to keep (default = 2) + :param int min_len: Minimum length (in characters) of a word to keep (default = 2) + :param int max_len: Maximum length (in characters) of a word to keep (default = 40) + """ + if not word_freqs: # default, use Thai National Corpus + word_freqs = tnc.get_word_frequency_all() + + # filter word list + word_freqs = [ + wf + for wf in word_freqs + if _keep(wf, min_freq, min_len, max_len, condition_func) + ] + + self.__WORDS = Counter(dict(word_freqs)) + self.__WORDS_TOTAL = sum(self.__WORDS.values()) + + def dictionary(self): + """ + Return the spelling dictionary currently used by this spell checker + """ + return self.__WORDS.items() + + def known(self, words): + """ + Return a list of given words that found in the spelling dictionary + + :param str words: A list of words to check if they are in the spelling dictionary + """ + return list(w for w in words if w in self.__WORDS) + + def prob(self, word): + """ + Return probability of an input word, according to the spelling dictionary + + :param str word: A word to check its probability of occurrence + """ + return self.__WORDS[word] / self.__WORDS_TOTAL + + def spell(self, word): + """ + Return a list of possible words, according to edit distance of 1 and 2, + sorted by probability of word occurrance in the spelling dictionary + + :param str word: A word to check its spelling + """ + if not word: + return "" + + candidates = ( + self.known([word]) + or self.known(_edits1(word)) + or self.known(_edits2(word)) + or [word] + ) + candidates.sort(key=self.prob, reverse=True) + + return candidates + + def correct(self, word): + """ + Return the most possible word, using the probability from the spelling dictionary + + :param str word: A word to correct its spelling + """ + if not word: + return "" + + return self.spell(word)[0] + + +DEFAULT_SPELL_CHECKER = NorvigSpellChecker() + + +def dictionary(): """ - Return a list of possible words, according to edit distance of 1 and 2, - sorted by probability of word occurrance + Return the spelling dictionary currently used by this spell checker. + The spelling dictionary is based on words found in the Thai National Corpus. """ - if not word: - return "" + return DEFAULT_SPELL_CHECKER.dictionary() + - candidates = ( - _known([word]) or _known(_edits1(word)) or _known(_edits2(word)) or [word] - ) - candidates.sort(key=_prob, reverse=True) +def known(words): + """ + Return a list of given words that found in the spelling dictionary. + The spelling dictionary is based on words found in the Thai National Corpus. - return candidates + :param str words: A list of words to check if they are in the spelling dictionary + """ + return DEFAULT_SPELL_CHECKER.known(words) -def correction(word): +def prob(word): """ - Return the most possible word, according to probability from the corpus - แสดงคำที่เป็นไปได้มากที่สุด + Return probability of an input word, according to the Thai National Corpus + + :param str word: A word to check its probability of occurrence """ - if not word: - return "" + return DEFAULT_SPELL_CHECKER.prob(word) - return spell(word)[0] + +def spell(word): + """ + Return a list of possible words, according to edit distance of 1 and 2, + sorted by probability of word occurrance in the Thai National Corpus. + + :param str word: A word to check its spelling + """ + return DEFAULT_SPELL_CHECKER.spell(word) + + +def correct(word): + """ + Return the most possible word, according to probability from the Thai National Corpus + + :param str word: A word to correct its spelling + """ + return DEFAULT_SPELL_CHECKER.correct(word) From 5e94b14c72705807d8ccbafe19059b044e0bcf1e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 29 Oct 2018 12:05:02 +0800 Subject: [PATCH 15/16] Add None option for dict_filter, using _no_filter() function. --- examples/spell.py | 2 +- pythainlp/spell/pn.py | 40 +++++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/examples/spell.py b/examples/spell.py index 9d82c44a7..773a122ae 100644 --- a/examples/spell.py +++ b/examples/spell.py @@ -16,6 +16,6 @@ # checker from pythainlp.spell.pn module (specified algorithm, custom dictionary) ttc_word_freqs = ttc.get_word_frequency_all() -pn_ttc_spell_checker = NorvigSpellChecker(word_freqs=ttc_word_freqs) +pn_ttc_spell_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs) pn_ttc_spell_checker.spell("เหลืยม") pn_ttc_spell_checker.correct("เหลืยม") diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 72dbc5a5a..4a79451bb 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -91,6 +91,10 @@ ] +def _no_filter(word): + return True + + def _is_thai_and_not_num(word): for ch in word: if ch != "." and not is_thaichar(ch): @@ -100,19 +104,19 @@ def _is_thai_and_not_num(word): return True -def _keep(wf, min_freq, min_len, max_len, condition_func): +def _keep(word_freq, min_freq, min_len, max_len, dict_filter): """ Keep only Thai words with at least min_freq frequency - and has length between min_len and (max_len characters + and has length between min_len and max_len characters """ - if not wf or wf[1] < min_freq: + if not word_freq or word_freq[1] < min_freq: return False - word = wf[0] + word = word_freq[0] if not word or len(word) < min_len or len(word) > max_len or word[0] == ".": return False - return condition_func(word) + return dict_filter(word) def _edits1(word): @@ -138,32 +142,38 @@ def _edits2(word): class NorvigSpellChecker: def __init__( self, - word_freqs=None, + custom_dict=None, min_freq=2, min_len=2, max_len=40, - condition_func=_is_thai_and_not_num, + dict_filter=_is_thai_and_not_num, ): """ Initialize Peter Norvig's spell checker object - :param str word_freqs: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words). + :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words). :param int min_freq: Minimum frequency of a word to keep (default = 2) :param int min_len: Minimum length (in characters) of a word to keep (default = 2) :param int max_len: Maximum length (in characters) of a word to keep (default = 40) + :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None. """ - if not word_freqs: # default, use Thai National Corpus - word_freqs = tnc.get_word_frequency_all() + if not custom_dict: # default, use Thai National Corpus + custom_dict = tnc.get_word_frequency_all() + + if dict_filter is None: + dict_filter = _no_filter # filter word list - word_freqs = [ - wf - for wf in word_freqs - if _keep(wf, min_freq, min_len, max_len, condition_func) + custom_dict = [ + word_freq + for word_freq in custom_dict + if _keep(word_freq, min_freq, min_len, max_len, dict_filter) ] - self.__WORDS = Counter(dict(word_freqs)) + self.__WORDS = Counter(dict(custom_dict)) self.__WORDS_TOTAL = sum(self.__WORDS.values()) + if self.__WORDS_TOTAL < 1: + self.__WORDS_TOTAL = 0 def dictionary(self): """ From 0f315b92b45b59a319a46c7db43e9174f71e115a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 29 Oct 2018 12:09:24 +0800 Subject: [PATCH 16/16] Update dict_filter condition --- pythainlp/spell/pn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 4a79451bb..5099ca30f 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -160,7 +160,7 @@ def __init__( if not custom_dict: # default, use Thai National Corpus custom_dict = tnc.get_word_frequency_all() - if dict_filter is None: + if not dict_filter: dict_filter = _no_filter # filter word list