diff --git a/.travis.yml b/.travis.yml index 8f4edb93f..f04002977 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,10 +4,16 @@ language: python python: - "3.6" + +# workaround to make boto work on travis +# from https://github.com/travis-ci/travis-ci/issues/7940 +before_install: + - sudo rm -f /etc/boto.cfg + # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install -r requirements.txt - - pip install .[icu,ner,pos,tokenize,transliterate] + - pip install .[icu,ipa,ner,thai2vec] - pip install coveralls os: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5ba12656d..dd52500c3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,8 +23,8 @@ We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-mod - Write tests for your new features (please see "Tests" topic below); - Always remember that [commented code is dead code](http://www.codinghorror.com/blog/2008/07/coding-without-comments.html); -- Name identifiers (variables, classes, functions, module names) with readable - names (`x` is always wrong); +- Name identifiers (variables, classes, functions, module names) with meaningful + and pronounceable names (`x` is always wrong); - When manipulating strings, use [Python's new-style formatting](http://docs.python.org/library/string.html#format-string-syntax) (`'{} = {}'.format(a, b)` instead of `'%s = %s' % (a, b)`); @@ -55,7 +55,7 @@ Happy hacking! (; ## newmm (onecut), mm, TCC, and Thai Soundex Code - Korakot Chaovavanich -## Thai2Vec & ulmfit +## Thai2Vec & ULMFiT - Charin Polpanumas ## Docs diff --git a/README-pypi.md b/README-pypi.md index 70a8a53c2..8141c642e 100644 --- a/README-pypi.md +++ b/README-pypi.md @@ -10,20 +10,14 @@ PyThaiNLP is a Python library for natural language processing (NLP) of Thai language. -PyThaiNLP features include Thai word and subword segmentations, soundex, romanization, part-of-speech taggers, and spelling corrections. - -## What's new in version 1.7 ? - -- Deprecate Python 2 support. (Python 2 compatibility code will be completely dropped in PyThaiNLP 1.8) -- Refactor pythainlp.tokenize.pyicu for readability -- Add Thai NER model to pythainlp.ner -- thai2vec v0.2 - larger vocab, benchmarking results on Wongnai dataset -- Sentiment classifier based on ULMFit and various product review datasets -- Add ULMFit utility to PyThaiNLP -- Add Thai romanization model ThaiTransliterator -- Retrain POS-tagging model -- Improved word_tokenize (newmm, mm) and dict_word_tokenize -- Documentation added +PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, part-of-speech taggers, and spell checkers. + +## What's new in version 1.8 ? + +- New NorvigSpellChecker spell checker class, which can be initialized with custom dictionary. +- Terminate Python 2 support. Remove all Python 2 compatibility code. +- Remove old, obsolated, deprecated, and experimental code. +- see [PyThaiNLP 1.8 change log](https://github.com/PyThaiNLP/pythainlp/issues/118) ## Install diff --git a/README.md b/README.md index ef71bf205..c3399a200 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@ Thai Natural Language Processing in Python. PyThaiNLP is a Python package for text processing and linguistic analysis, similar to `nltk` but with focus on Thai language. -PyThaiNLP supports Python 3.4+. -Since version 1.7, PyThaiNLP deprecates its support for Python 2. The future PyThaiNLP 1.8 will completely drop all supports for Python 2. -Python 2 users can still use PyThaiNLP 1.6. +PyThaiNLP 1.8 supports Python 3.6+. Some functions may work with older version of Python 3, but it is not well-tested and will not be supported. See [PyThaiNLP 1.8 change log](https://github.com/PyThaiNLP/pythainlp/issues/118). + +Python 2 users can use PyThaiNLP 1.6, our latest released that tested with Python 2.7. **This is a document for development branch (post 1.7.x). Things will break. For a document for stable branch, see [master](https://github.com/PyThaiNLP/pythainlp/tree/master).** @@ -34,21 +34,40 @@ Python 2 users can still use PyThaiNLP 1.6. ## Installation -**Using pip** +PyThaiNLP uses PyPI as its main distribution channel, see https://pypi.org/project/pythainlp/ + +### Stable release -Stable release +Standard installation: ```sh $ pip install pythainlp ``` -Development release +For some advanced functionalities, like word vector, extra packages may be needed. Install them with these options during pip install: ```sh -$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip +$ pip install pythainlp[extra1,extra2,...] ``` -Note: PyTorch is required for ulmfit sentiment analyser. ```pip install torch``` is needed for the feature. gensim and keras packages may also needed for other modules that rely on these machine learning libraries. +where ```extras``` can be + - ```artagger``` (to support artagger part-of-speech tagger) + - ```deepcut``` (to support deepcut machine-learnt tokenizer) + - ```icu``` (for ICU support in transliteration and tokenization) + - ```ipa``` (for International Phonetic Alphabet support in transliteration) + - ```ml``` (to support ULMFit models, like one for sentiment analyser) + - ```ner``` (for named-entity recognizer) + - ```thai2rom``` (for machine-learnt romanization) + - ```thai2vec``` (for Thai word vector) + - ```full``` (install everything) + +see ```extras``` and ```extras_require``` in [```setup.py```](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py) for details. + +Development release: + +```sh +$ pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip +``` ## Documentation diff --git a/appveyor.yml b/appveyor.yml index 00b4e1ae2..808598eae 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,7 +32,7 @@ install: # - "set ICU_VERSION=62" - "%PYTHON%/python.exe -m pip install --upgrade pip" - "%PYTHON%/python.exe -m pip install %PYICU_WHEEL%" - - "%PYTHON%/python.exe -m pip install -e .[icu,ner,pos,tokenize,transliterate]" + - "%PYTHON%/python.exe -m pip install -e .[icu,ipa,ner,thai2vec]" test_script: - "%PYTHON%/python.exe -m pip --version" diff --git a/pythainlp/number/wordtonum.py b/pythainlp/number/wordtonum.py index 7184cf61a..871d4c784 100644 --- a/pythainlp/number/wordtonum.py +++ b/pythainlp/number/wordtonum.py @@ -40,11 +40,11 @@ def _thaiword_to_num(tokens): - len_tokens = len(tokens) - - if len_tokens == 0: + if not tokens: return None + len_tokens = len(tokens) + if len_tokens == 1: return _THAI_INT_MAP[tokens[0]] diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py index 19ca3368f..19532f453 100644 --- a/pythainlp/sentiment/ulmfit_sent.py +++ b/pythainlp/sentiment/ulmfit_sent.py @@ -15,6 +15,8 @@ # from fastai.text import multiBatchRNN +__all__ = ["about", "get_sentiment"] + MODEL_NAME = "sent_model" ITOS_NAME = "itos_sent" @@ -29,24 +31,26 @@ def get_path(fname): # load model -model = torch.load(get_path(MODEL_NAME)) -model.eval() +MODEL = torch.load(get_path(MODEL_NAME)) +MODEL.eval() # load itos and stoi itos = pickle.load(open(get_path(ITOS_NAME), "rb")) stoi = defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)}) + # get sentiment; 1 for positive and 0 for negative # or score if specified return_score=True -softmax = lambda x: np.exp(x) / np.sum(np.exp(x)) +def softmax(x): + return np.exp(x) / np.sum(np.exp(x)) def get_sentiment(text, return_score=False): words = word_tokenize(text) tensor = LongTensor([stoi[word] for word in words]).view(-1, 1).cpu() tensor = Variable(tensor, volatile=False) - model.reset() - pred, *_ = model(tensor) + MODEL.reset() + pred, *_ = MODEL(tensor) result = pred.data.cpu().numpy().reshape(-1) if return_score: diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index d60ee950f..7b694375a 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -20,21 +20,30 @@ def pos_tag(words, engine="unigram", corpus="orchid"): * pud - Parallel Universal Dependencies (PUD) treebanks :return: returns a list of labels regarding which part of speech it is """ + if not words: + return [] + if engine == "perceptron": - from .perceptron import tag as _tag + from .perceptron import tag as tag_ elif engine == "artagger": - def _tag(text, corpus=None): + def tag_(words, corpus=None): + if not words: + return [] + from artagger import Tagger - words = Tagger().tag(" ".join(text)) + words_ = Tagger().tag(" ".join(words)) - return [(word.word, word.tag) for word in words] + return [(word.word, word.tag) for word in words_] else: # default, use "unigram" ("old") engine - from .unigram import tag as _tag + from .unigram import tag as tag_ - return _tag(words, corpus=corpus) + return tag_(words, corpus=corpus) def pos_tag_sents(sentences, engine="unigram", corpus="orchid"): + if not sentences: + return [] + return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences] diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index 8d4fe1280..e5dc9e424 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -7,28 +7,33 @@ import dill from pythainlp.corpus import CORPUS_PATH +_ORCHID_DATA_FILENAME = "orchid_pt_tagger.dill" +_PUD_DATA_FILENAME = "ud_thai_pud_pt_tagger.dill" -def orchid_data(): - data_filename = os.path.join(CORPUS_PATH, "orchid_pt_tagger.dill") + +def _load_tagger(filename): + data_filename = os.path.join(CORPUS_PATH, filename) with open(data_filename, "rb") as fh: model = dill.load(fh) return model -def pud_data(): - data_filename = os.path.join(CORPUS_PATH, "ud_thai_pud_pt_tagger.dill") - with open(data_filename, "rb") as fh: - model = dill.load(fh) - return model +_ORCHID_TAGGER = _load_tagger(_ORCHID_DATA_FILENAME) +_PUD_TAGGER = _load_tagger(_PUD_DATA_FILENAME) -def tag(text, corpus="pud"): +def tag(words, corpus="pud"): """ รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('คำ', 'ชนิดคำ'), ('คำ', 'ชนิดคำ'), ...] """ + if not words: + return [] + + words = [word.strip() for word in words if word.strip()] + if corpus == "orchid": - tagger = orchid_data() + tagger = _ORCHID_TAGGER else: # default, use "pud" as a corpus - tagger = pud_data() + tagger = _PUD_TAGGER - return tagger.tag(text) + return tagger.tag(words) diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py index 21324bf64..e90c992f0 100644 --- a/pythainlp/tag/unigram.py +++ b/pythainlp/tag/unigram.py @@ -15,26 +15,29 @@ _THAI_POS_PUD_PATH = os.path.join(CORPUS_PATH, _THAI_POS_PUD_FILENAME) -def orchid_data(): +def _orchid_tagger(): with open(_THAI_POS_ORCHID_PATH, encoding="utf-8-sig") as f: model = json.load(f) return model -def pud_data(): +def _pud_tagger(): with open(_THAI_POS_PUD_PATH, "rb") as handle: model = dill.load(handle) return model -def tag(text, corpus): +def tag(words, corpus): """ รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('คำ', 'ชนิดคำ'), ('คำ', 'ชนิดคำ'), ...] """ + if not words: + return [] + if corpus == "orchid": - tagger = nltk.tag.UnigramTagger(model=orchid_data()) - return tagger.tag(text) + tagger = nltk.tag.UnigramTagger(model=_orchid_tagger()) + return tagger.tag(words) # default, use "pud" as a corpus - tagger = pud_data() - return tagger.tag(text) + tagger = _pud_tagger() + return tagger.tag(words) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index e81c3214d..3c97535c0 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -34,15 +34,18 @@ def word_tokenize(text, engine="newmm", whitespaces=True): >>> word_tokenize(text, engine="icu") ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด'] """ + if not text: + return [] + if engine == "newmm" or engine == "onecut": - from .newmm import mmcut as segment + from .newmm import segment elif engine == "longest" or engine == "longest-matching": from .longest import segment elif engine == "ulmfit": - from .newmm import mmcut + from .newmm import segment as segment_ def segment(text): - return mmcut(text, trie=FROZEN_DICT_TRIE) + return segment_(text, trie=FROZEN_DICT_TRIE) elif engine == "icu": from .pyicu import segment @@ -51,7 +54,7 @@ def segment(text): elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment else: # default, use "newmm" engine - from .newmm import mmcut as segment + from .newmm import segment if not whitespaces: return [token.strip(" ") for token in segment(text) if token.strip(" ")] @@ -73,14 +76,18 @@ def dict_word_tokenize(text, custom_dict, engine="newmm"): >>> dict_word_tokenize("แมวดีดีแมว", trie) ['แมว', 'ดี', 'ดี', 'แมว'] """ + + if not text: + return [] + if engine == "newmm" or engine == "onecut": - from .newmm import mmcut as segment + from .newmm import segment elif engine == "longest" or engine == "longest-matching": from .longest import segment elif engine == "mm" or engine == "multi_cut": from .multi_cut import segment else: # default, use "newmm" engine - from .newmm import mmcut as segment + from .newmm import segment return segment(text, custom_dict) @@ -94,12 +101,16 @@ def sent_tokenize(text, engine="whitespace+newline"): :return: a list of text, split by whitespace or new line. """ + + if not text: + return [] + sentences = [] if engine == "whitespace": sentences = nltk.tokenize.WhitespaceTokenizer().tokenize(text) else: # default, use whitespace + newline - sentences = re.sub(r"\n+|\s+", "|", text).split("|") + sentences = re.sub(r"\n+|\s+", "|", text.strip()).split("|") return sentences @@ -110,6 +121,9 @@ def subword_tokenize(text, engine="tcc"): :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units. :return: a list of tokenized strings. """ + if not text: + return "" + from .tcc import tcc return tcc(text) @@ -121,6 +135,10 @@ def syllable_tokenize(text): :return: returns list of strings of syllables """ + + if not text: + return [] + tokens = [] if text: words = word_tokenize(text) @@ -171,6 +189,6 @@ def __init__(self, custom_dict=None): self.__trie_dict = Trie(thai_words()) def word_tokenize(self, text, engine="newmm"): - from .newmm import mmcut as segment + from .newmm import segment return segment(text, self.__trie_dict) diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 395e76583..510a1b848 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -7,4 +7,7 @@ def segment(text): + if not text: + return [] + return deepcut.tokenize(text) diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py index a90e0b835..5e73b4586 100644 --- a/pythainlp/tokenize/etcc.py +++ b/pythainlp/tokenize/etcc.py @@ -27,6 +27,10 @@ def etcc(text): รับ str ส่งออก str """ + + if not text: + return "" + if re.search(r"[เแ]" + _C + r"[" + "".join(_UV) + r"]" + r"\w", text): search = re.findall(r"[เแ]" + _C + r"[" + "".join(_UV) + r"]" + r"\w", text) for i in search: diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 483685da2..33ff1fa0a 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -35,7 +35,7 @@ _UNKNOWN = False -class Tokenizer(object): +class LongestMatchTokenizer(object): def __init__(self, trie): self.__trie = trie @@ -95,6 +95,9 @@ def __longest_matching(self, text, begin_pos): return "" def __segment_text(self, text): + if not text: + return [] + begin_pos = 0 len_text = len(text) tokens = [] @@ -137,4 +140,5 @@ def segment(text, trie=None): """ตัดคำภาษาไทยด้วยวิธี longest matching""" if not trie: trie = DEFAULT_DICT_TRIE - return Tokenizer(trie).tokenize(text) + + return LongestMatchTokenizer(trie).tokenize(text) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 80f621c27..d161bdf4e 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -40,7 +40,7 @@ def __init__(self, value, multi=None, in_dict=True): _PAT_ENG = re.compile(_RE_ENG) -def multicut(text, trie=None): +def _multicut(text, trie=None): """ ส่งคืน LatticeString คืนมาเป็นก้อนๆ """ @@ -95,18 +95,18 @@ def serialize(p, p2): # helper function def mmcut(text): res = [] - for w in multicut(text): + for w in _multicut(text): mm = min(w.multi, key=lambda x: x.count("/")) res.extend(mm.split("/")) return res -def combine(ww): +def _combine(ww): if ww == []: yield "" else: w = ww[0] - for tail in combine(ww[1:]): + for tail in _combine(ww[1:]): if w.unique: yield w + "|" + tail else: @@ -118,13 +118,18 @@ def segment(text, trie=None): """ ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด """ - ww = list(multicut(text, trie=trie)) - return ww + if not text: + return [] + + return list(_multicut(text, trie=trie)) def find_all_segment(text, trie=None): """ ใช้ในการหา list ที่สามารถตัดคำได้ทั้งหมด """ - ww = list(multicut(text, trie=trie)) - return list(combine(ww)) + if not text: + return [] + + ww = list(_multicut(text, trie=trie)) + return list(_combine(ww)) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 08fda8628..17815fd9f 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -90,7 +90,11 @@ def onecut(text, trie): # ช่วยให้ไม่ต้องพิมพ์ยาวๆ -def mmcut(text, trie=None): +def segment(text, trie=None): + if not text: + return [] + if not trie: trie = DEFAULT_DICT_TRIE + return list(onecut(text, trie)) diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index aefcc9311..23b7b38e4 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -17,5 +17,8 @@ def _gen_words(text): def segment(text): + if not text: + return [] + text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text) return list(_gen_words(text)) diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index bfb5920e9..b50bdb24a 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -48,6 +48,9 @@ def tcc_gen(w): + if not w: + return '' + p = 0 while p < len(w): m = PAT_TCC.match(w[p:]) @@ -60,13 +63,20 @@ def tcc_gen(w): def tcc_pos(text): + if not text: + return set() + p_set = set() p = 0 for w in tcc_gen(text): p += len(w) p_set.add(p) + return p_set -def tcc(w, sep="/"): - return sep.join(tcc_gen(w)) \ No newline at end of file +def tcc(text, sep="/"): + if not text: + return "" + + return sep.join(tcc_gen(text)) diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py index 48bd5cfd2..7ede03197 100644 --- a/pythainlp/transliterate/__init__.py +++ b/pythainlp/transliterate/__init__.py @@ -10,11 +10,17 @@ def romanize(text, engine="royin"): :param str engine: 'royin' (default) or 'thai2rom'. 'royin' uses Thai Royal Institute standard. 'thai2rom' is deep learning Thai romanization (require keras). :return: English (more or less) text that spells out how the Thai text should read. """ + + if not text: + return "" + if engine == "thai2rom": from .thai2rom import romanize + return romanize(text) else: # use default engine "royin" from .royin import romanize + words = word_tokenize(text) romanized_words = [romanize(word) for word in words] return "".join(romanized_words) @@ -26,6 +32,10 @@ def transliterate(text, engine="ipa"): :param str engine: 'ipa' (default) or 'pyicu'. :return: A string of Internaitonal Phonetic Alphabets indicating how the text should read. """ + + if not text: + return "" + if engine == "pyicu": from .pyicu import transliterate else: diff --git a/pythainlp/transliterate/royin.py b/pythainlp/transliterate/royin.py index 69a3671d9..e868f10d0 100644 --- a/pythainlp/transliterate/royin.py +++ b/pythainlp/transliterate/royin.py @@ -145,8 +145,9 @@ def _replace_consonants(word, res): lenword = len(res) while i < lenword: if i == 0 and res[0] == "ห": - word = word.replace(res[0], _CONSONANTS[res[0]][0]) - i += 1 + word = word.replace(res[0], "") + del res[0] + lenword -= 1 elif i == 0 and res[0] != "ห": word = word.replace(res[0], _CONSONANTS[res[0]][0]) i += 1 @@ -168,6 +169,9 @@ def _replace_consonants(word, res): def romanize(word): + if not word: + return "" + word2 = _replace_vowels(_normalize(word)) res = re.findall(_RE_CONSONANT, word2) # 2-character word, all consonants diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py index e2b4b1329..0f371e31e 100644 --- a/pythainlp/word_vector/thai2vec.py +++ b/pythainlp/word_vector/thai2vec.py @@ -10,7 +10,7 @@ from pythainlp.tokenize import word_tokenize -def download(): +def _download(): path = get_file("thai2vec02") if not path: download_data("thai2vec02") @@ -20,8 +20,13 @@ def download(): def get_model(): """ - :return: Downloads the `gensim` model.""" - return KeyedVectors.load_word2vec_format(download(), binary=False) + Download model + :return: `gensim` model + """ + return KeyedVectors.load_word2vec_format(_download(), binary=False) + + +_MODEL = get_model() def most_similar_cosmul(positive, negative): @@ -29,28 +34,30 @@ def most_similar_cosmul(positive, negative): การใช้งาน input list """ - return get_model().most_similar_cosmul(positive=positive, negative=negative) + return _MODEL.most_similar_cosmul(positive=positive, negative=negative) def doesnt_match(listdata): - return get_model().doesnt_match(listdata) + return _MODEL.doesnt_match(listdata) def similarity(word1, word2): """ + Get cosine similarity between two words. + If a word is not in the vocabulary, KeyError will be raised. :param str word1: first word :param str word2: second word :return: the cosine similarity between the two word vectors """ - return get_model().similarity(word1, word2) + return _MODEL.similarity(word1, word2) def sentence_vectorizer(text, dim=300, use_mean=False): words = word_tokenize(text) vec = np.zeros((1, dim)) for word in words: - if word in get_model().wv.index2word: - vec += get_model().wv.word_vec(word) + if word in _MODEL.wv.index2word: + vec += _MODEL.wv.word_vec(word) else: pass if use_mean: diff --git a/setup.py b/setup.py index 3fa7c5c18..583a5d98a 100644 --- a/setup.py +++ b/setup.py @@ -9,21 +9,25 @@ requirements = f.read().splitlines() extras = { + "artagger": ["artagger"], + "deepcut": ["deepcut", "keras", "tensorflow"], "icu": ["pyicu"], + "ipa": ["epitran"], "ml": ["fastai==0.7.0", "keras", "numpy", "torch"], "ner": ["sklearn_crfsuite"], - "pos": ["artagger"], - "tokenize": ["deepcut", "pyicu"], - "transliterate": ["epitran", "pyicu"], + "thai2rom": ["keras", "numpy"], + "thai2vec": ["gensim", "numpy"], "full": [ "artagger", "deepcut", "epitran", "fastai==0.7.0", + "gensim", "keras", "numpy", "pyicu", "sklearn_crfsuite", + "tensorflow", "torch", ], } diff --git a/tests/__init__.py b/tests/__init__.py index ec4a492d6..12fc36236 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import unittest from collections import Counter +from nltk.corpus import wordnet as wn from pythainlp.collation import collate from pythainlp.corpus import ( @@ -34,11 +35,26 @@ from pythainlp.sentiment import sentiment from pythainlp.soundex import lk82, metasound, soundex, udom83 from pythainlp.spell import correct, spell +from pythainlp.spell.pn import NorvigSpellChecker, dictionary, known, prob from pythainlp.summarize import summarize -from pythainlp.tag import pos_tag, pos_tag_sents -from pythainlp.tokenize import etcc, syllable_tokenize, tcc, word_tokenize +from pythainlp.tag import perceptron, pos_tag, pos_tag_sents, unigram +from pythainlp.tokenize import ( + FROZEN_DICT_TRIE, + dict_word_tokenize, + etcc, + longest, + multi_cut, + newmm, + sent_tokenize, + subword_tokenize, + syllable_tokenize, + tcc, + word_tokenize, +) +from pythainlp.tokenize import pyicu as tokenize_pyicu from pythainlp.transliterate import romanize, transliterate from pythainlp.transliterate.ipa import trans_list, xsampa_list +from pythainlp.transliterate.royin import romanize as romanize_royin from pythainlp.util import ( deletetone, eng_to_thai, @@ -48,6 +64,7 @@ normalize, thai_to_eng, ) +from pythainlp.word_vector import thai2vec class TestUM(unittest.TestCase): @@ -86,10 +103,31 @@ def test_ttc(self): self.assertIsNotNone(ttc.word_freqs()) def test_wordnet(self): + self.assertIsNotNone(wordnet.langs()) + self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] ) - self.assertIsNotNone(wordnet.langs()) + self.assertIsNotNone(wordnet.synsets("นก")) + self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ)) + + self.assertIsNotNone(wordnet.lemmas("นก")) + self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) + self.assertIsNotNone(wordnet.lemma("cat.n.01.cat")) + + self.assertEqual(wordnet.morphy("dogs"), "dog") + + bird = wordnet.synset("bird.n.01") + mouse = wordnet.synset("mouse.n.01") + self.assertEqual( + wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse) + ) + self.assertEqual( + wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse) + ) + + cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() + self.assertIsNotNone(wordnet.lemma_from_key(cat_key)) # ### pythainlp.date @@ -170,6 +208,7 @@ def test_number(self): ) self.assertEqual(thaiword_to_num("ยี่สิบ"), 20) self.assertEqual(thaiword_to_num("ศูนย์"), 0) + self.assertEqual(thaiword_to_num("ศูนย์อะไรนะ"), 0) self.assertEqual(thaiword_to_num(""), None) self.assertEqual(thaiword_to_num(None), None) @@ -234,13 +273,22 @@ def test_soundex(self): # ### pythainlp.spell def test_spell(self): - self.assertIsNotNone(spell("เน้ร")) - self.assertEqual(spell(""), "") self.assertEqual(spell(None), "") + self.assertEqual(spell(""), "") + self.assertIsNotNone(spell("เน้ร")) + self.assertIsNotNone(spell("เกสมร์")) - self.assertIsNotNone(correct("ทดสอง")) - self.assertEqual(correct(""), "") self.assertEqual(correct(None), "") + self.assertEqual(correct(""), "") + self.assertIsNotNone(correct("ทดสอง")) + + self.assertIsNotNone(dictionary()) + self.assertGreaterEqual(prob("มี"), 0) + self.assertIsNotNone(known(["เกิด", "abc", ""])) + + checker = NorvigSpellChecker(dict_filter="") + self.assertIsNotNone(checker.dictionary()) + self.assertGreaterEqual(checker.prob("มี"), 0) # ### pythainlp.summarize @@ -262,8 +310,19 @@ def test_summarize(self): def test_pos_tag(self): tokens = ["ผม", "รัก", "คุณ"] + + self.assertEqual(pos_tag(None), []) + self.assertEqual(pos_tag([]), []) + self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) + self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud")) + + self.assertEqual(unigram.tag(None, corpus="pud"), []) + self.assertEqual(unigram.tag([], corpus="pud"), []) + self.assertEqual(unigram.tag(None, corpus="orchid"), []) + self.assertEqual(unigram.tag([], corpus="orchid"), []) + self.assertEqual( pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"), [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], @@ -271,10 +330,16 @@ def test_pos_tag(self): self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud")) + self.assertEqual(perceptron.tag(None, corpus="pud"), []) + self.assertEqual(perceptron.tag([], corpus="pud"), []) + self.assertEqual(perceptron.tag(None, corpus="orchid"), []) + self.assertEqual(perceptron.tag([], corpus="orchid"), []) - # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="orchid")) - # self.assertIsNotNone(pos_tag(tokens, engine="arttagger", corpus="pud")) + # self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="orchid")) + # self.assertIsNotNone(pos_tag(tokens, engine="artagger", corpus="pud")) + self.assertEqual(pos_tag_sents(None), []) + self.assertEqual(pos_tag_sents([]), []) self.assertEqual( pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), [ @@ -285,30 +350,88 @@ def test_pos_tag(self): # ### pythainlp.tokenize - def test_syllable_tokenize(self): - self.assertEqual( - syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] + def test_dict_word_tokenize(self): + self.assertEqual(dict_word_tokenize("", custom_dict=FROZEN_DICT_TRIE), []) + self.assertIsNotNone( + dict_word_tokenize("รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="newmm" + ) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", + custom_dict=FROZEN_DICT_TRIE, + engine="longest", + ) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="mm" + ) + ) + self.assertIsNotNone( + dict_word_tokenize( + "รถไฟฟ้ากรุงเทพBTSหูว์ค์", custom_dict=FROZEN_DICT_TRIE, engine="XX" + ) + ) + + def test_etcc(self): + self.assertEqual(etcc.etcc(""), "") + self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") + self.assertIsNotNone( + etcc.etcc( + "หมูแมวเหล่านี้ด้วยเหตุผลเชื่อมโยงทางกรรมพันธุ์" + + "สัตว์มีแขนขาหน้าหัวเราะเพราะแข็งขืน" + ) ) def test_word_tokenize(self): + self.assertEqual(word_tokenize(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertIsNotNone(word_tokenize("ทดสอบ", engine="ulmfit")) + self.assertIsNotNone(word_tokenize("ทดสอบ", engine="XX")) def test_word_tokenize_icu(self): + self.assertEqual(tokenize_pyicu.segment(None), []) + self.assertEqual(tokenize_pyicu.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], ) + # def test_word_tokenize_deepcut(self): + # self.assertEqual(deepcut.segment(None), []) + # self.assertEqual(deepcut.segment(""), []) + # self.assertIsNotNone(word_tokenize("ลึกลงไปลลลล", engine="deepcut")) + + def test_word_tokenize_longest_matching(self): + self.assertEqual(longest.segment(None), []) + self.assertEqual(longest.segment(""), []) + self.assertEqual( + word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), + ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], + ) + def test_word_tokenize_mm(self): + self.assertEqual(multi_cut.segment(None), []) + self.assertEqual(multi_cut.segment(""), []) + self.assertEqual(word_tokenize("", engine="mm"), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) + self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS")) + def test_word_tokenize_newmm(self): + self.assertEqual(newmm.segment(None), []) + self.assertEqual(newmm.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], @@ -326,31 +449,64 @@ def test_word_tokenize_newmm(self): ["จุ๋ม", "ง่วง"], ) - def test_word_tokenize_longest_matching(self): + def test_sent_tokenize(self): + self.assertEqual(sent_tokenize(None), []) + self.assertEqual(sent_tokenize(""), []) self.assertEqual( - word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), - ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], + sent_tokenize("รักน้ำ รักปลา ", engine="whitespace"), ["รักน้ำ", "รักปลา"] + ) + self.assertEqual(sent_tokenize("รักน้ำ รักปลา "), ["รักน้ำ", "รักปลา"]) + + def test_subword_tokenize(self): + self.assertEqual(subword_tokenize(None), "") + self.assertEqual(subword_tokenize(""), "") + self.assertIsNotNone(subword_tokenize("สวัสดีดาวอังคาร")) + + def test_syllable_tokenize(self): + self.assertEqual(syllable_tokenize(None), []) + self.assertEqual(syllable_tokenize(""), []) + self.assertEqual( + syllable_tokenize("สวัสดีชาวโลก"), ["สวัส", "ดี", "ชาว", "โลก"] ) def test_tcc(self): + self.assertEqual(tcc.tcc(None), "") + self.assertEqual(tcc.tcc(""), "") self.assertEqual(tcc.tcc("ประเทศไทย"), "ป/ระ/เท/ศ/ไท/ย") - def test_etcc(self): - self.assertEqual(etcc.etcc("คืนความสุข"), "/คืน/ความสุข") + self.assertEqual(list(tcc.tcc_gen("")), []) + self.assertEqual(tcc.tcc_pos(""), set()) # ### pythainlp.transliterate def test_romanize(self): + self.assertEqual(romanize(None), "") + self.assertEqual(romanize(""), "") self.assertEqual(romanize("แมว"), "maeo") - self.assertIsNotNone(romanize("กก", engine="royin")) + + self.assertEqual(romanize_royin(None), "") + self.assertEqual(romanize_royin(""), "") + self.assertEqual(romanize_royin("หาย"), "hai") + self.assertEqual(romanize_royin("หยาก"), "yak") + self.assertEqual(romanize("แมว", engine="royin"), "maeo") self.assertEqual(romanize("เดือน", engine="royin"), "duean") self.assertEqual(romanize("ดู", engine="royin"), "du") self.assertEqual(romanize("ดำ", engine="royin"), "dam") self.assertEqual(romanize("บัว", engine="royin"), "bua") + self.assertEqual(romanize("กร", engine="royin"), "kon") + self.assertEqual(romanize("กรร", engine="royin"), "kan") + self.assertEqual(romanize("กรรม", engine="royin"), "kam") + self.assertIsNotNone(romanize("กก", engine="royin")) + self.assertIsNotNone(romanize("ฝ้าย", engine="royin")) + self.assertIsNotNone(romanize("ทีปกร", engine="royin")) + self.assertIsNotNone(romanize("กรม", engine="royin")) + self.assertIsNotNone(romanize("ธรรพ์", engine="royin")) + self.assertIsNotNone(romanize("กฏa์", engine="royin")) # self.assertIsNotNone(romanize("บัว", engine="thai2rom")) def test_transliterate(self): + self.assertEqual(transliterate(""), "") self.assertEqual(transliterate("แมว", "pyicu"), "mæw") self.assertEqual(transliterate("คน", engine="ipa"), "kʰon") self.assertIsNotNone(trans_list("คน")) @@ -384,6 +540,23 @@ def test_keyboard(self): self.assertEqual(eng_to_thai("l;ylfu8iy["), "สวัสดีครับ") self.assertEqual(thai_to_eng("สวัสดีครับ"), "l;ylfu8iy[") + # ### pythainlp.word_vector + + def test_thai2vec(self): + self.assertGreaterEqual(thai2vec.similarity("แบคทีเรีย", "คน"), 0) + self.assertIsNotNone(thai2vec.sentence_vectorizer("")) + self.assertIsNotNone(thai2vec.sentence_vectorizer("เสรีภาพในการชุมนุม")) + self.assertIsNotNone( + thai2vec.sentence_vectorizer("เสรีภาพในการสมาคม", use_mean=True) + ) + self.assertIsNotNone(thai2vec.sentence_vectorizer("I คิด therefore I am ผ็ฎ์")) + self.assertEqual( + thai2vec.most_similar_cosmul(["ราชา", "ผู้ชาย"], ["ผู้หญิง"])[0][0], + "ราชินี", + ) + self.assertEqual(thai2vec.doesnt_match(["ญี่ปุ่น", "พม่า", "ไอติม"]), "ไอติม") + self.assertIsNotNone(thai2vec.about()) + if __name__ == "__main__": unittest.main()