Merge pull request #124 from bact/dev

wannaphong · web-flow · commit 1bfe60a7e639 · 2018-10-13T03:18:47.000+07:00
Precompile frequently-used pattern/regexes (PAT_ENG, PAT_TCC, PAT_TWOCHARS)
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -1,132 +1,102 @@
 # -*- coding: utf-8 -*-
 
-'''ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ TCC
+"""ตัวตัดคำภาษาไทยโดยใช้หลักการ maximal matching และ Thai Character Cluster (TCC)
 พัฒนาโดยคุณ Korakot Chaovavanich
-Notebook : https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
+Notebooks:
+https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
 https://colab.research.google.com/drive/14Ibg-ngZXj15RKwjNwoZlOT32fQBOrBx#scrollTo=MYZ7NzAR7Dmw
-'''
+"""
 from __future__ import absolute_import, unicode_literals
+
 import re
 from collections import defaultdict
-from heapq import heappush, heappop  # for priority queue
+from heapq import heappop, heappush  # for priority queue
+
 from pythainlp.tokenize import DEFAULT_DICT_TRIE
 
+from .tcc import tcc_gen
+
 # ช่วยตัดพวกภาษาอังกฤษ เป็นต้น
-pat_eng = re.compile(r'''(?x)
+PAT_ENG = re.compile(
+    r"""(?x)
 [-a-zA-Z]+|   # english
 \d[\d,\.]*|   # number
 [ \t]+|       # space
 \r?\n         # newline
-''')
-# TCC
-pat_tcc = """\
-เc็c
-เcctาะ
-เccีtยะ
-เccีtย(?=[เ-ไก-ฮ]|$)
-เcc็c
-เcิc์c
-เcิtc
-เcีtยะ?
-เcืtอะ?
-เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)
-เctา?ะ?
-cัtวะ
-c[ัื]tc[ุิะ]?
-c[ิุู]์
-c[ะ-ู]t
-c็
-ct[ะาำ]?
-แc็c
-แcc์
-แctะ
-แcc็c
-แccc์
-โctะ
-[เ-ไ]ct
-""".replace('c', '[ก-ฮ]').replace('t', '[่-๋]?').split()
-
-
-def tcc(w):
-    p = 0
-    pat = re.compile("|".join(pat_tcc))
-    while p < len(w):
-        m = pat.match(w[p:])
-        if m:
-            n = m.span()[1]
-        else:
-            n = 1
-        yield w[p:p + n]
-        p += n
+"""
+)
+
+PAT_TWOCHARS = re.compile("[ก-ฮ]{,2}$")
 
 
 def tcc_pos(text):
     p_set = set()
     p = 0
-    for w in tcc(text):
+    for w in tcc_gen(text):
         p += len(w)
         p_set.add(p)
     return p_set
 
+
 def bfs_paths_graph(graph, start, goal):
-  queue = [(start, [start])]
-  while queue:
-    (vertex, path) = queue.pop(0)
-    for next in graph[vertex]:
-      if next == goal:
-        yield path + [next]
-      else:
-        queue.append((next, path+[next]))
+    queue = [(start, [start])]
+    while queue:
+        (vertex, path) = queue.pop(0)
+        for next in graph[vertex]:
+            if next == goal:
+                yield path + [next]
+            else:
+                queue.append((next, path + [next]))
+
 
 def onecut(text, trie):
-  graph = defaultdict(list)  # main data structure
-  allow_pos = tcc_pos(text)     # ตำแหน่งที่ตัด ต้องตรงกับ tcc
-  
-  q = [0]       # min-heap queue
-  last_p = 0    # last position for yield
-  while q[0] < len(text):
-      p = heappop(q)
-
-      for w in trie.prefixes(text[p:]):
-          p_ = p + len(w)
-          if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
-            graph[p].append(p_)
-            if p_ not in q:
-              heappush(q, p_)   
-
-      # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
-      if len(q)==1:
-          pp = next(bfs_paths_graph(graph, last_p, q[0]))
-          # เริ่มต้น last_p = pp[0] เอง
-          for p in pp[1:]:
-            yield text[last_p:p]
-            last_p = p
-          # สุดท้าย last_p == q[0] เอง
-
-      # กรณี length 0  คือ ไม่มีใน dict
-      if len(q)==0:
-          m = pat_eng.match(text[p:])
-          if m: # อังกฤษ, เลข, ว่าง
-              i = p + m.end()
-          else: # skip น้อยที่สุด ที่เป็นไปได้
-              for i in range(p+1, len(text)):
-                  if i in allow_pos:   # ใช้ tcc ด้วย
-                      ww = [w for w in trie.prefixes(text[i:]) if (i+len(w) in allow_pos)]
-                      ww = [w for w in ww if not re.match('[ก-ฮ]{,2}$', w)]
-                      m = pat_eng.match(text[i:])
-                      if ww or m:
-                          break
-              else:
-                  i = len(text)
-          w = text[p:i]
-          graph[p].append(i)
-          yield w
-          last_p = i
-          heappush(q, i)
+    graph = defaultdict(list)  # main data structure
+    allow_pos = tcc_pos(text)  # ตำแหน่งที่ตัด ต้องตรงกับ tcc
 
-# ช่วยให้ไม่ต้องพิมพ์ยาวๆ
+    q = [0]  # min-heap queue
+    last_p = 0  # last position for yield
+    while q[0] < len(text):
+        p = heappop(q)
+
+        for w in trie.prefixes(text[p:]):
+            p_ = p + len(w)
+            if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
+                graph[p].append(p_)
+                if p_ not in q:
+                    heappush(q, p_)
 
+        # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
+        if len(q) == 1:
+            pp = next(bfs_paths_graph(graph, last_p, q[0]))
+            # เริ่มต้น last_p = pp[0] เอง
+            for p in pp[1:]:
+                yield text[last_p:p]
+                last_p = p
+            # สุดท้าย last_p == q[0] เอง
 
+        # กรณี length 0 คือ ไม่มีใน dict
+        if len(q) == 0:
+            m = PAT_ENG.match(text[p:])
+            if m:  # อังกฤษ, เลข, ว่าง
+                i = p + m.end()
+            else:  # skip น้อยที่สุด ที่เป็นไปได้
+                for i in range(p + 1, len(text)):
+                    if i in allow_pos:  # ใช้ tcc ด้วย
+                        ww = [w for w in trie.prefixes(text[i:]) if (i + len(w) in allow_pos)]
+                        ww = [w for w in ww if not PAT_TWOCHARS.match(w)]
+                        m = PAT_ENG.match(text[i:])
+                        if ww or m:
+                            break
+                else:
+                    i = len(text)
+            w = text[p:i]
+            graph[p].append(i)
+            yield w
+            last_p = i
+            heappush(q, i)
+
+
+# ช่วยให้ไม่ต้องพิมพ์ยาวๆ
 def mmcut(text, trie=None):
     if not trie:
         trie = DEFAULT_DICT_TRIE
diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py
@@ -1,14 +1,21 @@
 ﻿# -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals,print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import re
+
 """
-โปรแกรม TCC ภาษาไทย
-เดติด
-TCC : Mr.Jakkrit TeCho
-grammar : คุณ Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
-โค้ด : คุณ Korakot Chaovavanich
+Separate Thai text into Thai Character Cluster (TCC).
+Based on "Character cluster based Thai information retrieval" (Theeramunkong et al. 2002)
+http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548
+
+Credits:
+- TCC: Jakkrit TeCho
+- Grammar: Wittawat Jitkrittum https://github.com/wittawatj/jtcc/blob/master/TCC.g
+- Python code: Korakot Chaovavanich
 """
-import re
-pat_list = """\
+
+RE_TCC = (
+    """\
 เc็c
 เcctาะ
 เccีtยะ
@@ -33,25 +40,34 @@
 แccc์
 โctะ
 [เ-ไ]ct
-ๆ
-ฯลฯ
-ฯ
-""".replace('c','[ก-ฮ]').replace('t', '[่-๋]?').split()
-def tcc1(w):
+""".replace(
+        "c", "[ก-ฮ]"
+    )
+    .replace("t", "[่-๋]?")
+    .split()
+)
+
+PAT_TCC = re.compile("|".join(RE_TCC))
+
+
+def tcc_gen(w):
     p = 0
-    pat = re.compile("|".join(pat_list))
-    while p<len(w):
-        m = pat.match(w[p:])
+    while p < len(w):
+        m = PAT_TCC.match(w[p:])
         if m:
             n = m.span()[1]
         else:
             n = 1
-        yield w[p:p+n]
+        yield w[p : p + n]
         p += n
-def tcc(w, sep='/'):
-    return sep.join(tcc1(w))
-if __name__ == '__main__':
-    print(tcc('แมวกิน'))
-    print(tcc('ประชาชน'))
-    print(tcc('ขุดหลุม'))
-    print(tcc('ยินดี'))
+
+
+def tcc(w, sep="/"):
+    return sep.join(tcc_gen(w))
+
+
+if __name__ == "__main__":
+    print(tcc("แมวกิน"))
+    print(tcc("ประชาชน"))
+    print(tcc("ขุดหลุม"))
+    print(tcc("ยินดี"))