Skip to content

Commit 5728ba9

Browse files
authored
Merge pull request #5 from LlmKira/dev
(feat): Add small model `lid.176.ftz` to library resources, for offline use
2 parents 37aca03 + 1bbf7cd commit 5728ba9

File tree

8 files changed

+187
-95
lines changed

8 files changed

+187
-95
lines changed

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Facebook. This package is 80x faster than traditional methods and offers 95% acc
1111

1212
It supports Python versions 3.9 to 3.12.
1313

14+
Support offline usage.
15+
1416
This project builds upon [zafercavdar/fasttext-langdetect](https://github.com/zafercavdar/fasttext-langdetect#benchmark)
1517
with enhancements in packaging.
1618

@@ -51,18 +53,24 @@ model.
5153
> will be predicted as Japanese).
5254
5355
```python
56+
5457
from fast_langdetect import detect, detect_multilingual
5558

5659
# Single language detection
5760
print(detect("Hello, world!"))
5861
# Output: {'lang': 'en', 'score': 0.12450417876243591}
5962

63+
# `use_strict_mode` determines whether the model loading process should enforce strict conditions before using fallback options.
64+
# If `use_strict_mode` is set to True, we will load only the selected model, not the fallback model.
65+
print(detect("Hello, world!", low_memory=False, use_strict_mode=True))
66+
67+
# How to deal with multiline text
6068
multiline_text = """
6169
Hello, world!
6270
This is a multiline text.
6371
But we need remove `\n` characters or it will raise an ValueError.
6472
"""
65-
multiline_text = multiline_text.replace("\n", "")
73+
multiline_text = multiline_text.replace("\n", "") # NOTE:ITS IMPORTANT TO REMOVE \n CHARACTERS
6674
print(detect(multiline_text))
6775
# Output: {'lang': 'en', 'score': 0.8509423136711121}
6876

feature_test/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# -*- coding: utf-8 -*-
22
# @Time : 2024/1/18 上午11:41
33
# @Author : sudoskys
4-
5-
64
from fast_langdetect import detect, detect_multilingual, detect_language
75

86
# 测试繁体,简体,日文,英文,韩文,法文,德文,西班牙文
9-
print(detect_multilingual("Hello, world!你好世界!Привет, мир!",low_memory=False))
10-
print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
7+
print(detect_multilingual("Hello, world!你好世界!Привет, мир!", low_memory=False))
8+
print(
9+
detect_multilingual("Hello, world!你好世界!Привет, мир!", low_memory=True, use_strict_mode=True)
10+
)
1111
# [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]
1212
print(detect("hello world"))
1313
print(detect("你好世界"))

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "fast-langdetect"
3-
version = "0.2.1"
3+
version = "0.2.2"
44
description = "Quickly detect text language and segment language"
55
authors = [
66
{ name = "sudoskys", email = "[email protected]" },

src/fast_langdetect/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
# -*- coding: utf-8 -*-
22

3-
from .ft_detect import detect, detect_language, detect_langs, detect_multilingual # noqa: F401
3+
4+
from .ft_detect import detect, detect_language, detect_langs, detect_multilingual # noqa: F401

src/fast_langdetect/ft_detect/infer.py

Lines changed: 145 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -5,130 +5,194 @@
55
# @Software: PyCharm
66
import logging
77
import os
8+
from enum import Enum
89
from pathlib import Path
9-
from typing import Dict, Union, List
10+
from typing import Dict, Union, List, Optional, Any
1011

1112
import fasttext
1213
from robust_downloader import download
1314

1415
logger = logging.getLogger(__name__)
15-
MODELS = {"low_mem": None, "high_mem": None}
16-
FTLANG_CACHE = os.getenv("FTLANG_CACHE", "/tmp/fasttext-langdetect")
16+
CACHE_DIRECTORY = os.getenv("FTLANG_CACHE", "/tmp/fasttext-langdetect")
17+
LOCAL_SMALL_MODEL_PATH = Path(__file__).parent / "resources" / "lid.176.ftz"
1718

19+
# Suppress FastText output if possible
1820
try:
19-
# silences warnings as the package does not properly use the python 'warnings' package
20-
# see https://github.com/facebookresearch/fastText/issues/1056
2121
fasttext.FastText.eprint = lambda *args, **kwargs: None
2222
except Exception:
2323
pass
2424

2525

26+
class ModelType(Enum):
27+
LOW_MEMORY = "low_mem"
28+
HIGH_MEMORY = "high_mem"
29+
30+
31+
class ModelCache:
32+
def __init__(self):
33+
self._models = {}
34+
35+
def get_model(self, model_type: ModelType) -> Optional["fasttext.FastText._FastText"]:
36+
return self._models.get(model_type)
37+
38+
def set_model(self, model_type: ModelType, model: "fasttext.FastText._FastText"):
39+
self._models[model_type] = model
40+
41+
42+
_model_cache = ModelCache()
43+
44+
2645
class DetectError(Exception):
46+
"""Custom exception for language detection errors."""
2747
pass
2848

2949

30-
def get_model_map(low_memory=False):
50+
def load_model(low_memory: bool = False,
51+
download_proxy: Optional[str] = None,
52+
use_strict_mode: bool = False) -> "fasttext.FastText._FastText":
3153
"""
32-
Getting model map
33-
:param low_memory:
34-
:return:
54+
Load the FastText model based on memory preference.
55+
56+
:param low_memory: Indicates whether to load a smaller, memory-efficient model
57+
:param download_proxy: Proxy to use for downloading the large model if necessary
58+
:param use_strict_mode: If enabled, strictly loads large model or raises error if it fails
59+
:return: Loaded FastText model
60+
:raises DetectError: If the model cannot be loaded
3561
"""
36-
if low_memory:
37-
return "low_mem", FTLANG_CACHE, "lid.176.ftz", "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
38-
else:
39-
return "high_mem", FTLANG_CACHE, "lid.176.bin", "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
62+
model_type = ModelType.LOW_MEMORY if low_memory else ModelType.HIGH_MEMORY
4063

64+
# If the model is already loaded, return it
65+
cached_model = _model_cache.get_model(model_type)
66+
if cached_model:
67+
return cached_model
4168

42-
def get_model_loaded(
43-
low_memory: bool = False,
44-
download_proxy: str = None
45-
):
46-
"""
47-
Getting model loaded
48-
:param low_memory:
49-
:param download_proxy:
50-
:return:
51-
"""
52-
mode, cache, name, url = get_model_map(low_memory)
53-
loaded = MODELS.get(mode, None)
54-
if loaded:
55-
return loaded
56-
model_path = os.path.join(cache, name)
57-
if Path(model_path).exists():
58-
if Path(model_path).is_dir():
59-
raise Exception(f"{model_path} is a directory")
69+
def load_local_small_model():
70+
"""Try to load the local small model."""
71+
try:
72+
_loaded_model = fasttext.load_model(str(LOCAL_SMALL_MODEL_PATH))
73+
_model_cache.set_model(ModelType.LOW_MEMORY, _loaded_model)
74+
return _loaded_model
75+
except Exception as e:
76+
logger.error(f"Failed to load the local small model '{LOCAL_SMALL_MODEL_PATH}': {e}")
77+
raise DetectError("Unable to load low-memory model from local resources.")
78+
79+
def load_large_model():
80+
"""Try to load the large model."""
6081
try:
61-
loaded_model = fasttext.load_model(model_path)
62-
MODELS[mode] = loaded_model
82+
loaded_model = fasttext.load_model(str(model_path))
83+
_model_cache.set_model(ModelType.HIGH_MEMORY, loaded_model)
84+
return loaded_model
6385
except Exception as e:
64-
logger.error(f"Error loading model {model_path}: {e}")
65-
download(url=url, folder=cache, filename=name, proxy=download_proxy)
66-
raise e
67-
else:
86+
logger.error(f"Failed to load the large model '{model_path}': {e}")
87+
return None
88+
89+
if low_memory:
90+
# Attempt to load the local small model
91+
return load_local_small_model()
92+
93+
# Path for the large model
94+
large_model_name = "lid.176.bin"
95+
model_path = Path(CACHE_DIRECTORY) / large_model_name
96+
97+
# If the large model is already present, load it
98+
if model_path.exists():
99+
# Model cant be dir
100+
if model_path.is_dir():
101+
try:
102+
model_path.rmdir()
103+
except Exception as e:
104+
logger.error(f"Failed to remove the directory '{model_path}': {e}")
105+
raise DetectError(f"Unexpected directory found in large model file path '{model_path}': {e}")
106+
# Attempt to load large model
107+
loaded_model = load_large_model()
108+
if loaded_model:
109+
return loaded_model
110+
111+
# If the large model is not present, attempt to download (only if necessary)
112+
model_url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
113+
try:
114+
logger.info(f"Downloading large model from {model_url} to {model_path}")
115+
download(
116+
url=model_url,
117+
folder=CACHE_DIRECTORY,
118+
filename=large_model_name,
119+
proxy=download_proxy,
120+
retry_max=3,
121+
timeout=20
122+
)
123+
# Try loading the model again after download
124+
loaded_model = load_large_model()
125+
if loaded_model:
68126
return loaded_model
127+
except Exception as e:
128+
logger.error(f"Failed to download the large model: {e}")
69129

70-
download(url=url, folder=cache, filename=name, proxy=download_proxy, retry_max=3, timeout=20)
71-
loaded_model = fasttext.load_model(model_path)
72-
MODELS[mode] = loaded_model
73-
return loaded_model
130+
# Handle fallback logic for strict and non-strict modes
131+
if use_strict_mode:
132+
raise DetectError("Strict mode enabled: Unable to download or load the large model.")
133+
else:
134+
logger.info("Attempting to fall back to local small model.")
135+
return load_local_small_model()
74136

75137

76138
def detect(text: str, *,
77139
low_memory: bool = True,
78-
model_download_proxy: str = None
140+
model_download_proxy: Optional[str] = None,
141+
use_strict_mode: bool = False
79142
) -> Dict[str, Union[str, float]]:
80143
"""
81-
Detect language of text
82-
144+
Detect the language of a text using FastText.
83145
This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
84-
85-
:param text: Text for language detection
86-
:param low_memory: Whether to use low memory mode
87-
:param model_download_proxy: model download proxy
88-
:return: {"lang": "en", "score": 0.99}
89-
:raise ValueError: predict processes one line at a time (remove \'\\n\')
146+
If the model is not supervised, this function will throw a ValueError.
147+
:param text: The text for language detection
148+
:param low_memory: Whether to use a memory-efficient model
149+
:param model_download_proxy: Download proxy for the model if needed
150+
:param use_strict_mode: If enabled, strictly loads large model or raises error if it fails
151+
:return: A dictionary with detected language and confidence score
152+
:raises LanguageDetectionError: If detection fails
90153
"""
91-
model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy)
154+
model = load_model(low_memory=low_memory, download_proxy=model_download_proxy, use_strict_mode=use_strict_mode)
92155
labels, scores = model.predict(text)
93-
label = labels[0].replace("__label__", '')
94-
score = min(float(scores[0]), 1.0)
156+
language_label = labels[0].replace("__label__", '')
157+
confidence_score = min(float(scores[0]), 1.0)
95158
return {
96-
"lang": label,
97-
"score": score,
159+
"lang": language_label,
160+
"score": confidence_score,
98161
}
99162

100163

101164
def detect_multilingual(text: str, *,
102165
low_memory: bool = True,
103-
model_download_proxy: str = None,
166+
model_download_proxy: Optional[str] = None,
104167
k: int = 5,
105168
threshold: float = 0.0,
106-
on_unicode_error: str = "strict"
107-
) -> List[dict]:
169+
on_unicode_error: str = "strict",
170+
use_strict_mode: bool = False
171+
) -> List[Dict[str, Any]]:
108172
"""
109-
Given a string, get a list of labels and a list of corresponding probabilities.
110-
k controls the number of returned labels. A choice of 5, will return the 5 most probable labels.
111-
By default this returns only the most likely label and probability. threshold filters the returned labels by a threshold on probability. A choice of 0.5 will return labels with at least 0.5 probability.
112-
k and threshold will be applied together to determine the returned labels.
113-
114-
NOTE:This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
115-
116-
:param text: Text for language detection
117-
:param low_memory: Whether to use low memory mode
118-
:param model_download_proxy: model download proxy
119-
:param k: Predict top k languages
120-
:param threshold: Threshold for prediction
121-
:param on_unicode_error: Error handling
122-
:return:
173+
Detect multiple potential languages and their probabilities in a given text.
174+
k controls the number of returned labels. A choice of 5, will return the 5 most probable labels. By default, this returns only the most likely label and probability. threshold filters the returned labels by a threshold on probability. A choice of 0.5 will return labels with at least 0.5 probability. k and threshold will be applied together to determine the returned labels.
175+
This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed, and the null character.
176+
If the model is not supervised, this function will throw a ValueError.
177+
178+
:param text: The text for language detection
179+
:param low_memory: Whether to use a memory-efficient model
180+
:param model_download_proxy: Proxy for downloading the model
181+
:param k: Number of top language predictions to return
182+
:param threshold: Minimum score threshold for predictions
183+
:param on_unicode_error: Error handling for Unicode errors
184+
:param use_strict_mode: If enabled, strictly loads large model or raises error if it fails
185+
:return: A list of dictionaries, each containing a language and its confidence score
186+
:raises LanguageDetectionError: If detection fails
123187
"""
124-
model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy)
125-
labels, scores = model.predict(text=text, k=k, threshold=threshold, on_unicode_error=on_unicode_error)
126-
detect_result = []
188+
model = load_model(low_memory=low_memory, download_proxy=model_download_proxy, use_strict_mode=use_strict_mode)
189+
labels, scores = model.predict(text, k=k, threshold=threshold, on_unicode_error=on_unicode_error)
190+
results = []
127191
for label, score in zip(labels, scores):
128-
label = label.replace("__label__", '')
129-
score = min(float(score), 1.0)
130-
detect_result.append({
131-
"lang": label,
132-
"score": score,
192+
language_label = label.replace("__label__", '')
193+
confidence_score = min(float(score), 1.0)
194+
results.append({
195+
"lang": language_label,
196+
"score": confidence_score,
133197
})
134-
return sorted(detect_result, key=lambda i: i['score'], reverse=True)
198+
return sorted(results, key=lambda x: x['score'], reverse=True)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# License Notice
2+
3+
## Files `fast_langdetect/ft_detect/resources/lid.176.ftz`
4+
5+
The models are distributed under
6+
the [Creative Commons Attribution-Share-Alike License 3.0](https://creativecommons.org/licenses/by-sa/3.0/).
7+
8+
## References
9+
10+
https://fasttext.cc/docs/en/language-identification.html
11+
https://creativecommons.org/licenses/by-sa/3.0/
916 KB
Binary file not shown.

tests/test_detect.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,26 @@
66

77

88
def test_muti_detect():
9-
from fast_langdetect.ft_detect import detect_multilingual
10-
result = detect_multilingual("hello world", low_memory=True)
9+
from fast_langdetect import detect_multilingual
10+
result = detect_multilingual("hello world", low_memory=True, use_strict_mode=True)
1111
assert result[0].get("lang") == "en", "ft_detect error"
1212

1313

14+
def test_large():
15+
from fast_langdetect import detect_multilingual
16+
result = detect_multilingual("hello world", low_memory=True, use_strict_mode=True)
17+
assert result[0].get("lang") == "en", "ft_detect error"
18+
result = detect_multilingual("你好世界", low_memory=False, use_strict_mode=True)
19+
assert result[0].get("lang") == "zh", "ft_detect error"
20+
21+
1422
def test_detect():
1523
from fast_langdetect import detect
16-
assert detect("hello world")["lang"] == "en", "ft_detect error"
17-
assert detect("你好世界")["lang"] == "zh", "ft_detect error"
18-
assert detect("こんにちは世界")["lang"] == "ja", "ft_detect error"
19-
assert detect("안녕하세요 세계")["lang"] == "ko", "ft_detect error"
20-
assert detect("Bonjour le monde")["lang"] == "fr", "ft_detect error"
24+
assert detect("hello world", low_memory=False, use_strict_mode=True)["lang"] == "en", "ft_detect error"
25+
assert detect("你好世界", low_memory=True, use_strict_mode=True)["lang"] == "zh", "ft_detect error"
26+
assert detect("こんにちは世界", low_memory=False, use_strict_mode=True)["lang"] == "ja", "ft_detect error"
27+
assert detect("안녕하세요 세계", low_memory=True, use_strict_mode=True)["lang"] == "ko", "ft_detect error"
28+
assert detect("Bonjour le monde", low_memory=False, use_strict_mode=True)["lang"] == "fr", "ft_detect error"
2129

2230

2331
def test_detect_totally():

0 commit comments

Comments
 (0)