66import hashlib
77import logging
88import os
9- import tempfile
109import platform
1110import re
1211import shutil
12+ import tempfile
1313from pathlib import Path
1414from typing import Dict , List , Optional , Union , Any
1515
@@ -143,29 +143,29 @@ def _load_windows_compatible(self, model_path: Path) -> Any:
143143 :raises DetectError: If all loading strategies fail
144144 """
145145 model_path_str = str (model_path .resolve ())
146-
146+
147147 # Try to load model directly
148148 try :
149149 return fasttext .load_model (model_path_str )
150150 except Exception as e :
151151 logger .debug (f"fast-langdetect: Load model failed: { e } " )
152-
152+
153153 # Try to load model using relative path
154154 try :
155155 cwd = Path .cwd ()
156156 rel_path = os .path .relpath (model_path , cwd )
157157 return fasttext .load_model (rel_path )
158158 except Exception as e :
159159 logger .debug (f"fast-langdetect: Failed to load model using relative path: { e } " )
160-
160+
161161 # Use temporary file as last resort
162162 logger .debug (f"fast-langdetect: Using temporary file to load model: { model_path } " )
163163 tmp_path = None
164164 try :
165165 # Use NamedTemporaryFile to create a temporary file
166166 tmp_fd , tmp_path = tempfile .mkstemp (suffix = '.bin' )
167167 os .close (tmp_fd ) # Close file descriptor
168-
168+
169169 # Copy model file to temporary location
170170 shutil .copy2 (model_path , tmp_path )
171171 return fasttext .load_model (tmp_path )
@@ -203,16 +203,18 @@ class LangDetectConfig:
203203 :param proxy: HTTP proxy for downloads
204204 :param allow_fallback: Whether to fallback to small model
205205 :param disable_verify: Whether to disable MD5 verification
206+ :param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
206207 """
207208
208209 def __init__ (
209- self ,
210- cache_dir : Optional [str ] = None ,
211- custom_model_path : Optional [str ] = None ,
212- proxy : Optional [str ] = None ,
213- allow_fallback : bool = True ,
214- disable_verify : bool = False ,
215- verify_hash : Optional [str ] = None ,
210+ self ,
211+ cache_dir : Optional [str ] = None ,
212+ custom_model_path : Optional [str ] = None ,
213+ proxy : Optional [str ] = None ,
214+ allow_fallback : bool = True ,
215+ disable_verify : bool = False ,
216+ verify_hash : Optional [str ] = None ,
217+ normalize_input : bool = True ,
216218 ):
217219 self .cache_dir = cache_dir or CACHE_DIRECTORY
218220 self .custom_model_path = custom_model_path
@@ -221,9 +223,11 @@ def __init__(
221223 # Only verify large model
222224 self .disable_verify = disable_verify
223225 self .verify_hash = verify_hash
226+ self .normalize_input = normalize_input
224227 if self .custom_model_path and not Path (self .custom_model_path ).exists ():
225228 raise FileNotFoundError (f"fast-langdetect: Target model file not found: { self .custom_model_path } " )
226229
230+
227231class LangDetector :
228232 """Language detector using FastText models."""
229233 VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
@@ -238,6 +242,54 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
238242 self .config = config or LangDetectConfig ()
239243 self ._model_loader = ModelLoader ()
240244
245+ @staticmethod
246+ def _preprocess_text (text : str ) -> str :
247+ """
248+ Check text for newline characters and length.
249+
250+ :param text: Input text
251+ :return: Processed text
252+ """
253+ if len (text ) > 100 :
254+ logger .warning (
255+ "fast-langdetect: Text may be too long. "
256+ "Consider passing only a single sentence for accurate prediction."
257+ )
258+ if "\n " in text :
259+ logger .warning (
260+ "fast-langdetect: Newline characters will be removed. "
261+ "Input should not contain newline characters. or FastText will raise an error."
262+ )
263+ text = text .replace ("\n " , " " )
264+ return text
265+
266+ @staticmethod
267+ def _normalize_text (text : str , should_normalize : bool = False ) -> str :
268+ """
269+ Normalize text based on configuration.
270+
271+ Currently, handles:
272+ - Removing newline characters for better prediction
273+ - Lowercasing uppercase text to prevent misdetection as Japanese
274+
275+ :param text: Input text
276+ :param should_normalize: Whether normalization should be applied
277+ :return: Normalized text
278+ """
279+ # If not normalization is needed, return the processed text
280+ if not should_normalize :
281+ return text
282+
283+ # Check if text is all uppercase or mostly uppercase
284+ # https://github.com/LlmKira/fast-langdetect/issues/14
285+ if text .isupper () or (
286+ len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
287+ and len (text ) > 5
288+ ):
289+ return text .lower ()
290+
291+ return text
292+
241293 def _get_model (self , low_memory : bool = True ) -> Any :
242294 """Get or load appropriate model."""
243295 cache_key = "low_memory" if low_memory else "high_memory"
@@ -272,7 +324,7 @@ def _get_model(self, low_memory: bool = True) -> Any:
272324 raise DetectError ("Failed to load model" ) from e
273325
274326 def detect (
275- self , text : str , low_memory : bool = True
327+ self , text : str , low_memory : bool = True
276328 ) -> Dict [str , Union [str , float ]]:
277329 """
278330 Detect primary language of text.
@@ -286,8 +338,10 @@ def detect(
286338 DetectError: If detection fails
287339 """
288340 model = self ._get_model (low_memory )
341+ text = self ._preprocess_text (text )
342+ normalized_text = self ._normalize_text (text , self .config .normalize_input )
289343 try :
290- labels , scores = model .predict (text )
344+ labels , scores = model .predict (normalized_text )
291345 return {
292346 "lang" : labels [0 ].replace ("__label__" , "" ),
293347 "score" : min (float (scores [0 ]), 1.0 ),
@@ -297,11 +351,11 @@ def detect(
297351 raise DetectError ("Language detection failed" ) from e
298352
299353 def detect_multilingual (
300- self ,
301- text : str ,
302- low_memory : bool = False ,
303- k : int = 5 ,
304- threshold : float = 0.0 ,
354+ self ,
355+ text : str ,
356+ low_memory : bool = False ,
357+ k : int = 5 ,
358+ threshold : float = 0.0 ,
305359 ) -> List [Dict [str , Any ]]:
306360 """
307361 Detect multiple possible languages in text.
@@ -317,8 +371,10 @@ def detect_multilingual(
317371 DetectError: If detection fails
318372 """
319373 model = self ._get_model (low_memory )
374+ text = self ._preprocess_text (text )
375+ normalized_text = self ._normalize_text (text , self .config .normalize_input )
320376 try :
321- labels , scores = model .predict (text , k = k , threshold = threshold )
377+ labels , scores = model .predict (normalized_text , k = k , threshold = threshold )
322378 results = [
323379 {
324380 "lang" : label .replace ("__label__" , "" ),
@@ -337,78 +393,108 @@ def detect_multilingual(
337393
338394
339395def detect (
340- text : str ,
341- * ,
342- low_memory : bool = True ,
343- model_download_proxy : Optional [str ] = None ,
344- use_strict_mode : bool = False ,
396+ text : str ,
397+ * ,
398+ low_memory : bool = True ,
399+ model_download_proxy : Optional [str ] = None ,
400+ use_strict_mode : bool = False ,
401+ config : Optional [LangDetectConfig ] = None ,
345402) -> Dict [str , Union [str , float ]]:
346403 """
347404 Simple interface for language detection.
348-
349- Before passing a text to this function, you remove all the newline characters.
350-
405+
351406 Too long or too short text will effect the accuracy of the prediction.
352407
353408 :param text: Input text without newline characters
354409 :param low_memory: Whether to use memory-efficient model
355- :param model_download_proxy: Optional proxy for model download
356- :param use_strict_mode: Disable fallback to small model
410+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
411+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
412+ :param config: Optional LangDetectConfig object for advanced configuration
357413
358414 :return: Dictionary with language and confidence score
359415 """
360- if "\n " in text or len (text ) > 1000 :
416+ # Provide config
417+ if config is not None :
418+ detector = LangDetector (config )
419+ return detector .detect (text , low_memory = low_memory )
420+
421+ # Check if any custom parameters are provided
422+ has_custom_params = any ([
423+ model_download_proxy is not None ,
424+ use_strict_mode ,
425+ ])
426+ if has_custom_params :
427+ # Show warning if using individual parameters
361428 logger .warning (
362- "fast-langdetect: Text contains newline characters or is too long. "
363- "You should only pass a single sentence for accurate prediction."
429+ "fast-langdetect: Using individual parameters is deprecated. "
430+ "Consider using LangDetectConfig for better configuration management. "
431+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
364432 )
365- if model_download_proxy or use_strict_mode :
366- config = LangDetectConfig (
367- proxy = model_download_proxy , allow_fallback = not use_strict_mode
433+ custom_config = LangDetectConfig (
434+ proxy = model_download_proxy ,
435+ allow_fallback = not use_strict_mode ,
368436 )
369- detector = LangDetector (config )
437+ detector = LangDetector (custom_config )
370438 return detector .detect (text , low_memory = low_memory )
439+
440+ # Use default detector
371441 return _default_detector .detect (text , low_memory = low_memory )
372442
373443
374444def detect_multilingual (
375- text : str ,
376- * ,
377- low_memory : bool = False ,
378- model_download_proxy : Optional [str ] = None ,
379- k : int = 5 ,
380- threshold : float = 0.0 ,
381- use_strict_mode : bool = False ,
445+ text : str ,
446+ * ,
447+ low_memory : bool = False ,
448+ model_download_proxy : Optional [str ] = None ,
449+ k : int = 5 ,
450+ threshold : float = 0.0 ,
451+ use_strict_mode : bool = False ,
452+ config : Optional [LangDetectConfig ] = None ,
382453) -> List [Dict [str , Any ]]:
383454 """
384455 Simple interface for multi-language detection.
385456
386- Before passing a text to this function, you remove all the newline characters.
387-
388457 Too long or too short text will effect the accuracy of the prediction.
389458
390459 :param text: Input text without newline characters
391460 :param low_memory: Whether to use memory-efficient model
392- :param model_download_proxy: Optional proxy for model download
393461 :param k: Number of top languages to return
394462 :param threshold: Minimum confidence threshold
395- :param use_strict_mode: Disable fallback to small model
463+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
464+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
465+ :param config: Optional LangDetectConfig object for advanced configuration
396466
397467 :return: List of dictionaries with languages and scores
398468 """
399- if "\n " in text or len (text ) > 100 :
469+ # Use provided config or create new config
470+ if config is not None :
471+ detector = LangDetector (config )
472+ return detector .detect_multilingual (
473+ text , low_memory = low_memory , k = k , threshold = threshold
474+ )
475+
476+ # Check if any custom parameters are provided
477+ has_custom_params = any ([
478+ model_download_proxy is not None ,
479+ use_strict_mode ,
480+ ])
481+ if has_custom_params :
482+ # Show warning if using individual parameters
400483 logger .warning (
401- "fast-langdetect: Text contains newline characters or is too long. "
402- "You should only pass a single sentence for accurate prediction."
484+ "fast-langdetect: Using individual parameters is deprecated. "
485+ "Consider using LangDetectConfig for better configuration management. "
486+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
403487 )
404- if model_download_proxy or use_strict_mode :
405- config = LangDetectConfig (
406- proxy = model_download_proxy , allow_fallback = not use_strict_mode
488+ custom_config = LangDetectConfig (
489+ proxy = model_download_proxy ,
490+ allow_fallback = not use_strict_mode ,
407491 )
408- detector = LangDetector (config )
492+ detector = LangDetector (custom_config )
409493 return detector .detect_multilingual (
410494 text , low_memory = low_memory , k = k , threshold = threshold
411495 )
496+
497+ # Use default detector
412498 return _default_detector .detect_multilingual (
413499 text , low_memory = low_memory , k = k , threshold = threshold
414500 )
0 commit comments