@@ -242,6 +242,27 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242242 self .config = config or LangDetectConfig ()
243243 self ._model_loader = ModelLoader ()
244244
245+ @staticmethod
246+ def _preprocess_text (text : str ) -> str :
247+ """
248+ Check text for newline characters and length.
249+
250+ :param text: Input text
251+ :return: Processed text
252+ """
253+ if len (text ) > 100 :
254+ logger .warning (
255+ "fast-langdetect: Text may be too long. "
256+ "Consider passing only a single sentence for accurate prediction."
257+ )
258+ if "\n " in text :
259+ logger .warning (
260+ "fast-langdetect: Newline characters will be removed. "
261+ "Input should not contain newline characters. or FastText will raise an error."
262+ )
263+ text = text .replace ("\n " , " " )
264+ return text
265+
245266 @staticmethod
246267 def _normalize_text (text : str , should_normalize : bool = False ) -> str :
247268 """
@@ -258,7 +279,7 @@ def _normalize_text(text: str, should_normalize: bool = False) -> str:
258279 # If not normalization is needed, return the processed text
259280 if not should_normalize :
260281 return text
261-
282+
262283 # Check if text is all uppercase or mostly uppercase
263284 # https://github.com/LlmKira/fast-langdetect/issues/14
264285 if text .isupper () or (
@@ -317,18 +338,8 @@ def detect(
317338 DetectError: If detection fails
318339 """
319340 model = self ._get_model (low_memory )
341+ text = self ._preprocess_text (text )
320342 normalized_text = self ._normalize_text (text , self .config .normalize_input )
321- if len (normalized_text ) > 100 :
322- logger .warning (
323- "fast-langdetect: Text may be too long. "
324- "Consider passing only a single sentence for accurate prediction."
325- )
326- if "\n " in normalized_text :
327- logger .warning (
328- "fast-langdetect: Input should not contain newline characters. "
329- "Removing them or FastText will raise an error."
330- )
331- normalized_text = normalized_text .replace ("\n " , " " )
332343 try :
333344 labels , scores = model .predict (normalized_text )
334345 return {
@@ -360,6 +371,7 @@ def detect_multilingual(
360371 DetectError: If detection fails
361372 """
362373 model = self ._get_model (low_memory )
374+ text = self ._preprocess_text (text )
363375 normalized_text = self ._normalize_text (text , self .config .normalize_input )
364376 try :
365377 labels , scores = model .predict (normalized_text , k = k , threshold = threshold )
0 commit comments