@@ -4435,14 +4435,20 @@ static void llm_load_vocab(
44354435 } else if (
44364436 tokenizer_pre == "qwen2") {
44374437 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4438+ } else if (
4439+ tokenizer_pre == "phi2") {
4440+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PHI2;
44384441 } else if (
44394442 tokenizer_pre == "olmo") {
44404443 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
44414444 } else if (
44424445 tokenizer_pre == "dbrx") {
44434446 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
44444447 } else {
4445- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4448+ LLAMA_LOG_WARN("%s: Encountered unknown pre-tokenizer type. Falling back to default. \n", __func__);
4449+ LLAMA_LOG_WARN("%s: GENERATION QUALITY MIGHT BE DEGRADED! \n", __func__);
4450+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4451+ // throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
44464452 }
44474453 } else {
44484454 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -12278,7 +12284,7 @@ struct llm_tokenizer_bpe {
1227812284 case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
1227912285 word_collection = unicode_regex_split(text, {
1228012286 "[\r\n]",
12281- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ -ᾴᾶ-ᾼιῂ -ῄῆ-ῌῐ-ΐῖ-Ίῠ -Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK -ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12287+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ -ᾴᾶ-ᾼιῂ -ῄῆ-ῌῐ-ΐῖ-Ίῠ -Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK -ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
1228212288 "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
1228312289 "\\s+$",
1228412290 "[一-龥ࠀ-一가-]+",
@@ -12333,6 +12339,7 @@ struct llm_tokenizer_bpe {
1233312339 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
1233412340 });
1233512341 break;
12342+ case LLAMA_VOCAB_PRE_TYPE_PHI2:
1233612343 default:
1233712344 // default regex for BPE tokenization pre-processing
1233812345 word_collection = unicode_regex_split(text, {
0 commit comments