Skip to content

Commit b1dc90e

Browse files
authored
+Phi2 pre tokenizer template
1 parent e53a9fb commit b1dc90e

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

llama.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4435,14 +4435,20 @@ static void llm_load_vocab(
44354435
} else if (
44364436
tokenizer_pre == "qwen2") {
44374437
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4438+
} else if (
4439+
tokenizer_pre == "phi2") {
4440+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PHI2;
44384441
} else if (
44394442
tokenizer_pre == "olmo") {
44404443
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
44414444
} else if (
44424445
tokenizer_pre == "dbrx") {
44434446
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
44444447
} else {
4445-
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4448+
LLAMA_LOG_WARN("%s: Encountered unknown pre-tokenizer type. Falling back to default. \n", __func__);
4449+
LLAMA_LOG_WARN("%s: GENERATION QUALITY MIGHT BE DEGRADED! \n", __func__);
4450+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4451+
// throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
44464452
}
44474453
} else {
44484454
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -12278,7 +12284,7 @@ struct llm_tokenizer_bpe {
1227812284
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
1227912285
word_collection = unicode_regex_split(text, {
1228012286
"[\r\n]",
12281-
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12287+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
1228212288
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
1228312289
"\\s+$",
1228412290
"[一-龥ࠀ-一가-퟿]+",
@@ -12333,6 +12339,7 @@ struct llm_tokenizer_bpe {
1233312339
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
1233412340
});
1233512341
break;
12342+
case LLAMA_VOCAB_PRE_TYPE_PHI2:
1233612343
default:
1233712344
// default regex for BPE tokenization pre-processing
1233812345
word_collection = unicode_regex_split(text, {

0 commit comments

Comments
 (0)