Skip to content

Commit f20e0a2

Browse files
authored
Enable optimized Phi4 and upgrade transformers to v4.48.0 (#3549)
1 parent 2e87152 commit f20e0a2

38 files changed

+11496
-66
lines changed

docs/tutorials/features/fast_bert.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc
99

1010
### Prerequisite
1111

12-
- Transformers 4.6.0 ~ 4.46.2
12+
- Transformers 4.6.0 ~ 4.48.0
1313

1414
### Usage Example
1515

examples/cpu/features/fast_bert/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
Currently `ipex.fast_bert` API is only well optimized for training. For inference, it ensures functionality, while to get peak perf, please use `ipex.optimize` API + torchscript.
66

77
# Prerequisite:
8-
Transformers 4.6.0 ~ 4.46.2
8+
Transformers 4.6.0 ~ 4.48.0
99

1010
# Usage Example:
1111
Training:

examples/cpu/llm/fine-tuning/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ black[jupyter]
66
datasets
77
fire
88
peft
9-
transformers==4.46.2
9+
transformers==4.48.0
1010
gradio
1111
sentencepiece

examples/cpu/llm/inference/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
|Phi| microsoft/Phi-3-mini-128k-instruct ||||||
4444
|Phi| microsoft/Phi-3-medium-4k-instruct ||||||
4545
|Phi| microsoft/Phi-3-medium-128k-instruct ||||||
46+
|Phi| microsoft/Phi-4-mini-instruct ||| |||
47+
|Phi| microsoft/Phi-4-multimodal-instruct ||| |||
4648
|Whisper| openai/whisper-large-v2 ||||||
4749
|Maira| microsoft/maira-2 ||| |||
4850
|Jamba| ai21labs/Jamba-v0.1 ||| |||
@@ -116,6 +118,8 @@ python run.py --help # for more detailed usages
116118
| generation iterations | use "--num-iter" and "--num-warmup" to control the repeated iterations of generation, default: 100-iter/10-warmup |
117119
| streaming mode output | greedy search only (work with "--greedy"), use "--streaming" to enable the streaming generation output |
118120
| KV Cache dtype | default: auto, use "--kv-cache-dtype=fp8_e5m2" to enable e5m2 KV Cache. More information refer to [vLLM FP8 E5M2 KV Cache](https://docs.vllm.ai/en/v0.6.6/quantization/fp8_e5m2_kvcache.html) |
121+
| input mode | default: 0, use "--input-mode" to choose input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision and speech |
122+
| input audios | default: None, use "--audio" to choose the audio link address for speech tasks |
119123

120124
*Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login).
121125

examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py

Lines changed: 82 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,13 @@
221221
" INT4 weights, scales, zero points, etc. For better accuracy of weight only"
222222
" quantization with INT4 weight.",
223223
)
224-
224+
parser.add_argument(
225+
"--input-mode",
226+
default="0",
227+
choices=["0", "1", "2", "3"],
228+
type=str,
229+
help="Input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision_speech",
230+
)
225231

226232
args = parser.parse_args()
227233

@@ -397,6 +403,47 @@ def get_checkpoint_files(model_name_or_path):
397403
num_beams = 1 if args.greedy else 4
398404
if model_type in ["git", "llava", "jamba"]:
399405
config.batch_size = int(args.batch_size) * num_beams
406+
if re.search("phi4mm", config.architectures[0], re.IGNORECASE):
407+
model_type = "phi4mm"
408+
model_class = MODEL_CLASSES[model_type]
409+
tokenizer = model_class[1].from_pretrained(model_name, trust_remote_code=True)
410+
prompt = args.prompt
411+
_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN = r"<\|image_\d+\|>"
412+
_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN = r"<\|audio_\d+\|>"
413+
image_in_prompt = len(re.findall(_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN, prompt))
414+
audio_in_prompt = len(re.findall(_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN, prompt))
415+
is_vision = image_in_prompt > 0
416+
is_speech = audio_in_prompt > 0
417+
audio_batch_size = args.batch_size
418+
if is_vision:
419+
assert (
420+
image_in_prompt == args.batch_size
421+
), "Prompt is invalid. For multiple images, the user needs to insert \
422+
multiple image placeholders in the prompt as below: \
423+
<|user|><|image_1|><|image_2|><|image_3|>Summarize the content of the images.<|end|><|assistant|>"
424+
if is_speech:
425+
if not is_vision:
426+
assert (
427+
audio_in_prompt == args.batch_size
428+
), "Prompt is invalid. For multiple audios, the user needs to insert \
429+
multiple audio placeholders in the prompt as below: \
430+
<|user|><|audio_1|><|audio_2|><|audio_3|>Transcribe the audio clip into text.<|end|><|assistant|>"
431+
else:
432+
audio_batch_size = audio_in_prompt
433+
if not is_vision and not is_speech:
434+
config.input_mode = 0
435+
elif is_vision and not is_speech:
436+
config.input_mode = 1
437+
elif not is_vision and is_speech:
438+
config.input_mode = 2
439+
else:
440+
config.input_mode = 3
441+
442+
assert config.input_mode == int(
443+
args.input_mode
444+
), "Input mode in prompt is not consistent with the input mode in the command line."
445+
config.batch_size = int(args.batch_size) * num_beams
446+
config.audio_batch_size = audio_batch_size
400447
# XXX: can't automatically derive dtype via config's `from_pretrained`
401448
# dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16
402449

@@ -430,13 +477,15 @@ def get_checkpoint_files(model_name_or_path):
430477
"yuan",
431478
"whisper",
432479
"jamba",
480+
"phi4mm",
433481
]:
434482
model = model_class[0].from_pretrained(
435483
model_name,
436484
config=config,
437485
low_cpu_mem_usage=True if model_type != "maira2" else False,
438486
torch_dtype=load_dtype,
439487
trust_remote_code=True,
488+
attn_implementation="eager",
440489
)
441490
elif model_type == "maira2":
442491
model = model_class[0].from_pretrained(
@@ -451,7 +500,9 @@ def get_checkpoint_files(model_name_or_path):
451500
else:
452501
model = (
453502
model_class[0]
454-
.from_config(config, trust_remote_code=True)
503+
.from_config(
504+
config, trust_remote_code=True, attn_implementation="eager"
505+
)
455506
.to(load_dtype)
456507
)
457508

@@ -769,6 +820,23 @@ def load_image(image_file):
769820
input_size = inputs["input_ids"].size(dim=1)
770821
print("---- Prompt size:", input_size)
771822
inputs = [prompt] * args.batch_size
823+
elif model_type == "phi4mm":
824+
from PIL import Image
825+
826+
def load_image(image_file):
827+
if image_file.startswith("http://") or image_file.startswith("https://"):
828+
import requests
829+
830+
raw_image = Image.open(requests.get(args.image_url, stream=True).raw)
831+
else:
832+
raw_image = Image.open(image_file)
833+
return raw_image
834+
835+
import soundfile
836+
837+
sample = soundfile.read(args.audio) if config.input_mode in [2, 3] else None
838+
prompt = args.prompt
839+
inputs = [prompt] * args.batch_size
772840
elif model_type == "maira2":
773841
from PIL import Image
774842
import requests
@@ -864,6 +932,17 @@ def generate():
864932
get_grounding=False,
865933
)
866934
input_ids = input_tokens["input_ids"]
935+
elif model_type == "phi4mm":
936+
raw_image = load_image(args.image_url) if is_vision else None
937+
raw_image = [raw_image] * args.batch_size
938+
samples = [sample] * audio_batch_size
939+
input_tokens = tokenizer(
940+
text=inputs[0],
941+
images=raw_image if is_vision else None,
942+
audios=samples if is_speech else None,
943+
return_tensors="pt",
944+
)
945+
input_ids = input_tokens["input_ids"]
867946
else:
868947
input_tokens = tokenizer.batch_encode_plus(
869948
inputs, return_token_type_ids=False, return_tensors="pt"
@@ -890,7 +969,7 @@ def generate():
890969
gen_text = tokenizer.batch_decode(
891970
(
892971
gen_ids[:, input_ids.shape[1] :]
893-
if model_type in ["llava", "maira2"]
972+
if model_type in ["llava", "maira2", "phi4mm"]
894973
else gen_ids
895974
),
896975
skip_special_tokens=True,
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
transformers==4.46.2
1+
transformers==4.48.0

examples/cpu/llm/inference/run.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
218218
" INT4 weights, scales, zero points, etc. For better accuracy of weight only"
219219
" quantization with INT4 weight.",
220220
)
221+
parser.add_argument(
222+
"--input-mode",
223+
default="0",
224+
choices=["0", "1", "2", "3"],
225+
type=str,
226+
help="Input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision_speech",
227+
)
221228
parser.add_argument(
222229
"--gptq",
223230
action="store_true",
@@ -347,6 +354,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
347354
infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
348355
infer_cmd.extend(["--batch-size", str(args.batch_size)])
349356
infer_cmd.extend(["--kv-cache-dtype", args.kv_cache_dtype])
357+
infer_cmd.extend(["--input-mode", str(args.input_mode)])
350358
if args.vision_text_model:
351359
infer_cmd.extend(["--vision-text-model"])
352360
if args.greedy:
@@ -392,6 +400,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
392400
quant_cmd.extend(["--output-dir", str(args.output_dir)])
393401
quant_cmd.extend(["--input-tokens", str(args.input_tokens)])
394402
quant_cmd.extend(["--max-new-tokens", str(args.max_new_tokens)])
403+
quant_cmd.extend(["--input-mode", str(args.input_mode)])
404+
quant_cmd.extend(["--batch-size", str(args.batch_size)])
395405
if args.vision_text_model:
396406
quant_cmd.extend(["--vision-text-model"])
397407
if args.config_file is not None:
@@ -529,6 +539,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
529539
infer_cmd.extend(["--num-iter", str(args.num_iter)])
530540
infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
531541
infer_cmd.extend(["--batch-size", str(args.batch_size)])
542+
infer_cmd.extend(["--input-mode", str(args.input_mode)])
532543
if args.vision_text_model:
533544
infer_cmd.extend(["--vision-text-model"])
534545
if args.quant_with_amp:
@@ -589,6 +600,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
589600
"git": ("/git_local_shard"),
590601
"yuan": ("/yuan_local_shard"),
591602
"phi-3": ("/phi-3_local_shard"),
603+
"phi4mm": ("/phi4mm_local_shard"),
592604
"phi": ("/phi_local_shard"),
593605
"whisper": ("/whisper_local_shard"),
594606
"maira": ("/maira2_local_shard"),
@@ -646,6 +658,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
646658
infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
647659
infer_cmd.extend(["--batch-size", str(args.batch_size)])
648660
infer_cmd.extend(["--kv-cache-dtype", args.kv_cache_dtype])
661+
infer_cmd.extend(["--input-mode", str(args.input_mode)])
649662
if args.local_rank is not None:
650663
infer_cmd.extend(["--local_rank", str(args.local_rank)])
651664
if args.greedy:

examples/cpu/llm/inference/single_instance/run_generation.py

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,13 @@
116116
help='Data type for kv cache storage. If "auto", will use model '
117117
"data type. fp8 type now supports e5m2.",
118118
)
119-
119+
parser.add_argument(
120+
"--input-mode",
121+
default="0",
122+
choices=["0", "1", "2", "3"],
123+
type=str,
124+
help="Input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision_speech",
125+
)
120126
args = parser.parse_args()
121127
print(args)
122128

@@ -185,14 +191,53 @@
185191
config.lm_head_generation = True
186192
if model_type == "maira2" and not hasattr(config.text_config, "lm_head_generation"):
187193
config.text_config.lm_head_generation = True
194+
if re.search("phi4mm", config.architectures[0], re.IGNORECASE):
195+
model_type = "phi4mm"
196+
model_class = MODEL_CLASSES[model_type]
197+
prompt = args.prompt
198+
_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN = r"<\|image_\d+\|>"
199+
_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN = r"<\|audio_\d+\|>"
200+
image_in_prompt = len(re.findall(_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN, prompt))
201+
audio_in_prompt = len(re.findall(_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN, prompt))
202+
is_vision = image_in_prompt > 0
203+
is_speech = audio_in_prompt > 0
204+
audio_batch_size = args.batch_size
205+
if is_vision:
206+
assert (
207+
image_in_prompt == args.batch_size
208+
), "Prompt is invalid. For multiple images, the user needs to \
209+
insert multiple image placeholders in the prompt as below: \
210+
<|user|><|image_1|><|image_2|><|image_3|>Summarize the content of the images.<|end|><|assistant|>"
211+
if is_speech:
212+
if not is_vision:
213+
assert (
214+
audio_in_prompt == args.batch_size
215+
), "Prompt is invalid. For multiple audios, the user needs to \
216+
insert multiple audio placeholders in the prompt as below: \
217+
<|user|><|audio_1|><|audio_2|><|audio_3|>Transcribe the audio clip into text.<|end|><|assistant|>"
218+
else:
219+
audio_batch_size = audio_in_prompt
220+
if not is_vision and not is_speech:
221+
config.input_mode = 0
222+
elif is_vision and not is_speech:
223+
config.input_mode = 1
224+
elif not is_vision and is_speech:
225+
config.input_mode = 2
226+
else:
227+
config.input_mode = 3
188228

229+
assert config.input_mode == int(
230+
args.input_mode
231+
), "Input mode in prompt is not consistent with the input mode in the command line."
189232
if model_type != "llava":
233+
config._attn_implementation = "eager"
190234
model = model_class[0].from_pretrained(
191235
args.model_id,
192236
torch_dtype=amp_dtype,
193237
config=config,
194238
low_cpu_mem_usage=True if model_type != "maira2" else False,
195239
trust_remote_code=True,
240+
attn_implementation="eager",
196241
)
197242
tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True)
198243
else:
@@ -240,7 +285,9 @@ def load_image(image_file):
240285
image = Image.open(image_file).convert("RGB")
241286
return image
242287

243-
elif re.search("mllama", model.config.architectures[0], re.IGNORECASE):
288+
elif re.search("mllama", model.config.architectures[0], re.IGNORECASE) or re.search(
289+
"phi4mm", model.config.architectures[0], re.IGNORECASE
290+
):
244291
from PIL import Image
245292

246293
def load_image(image_file):
@@ -280,10 +327,20 @@ def download_and_open(url: str) -> Image.Image:
280327
"jamba", model.config.architectures[0], re.IGNORECASE
281328
):
282329
model.config.batch_size = int(args.batch_size) * num_beams
330+
if re.search("phi4mm", model.config.architectures[0], re.IGNORECASE):
331+
model.config.batch_size = int(args.batch_size) * num_beams
332+
model.config.audio_batch_size = audio_batch_size * num_beams
283333
if re.search("whisper", model.config.architectures[0], re.IGNORECASE):
284334
import librosa
285335

286336
sample = librosa.load(args.audio, sr=16000)
337+
if re.search("phi4mm", model.config.architectures[0], re.IGNORECASE):
338+
if config.input_mode in [2, 3]:
339+
import soundfile
340+
341+
sample = soundfile.read(args.audio)
342+
else:
343+
sample = None
287344

288345

289346
def trace_handler(prof):
@@ -347,6 +404,8 @@ def trace_handler(prof):
347404
if hasattr(tokenizer, "process_reporting_input")
348405
else tokenizer.format_and_preprocess_reporting_input
349406
)
407+
elif model_type == "phi4mm":
408+
prompt = args.prompt
350409
else:
351410
# input prompt
352411
current_path = pathlib.Path(__file__).parent.resolve()
@@ -431,14 +490,26 @@ def trace_handler(prof):
431490
)
432491
input_ids = processed_inputs["input_ids"]
433492
output = model.generate(**processed_inputs, **generate_kwargs)
493+
elif model_type == "phi4mm":
494+
raw_image = load_image(args.image_url) if is_vision else None
495+
raw_image = [raw_image] * args.batch_size
496+
samples = [sample] * audio_batch_size
497+
inputs = tokenizer(
498+
text=prompt[0],
499+
images=raw_image if is_vision else None,
500+
audios=samples if is_speech else None,
501+
return_tensors="pt",
502+
)
503+
input_ids = inputs["input_ids"]
504+
output = model.generate(**inputs, **generate_kwargs)
434505
else:
435506
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
436507
output = model.generate(input_ids, **generate_kwargs)
437508
gen_ids = output[0] if args.token_latency else output
438509
gen_text = tokenizer.batch_decode(
439510
(
440511
gen_ids[:, input_ids.shape[1] :]
441-
if model_type in ["llava", "maira2"]
512+
if model_type in ["llava", "maira2", "phi4mm"]
442513
else gen_ids
443514
),
444515
skip_special_tokens=True,
@@ -514,6 +585,17 @@ def trace_handler(prof):
514585
get_grounding=False,
515586
)
516587
output = model.generate(**processed_inputs, **generate_kwargs)
588+
elif model_type == "phi4mm":
589+
raw_image = load_image(args.image_url) if is_vision else None
590+
raw_image = [raw_image] * args.batch_size
591+
samples = [sample] * audio_batch_size
592+
inputs = tokenizer(
593+
text=prompt[0],
594+
images=raw_image if is_vision else None,
595+
audios=samples if is_speech else None,
596+
return_tensors="pt",
597+
)
598+
output = model.generate(**inputs, **generate_kwargs)
517599
else:
518600
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
519601
output = model.generate(input_ids, **generate_kwargs)

0 commit comments

Comments
 (0)