intel
diff --git a/‎docs/tutorials/features/fast_bert.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/tutorials/features/fast_bert.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpu/features/fast_bert/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/cpu/features/fast_bert/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpu/llm/fine-tuning/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/cpu/llm/fine-tuning/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpu/llm/inference/README.md‎
Lines changed: 4 additions & 0 deletions b/‎examples/cpu/llm/inference/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py‎
Lines changed: 82 additions & 3 deletions b/‎examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py‎
Lines changed: 82 additions & 3 deletions
diff --git a/‎examples/cpu/llm/inference/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/cpu/llm/inference/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpu/llm/inference/run.py‎
Lines changed: 13 additions & 0 deletions b/‎examples/cpu/llm/inference/run.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/cpu/llm/inference/single_instance/run_generation.py‎
Lines changed: 85 additions & 3 deletions b/‎examples/cpu/llm/inference/single_instance/run_generation.py‎
Lines changed: 85 additions & 3 deletions
@@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc
 
 ### Prerequisite
 
-- Transformers 4.6.0 ~ 4.46.2
+- Transformers 4.6.0 ~ 4.48.0
 
 ### Usage Example
 
 
@@ -5,7 +5,7 @@
 Currently `ipex.fast_bert` API is only well optimized for training. For inference, it ensures functionality, while to get peak perf, please use `ipex.optimize` API + torchscript.
 
 # Prerequisite:
-Transformers 4.6.0 ~ 4.46.2
+Transformers 4.6.0 ~ 4.48.0
 
 # Usage Example:
 Training:
 
@@ -6,6 +6,6 @@ black[jupyter]
 datasets
 fire
 peft
-transformers==4.46.2
+transformers==4.48.0
 gradio
 sentencepiece
@@ -43,6 +43,8 @@
 |Phi| microsoft/Phi-3-mini-128k-instruct | ✅ | ✅ | ✅ | ✅ | ✅ |
 |Phi| microsoft/Phi-3-medium-4k-instruct | ✅ | ✅ | ✅ | ✅ | ✅ |
 |Phi| microsoft/Phi-3-medium-128k-instruct | ✅ | ✅ | ✅ | ✅ | ✅ |
+|Phi| microsoft/Phi-4-mini-instruct | ✅ | ✅ |   | ✅ | ✅ |
+|Phi| microsoft/Phi-4-multimodal-instruct | ✅ | ✅ |   | ✅ | ✅ |
 |Whisper| openai/whisper-large-v2 | ✅ | ✅ | ✅ | ✅ | ✅ |
 |Maira| microsoft/maira-2 | ✅ | ✅ |   | ✅ | ✅ |
 |Jamba| ai21labs/Jamba-v0.1 | ✅ | ✅ |   | ✅ | ✅ |
@@ -116,6 +118,8 @@ python run.py --help # for more detailed usages
 | generation iterations |  use "--num-iter" and "--num-warmup" to control the repeated iterations of generation, default: 100-iter/10-warmup |
 | streaming mode output | greedy search only (work with "--greedy"), use "--streaming" to enable the streaming generation output |
 | KV Cache dtype |   default: auto, use "--kv-cache-dtype=fp8_e5m2" to enable e5m2 KV Cache. More information refer to [vLLM FP8 E5M2 KV Cache](https://docs.vllm.ai/en/v0.6.6/quantization/fp8_e5m2_kvcache.html) |
+| input mode | default: 0, use "--input-mode" to choose input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision and speech |
+| input audios | default: None, use "--audio" to choose the audio link address for speech tasks |
 
 *Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login).
 
 
@@ -221,7 +221,13 @@
     " INT4 weights, scales, zero points, etc. For better accuracy of weight only"
     " quantization with INT4 weight.",
 )
-
+parser.add_argument(
+    "--input-mode",
+    default="0",
+    choices=["0", "1", "2", "3"],
+    type=str,
+    help="Input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision_speech",
+)
 
 args = parser.parse_args()
 
@@ -397,6 +403,47 @@ def get_checkpoint_files(model_name_or_path):
 num_beams = 1 if args.greedy else 4
 if model_type in ["git", "llava", "jamba"]:
     config.batch_size = int(args.batch_size) * num_beams
+if re.search("phi4mm", config.architectures[0], re.IGNORECASE):
+    model_type = "phi4mm"
+    model_class = MODEL_CLASSES[model_type]
+    tokenizer = model_class[1].from_pretrained(model_name, trust_remote_code=True)
+    prompt = args.prompt
+    _COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN = r"<\|image_\d+\|>"
+    _COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN = r"<\|audio_\d+\|>"
+    image_in_prompt = len(re.findall(_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN, prompt))
+    audio_in_prompt = len(re.findall(_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN, prompt))
+    is_vision = image_in_prompt > 0
+    is_speech = audio_in_prompt > 0
+    audio_batch_size = args.batch_size
+    if is_vision:
+        assert (
+            image_in_prompt == args.batch_size
+        ), "Prompt is invalid. For multiple images, the user needs to insert \
+            multiple image placeholders in the prompt as below: \
+            <|user|><|image_1|><|image_2|><|image_3|>Summarize the content of the images.<|end|><|assistant|>"
+    if is_speech:
+        if not is_vision:
+            assert (
+                audio_in_prompt == args.batch_size
+            ), "Prompt is invalid. For multiple audios, the user needs to insert \
+                multiple audio placeholders in the prompt as below: \
+                <|user|><|audio_1|><|audio_2|><|audio_3|>Transcribe the audio clip into text.<|end|><|assistant|>"
+        else:
+            audio_batch_size = audio_in_prompt
+    if not is_vision and not is_speech:
+        config.input_mode = 0
+    elif is_vision and not is_speech:
+        config.input_mode = 1
+    elif not is_vision and is_speech:
+        config.input_mode = 2
+    else:
+        config.input_mode = 3
+
+    assert config.input_mode == int(
+        args.input_mode
+    ), "Input mode in prompt is not consistent with the input mode in the command line."
+    config.batch_size = int(args.batch_size) * num_beams
+    config.audio_batch_size = audio_batch_size
 # XXX: can't automatically derive dtype via config's `from_pretrained`
 # dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16
 
@@ -430,13 +477,15 @@ def get_checkpoint_files(model_name_or_path):
     "yuan",
     "whisper",
     "jamba",
+    "phi4mm",
 ]:
     model = model_class[0].from_pretrained(
         model_name,
         config=config,
         low_cpu_mem_usage=True if model_type != "maira2" else False,
         torch_dtype=load_dtype,
         trust_remote_code=True,
+        attn_implementation="eager",
     )
 elif model_type == "maira2":
     model = model_class[0].from_pretrained(
@@ -451,7 +500,9 @@ def get_checkpoint_files(model_name_or_path):
         else:
             model = (
                 model_class[0]
-                .from_config(config, trust_remote_code=True)
+                .from_config(
+                    config, trust_remote_code=True, attn_implementation="eager"
+                )
                 .to(load_dtype)
             )
 
@@ -769,6 +820,23 @@ def load_image(image_file):
     input_size = inputs["input_ids"].size(dim=1)
     print("---- Prompt size:", input_size)
     inputs = [prompt] * args.batch_size
+elif model_type == "phi4mm":
+    from PIL import Image
+
+    def load_image(image_file):
+        if image_file.startswith("http://") or image_file.startswith("https://"):
+            import requests
+
+            raw_image = Image.open(requests.get(args.image_url, stream=True).raw)
+        else:
+            raw_image = Image.open(image_file)
+        return raw_image
+
+    import soundfile
+
+    sample = soundfile.read(args.audio) if config.input_mode in [2, 3] else None
+    prompt = args.prompt
+    inputs = [prompt] * args.batch_size
 elif model_type == "maira2":
     from PIL import Image
     import requests
@@ -864,6 +932,17 @@ def generate():
             get_grounding=False,
         )
         input_ids = input_tokens["input_ids"]
+    elif model_type == "phi4mm":
+        raw_image = load_image(args.image_url) if is_vision else None
+        raw_image = [raw_image] * args.batch_size
+        samples = [sample] * audio_batch_size
+        input_tokens = tokenizer(
+            text=inputs[0],
+            images=raw_image if is_vision else None,
+            audios=samples if is_speech else None,
+            return_tensors="pt",
+        )
+        input_ids = input_tokens["input_ids"]
     else:
         input_tokens = tokenizer.batch_encode_plus(
             inputs, return_token_type_ids=False, return_tensors="pt"
@@ -890,7 +969,7 @@ def generate():
     gen_text = tokenizer.batch_decode(
         (
             gen_ids[:, input_ids.shape[1] :]
-            if model_type in ["llava", "maira2"]
+            if model_type in ["llava", "maira2", "phi4mm"]
             else gen_ids
         ),
         skip_special_tokens=True,
 
@@ -1 +1 @@
-transformers==4.46.2
+transformers==4.48.0
@@ -218,6 +218,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
         " INT4 weights, scales, zero points, etc. For better accuracy of weight only"
         " quantization with INT4 weight.",
     )
+    parser.add_argument(
+        "--input-mode",
+        default="0",
+        choices=["0", "1", "2", "3"],
+        type=str,
+        help="Input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision_speech",
+    )
     parser.add_argument(
         "--gptq",
         action="store_true",
@@ -347,6 +354,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
             infer_cmd.extend(["--batch-size", str(args.batch_size)])
             infer_cmd.extend(["--kv-cache-dtype", args.kv_cache_dtype])
+            infer_cmd.extend(["--input-mode", str(args.input_mode)])
             if args.vision_text_model:
                 infer_cmd.extend(["--vision-text-model"])
             if args.greedy:
@@ -392,6 +400,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 quant_cmd.extend(["--output-dir", str(args.output_dir)])
                 quant_cmd.extend(["--input-tokens", str(args.input_tokens)])
                 quant_cmd.extend(["--max-new-tokens", str(args.max_new_tokens)])
+                quant_cmd.extend(["--input-mode", str(args.input_mode)])
+                quant_cmd.extend(["--batch-size", str(args.batch_size)])
                 if args.vision_text_model:
                     quant_cmd.extend(["--vision-text-model"])
                 if args.config_file is not None:
@@ -529,6 +539,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             infer_cmd.extend(["--num-iter", str(args.num_iter)])
             infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
             infer_cmd.extend(["--batch-size", str(args.batch_size)])
+            infer_cmd.extend(["--input-mode", str(args.input_mode)])
             if args.vision_text_model:
                 infer_cmd.extend(["--vision-text-model"])
             if args.quant_with_amp:
@@ -589,6 +600,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 "git": ("/git_local_shard"),
                 "yuan": ("/yuan_local_shard"),
                 "phi-3": ("/phi-3_local_shard"),
+                "phi4mm": ("/phi4mm_local_shard"),
                 "phi": ("/phi_local_shard"),
                 "whisper": ("/whisper_local_shard"),
                 "maira": ("/maira2_local_shard"),
@@ -646,6 +658,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
         infer_cmd.extend(["--batch-size", str(args.batch_size)])
         infer_cmd.extend(["--kv-cache-dtype", args.kv_cache_dtype])
+        infer_cmd.extend(["--input-mode", str(args.input_mode)])
         if args.local_rank is not None:
             infer_cmd.extend(["--local_rank", str(args.local_rank)])
         if args.greedy:
 
@@ -116,7 +116,13 @@
     help='Data type for kv cache storage. If "auto", will use model '
     "data type. fp8 type now supports e5m2.",
 )
-
+parser.add_argument(
+    "--input-mode",
+    default="0",
+    choices=["0", "1", "2", "3"],
+    type=str,
+    help="Input mode for multimodal models. 0: language; 1: vision; 2: speech; 3: vision_speech",
+)
 args = parser.parse_args()
 print(args)
 
@@ -185,14 +191,53 @@
     config.lm_head_generation = True
 if model_type == "maira2" and not hasattr(config.text_config, "lm_head_generation"):
     config.text_config.lm_head_generation = True
+if re.search("phi4mm", config.architectures[0], re.IGNORECASE):
+    model_type = "phi4mm"
+    model_class = MODEL_CLASSES[model_type]
+    prompt = args.prompt
+    _COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN = r"<\|image_\d+\|>"
+    _COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN = r"<\|audio_\d+\|>"
+    image_in_prompt = len(re.findall(_COMPATIBLE_IMAGE_SPECIAL_TOKEN_PATTERN, prompt))
+    audio_in_prompt = len(re.findall(_COMPATIBLE_AUDIO_SPECIAL_TOKEN_PATTERN, prompt))
+    is_vision = image_in_prompt > 0
+    is_speech = audio_in_prompt > 0
+    audio_batch_size = args.batch_size
+    if is_vision:
+        assert (
+            image_in_prompt == args.batch_size
+        ), "Prompt is invalid. For multiple images, the user needs to \
+            insert multiple image placeholders in the prompt as below: \
+            <|user|><|image_1|><|image_2|><|image_3|>Summarize the content of the images.<|end|><|assistant|>"
+    if is_speech:
+        if not is_vision:
+            assert (
+                audio_in_prompt == args.batch_size
+            ), "Prompt is invalid. For multiple audios, the user needs to \
+                insert multiple audio placeholders in the prompt as below: \
+                <|user|><|audio_1|><|audio_2|><|audio_3|>Transcribe the audio clip into text.<|end|><|assistant|>"
+        else:
+            audio_batch_size = audio_in_prompt
+    if not is_vision and not is_speech:
+        config.input_mode = 0
+    elif is_vision and not is_speech:
+        config.input_mode = 1
+    elif not is_vision and is_speech:
+        config.input_mode = 2
+    else:
+        config.input_mode = 3
 
+    assert config.input_mode == int(
+        args.input_mode
+    ), "Input mode in prompt is not consistent with the input mode in the command line."
 if model_type != "llava":
+    config._attn_implementation = "eager"
     model = model_class[0].from_pretrained(
         args.model_id,
         torch_dtype=amp_dtype,
         config=config,
         low_cpu_mem_usage=True if model_type != "maira2" else False,
         trust_remote_code=True,
+        attn_implementation="eager",
     )
     tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True)
 else:
@@ -240,7 +285,9 @@ def load_image(image_file):
             image = Image.open(image_file).convert("RGB")
         return image
 
-elif re.search("mllama", model.config.architectures[0], re.IGNORECASE):
+elif re.search("mllama", model.config.architectures[0], re.IGNORECASE) or re.search(
+    "phi4mm", model.config.architectures[0], re.IGNORECASE
+):
     from PIL import Image
 
     def load_image(image_file):
@@ -280,10 +327,20 @@ def download_and_open(url: str) -> Image.Image:
     "jamba", model.config.architectures[0], re.IGNORECASE
 ):
     model.config.batch_size = int(args.batch_size) * num_beams
+if re.search("phi4mm", model.config.architectures[0], re.IGNORECASE):
+    model.config.batch_size = int(args.batch_size) * num_beams
+    model.config.audio_batch_size = audio_batch_size * num_beams
 if re.search("whisper", model.config.architectures[0], re.IGNORECASE):
     import librosa
 
     sample = librosa.load(args.audio, sr=16000)
+if re.search("phi4mm", model.config.architectures[0], re.IGNORECASE):
+    if config.input_mode in [2, 3]:
+        import soundfile
+
+        sample = soundfile.read(args.audio)
+    else:
+        sample = None
 
 
 def trace_handler(prof):
@@ -347,6 +404,8 @@ def trace_handler(prof):
             if hasattr(tokenizer, "process_reporting_input")
             else tokenizer.format_and_preprocess_reporting_input
         )
+    elif model_type == "phi4mm":
+        prompt = args.prompt
     else:
         # input prompt
         current_path = pathlib.Path(__file__).parent.resolve()
@@ -431,14 +490,26 @@ def trace_handler(prof):
                 )
                 input_ids = processed_inputs["input_ids"]
                 output = model.generate(**processed_inputs, **generate_kwargs)
+            elif model_type == "phi4mm":
+                raw_image = load_image(args.image_url) if is_vision else None
+                raw_image = [raw_image] * args.batch_size
+                samples = [sample] * audio_batch_size
+                inputs = tokenizer(
+                    text=prompt[0],
+                    images=raw_image if is_vision else None,
+                    audios=samples if is_speech else None,
+                    return_tensors="pt",
+                )
+                input_ids = inputs["input_ids"]
+                output = model.generate(**inputs, **generate_kwargs)
             else:
                 input_ids = tokenizer(prompt, return_tensors="pt").input_ids
                 output = model.generate(input_ids, **generate_kwargs)
             gen_ids = output[0] if args.token_latency else output
             gen_text = tokenizer.batch_decode(
                 (
                     gen_ids[:, input_ids.shape[1] :]
-                    if model_type in ["llava", "maira2"]
+                    if model_type in ["llava", "maira2", "phi4mm"]
                     else gen_ids
                 ),
                 skip_special_tokens=True,
@@ -514,6 +585,17 @@ def trace_handler(prof):
                             get_grounding=False,
                         )
                         output = model.generate(**processed_inputs, **generate_kwargs)
+                    elif model_type == "phi4mm":
+                        raw_image = load_image(args.image_url) if is_vision else None
+                        raw_image = [raw_image] * args.batch_size
+                        samples = [sample] * audio_batch_size
+                        inputs = tokenizer(
+                            text=prompt[0],
+                            images=raw_image if is_vision else None,
+                            audios=samples if is_speech else None,
+                            return_tensors="pt",
+                        )
+                        output = model.generate(**inputs, **generate_kwargs)
                     else:
                         input_ids = tokenizer(prompt, return_tensors="pt").input_ids
                         output = model.generate(input_ids, **generate_kwargs)
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-transformers==4.46.2`
	`1`	`+transformers==4.48.0`