Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 24 additions & 18 deletions python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ Right click and select **Update Driver** -> **Browse my computer for drivers**.
## 1. Install
### 1.1 Installation on Windows
We suggest using conda to manage environment:
```bash
```cmd
conda create -n llm python=3.10
conda activate llm

# install ipex-llm with 'npu' option
:: install ipex-llm with 'npu' option
pip install --pre --upgrade ipex-llm[npu]
```

Expand Down Expand Up @@ -98,26 +98,26 @@ Supported models: Llama2-7B, MiniCPM-1B, Baichuan2-7B
Supported models: Llama3-8B, MiniCPM-2B, Qwen2-7B, Qwen2-1.5B

### Run
```bash
# to run Llama-2-7b-chat-hf
```cmd
:: to run Llama-2-7b-chat-hf
python llama.py

# to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
:: to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct

# to run Qwen2-1.5B-Instruct LNL driver version: 32.0.101.2715)
:: to run Qwen2-1.5B-Instruct LNL driver version: 32.0.101.2715)
python qwen2.py

# to run Qwen2-7B-Instruct LNL driver version: 32.0.101.2715)
:: to run Qwen2-7B-Instruct LNL driver version: 32.0.101.2715)
python qwen2.py --repo-id-or-model-path Qwen/Qwen2-7B-Instruct

# to run MiniCPM-1B-sft-bf16
:: to run MiniCPM-1B-sft-bf16
python minicpm.py

# to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
:: to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16

# to run Baichuan2-7B-Chat
:: to run Baichuan2-7B-Chat
python baichuan2.py
```

Expand All @@ -137,29 +137,35 @@ If you encounter `TypeError: can't convert meta device type tensor to numpy. Use

#### Output Problem
If you encounter output problem, please try to disable the optimization of transposing value cache with following command:
```bash
# to run Llama-2-7b-chat-hf
```cmd
:: to run Llama-2-7b-chat-hf
python llama.py --disable-transpose-value-cache

# to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
:: to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715)
python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --disable-transpose-value-cache

# to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
python qwen2.py --disable-transpose-value-cache

# to run Qwen2-7B-Instruct LNL driver version: 32.0.101.2715)
:: to run Qwen2-7B-Instruct LNL driver version: 32.0.101.2715)
python qwen2.py --repo-id-or-model-path Qwen/Qwen2-7B-Instruct --disable-transpose-value-cache

# to run MiniCPM-1B-sft-bf16
:: to run MiniCPM-1B-sft-bf16
python minicpm.py --disable-transpose-value-cache

# to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
:: to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715)
python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache

# to run Baichuan2-7B-Chat
:: to run Baichuan2-7B-Chat
python baichuan2.py --disable-transpose-value-cache
```

For [Qwen2-7B](./qwen2.py), you could also try to enable mixed precision optimization for better outputs:

```cmd
python --repo-id-or-model-path Qwen/Qwen2-7B-Instruct qwen2.py
```

#### Better Performance with High CPU Utilization
You could enable optimization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=1` for better performance. But this will cause high CPU utilization.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,15 @@
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
parser.add_argument('--prompt', type=str, default="AI是什么?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-output-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
parser.add_argument("--intra-pp", type=int, default=None)
parser.add_argument("--inter-pp", type=int, default=None)
parser.add_argument("--mixed-precision", action='store_true')

args = parser.parse_args()
model_path = args.repo_id_or_model_path
Expand All @@ -68,6 +69,7 @@
intra_pp=args.intra_pp,
inter_pp=args.inter_pp,
transpose_value_cache=not args.disable_transpose_value_cache,
mixed_precision=args.mixed_precision
)
else:
model = AutoModelForCausalLM.load_low_bit(
Expand Down
12 changes: 9 additions & 3 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def from_pretrained(cls, *args, **kwargs):
Relevant low bit optimizations will be applied to the model.
:param optimize_model: boolean value, Whether to further optimize the low_bit llm model.
Default to be ``False``.
:param mixed_precision: boolean value, Whether to use mixed precision quantization.
Default to be False. If set to ``True``, we will use ``'sym_int8'`` for lm_head when
``load_in_low_bit`` is '``sym_int4``' for certain models.
:return: a model instance
"""
if kwargs.get("device_map", None) not in [None, "cpu", "auto"]:
Expand Down Expand Up @@ -108,7 +111,6 @@ def from_pretrained(cls, *args, **kwargs):
ignore_argument(kwargs, "load_in_4bit")
ignore_argument(kwargs, "load_in_8bit")
ignore_argument(kwargs, "imatrix")
ignore_argument(kwargs, "mixed_precision")
ignore_argument(kwargs, "cpu_embedding")
ignore_argument(kwargs, "embedding_qtype")
ignore_argument(kwargs, "enable_mp")
Expand All @@ -123,6 +125,7 @@ def from_pretrained(cls, *args, **kwargs):
intra_pp = kwargs.pop("intra_pp", None)
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
mixed_precision = kwargs.pop('mixed_precision', False)

_args = copy.deepcopy(args)
_kwargs = copy.deepcopy(kwargs)
Expand Down Expand Up @@ -158,7 +161,8 @@ def from_pretrained(cls, *args, **kwargs):
llm = model

with torch.no_grad():
optimize_llm_pre(model, qtype)
model.config.update({"mixed_precision": mixed_precision})
optimize_llm_pre(model, qtype, mixed_precision)
cls.load_convert(qtype, model, "cpu", modules_to_not_convert, *args, **kwargs)
create_npu_kernels(llm)
model = model.eval()
Expand Down Expand Up @@ -209,6 +213,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
ignore_argument(kwargs, "embedding_qtype")
ignore_argument(kwargs, "speculative")
ignore_argument(kwargs, "pipeline_parallel_stages")
ignore_argument(kwargs, "mixed_precision")
optimize_model = kwargs.pop("optimize_model", False)
max_output_len = kwargs.pop("max_output_len", 1024)
max_prompt_len = kwargs.pop("max_prompt_len", 512)
Expand Down Expand Up @@ -258,6 +263,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path)
qtype = config_dict.pop("bigdl_transformers_low_bit", False)
bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
mixed_precision = config_dict.pop("mixed_precision", False)

invalidInputError(
qtype,
Expand Down Expand Up @@ -370,7 +376,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
llm = model

with torch.no_grad():
optimize_llm_pre(model, qtype)
optimize_llm_pre(model, qtype, mixed_precision)
cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
*model_args, **kwargs)
create_npu_kernels(llm)
Expand Down
7 changes: 5 additions & 2 deletions python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def convert_forward(m, target_m, new_forward):
convert_forward(sub_m, target_m, new_forward)


def optimize_llm_pre(model: torch.nn.Module, qtype):
def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision):
if model.config.model_type == "baichuan":
# process NormHead module in Baichuan2 7B
if hasattr(model, 'lm_head') and model.lm_head is not None:
Expand Down Expand Up @@ -92,7 +92,10 @@ def optimize_llm_pre(model: torch.nn.Module, qtype):
# for Qwen2-7B-Insturct, divide lm_head into 14 parts
if model.config.hidden_size == 3584 and model.config.vocab_size == 152064 and \
not cpu_lm_head:
new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=14,
# Do not split lm_head and use sym_int8 when mix_precison is True
is_split = (not mixed_precision) and qtype == "sym_int4_rtn"
split_num = 14 if is_split else 1
new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
bias=model.lm_head.bias)
del model.lm_head
model.lm_head = new_lm_head
Expand Down