Skip to content

Commit bcdff7f

Browse files
shubhagr-quiceplatero97
authored andcommitted
QNN Compilation path Support in QEFFBaseModel class. (quic#374)
This change will facilitate the support of QNN Compilation path for any model class derived from QEFFBaseModel class. --------- Signed-off-by: Shubham Agrawal <[email protected]> Signed-off-by: eplatero <[email protected]>
1 parent 132f19e commit bcdff7f

File tree

14 files changed

+373
-531
lines changed

14 files changed

+373
-531
lines changed

QEfficient/base/modeling_qeff.py

Lines changed: 27 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from QEfficient.compile.qnn_compiler import compile as qnn_compile
2525
from QEfficient.generation.cloud_infer import QAICInferenceSession
2626
from QEfficient.utils import constants, dump_qconfig
27-
from QEfficient.utils._utils import load_json
2827
from QEfficient.utils.cache import QEFF_HOME, to_hashable
2928

3029
logger = logging.getLogger(__name__)
@@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
9897
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
9998
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
10099
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
101-
:compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
100+
:compiler_options: Pass any compiler option as input.
101+
Following flag can be passed in compiler_options to enable QNN Compilation path.
102+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
103+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
104+
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
102105
- aic_num_cores=16 -> -aic-num-cores=16
103106
- convert_to_fp16=True -> -convert-to-fp16
104107
@@ -217,10 +220,13 @@ def _compile(
217220
onnx_path: Optional[str] = None,
218221
compile_dir: Optional[str] = None,
219222
*,
223+
mxint8_kv_cache: bool = False,
220224
specializations: Optional[List[Dict[str, int]]] = None,
221225
custom_io: Optional[Dict[str, str]] = None,
222226
mdp_ts_num_devices: int = 1,
223227
num_speculative_tokens: Optional[int] = None,
228+
enable_qnn: Optional[bool] = False,
229+
qnn_config: Optional[str] = None,
224230
**compiler_options,
225231
) -> str:
226232
"""
@@ -229,10 +235,13 @@ def _compile(
229235
Args:
230236
:onnx_path (str): Onnx file to compile
231237
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
238+
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
232239
:specializations (list): List of specializations to compile for
233240
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
234241
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
235242
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
243+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
244+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
236245
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
237246
- aic_num_cores=16 -> -aic-num-cores=16
238247
- convert_to_fp16=True -> -convert-to-fp16
@@ -245,6 +254,22 @@ def _compile(
245254
qpc_path = compile_dir / "qpc"
246255
if not onnx_path.is_file():
247256
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
257+
258+
if enable_qnn:
259+
self.qpc_path = qnn_compile(
260+
onnx_path=onnx_path,
261+
qpc_base_path=compile_dir,
262+
specializations=specializations,
263+
custom_io=custom_io,
264+
device_group=list(range(mdp_ts_num_devices)),
265+
num_cores=compiler_options.get("aic_num_cores", 16),
266+
mxfp6=compiler_options.get("mxfp6_matmul", False),
267+
mxint8=mxint8_kv_cache,
268+
qnn_config=qnn_config,
269+
)
270+
271+
return self.qpc_path
272+
248273
command = constants.COMPILER + [f"-m={onnx_path}"]
249274
if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
250275
mdp_ts_num_devices = None
@@ -339,104 +364,3 @@ def _compile(
339364
self.qpc_path = qpc_path
340365

341366
return qpc_path
342-
343-
@dump_qconfig
344-
def _qnn_compile(
345-
self,
346-
onnx_path: Optional[str] = None,
347-
compile_dir: Optional[str] = None,
348-
*,
349-
specializations: Optional[List[Dict[str, int]]] = None,
350-
prefill_seq_len: int = 32,
351-
ctx_len: int = 128,
352-
batch_size: int = 1,
353-
full_batch_size: Optional[int] = None,
354-
mdp_ts_num_devices: int = 1,
355-
num_cores: int = 16,
356-
mxfp6_matmul: bool = False,
357-
mxint8_kv_cache: bool = False,
358-
qnn_config: Optional[str] = None,
359-
kv_cache_batch_size: Optional[int] = None,
360-
) -> str:
361-
"""
362-
Interface for QNN compiler
363-
364-
Args:
365-
:onnx_path (str): Onnx file to compile
366-
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
367-
:specializations (list): List of specializations to compile for
368-
:prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
369-
:ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
370-
:batch_size (int, optional): Batch size. ``Defaults to 1``.
371-
:full_batch_size (int, optional): Continuous batching batch size.
372-
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
373-
:num_cores (int): Number of cores used to compile the model.
374-
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
375-
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
376-
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
377-
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
378-
"""
379-
if onnx_path is None and self.onnx_path is None:
380-
self.export()
381-
382-
onnx_path = Path(onnx_path or self.onnx_path)
383-
compile_dir = Path(compile_dir or onnx_path.parent)
384-
qpc_path = compile_dir / "qpc"
385-
if not onnx_path.is_file():
386-
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
387-
388-
compile_hash = hashlib.sha256(to_hashable("qnn"))
389-
390-
if specializations is not None:
391-
compile_hash.update(to_hashable(specializations))
392-
393-
if qnn_config is not None:
394-
qnn_config_values = load_json(qnn_config)
395-
compile_hash.update(to_hashable(qnn_config_values))
396-
397-
if mdp_ts_num_devices > 1:
398-
compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
399-
400-
compile_hash.update(to_hashable({"num_cores": num_cores}))
401-
compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
402-
compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))
403-
404-
# Check if already compiled
405-
compile_hash = compile_hash.hexdigest()[:16]
406-
qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
407-
if qpc_path.is_dir():
408-
if (qpc_path / "programqpc.bin").is_file():
409-
self.qpc_path = qpc_path
410-
return qpc_path
411-
# Probably compilation failure last time, delete directory to start over
412-
shutil.rmtree(qpc_path)
413-
414-
# Write specializations.json file
415-
if specializations is not None:
416-
specializations_json = compile_dir / "specializations.json"
417-
with open(specializations_json, "w") as fp:
418-
json.dump(
419-
{"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
420-
fp,
421-
indent=4,
422-
)
423-
424-
qnn_compile(
425-
onnx_path=onnx_path,
426-
qpc_base_path=compile_dir,
427-
num_cores=num_cores,
428-
device_group=list(range(mdp_ts_num_devices)),
429-
batch_size=batch_size,
430-
prompt_len=prefill_seq_len,
431-
ctx_len=ctx_len,
432-
mxfp6=mxfp6_matmul,
433-
mxint8=mxint8_kv_cache,
434-
full_batch_size=full_batch_size,
435-
qnn_config=qnn_config,
436-
qnn_binary_dir=qpc_path,
437-
kv_cache_batch_size=kv_cache_batch_size,
438-
)
439-
440-
self.qpc_path = qpc_path
441-
442-
return qpc_path

QEfficient/compile/compile_helper.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import List, Optional, Tuple
1414

1515
from QEfficient.compile.qnn_compiler import compile as qnn_compile
16+
from QEfficient.utils._utils import load_json, load_yaml
1617
from QEfficient.utils.logging_utils import logger
1718

1819

@@ -180,36 +181,35 @@ def compile(
180181
full_batch_size=full_batch_size,
181182
)
182183

184+
# Select the customIO config based on the mx flag.
185+
custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
186+
187+
if custom_io_file_path is None:
188+
custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
189+
190+
if not os.path.isfile(custom_io_file_path):
191+
raise FileNotFoundError(
192+
f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
193+
)
194+
183195
if enable_qnn:
184196
qpc_path = qnn_compile(
185197
onnx_path=onnx_path,
186198
qpc_base_path=qpc_path,
199+
qnn_binary_dir=os.path.join(qpc_path, "qpcs"),
187200
num_cores=num_cores,
188-
batch_size=batch_size,
189-
prompt_len=prompt_len,
190-
ctx_len=ctx_len,
191201
mxfp6=mxfp6,
192202
mxint8=mxint8,
193203
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
194204
aic_enable_depth_first=aic_enable_depth_first,
195205
mos=mos,
196206
device_group=device_group,
197-
full_batch_size=full_batch_size,
198207
qnn_config=qnn_config,
208+
specializations=(load_json(specialization_json_path))["specializations"],
209+
custom_io=load_yaml(custom_io_file_path),
199210
)
200211
logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")
201212
else:
202-
# Select the customIO config based on the mx flag.
203-
custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
204-
205-
if custom_io_file_path is None:
206-
custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
207-
208-
if not os.path.isfile(custom_io_file_path):
209-
raise FileNotFoundError(
210-
f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
211-
)
212-
213213
_, qpc_path = compile_kv_model_on_cloud_ai_100(
214214
onnx_path=onnx_path,
215215
specializations_json=specialization_json_path,

0 commit comments

Comments
 (0)