2424from QEfficient .compile .qnn_compiler import compile as qnn_compile
2525from QEfficient .generation .cloud_infer import QAICInferenceSession
2626from QEfficient .utils import constants , dump_qconfig
27- from QEfficient .utils ._utils import load_json
2827from QEfficient .utils .cache import QEFF_HOME , to_hashable
2928
3029logger = logging .getLogger (__name__ )
@@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
9897 :num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
9998 :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
10099 :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
101- :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
100+ :compiler_options: Pass any compiler option as input.
101+ Following flag can be passed in compiler_options to enable QNN Compilation path.
102+ :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
103+ :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
104+ for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
102105 - aic_num_cores=16 -> -aic-num-cores=16
103106 - convert_to_fp16=True -> -convert-to-fp16
104107
@@ -217,10 +220,13 @@ def _compile(
217220 onnx_path : Optional [str ] = None ,
218221 compile_dir : Optional [str ] = None ,
219222 * ,
223+ mxint8_kv_cache : bool = False ,
220224 specializations : Optional [List [Dict [str , int ]]] = None ,
221225 custom_io : Optional [Dict [str , str ]] = None ,
222226 mdp_ts_num_devices : int = 1 ,
223227 num_speculative_tokens : Optional [int ] = None ,
228+ enable_qnn : Optional [bool ] = False ,
229+ qnn_config : Optional [str ] = None ,
224230 ** compiler_options ,
225231 ) -> str :
226232 """
@@ -229,10 +235,13 @@ def _compile(
229235 Args:
230236 :onnx_path (str): Onnx file to compile
231237 :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
238+ :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
232239 :specializations (list): List of specializations to compile for
233240 :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
234241 :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
235242 :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
243+ :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
244+ :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
236245 :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
237246 - aic_num_cores=16 -> -aic-num-cores=16
238247 - convert_to_fp16=True -> -convert-to-fp16
@@ -245,6 +254,22 @@ def _compile(
245254 qpc_path = compile_dir / "qpc"
246255 if not onnx_path .is_file ():
247256 raise FileNotFoundError (f"ONNX file not found at: { onnx_path } " )
257+
258+ if enable_qnn :
259+ self .qpc_path = qnn_compile (
260+ onnx_path = onnx_path ,
261+ qpc_base_path = compile_dir ,
262+ specializations = specializations ,
263+ custom_io = custom_io ,
264+ device_group = list (range (mdp_ts_num_devices )),
265+ num_cores = compiler_options .get ("aic_num_cores" , 16 ),
266+ mxfp6 = compiler_options .get ("mxfp6_matmul" , False ),
267+ mxint8 = mxint8_kv_cache ,
268+ qnn_config = qnn_config ,
269+ )
270+
271+ return self .qpc_path
272+
248273 command = constants .COMPILER + [f"-m={ onnx_path } " ]
249274 if mdp_ts_json_path := compiler_options .pop ("mdp_ts_json_path" , None ):
250275 mdp_ts_num_devices = None
@@ -339,104 +364,3 @@ def _compile(
339364 self .qpc_path = qpc_path
340365
341366 return qpc_path
342-
343- @dump_qconfig
344- def _qnn_compile (
345- self ,
346- onnx_path : Optional [str ] = None ,
347- compile_dir : Optional [str ] = None ,
348- * ,
349- specializations : Optional [List [Dict [str , int ]]] = None ,
350- prefill_seq_len : int = 32 ,
351- ctx_len : int = 128 ,
352- batch_size : int = 1 ,
353- full_batch_size : Optional [int ] = None ,
354- mdp_ts_num_devices : int = 1 ,
355- num_cores : int = 16 ,
356- mxfp6_matmul : bool = False ,
357- mxint8_kv_cache : bool = False ,
358- qnn_config : Optional [str ] = None ,
359- kv_cache_batch_size : Optional [int ] = None ,
360- ) -> str :
361- """
362- Interface for QNN compiler
363-
364- Args:
365- :onnx_path (str): Onnx file to compile
366- :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
367- :specializations (list): List of specializations to compile for
368- :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
369- :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
370- :batch_size (int, optional): Batch size. ``Defaults to 1``.
371- :full_batch_size (int, optional): Continuous batching batch size.
372- :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
373- :num_cores (int): Number of cores used to compile the model.
374- :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
375- :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
376- :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
377- :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
378- """
379- if onnx_path is None and self .onnx_path is None :
380- self .export ()
381-
382- onnx_path = Path (onnx_path or self .onnx_path )
383- compile_dir = Path (compile_dir or onnx_path .parent )
384- qpc_path = compile_dir / "qpc"
385- if not onnx_path .is_file ():
386- raise FileNotFoundError (f"ONNX file not found at: { onnx_path } " )
387-
388- compile_hash = hashlib .sha256 (to_hashable ("qnn" ))
389-
390- if specializations is not None :
391- compile_hash .update (to_hashable (specializations ))
392-
393- if qnn_config is not None :
394- qnn_config_values = load_json (qnn_config )
395- compile_hash .update (to_hashable (qnn_config_values ))
396-
397- if mdp_ts_num_devices > 1 :
398- compile_hash .update (to_hashable ({"mdp_ts_num_devices" : mdp_ts_num_devices }))
399-
400- compile_hash .update (to_hashable ({"num_cores" : num_cores }))
401- compile_hash .update (to_hashable ({"mxfp6_matmul" : mxfp6_matmul }))
402- compile_hash .update (to_hashable ({"mxint8_kv_cache" : mxint8_kv_cache }))
403-
404- # Check if already compiled
405- compile_hash = compile_hash .hexdigest ()[:16 ]
406- qpc_path = qpc_path .with_name (qpc_path .name + "-" + compile_hash )
407- if qpc_path .is_dir ():
408- if (qpc_path / "programqpc.bin" ).is_file ():
409- self .qpc_path = qpc_path
410- return qpc_path
411- # Probably compilation failure last time, delete directory to start over
412- shutil .rmtree (qpc_path )
413-
414- # Write specializations.json file
415- if specializations is not None :
416- specializations_json = compile_dir / "specializations.json"
417- with open (specializations_json , "w" ) as fp :
418- json .dump (
419- {"specializations" : [{k : str (v ) for k , v in spec .items ()} for spec in specializations ]},
420- fp ,
421- indent = 4 ,
422- )
423-
424- qnn_compile (
425- onnx_path = onnx_path ,
426- qpc_base_path = compile_dir ,
427- num_cores = num_cores ,
428- device_group = list (range (mdp_ts_num_devices )),
429- batch_size = batch_size ,
430- prompt_len = prefill_seq_len ,
431- ctx_len = ctx_len ,
432- mxfp6 = mxfp6_matmul ,
433- mxint8 = mxint8_kv_cache ,
434- full_batch_size = full_batch_size ,
435- qnn_config = qnn_config ,
436- qnn_binary_dir = qpc_path ,
437- kv_cache_batch_size = kv_cache_batch_size ,
438- )
439-
440- self .qpc_path = qpc_path
441-
442- return qpc_path
0 commit comments