diff --git a/examples/windows/onnx_ptq/genai_llm/quantize.py b/examples/windows/onnx_ptq/genai_llm/quantize.py index 13f6ac80452..713597f4c66 100644 --- a/examples/windows/onnx_ptq/genai_llm/quantize.py +++ b/examples/windows/onnx_ptq/genai_llm/quantize.py @@ -27,6 +27,7 @@ from transformers import AutoConfig, AutoTokenizer from modelopt.onnx.quantization.int4 import quantize as quantize_int4 +from modelopt.onnx.quantization.ort_utils import register_abi_ep logging.getLogger().setLevel(logging.INFO) @@ -72,7 +73,7 @@ def make_input_shapes_profile_for_ep_list(ep_list, model_name_or_path): # Using empty shapes_profile for non-NvTensorRtRtx EPs. input_shapes_profile_sequence = [] for ep in ep_list: - if ep == "NvTensorRtRtx": + if ep in {"NvTensorRtRtx", "NvTensorRtRtx-abi"}: min_shapes, max_shapes, opt_shapes = get_input_shapes_profile(model_name_or_path) input_shapes_profile = { "nv_profile_min_shapes": min_shapes, @@ -304,14 +305,15 @@ def get_calib_inputs( def parse_calibration_eps(value): """Parse and validate the calibration_eps input.""" - valid_choices = {"cuda", "cpu", "dml", "NvTensorRtRtx"} + valid_choices = {"cuda", "cpu", "dml", "NvTensorRtRtx", "NvTensorRtRtx-abi"} # Split the input by commas and remove any surrounding whitespace eps = [item.strip() for item in value.split(",")] # Validate each calibration endpoint for ep in eps: if ep not in valid_choices: raise argparse.ArgumentTypeError( - f"Invalid calibration endpoint: '{ep}'. Choose from 'cuda', 'cpu', 'dml', 'NvTensorRtRtx'." + f"Invalid calibration endpoint: '{ep}'. Choose from 'cuda', 'cpu', 'dml', " + "'NvTensorRtRtx', 'NvTensorRtRtx-abi'." ) return eps @@ -413,8 +415,13 @@ def main(args): args.trust_remote_code, ) + if "NvTensorRtRtx-abi" in args.calibration_eps: + register_abi_ep(args.abi_ep_path) + input_shapes_profile_data = None - if "NvTensorRtRtx" in args.calibration_eps and (args.algo not in ["rtn", "rtn_dq"]): + if any(ep in args.calibration_eps for ep in {"NvTensorRtRtx", "NvTensorRtRtx-abi"}) and ( + args.algo not in ["rtn", "rtn_dq"] + ): # NvTensorRtRtx EP uses (min, max, opt) profile for dynamic shapes in the model's inputs. input_shapes_profile_data = make_input_shapes_profile_for_ep_list( args.calibration_eps, args.model_name @@ -607,7 +614,16 @@ def main(args): "--calibration_eps", type=parse_calibration_eps, # Use the custom parser default=["cuda", "cpu"], # Default as a list - help="Comma-separated list of calibration endpoints. Choose from 'cuda', 'cpu', 'dml', 'NvTensorRtRtx'.", + help=( + "Comma-separated list of calibration endpoints. Choose from 'cuda', 'cpu', 'dml', " + "'NvTensorRtRtx', 'NvTensorRtRtx-abi'." + ), + ) + parser.add_argument( + "--abi_ep_path", + type=str, + default=None, + help="Path to an external NvTensorRtRtx ABI execution-provider library.", ) parser.add_argument( "--trust_remote_code", diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py index 4671b99139c..6e61df5da51 100644 --- a/modelopt/onnx/quantization/__main__.py +++ b/modelopt/onnx/quantization/__main__.py @@ -62,6 +62,12 @@ def get_parser() -> argparse.ArgumentParser: argparser.add_argument( "--onnx_path", required=True, type=str, help="Input onnx model without Q/DQ nodes." ) + argparser.add_argument( + "--abi_ep_path", + required=False, + type=str, + help="Path to an external NvTensorRtRtx ABI execution-provider library.", + ) argparser.add_argument( "--quantize_mode", type=str, @@ -110,7 +116,8 @@ def get_parser() -> argparse.ArgumentParser: nargs="+", help=( "Priority order for the execution providers (EP) to calibrate the model. " - "Any subset of ['trt', 'cuda:x', dml:x, 'cpu'], where 'x' is the device id." + "Any subset of ['trt', 'cuda:x', dml:x, 'cpu', 'NvTensorRtRtx', " + "'NvTensorRtRtx-abi'], where 'x' is the device id." "If a custom op is detected in the model, 'trt' will automatically be added to the EP list." ), ) @@ -507,6 +514,7 @@ def main(): autotune_warmup_runs=args.autotune_warmup_runs, autotune_timing_runs=args.autotune_timing_runs, autotune_trtexec_args=args.autotune_trtexec_args, + abi_ep_path=args.abi_ep_path, ) diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py index f7799c634f0..7090c444bcb 100755 --- a/modelopt/onnx/quantization/ort_utils.py +++ b/modelopt/onnx/quantization/ort_utils.py @@ -314,6 +314,24 @@ def _check_for_nv_tensorrt_rtx_libs(): return found +def _check_for_nv_tensorrt_rtx_abi_libs(ep_path: str): + logger.info("Checking for NvTensorRtRtx ABI EP library") + if not ep_path: + raise FileNotFoundError("Need to provide abi_ep_path to use NvTensorRtRtx-abi") + if not os.path.isfile(ep_path): + raise FileNotFoundError(f"NvTensorRtRtx ABI EP library not found: {ep_path}") + + ort.register_execution_provider_library("NvTensorRTRTXExecutionProvider", ep_path) + return True + + +def register_abi_ep(abi_ep_path: str | None): + """Register an external NvTensorRtRtx ABI execution-provider library.""" + + _check_for_nv_tensorrt_rtx_abi_libs(abi_ep_path or "") + logger.debug("Registered NvTensorRtRtx ABI EP") + + def _prepare_ep_list(calibration_eps: list[str]): """Prepares the EP list for ORT from the given user input.""" logger.debug(f"Preparing execution providers list from: {calibration_eps}") @@ -334,6 +352,9 @@ def _prepare_ep_list(calibration_eps: list[str]): elif "cpu" in ep: providers.append("CPUExecutionProvider") logger.debug("Added CPU EP") + elif "NvTensorRtRtx-abi" in ep: + providers.append("NvTensorRTRTXExecutionProvider") + logger.debug("Added NvTensorRtRtx ABI EP") elif "NvTensorRtRtx" in ep: try: _check_for_nv_tensorrt_rtx_libs() diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 3484140a57c..3c592750fd9 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -63,7 +63,7 @@ ) from modelopt.onnx.quantization.int4 import quantize as quantize_int4 from modelopt.onnx.quantization.int8 import quantize as quantize_int8 -from modelopt.onnx.quantization.ort_utils import update_trt_ep_support +from modelopt.onnx.quantization.ort_utils import register_abi_ep, update_trt_ep_support from modelopt.onnx.quantization.qdq_utils import ( qdq_to_dq, remove_graph_input_q, @@ -358,6 +358,7 @@ def quantize( simplify: bool = False, calibrate_per_node: bool = False, input_shapes_profile: Sequence[dict[str, str]] | None = None, + abi_ep_path: str | None = None, direct_io_types: bool = False, opset: int | None = None, autotune: bool = False, @@ -491,6 +492,9 @@ def quantize( If None of the calibration_eps require any such shapes profile for model inputs, then nothing needs to be set for this "input_shapes_profile" parameter. Default value is None. + abi_ep_path: + Path to an external NvTensorRtRtx ABI execution-provider library. Required when + ``NvTensorRtRtx-abi`` is present in ``calibration_eps``. direct_io_types: If True, modify the I/O types in the quantized ONNX model to be lower precision whenever possible. If False, keep the I/O types in the quantized ONNX model the same as in the given ONNX model. @@ -547,6 +551,9 @@ def quantize( "Per node calibration is only supported for int8 and fp8 quantization modes" ) + if "NvTensorRtRtx-abi" in calibration_eps: + register_abi_ep(abi_ep_path) + # quantize_static creates a shape-inferred copy at the input model's directory # Needs to check if we have write permission to this directory assert onnx_path.endswith((".onnx", ".pb"))