[Common] Fuse pre-swizzling into grouped MXFP8 quantization kernel #2630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

Oleg-Goncharov wants to merge 21 commits into NVIDIA:main from Oleg-Goncharov:pr_mxfp8_grouped_preswizzle

tests/cpp/operator/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ add_executable(test_operator @@
                    test_cast_mxfp8_gated_swiglu.cu
                    test_qdq.cu
                    test_cast_mxfp8.cu
+                   test_cast_mxfp8_grouped.cu
                    test_cast_nvfp4_transpose.cu
                    test_cast_float8blockwise.cu
                    test_dequantize_mxfp8.cu
@@ Expand Down @@

tests/cpp/operator/test_cast_mxfp8_grouped.cu

Large diffs are not rendered by default.

transformer_engine/common/activation/gelu.cu

-Original file line number
+Diff line change
@@ Expand Up @@
       act_fn<fp32, Empty, gelu<fp32, fp32>>(input, output, stream);
     }
+    void nvte_group_gelu(const NVTEGroupedTensor input, NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_gelu);
+      using namespace transformer_engine;
+      constexpr bool IS_ACT = true;
+      dispatch::group_quantize_fwd_helper<IS_ACT, Empty, gelu<fp32, fp32>>(input, output, nullptr,
+                                                                           stream);
+    }
     void nvte_dgelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                     cudaStream_t stream) {
       NVTE_API_CALL(nvte_dgelu);
       using namespace transformer_engine;
       dact_fn<fp32, Empty, dgelu<fp32, fp32>>(grad, input, output, stream);
     }
+    void nvte_group_dgelu(const NVTEGroupedTensor grad, const NVTEGroupedTensor input,
+                          NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_dgelu);
+      using namespace transformer_engine;
+      NVTETensor dbias = nullptr;
+      NVTETensor workspace = nullptr;
+      constexpr bool IS_DBIAS = false;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dgelu<fp32, fp32>>(
+          grad, input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_quantize_dbias_dgelu(const NVTETensor input, const NVTETensor activation_input,
                                    NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                                    cudaStream_t stream) {
@@ Expand All @@
           input, activation_input, output, dbias, workspace, nullptr, stream);
     }
+    void nvte_group_quantize_dbias_dgelu(const NVTEGroupedTensor input,
+                                         const NVTEGroupedTensor activation_input,
+                                         NVTEGroupedTensor output, NVTETensor dbias,
+                                         NVTETensor workspace, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_quantize_dbias_dgelu);
+      using namespace transformer_engine;
+      constexpr bool IS_DBIAS = true;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dgelu<fp32, fp32>>(
+          input, activation_input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_geglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
       NVTE_API_CALL(nvte_geglu);
       using namespace transformer_engine;
@@ Expand All @@
       act_fn<fp32, Empty, qgelu<fp32, fp32>>(input, output, stream);
     }
+    void nvte_group_qgelu(const NVTEGroupedTensor input, NVTEGroupedTensor output,
+                          cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_qgelu);
+      using namespace transformer_engine;
+      constexpr bool IS_ACT = true;
+      dispatch::group_quantize_fwd_helper<IS_ACT, Empty, qgelu<fp32, fp32>>(input, output, nullptr,
+                                                                            stream);
+    }
     void nvte_dqgelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                      cudaStream_t stream) {
       NVTE_API_CALL(nvte_dqgelu);
       using namespace transformer_engine;
       dact_fn<fp32, Empty, dqgelu<fp32, fp32>>(grad, input, output, stream);
     }
+    void nvte_group_dqgelu(const NVTEGroupedTensor grad, const NVTEGroupedTensor input,
+                           NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_dqgelu);
+      using namespace transformer_engine;
+      NVTETensor dbias = nullptr;
+      NVTETensor workspace = nullptr;
+      constexpr bool IS_DBIAS = false;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dqgelu<fp32, fp32>>(
+          grad, input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_quantize_dbias_dqgelu(const NVTETensor input, const NVTETensor activation_input,
                                     NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                                     cudaStream_t stream) {
@@ Expand All @@
           input, activation_input, output, dbias, workspace, nullptr, stream);
     }
+    void nvte_group_quantize_dbias_dqgelu(const NVTEGroupedTensor input,
+                                          const NVTEGroupedTensor activation_input,
+                                          NVTEGroupedTensor output, NVTETensor dbias,
+                                          NVTETensor workspace, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_quantize_dbias_dqgelu);
+      using namespace transformer_engine;
+      constexpr bool IS_DBIAS = true;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dqgelu<fp32, fp32>>(
+          input, activation_input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_qgeglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
       NVTE_API_CALL(nvte_qgeglu);
       using namespace transformer_engine;
@@ Expand Down @@

transformer_engine/common/activation/relu.cu

-Original file line number
+Diff line change
@@ Expand Up @@
       act_fn<fp32, Empty, relu<fp32, fp32>>(input, output, stream);
     }
+    void nvte_group_relu(const NVTEGroupedTensor input, NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_relu);
+      using namespace transformer_engine;
+      constexpr bool IS_ACT = true;
+      dispatch::group_quantize_fwd_helper<IS_ACT, Empty, relu<fp32, fp32>>(input, output, nullptr,
+                                                                           stream);
+    }
     void nvte_drelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                     cudaStream_t stream) {
       NVTE_API_CALL(nvte_drelu);
       using namespace transformer_engine;
       dact_fn<fp32, Empty, drelu<fp32, fp32>>(grad, input, output, stream);
     }
+    void nvte_group_drelu(const NVTEGroupedTensor grad, const NVTEGroupedTensor input,
+                          NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_drelu);
+      using namespace transformer_engine;
+      NVTETensor dbias = nullptr;
+      NVTETensor workspace = nullptr;
+      constexpr bool IS_DBIAS = false;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, drelu<fp32, fp32>>(
+          grad, input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_quantize_dbias_drelu(const NVTETensor input, const NVTETensor activation_input,
                                    NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                                    cudaStream_t stream) {
@@ Expand All @@
           input, activation_input, output, dbias, workspace, nullptr, stream);
     }
+    void nvte_group_quantize_dbias_drelu(const NVTEGroupedTensor input,
+                                         const NVTEGroupedTensor activation_input,
+                                         NVTEGroupedTensor output, NVTETensor dbias,
+                                         NVTETensor workspace, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_quantize_dbias_drelu);
+      using namespace transformer_engine;
+      constexpr bool IS_DBIAS = true;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, drelu<fp32, fp32>>(
+          input, activation_input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_reglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
       NVTE_API_CALL(nvte_reglu);
       using namespace transformer_engine;
@@ Expand All @@
       act_fn<fp32, Empty, srelu<fp32, fp32>>(input, output, stream);
     }
+    void nvte_group_srelu(const NVTEGroupedTensor input, NVTEGroupedTensor output,
+                          cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_srelu);
+      using namespace transformer_engine;
+      constexpr bool IS_ACT = true;
+      dispatch::group_quantize_fwd_helper<IS_ACT, Empty, srelu<fp32, fp32>>(input, output, nullptr,
+                                                                            stream);
+    }
     void nvte_dsrelu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                      cudaStream_t stream) {
       NVTE_API_CALL(nvte_dsrelu);
       using namespace transformer_engine;
       dact_fn<fp32, Empty, dsrelu<fp32, fp32>>(grad, input, output, stream);
     }
+    void nvte_group_dsrelu(const NVTEGroupedTensor grad, const NVTEGroupedTensor input,
+                           NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_dsrelu);
+      using namespace transformer_engine;
+      NVTETensor dbias = nullptr;
+      NVTETensor workspace = nullptr;
+      constexpr bool IS_DBIAS = false;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dsrelu<fp32, fp32>>(
+          grad, input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_quantize_dbias_dsrelu(const NVTETensor input, const NVTETensor activation_input,
                                     NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                                     cudaStream_t stream) {
@@ Expand All @@
           input, activation_input, output, dbias, workspace, nullptr, stream);
     }
+    void nvte_group_quantize_dbias_dsrelu(const NVTEGroupedTensor input,
+                                          const NVTEGroupedTensor activation_input,
+                                          NVTEGroupedTensor output, NVTETensor dbias,
+                                          NVTETensor workspace, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_quantize_dbias_dsrelu);
+      using namespace transformer_engine;
+      constexpr bool IS_DBIAS = true;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dsrelu<fp32, fp32>>(
+          input, activation_input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_sreglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
       NVTE_API_CALL(nvte_sreglu);
       using namespace transformer_engine;
@@ Expand Down @@

transformer_engine/common/activation/swiglu.cu

-Original file line number
+Diff line change
@@ Expand Up @@
       act_fn<fp32, Empty, silu<fp32, fp32>>(input, output, stream);
     }
+    void nvte_group_silu(const NVTEGroupedTensor input, NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_silu);
+      using namespace transformer_engine;
+      constexpr bool IS_ACT = true;
+      dispatch::group_quantize_fwd_helper<IS_ACT, Empty, silu<fp32, fp32>>(input, output, nullptr,
+                                                                           stream);
+    }
     void nvte_dsilu(const NVTETensor grad, const NVTETensor input, NVTETensor output,
                     cudaStream_t stream) {
       NVTE_API_CALL(nvte_dsilu);
       using namespace transformer_engine;
       dact_fn<fp32, Empty, dsilu<fp32, fp32>>(grad, input, output, stream);
     }
+    void nvte_group_dsilu(const NVTEGroupedTensor grad, const NVTEGroupedTensor input,
+                          NVTEGroupedTensor output, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_dsilu);
+      using namespace transformer_engine;
+      NVTETensor dbias = nullptr;
+      NVTETensor workspace = nullptr;
+      constexpr bool IS_DBIAS = false;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dsilu<fp32, fp32>>(
+          grad, input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_quantize_dbias_dsilu(const NVTETensor input, const NVTETensor activation_input,
                                    NVTETensor output, NVTETensor dbias, NVTETensor workspace,
                                    cudaStream_t stream) {
@@ Expand All @@
           input, activation_input, output, dbias, workspace, nullptr, stream);
     }
+    void nvte_group_quantize_dbias_dsilu(const NVTEGroupedTensor input,
+                                         const NVTEGroupedTensor activation_input,
+                                         NVTEGroupedTensor output, NVTETensor dbias,
+                                         NVTETensor workspace, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_quantize_dbias_dsilu);
+      using namespace transformer_engine;
+      constexpr bool IS_DBIAS = true;
+      constexpr bool IS_DACT = true;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, dsilu<fp32, fp32>>(
+          input, activation_input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_swiglu(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
       NVTE_API_CALL(nvte_swiglu);
       using namespace transformer_engine;
@@ Expand Down @@

transformer_engine/common/cast/cast.cu

-Original file line number
+Diff line change
@@ Expand Up @@
       dispatch::quantize_fwd_helper<IS_ACT, Empty, nullptr>(input, output, nullptr, stream);
     }
+    void nvte_group_quantize(const NVTEGroupedTensor input, NVTEGroupedTensor output,
+                             cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_quantize);
+      using namespace transformer_engine;
+      constexpr bool IS_ACT = false;
+      dispatch::group_quantize_fwd_helper<IS_ACT, Empty, nullptr>(input, output, nullptr, stream);
+    }
     void nvte_quantize_noop(const NVTETensor input, NVTETensor output, NVTETensor noop,
                             cudaStream_t stream) {
       NVTE_API_CALL(nvte_quantize_noop);
@@ Expand Down Expand Up @@
           input, activation_input, output, dbias, workspace, nullptr, stream);
     }
+    void nvte_group_quantize_dbias(const NVTEGroupedTensor input, NVTEGroupedTensor output,
+                                   NVTETensor dbias, NVTETensor workspace, cudaStream_t stream) {
+      NVTE_API_CALL(nvte_group_quantize_dbias);
+      using namespace transformer_engine;
+      constexpr bool IS_DBIAS = true;
+      constexpr bool IS_DACT = false;
+      constexpr const NVTEGroupedTensor activation_input = nullptr;
+      dispatch::group_quantize_bwd_helper<IS_DBIAS, IS_DACT, Empty, nullptr>(
+          input, activation_input, output, dbias, workspace, nullptr, stream);
+    }
     void nvte_dequantize(const NVTETensor input, NVTETensor output, cudaStream_t stream) {
       NVTE_API_CALL(nvte_dequantize);
       using namespace transformer_engine;
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Common] Fuse pre-swizzling into grouped MXFP8 quantization kernel #2630

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

[Common] Fuse pre-swizzling into grouped MXFP8 quantization kernel #2630

Are you sure you want to change the base?

Uh oh!

[Common] Fuse pre-swizzling into grouped MXFP8 quantization kernel #2630

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!