uxlfoundation
diff --git a/‎src/cpu/aarch64/brgemm/brgemm_types.hpp‎
Lines changed: 17 additions & 2 deletions b/‎src/cpu/aarch64/brgemm/brgemm_types.hpp‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp‎
Lines changed: 8 additions & 6 deletions b/‎src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/cpu/aarch64/matmul/brgemm_matmul.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/cpu/aarch64/matmul/brgemm_matmul.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/cpu/aarch64/matmul/brgemm_matmul.hpp‎
Lines changed: 5 additions & 2 deletions b/‎src/cpu/aarch64/matmul/brgemm_matmul.hpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/cpu/aarch64/matmul/brgemm_matmul_copy_utils.cpp‎
Lines changed: 3 additions & 1 deletion b/‎src/cpu/aarch64/matmul/brgemm_matmul_copy_utils.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/cpu/aarch64/matmul/brgemm_matmul_utils.cpp‎
Lines changed: 61 additions & 0 deletions b/‎src/cpu/aarch64/matmul/brgemm_matmul_utils.cpp‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎src/cpu/aarch64/matmul/brgemm_matmul_utils.hpp‎
Lines changed: 8 additions & 0 deletions b/‎src/cpu/aarch64/matmul/brgemm_matmul_utils.hpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/cpu/aarch64/ukernel/attr_params.cpp‎
Lines changed: 102 additions & 0 deletions b/‎src/cpu/aarch64/ukernel/attr_params.cpp‎
Lines changed: 102 additions & 0 deletions
@@ -215,7 +215,7 @@ struct brgemm_desc_t {
     dim_t stride_b = 0;
 
     brgemm_layout_t layout = brgemm_layout_undef;
-    brgemm_batch_kind_t type;
+    brgemm_batch_kind_t type = brgemm_batch_kind_t::brgemm_addr;
     bool is_dgmm = false; // set to true in brdgmm_desc_init
     bool with_sum = false;
     bool req_cal_comp_pads = false;
@@ -292,7 +292,22 @@ struct brgemm_desc_t {
         return sz;
     }
 
-    bool is_b_data_layout_vnni() { return true; }
+    // A class version of the `static` version of the function.
+    // Note: used in benchdnn only, not used inside the library.
+    bool is_b_data_layout_vnni() const { return is_b_data_layout_vnni(dt_b); }
+
+    static bool is_b_data_layout_vnni(data_type_t dt_b) {
+        using namespace data_type;
+        return utils::one_of(dt_b, s8, u8, bf16);
+    }
+
+    bool are_post_ops_applicable() const {
+        const bool has_zero_points = !utils::everyone_is(
+                brgemm_broadcast_t::none, zp_type_a, zp_type_b, zp_type_c);
+        return dt_c != dt_d || with_eltwise || with_binary || with_bias
+                || with_sum || req_s8s8_compensation || has_zero_points
+                || with_scales || with_dst_scales;
+    }
 
     bool operator==(const brgemm_desc_t &rhs) const;
     bool operator<(const brgemm_desc_t &rhs) const;
 
@@ -149,11 +149,12 @@ struct jit_brgemm_kernel_t : public jit_generator_t {
     const XReg reg_a_offset = x2;
     const XReg reg_b_offset = x6;
 
-    const XReg reg_aux1_batch = x5;
-    const XReg reg_aux1_A = x5;
-    const XReg reg_aux1_B = x7; //from jit_generator.hpp in x64
+    const XReg reg_aux1_A = x4;
+    const XReg reg_aux1_batch = reg_aux1_A;
 
-    const XReg reg_offs_batch = reg_aux1_A;
+    const XReg reg_aux1_B = x7; //from jit_generator_t.hpp in x64
+
+    const XReg reg_offs_batch = x5;
     const XReg reg_strd_batch = reg_rdb_loop;
 
     const XReg reg_bias = reg_rdb_loop;
@@ -1232,8 +1233,9 @@ void jit_brgemm_kernel_t::set_A_B_matrices() {
         add(reg_aux_A, reg_aux_A, X_TMP_0);
         ldr(X_TMP_1, ptr(reg_offs_batch, GET_OFF_BATCH_ELEMENT(offset.B)));
         add(reg_aux_B, reg_aux_B, X_TMP_1);
-        mov_imm(X_TMP_2, sizeof(brgemm_batch_element_t));
-        add(reg_offs_batch, reg_offs_batch, X_TMP_2);
+        if (brg.brgattr.max_bs > 1)
+            add_imm(reg_offs_batch, reg_offs_batch,
+                    sizeof(brgemm_batch_element_t), X_TMP_2);
     } else if (brg.type == brgemm_strd) {
         mov(reg_aux_A, reg_aux1_A);
         mov(reg_aux_B, reg_aux1_B);
 
@@ -193,7 +193,7 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
 
         int idx = get_brg_kernel_idx(i_bs, i_init, i_M, i_N, i_K);
         if (idx < 0) continue;
-        brgemm_t &brg = brg_descs_[idx];
+        brgemm_desc_t &brg = brg_descs_[idx];
         auto LDA = i_K && bgmmc_.use_buffer_a_tail_only
                 ? (dim_t)bgmmc_.wei_k_blk
                 : bgmmc_.LDA;
 
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2021 Intel Corporation
 * Copyright 2024 FUJITSU LIMITED
+* Copyright 2025 Arm Ltd. and affiliates
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
@@ -53,13 +54,15 @@ struct brgemm_matmul_t : public primitive_t {
         status_t init(engine_t *engine);
         int get_brg_kernel_idx(bool is_bs_tail, bool do_initialization,
                 int m_ker_idx, bool is_N_tail, bool is_K_tail) const;
-        const brgemm_t &get_brg_desc(int idx) const { return brg_descs_[idx]; }
+        const brgemm_desc_t &get_brg_desc(int idx) const {
+            return brg_descs_[idx];
+        }
         const brgemm_matmul_conf_t &get_brgemm_matmul_conf() const {
             return bgmmc_;
         }
 
     private:
-        brgemm_t brg_descs_[max_num_brg_kernels_matmul];
+        brgemm_desc_t brg_descs_[max_num_brg_kernels_matmul];
         brgemm_matmul_conf_t bgmmc_;
     };
 
 
@@ -21,6 +21,7 @@
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
 #include "cpu/aarch64/jit_generator.hpp"
+#include "xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_reg.h"
 
 #include "cpu/aarch64/matmul/brgemm_matmul_copy_utils.hpp"
 
@@ -586,7 +587,7 @@ void jit_brgemm_matmul_copy_b_f32_t::copy_16_8_x_n_block(
             continue;
         }
 
-        const opmask_t curr_msk = zero_padding < n_blk_step ? kTail : kFFFF;
+        const opmask_t curr_msk = zero_padding < n_blk_step ? kTail : P_ALL_ONE;
         const int blk_idx = iter % max_regs_available;
         load(blk_idx, k, n, curr_msk);
         add_imm(X_DEFAULT_ADDR, reg_tr_src, tr_src_off, X_TMP_0);
@@ -621,6 +622,7 @@ void jit_brgemm_matmul_copy_b_f32_t::compute_k_loop(int ncolumns) {
 }
 
 void jit_brgemm_matmul_copy_b_f32_t::generate() {
+
     preamble();
     eor(zmm_zero.d, zmm_zero.d, zmm_zero.d);
     LDR_IMM(reg_src, param1, GET_OFF(src));
 
@@ -989,6 +989,67 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     return status::success;
 }
 
+status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t M, dim_t K,
+        dim_t N, dim_t in_ld, dim_t n_blk, data_type_t in_type,
+        data_type_t out_type, format_tag_t in_tag) {
+    if (n_blk <= 0 && M <= 0) return status::invalid_arguments;
+
+    const auto vnni_granularity = data_type_vnni_granularity(out_type);
+    if (vnni_granularity <= 0) return status::invalid_arguments;
+
+    // Zero initialize the `conf` to avoid access to 'garbage' in members.
+    conf = brgemm_matmul_conf_t();
+
+    const bool is_bf16 = one_of(in_type, bf16) || one_of(out_type, bf16);
+    const bool is_s8u8 = one_of(in_type, s8, u8) || one_of(out_type, s8, u8);
+
+    VCONDCHECK_BG(!(is_bf16 || is_s8u8), VERBOSE_UNSUPPORTED_DT);
+
+    const bool is_copyB = N > 0;
+    conf.isa = get_max_cpu_isa(); // Just use the best ISA possible.
+    conf.is_bf32 = false;
+    conf.batch = batch;
+    conf.src_dt = conf.wei_dt = out_type;
+    conf.orig_src_dt = conf.orig_wei_dt = in_type;
+    // Note: will need to change `tr_a_dt_sz` for copyA in cases where src_dt != dst_dt
+    conf.a_dt_sz = conf.tr_a_dt_sz = types::data_type_size(conf.src_dt);
+    conf.N = N;
+    conf.M = M;
+    conf.K = K;
+    const dim_t copyA_K_blk = isa_num_vregs(conf.isa) / 2;
+    const dim_t copyB_K_blk = 16 * vnni_granularity;
+    conf.K_blk = is_copyB ? copyB_K_blk : copyA_K_blk;
+    conf.K_tail = conf.K % conf.K_blk;
+    if (!is_copyB) {
+        // Note: current implementation always calls the transposed kernel.
+        conf.transposed_A = true;
+        conf.M_blk = (dim_t)isa_max_vlen(conf.isa) / conf.a_dt_sz;
+        conf.M_tail = conf.M % conf.M_blk;
+        conf.copy_A_src_stride = in_ld * conf.a_dt_sz;
+        // setting LDA parameter required for plain transpose
+        conf.LDA = conf.K;
+    } else {
+        conf.blocked_B = !utils::one_of(in_tag, ab, ba, abc, acb);
+        conf.transposed_B = utils::one_of(in_tag, ba, acb);
+        conf.wei_tag = in_tag;
+        conf.wei_n_blk = conf.N_blk = conf.LDB = n_blk;
+        conf.N_tail = conf.N % conf.N_blk;
+        conf.b_dt_sz = types::data_type_size(in_type);
+        conf.tr_b_dt_sz = types::data_type_size(conf.wei_dt);
+        conf.copy_B_wei_stride = in_ld * conf.b_dt_sz;
+        conf.N_chunk_elems = conf.N;
+        conf.s8s8_comp_b_str = utils::rnd_up(conf.N, conf.wei_n_blk);
+        conf.s8s8_comp_n_str = conf.wei_n_blk;
+    }
+
+    conf.s8s8_compensation_required = false;
+    conf.src_zp_type = brgemm_broadcast_t::none;
+    conf.has_zero_point_a = false;
+    conf.has_zero_point_b = false;
+
+    return status::success;
+}
+
 void init_aux_values(brgemm_matmul_conf_t &bgmmc,
         const memory_desc_wrapper &src_d, const memory_desc_wrapper &wei_d,
         const memory_desc_wrapper &dst_d) {
 
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2021 Intel Corporation
 * Copyright 2023-2024 FUJITSU LIMITED
+* Copyright 2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -121,6 +122,8 @@ struct brgemm_matmul_conf_t {
     data_type_t wei_dt;
     data_type_t acc_dt;
     data_type_t bia_dt;
+    data_type_t orig_src_dt;
+    data_type_t orig_wei_dt;
     int nthr;
     int nthr_k;
 
@@ -166,6 +169,7 @@ struct brgemm_matmul_conf_t {
     bool has_zero_point_a, has_zero_point_b, has_zero_point_c;
     bool post_ops_applicable;
     bool transposed_A;
+    bool transposed_B;
     bool blocked_B;
 
     dim_t zp_a_comp_shift_n;
@@ -301,6 +305,10 @@ struct brgemm_matmul_conf_utils_t {
     const cpu_isa_t isa_;
 };
 
+status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t M, dim_t K,
+        dim_t N, dim_t in_ld, dim_t n_blk, data_type_t in_type,
+        data_type_t out_type, format_tag_t in_tag);
+
 void init_aux_values(brgemm_matmul_conf_t &bgmmc,
         const memory_desc_wrapper &src_d, const memory_desc_wrapper &wei_d,
         const memory_desc_wrapper &dst_d);
 
@@ -0,0 +1,102 @@
+/*******************************************************************************
+* Copyright 2025 Arm Ltd. and affiliates
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/utils.hpp"
+
+#include "cpu/aarch64/ukernel/attr_params.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu::ukernel;
+
+status_t attr_params_t::set_post_ops_args(const void **post_ops_args) {
+    post_ops_args_ = post_ops_args;
+    return status::success;
+}
+
+status_t attr_params_t::set_scales(const void *scales, int arg) {
+    switch (arg) {
+        case DNNL_ARG_SRC: a_scales_ = scales; break;
+        case DNNL_ARG_WEIGHTS: b_scales_ = scales; break;
+        case DNNL_ARG_DST: d_scales_ = scales; break;
+        default: assert(!"unsupported arg");
+    }
+    return status::success;
+}
+
+const void *attr_params_t::get_scales(int arg) const {
+    switch (arg) {
+        case DNNL_ARG_SRC: return a_scales_;
+        case DNNL_ARG_WEIGHTS: return b_scales_;
+        case DNNL_ARG_DST: return d_scales_;
+        default: assert(!"unsupported arg");
+    }
+    return nullptr;
+}
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+namespace ukernel {
+
+status_t dnnl_ukernel_attr_params_create(attr_params_t **attr_params) {
+    *attr_params = new attr_params_t();
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_set_post_ops_args(
+        attr_params_t *attr_params, const void **post_ops_args) {
+    if (attr_params == nullptr) return status::invalid_arguments;
+
+    CHECK(attr_params->set_post_ops_args(post_ops_args));
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_set_A_scales(
+        attr_params_t *attr_params, const void *a_scales) {
+    return status::unimplemented;
+}
+
+status_t dnnl_ukernel_attr_params_set_B_scales(
+        attr_params_t *attr_params, const void *b_scales) {
+    return status::unimplemented;
+}
+
+status_t dnnl_ukernel_attr_params_set_D_scales(
+        attr_params_t *attr_params, const void *d_scales) {
+    if (attr_params == nullptr) return status::invalid_arguments;
+
+    CHECK(attr_params->set_scales(d_scales, DNNL_ARG_DST));
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_destroy(attr_params_t *attr_params) {
+    delete attr_params;
+    return status::success;
+}
+
+} // namespace ukernel
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s