Compute Library
 21.08
ClGemmMatrixMultiplyKernel Class Reference

OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. More...

#include <ClGemmMatrixMultiplyKernel.h>

Collaboration diagram for ClGemmMatrixMultiplyKernel:
[legend]

Public Member Functions

 ClGemmMatrixMultiplyKernel ()
 
 ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE (ClGemmMatrixMultiplyKernel)
 
void configure (const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta=0.f, bool is_interleaved_transposed=true, const GEMMReshapeInfo &reshape_info=GEMMReshapeInfo(), bool fp_mixed_precision=false, const ActivationLayerInfo &activation_info=ActivationLayerInfo())
 Initialise the kernel's input, output and alpha. More...
 
void run_op (ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
 Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue. More...
 
- Public Member Functions inherited from ICLKernel
 ICLKernel ()
 Constructor. More...
 
cl::Kernel & kernel ()
 Returns a reference to the OpenCL kernel of this object. More...
 
CLKernelType type () const
 Returns the CL kernel type. More...
 
template<typename T >
void add_1D_array_argument (unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx. More...
 
void add_1D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
 Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx. More...
 
void add_1D_tensor_argument_if (bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
 Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true. More...
 
void add_2D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
 Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx. More...
 
void add_2D_tensor_argument_if (bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
 Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true. More...
 
void add_3D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
 Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx. More...
 
void add_4D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
 Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx. More...
 
virtual void run (const Window &window, cl::CommandQueue &queue)
 Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue. More...
 
template<typename T >
void add_argument (unsigned int &idx, T value)
 Add the passed parameters to the object's kernel's arguments starting from the index idx. More...
 
void set_lws_hint (const cl::NDRange &lws_hint)
 Set the Local-Workgroup-Size hint. More...
 
cl::NDRange lws_hint () const
 Return the Local-Workgroup-Size hint. More...
 
void set_wbsm_hint (const cl_int &wbsm_hint)
 Set the workgroup batch size modifier hint. More...
 
cl_int wbsm_hint () const
 Return the workgroup batch size modifier hint. More...
 
const std::string & config_id () const
 Get the configuration ID. More...
 
void set_target (GPUTarget target)
 Set the targeted GPU architecture. More...
 
void set_target (cl::Device &device)
 Set the targeted GPU architecture according to the CL device. More...
 
GPUTarget get_target () const
 Get the targeted GPU architecture. More...
 
size_t get_max_workgroup_size ()
 Get the maximum workgroup size for the device the CLKernelLibrary uses. More...
 
template<unsigned int dimension_size>
void add_tensor_argument (unsigned &idx, const ICLTensor *tensor, const Window &window)
 
template<typename T , unsigned int dimension_size>
void add_array_argument (unsigned &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 Add the passed array's parameters to the object's kernel's arguments starting from the index idx. More...
 
- Public Member Functions inherited from IKernel
 IKernel ()
 Constructor. More...
 
virtual ~IKernel ()=default
 Destructor. More...
 
virtual bool is_parallelisable () const
 Indicates whether or not the kernel is parallelisable. More...
 
virtual BorderSize border_size () const
 The size of the border for that kernel. More...
 
const Windowwindow () const
 The maximum window the kernel can be executed on. More...
 
bool is_window_configured () const
 Function to check if the embedded window of this kernel has been configured. More...
 

Static Public Member Functions

static Status validate (const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision=false, const ActivationLayerInfo &activation_info=ActivationLayerInfo())
 Static function to check if given info will lead to a valid configuration. More...
 
- Static Public Member Functions inherited from ICLKernel
static constexpr unsigned int num_arguments_per_1D_array ()
 Returns the number of arguments enqueued per 1D array object. More...
 
static constexpr unsigned int num_arguments_per_1D_tensor ()
 Returns the number of arguments enqueued per 1D tensor object. More...
 
static constexpr unsigned int num_arguments_per_2D_tensor ()
 Returns the number of arguments enqueued per 2D tensor object. More...
 
static constexpr unsigned int num_arguments_per_3D_tensor ()
 Returns the number of arguments enqueued per 3D tensor object. More...
 
static constexpr unsigned int num_arguments_per_4D_tensor ()
 Returns the number of arguments enqueued per 4D tensor object. More...
 
static cl::NDRange gws_from_window (const Window &window)
 Get the global work size given an execution window. More...
 

Data Fields

bool _slide_matrix_b { true }
 
bool _reinterpret_input_as_3d { false }
 
bool _reinterpret_output_as_3d { false }
 
bool _add_bias { false }
 

Detailed Description

OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided.

All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result. For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object

Note
If the input tensors src0 and src1 have been reshaped respectively with ClGemmReshapeLhsMatrixKernel" and ClGemmReshapeRhsMatrixKernel, the flag is_interleaved_transposed must be set to true
Attention
src1 tensor must have at least 2 dimensions (matrix)

Definition at line 45 of file ClGemmMatrixMultiplyKernel.h.

Constructor & Destructor Documentation

◆ ClGemmMatrixMultiplyKernel()

Definition at line 265 of file ClGemmMatrixMultiplyKernel.cpp.

References arm_compute::GEMM.

266 {
267  _type = CLKernelType::GEMM;
268 }
Convolution using GEMM.

Member Function Documentation

◆ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE()

ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE ( ClGemmMatrixMultiplyKernel  )

◆ configure()

void configure ( const ClCompileContext compile_context,
ITensorInfo src0,
ITensorInfo src1,
ITensorInfo src2,
ITensorInfo dst,
float  alpha,
float  beta = 0.f,
bool  is_interleaved_transposed = true,
const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(),
bool  fp_mixed_precision = false,
const ActivationLayerInfo activation_info = ActivationLayerInfo() 
)

Initialise the kernel's input, output and alpha.

Parameters
[in]compile_contextThe compile context to be used.
[in]src0Input tensor containing the Matrix A. Data types supported: F16/F32
[in]src1Input tensor containing the Matrix B. Data type supported: same as src0
[in]src2Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as src0
[out]dstOutput tensor to store the result of matrix multiplication. Data type supported: same as src0
[in]alphaWeight of the matrix product
[in]beta(Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
[in]is_interleaved_transposed(Optional) True if input0 and input1 have been reshaped respectively using ClGemmReshapeLhsMatrixKernel and ClGemmReshapeRhsMatrixKernel
[in]reshape_info(Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
[in]fp_mixed_precision(Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
[in]activation_info(Optional) Activation to apply after the matrix multiplication

Definition at line 270 of file ClGemmMatrixMultiplyKernel.cpp.

References ClGemmMatrixMultiplyKernel::_add_bias, ClGemmMatrixMultiplyKernel::_reinterpret_input_as_3d, ClGemmMatrixMultiplyKernel::_reinterpret_output_as_3d, ClGemmMatrixMultiplyKernel::_slide_matrix_b, ActivationLayerInfo::a(), ActivationLayerInfo::activation(), CLBuildOptions::add_option(), CLBuildOptions::add_option_if(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_ERROR_ON_NULLPTR, ARM_COMPUTE_ERROR_THROW_ON, ActivationLayerInfo::b(), arm_compute::BIFROST, GEMMReshapeInfo::broadcast_bias(), arm_compute::create_kernel(), ITensorInfo::data_type(), GEMMReshapeInfo::depth_output_gemm3d(), ITensorInfo::dimension(), ActivationLayerInfo::enabled(), arm_compute::F16, arm_compute::F32, arm_compute::float_to_string_with_full_precision(), arm_compute::get_arch_from_target(), arm_compute::get_cl_type_from_data_type(), arm_compute::get_padding_info(), ICLKernel::get_target(), arm_compute::has_padding_changed(), arm_compute::is_data_type_float(), arm_compute::helpers::float_ops::is_one(), kernel_name, arm_compute::lower_string(), GEMMReshapeInfo::mult_interleave4x4_height(), GEMMReshapeInfo::mult_transpose1xW_width(), ITensorInfo::num_dimensions(), CLBuildOptions::options(), GEMMReshapeInfo::reinterpret_input_as_3d(), ICLKernel::set_lws_hint(), arm_compute::string_from_activation_func(), arm_compute::string_from_data_type(), and arm_compute::support::cpp11::to_string().

273 {
274  ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
275 
276  // Perform validate step
277  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, beta,
278  is_interleaved_transposed, reshape_info, fp_mixed_precision));
279 
280  auto padding_info = is_interleaved_transposed ? get_padding_info({ src0, src1, dst }) : get_padding_info({ src0, dst });
281 
282  _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d();
283  _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
284  _add_bias = src2 != nullptr;
285 
286  // In case both input and dst have to be reinterpreted as 3D tensors,
287  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
288  if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
289  {
290  _reinterpret_input_as_3d = false;
292  }
293 
294  // Check if we need to slide the matrix B
295  const unsigned int num_dimensions_src0 = _reinterpret_input_as_3d ? src0->num_dimensions() - 1 : src0->num_dimensions();
296 
297  _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
298 
299  const DataType data_type = src0->data_type();
300 
301  // Get target architecture
302  GPUTarget gpu_target = get_target();
303 
304  ElementsProcessed num_elements_processed{};
305 
306  // Configure kernel window
307  auto win_config = validate_and_configure_window(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info,
308  gpu_target, num_elements_processed);
309  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
310  ICLKernel::configure_internal(win_config.second);
311 
312  // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
313  // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
314  // This means that the actual m used by the kernel is given by dst->dimension(1)
315  const unsigned int internal_m = _reinterpret_output_as_3d ? dst->dimension(1) * dst->dimension(2) : dst->dimension(1);
316  const unsigned int n = dst->dimension(0);
317 
318  const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
319  const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
320 
321  const unsigned int m0 = num_elements_processed.y();
322  const unsigned int n0 = num_elements_processed.x();
323 
324  // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
325  const unsigned int partial_store_m0 = internal_m % m0;
326  const unsigned int partial_store_n0 = n % n0;
327 
328  // Create build options
329  CLBuildOptions build_opts;
330 
331  build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
332  build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
333  build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
334  build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
335  build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
336  build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
337  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
338  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
339  build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
340  build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
341  build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
342  build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
343  build_opts.add_option("-DIN1_DIM_X=" + support::cpp11::to_string(src1->dimension(0)));
344 
345  const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
346 
347  std::string kernel_name;
348  if(is_interleaved_transposed)
349  {
350  const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
351  const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
352 
353  build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
354  build_opts.add_option("-DN=" + support::cpp11::to_string(n));
355  build_opts.add_option("-DK=" + support::cpp11::to_string(src1->dimension(0) / (n0 * mult_transpose1xW_width)));
356  build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
357  build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
358  build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
359  build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
360 
361  if(is_data_type_float(data_type) && is_bifrost)
362  {
363  kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
364  }
365  else
366  {
367  kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
368  if(fp_mixed_precision && data_type == DataType::F16)
369  {
370  // currently wider accumulator is only supported for fp16 kernels.
371  kernel_name += "_acc32";
372  }
373  }
374  }
375  else // The input tensors have not been reshaped
376  {
377  build_opts.add_option("-DN=" + support::cpp11::to_string(n));
378  build_opts.add_option("-DK=" + support::cpp11::to_string(src0->dimension(0)));
379  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
380  build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
381  build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
382  build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
383  build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
384 
385  // Create kernels according to the architecture, data type and input size.
386  if(is_data_type_float(data_type) && is_bifrost)
387  {
388  kernel_name = "gemm_mm_floating_point";
389 
390  if(src0->num_dimensions() != 1)
391  {
392  kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
393  if(fp_mixed_precision && data_type == DataType::F16)
394  {
395  // currently wider accumulator is only supported for fp16 kernels.
396  kernel_name += "_acc32";
397  }
398  }
399  else if(src1->dimension(0) <= 1000 && data_type == DataType::F32)
400  {
401  // The first kernel is optimized for the case of 1000 or less dst elements (e.g. FC8 of AlexNet and VGG-16, and
402  // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 dst elements (e.g.
403  // FC6 and FC7 of AlexNet and VGG-16).
404  kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
405  }
406 
407  // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
408  // via exhaustive autotuning over a range of representative layer configurations.
409  set_lws_hint(cl::NDRange(4));
410  }
411  else // (MIDGARD and F32) or (F16)
412  {
413  kernel_name = "gemm_mm_floating_point";
414  }
415  }
416  // Create kernel
417  _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
418 
419  // Set config_id for enabling LWS tuning
420  _config_id = "gemm_";
421  _config_id += (is_interleaved_transposed ? "reshaped_" : "");
422  _config_id += (_add_bias ? "add_bias_" : "");
423  _config_id += (reshape_info.broadcast_bias() ? "broadcast_bias_" : "");
424  _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
425  _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
426  _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
427  _config_id += lower_string(string_from_data_type(src0->data_type()));
428  _config_id += "_";
429  _config_id += support::cpp11::to_string(dst->dimension(1));
430  _config_id += "_";
431  _config_id += support::cpp11::to_string(dst->dimension(0));
432  _config_id += "_";
433  _config_id += support::cpp11::to_string(dst->dimension(2));
434  _config_id += "_";
435  _config_id += support::cpp11::to_string(dst->dimension(3));
436  _config_id += "_";
437  _config_id += (is_interleaved_transposed ? support::cpp11::to_string(src1->dimension(0)) : support::cpp11::to_string(src1->dimension(1)));
438 
440 }
bool is_one(float a, float epsilon=0.00001f)
Checks if the input floating point number is 1.0f checking if the difference is within a range define...
Definition: float_ops.h:97
void set_lws_hint(const cl::NDRange &lws_hint)
Set the Local-Workgroup-Size hint.
Definition: ICLKernel.h:308
std::string to_string(T &&value)
Convert integer and float values to string.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
const std::string & string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
Translates a given activation function to a string.
Definition: Utils.cpp:163
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
Definition: GPUTarget.cpp:193
std::string lower_string(const std::string &val)
Lower a given string.
Definition: Utils.cpp:326
1 channel, 1 F16 per channel
const DataType data_type
Definition: Im2Col.cpp:150
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
Definition: CLHelpers.cpp:391
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
Definition: Utils.cpp:135
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
Definition: Utils.h:1075
GPUTarget get_target() const
Get the targeted GPU architecture.
Definition: ICLKernel.h:378
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
Definition: CLHelpers.cpp:39
bool has_padding_changed(const std::unordered_map< const ITensorInfo *, PaddingSize > &padding_map)
Check if the previously stored padding info has changed after configuring a kernel.
Definition: Utils.cpp:533
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
std::unordered_map< const ITensorInfo *, PaddingSize > get_padding_info(std::initializer_list< const ITensorInfo *> infos)
Stores padding information before configuring a kernel.
Definition: Utils.cpp:518
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
std::string kernel_name
DataType
Available data types.
Definition: Types.h:77
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
Definition: Utils.h:961

◆ run_op()

void run_op ( ITensorPack tensors,
const Window window,
cl::CommandQueue &  queue 
)
overridevirtual

Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.

Note
The queue is not flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
Parameters
[in]tensorsA vector containing the tensors to operato on.
[in]windowRegion on which to execute the kernel. (Must be a valid region of the window returned by window()).
[in,out]queueCommand queue on which to enqueue the kernel.

Reimplemented from ICLKernel.

Definition at line 464 of file ClGemmMatrixMultiplyKernel.cpp.

References ClGemmMatrixMultiplyKernel::_add_bias, ClGemmMatrixMultiplyKernel::_reinterpret_input_as_3d, ClGemmMatrixMultiplyKernel::_reinterpret_output_as_3d, ClGemmMatrixMultiplyKernel::_slide_matrix_b, arm_compute::ACL_DST, arm_compute::ACL_SRC_0, arm_compute::ACL_SRC_1, arm_compute::ACL_SRC_2, ICLKernel::add_2D_tensor_argument(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW, ARM_COMPUTE_ERROR_ON_NULLPTR, ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL, Window::DimX, Window::DimY, arm_compute::enqueue(), Window::first_slice_window_3D(), ITensorPack::get_const_tensor(), ITensorPack::get_tensor(), ICLKernel::lws_hint(), ICLKernel::num_arguments_per_2D_tensor(), Window::set(), arm_compute::test::validation::reference::slice(), Window::slide_window_slice_3D(), and IKernel::window().

465 {
468 
469  const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
470  const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
471  const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
472  auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
473 
474  ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
475  ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
476 
477  if(src1->info()->num_dimensions() < 3)
478  {
479  // The stride_z for matrix B must be zero if we do not slice
480  ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
481  }
482 
484  Window slice_matrix_b = slice;
485 
486  slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
487  slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
488 
489  const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
490 
492  {
493  // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
494  const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
495  const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
496  _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
497  }
498 
500  {
501  // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
502  const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
503  const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
504  _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
505  }
506 
507  do
508  {
509  Window slice_b = slice;
510  // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
511  // This scenario can happen when the matrix multiplication is used to perform a convolution operation
512  if(!_slide_matrix_b)
513  {
514  slice_b = slice_matrix_b;
515  }
516 
517  unsigned int idx = 0;
518  add_2D_tensor_argument(idx, src0, slice);
519  add_2D_tensor_argument(idx, src1, slice_b);
520  if(_add_bias)
521  {
522  add_2D_tensor_argument(idx, src2, slice);
523  }
524  add_2D_tensor_argument(idx, dst, slice);
525  _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
526  _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
527  if(_add_bias)
528  {
529  _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
530  }
531  _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
532  enqueue(queue, *this, slice, lws_hint());
533  }
534  while(window.slide_window_slice_3D(slice));
535 }
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
Definition: ICLKernel.cpp:32
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
Definition: ICLKernel.h:318
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
static constexpr unsigned int num_arguments_per_2D_tensor()
Returns the number of arguments enqueued per 2D tensor object.
Definition: ICLKernel.h:248
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
Definition: Window.h:335
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor&#39;s parameters to the object&#39;s kernel&#39;s arguments starting from the index idx...
Definition: ICLKernel.h:190
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
Window first_slice_window_3D() const
First 3D slice of the window.
Definition: Window.h:291
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)

◆ validate()

Status validate ( const ITensorInfo src0,
const ITensorInfo src1,
const ITensorInfo src2,
const ITensorInfo dst,
float  alpha,
float  beta,
bool  is_interleaved_transposed,
const GEMMReshapeInfo reshape_info,
GPUTarget  gpu_target,
bool  fp_mixed_precision = false,
const ActivationLayerInfo activation_info = ActivationLayerInfo() 
)
static

Static function to check if given info will lead to a valid configuration.

Similar to ClGemmMatrixMultiplyKernel::configure()

Returns
a status

Definition at line 442 of file ClGemmMatrixMultiplyKernel.cpp.

References ARM_COMPUTE_RETURN_ON_ERROR, ARM_COMPUTE_UNUSED, and ICloneable< T >::clone().

444 {
445  // Note: num_elements_processed will be set in validate_and_configure_window()
446  ElementsProcessed num_elements_processed{};
447  ARM_COMPUTE_UNUSED(alpha);
448  ARM_COMPUTE_UNUSED(activation_info);
449  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
450  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
451  src1->clone().get(),
452  (src2 != nullptr) ? src2->clone().get() : nullptr,
453  dst->clone().get(),
454  beta,
455  is_interleaved_transposed,
456  reshape_info,
457  gpu_target,
458  num_elements_processed)
459  .first);
460 
461  return Status{};
462 }
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

Field Documentation

◆ _add_bias

bool _add_bias { false }

◆ _reinterpret_input_as_3d

bool _reinterpret_input_as_3d { false }

◆ _reinterpret_output_as_3d

bool _reinterpret_output_as_3d { false }

◆ _slide_matrix_b

bool _slide_matrix_b { true }

The documentation for this class was generated from the following files: