52 using ElementsProcessed =
Steps;
62 if(!is_interleaved_transposed)
75 const int m = reshape_info.
m();
76 const int n = reshape_info.
n();
77 const int k = reshape_info.
k();
82 tensor_shape0.
set(0, k);
83 tensor_shape0.
set(1, m);
86 tensor_shape1.
set(0, n);
87 tensor_shape1.
set(1, k);
89 const TensorInfo tensor_info0 = input0->
clone()->set_tensor_shape(tensor_shape0);
90 const TensorInfo tensor_info1 = input1->
clone()->set_tensor_shape(tensor_shape1);
111 GPUTarget gpu_target, ElementsProcessed &num_elements_processed)
117 tensor_shape.
set(0, is_interleaved_transposed ? reshape_info.
n() : input1->
dimension(0));
118 tensor_shape.
set(1, is_interleaved_transposed ? reshape_info.
m() : input0->
dimension(1));
122 bool window_changed =
false;
126 unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
127 unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
129 if(is_interleaved_transposed)
132 num_elems_processed_per_iteration_x = max_gc_vector_width /
data_size_from_type(data_type);
133 num_elems_processed_per_iteration_y = 4;
137 AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
138 AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
139 AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
148 num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->
dimension(1)), 4);
153 num_elems_processed_per_iteration_x = 4;
157 num_elems_processed_per_iteration_x = max_gc_vector_width /
data_size_from_type(data_type);
169 AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
179 return std::make_pair(err, win);
184 : _input0(nullptr), _input1(nullptr), _output(nullptr)
202 ElementsProcessed num_elements_processed{};
205 auto win_config = validate_and_configure_window(input0->
info(), input1->
info(), output->
info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
207 IGCKernel::configure(win_config.second);
210 std::set<std::string> build_opts;
221 if(is_interleaved_transposed)
223 const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
224 const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
232 build_opts.emplace(
"#define DATA_TYPE_FP16");
236 build_opts.emplace(
"#define DATA_TYPE_FP32");
244 build_opts.emplace(
"#define GEMM_MM_INTERLEAVED_TRANSPOSED");
246 kernel_name =
"gemm_mm_interleaved_transposed";
256 build_opts.emplace(
"#define DATA_TYPE_FP16");
257 build_opts.emplace(
"#define MM_PROCESS_4X_OPTIMIZED");
258 build_opts.emplace(
"#define GEMM_MM_FLOATING_POINT");
262 build_opts.emplace(
"#define DATA_TYPE_FP32");
266 build_opts.emplace(
"#define GEMM_MM_FLOATING_POINT_BIFROST");
270 build_opts.emplace(
"#define GEMM_MM_FLOATING_POINT");
282 kernel_name =
"gemm_mm_floating_point";
293 ElementsProcessed num_elements_processed{};
296 input1->
clone().get(),
297 output->
clone().get(),
298 is_interleaved_transposed,
301 num_elements_processed)
326 slice_b = slice_matrix_b;
329 unsigned int idx = 0;
334 _kernel.update_shader_params();
Window first_slice_window_2D() const
First 2D slice of the window.
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
int mult_interleave4x4_height() const
Multiplication factor for the height of the 4x4 interleaved block.
const Window & window() const
The maximum window the kernel can be executed on.
void run(const Window &window) override
Enqueue the OpenGL ES shader to process the given window.
TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width=1)
Calculate the transposed 1xW width element shape.
void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws=gles::NDRange(1U, 1U, 1U))
Add the kernel to the command queue with the given window.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
int mult_transpose1xW_width() const
Multiplication factor for the width of the 1xW transposed block.
GEMM reshape information class.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
GCGEMMMatrixMultiplyKernel()
Default constructor.
Store the tensor's metadata.
Interface for GLES Compute tensor.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height=1, bool reinterpret_input_as_3d=false)
Calculate the interleaved shape of an input tensor.
bool slide_window_slice_2D(Window &slice) const
Slide the passed 2D window slice.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
Implementation of a static rectangular access pattern.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
int n() const
Number of matrix B columns.
Implementation of a rectangular access pattern.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Manages all the GLES kernels compilation and caching, provides accessors for the GLES Context...
Class to describe a number of elements in each dimension.
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
Static function to check if given info will lead to a valid configuration of GCGEMMMatrixMultiplyKern...
void configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed=true, const GEMMReshapeInfo &reshape_info=GEMMReshapeInfo())
Initialise the kernel's input, output and alpha.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
size_t data_size_from_type(DataType data_type)
The size in bytes of the data type.
int k() const
Number of matrix A columns or matrix B rows.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx...
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Wrapper to configure the Khronos EGL and OpenGL ES C header.
static GCKernelLibrary & get()
Get the static instance of GCKernelLibrary.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Implementation of a XY-transpose access pattern.
GPUTarget
Available GPU Targets.
int m() const
Number of matrix A rows.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
GCKernel create_kernel(const std::string &shader_name, const StringSet &build_options_set={}) const
Creates a kernel from the kernel library.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
GPUTarget get_target() const
Get the targeted GPU architecture.
Container for valid region of a window.
DataType
Available data types.
Describe a multidimensional execution window.
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true, bool increase_dim_unit=true)
Accessor to set the value of one of the dimensions.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)