104 #
if defined(HAS_BIAS)
110 #define _IWEI_WIDTH WEI_WIDTH 111 #define _IWEI_HEIGHT WEI_HEIGHT 112 #define _IWEI_DEPTH WEI_DEPTH 113 #define _ISRC_WIDTH SRC_WIDTH 114 #define _ISRC_HEIGHT SRC_HEIGHT 115 #define _ISRC_DEPTH SRC_DEPTH 116 #define _ISRC_CHANNELS SRC_CHANNELS 117 #define _IDST_WIDTH DST_WIDTH 118 #define _IDST_HEIGHT DST_HEIGHT 119 #define _IDST_DEPTH DST_DEPTH 120 #define _IDST_CHANNELS DST_CHANNELS 121 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT * _IWEI_DEPTH) 124 #if defined(IS_QUANTIZED) 125 #define _IOUTPUT_TILE cq 126 #else // defined(IS_QUANTIZED) 127 #define _IOUTPUT_TILE c 128 #endif // defined(IS_QUANTIZED) 134 TILE(
int, M0, 1, xi);
135 TILE(
int, M0, 1, yi);
136 TILE(
int, M0, 1, zi);
147 zi[i].v -= PAD_FRONT;
151 TILE(ACC_DATA_TYPE, M0, N0, c);
155 c[i].v = (ACC_DATA_TYPE)0;
168 TILE(DATA_TYPE, M0, K0, a);
169 TILE(DATA_TYPE, N0, K0,
b);
177 T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, K0, BUFFER,
src, bout, zk, yk, xk, ck,
_ISRC_WIDTH,
_ISRC_HEIGHT,
_ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
187 b[i].s[j] = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) *
sizeof(DATA_TYPE) + j * wei_stride_y + b_offs * wei_stride_y);
193 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a,
b, c);
202 #if((_ISRC_CHANNELS % K0) != 0) 206 TILE(DATA_TYPE, M0, 1, a);
207 TILE(DATA_TYPE, N0, 1,
b);
215 T_LOAD_NDHWC_INDIRECT(DATA_TYPE, M0, 1, BUFFER,
src, bout, zk, yk, xk, ck,
_ISRC_WIDTH,
_ISRC_HEIGHT,
_ISRC_DEPTH, src_stride_y, xi, yi, zi, a);
223 b[i].v = *(__global DATA_TYPE *)(wei_ptr + wei_offset_first_element_in_bytes + (cout + i) *
sizeof(DATA_TYPE) + b_offs * wei_stride_y);
228 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a,
b, c);
236 #endif // ((_ISRC_CHANNELS % K0) != 0) 243 #if defined(HAS_BIAS) 244 TILE(BIA_DATA_TYPE, 1, N0, bias0);
248 bias0[0].v =
VLOAD(N0)(0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout *
sizeof(BIA_DATA_TYPE)));
253 (bias0[0].v, 0, (__global BIA_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + cout *
sizeof(BIA_DATA_TYPE)));
261 TILE(uint, M0, 1, dst_indirect_y);
270 #
if defined(IS_QUANTIZED)
271 TILE(DATA_TYPE, M0, N0, cq);
275 #endif // defined(IS_QUANTIZED) 277 bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
280 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_N0, BUFFER,
dst, cout, dst_stride_y, x_cond,
_IOUTPUT_TILE, dst_indirect_y);
#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst)
Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect...
#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0)
Get the get_global_id with partial N0.
#define LOOP_UNROLLING(type, idx, start, step, num, macro)
#define VLOAD_PARTIAL(size, load_size)
SimpleTensor< float > src
#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst)
Matrix multiplication.
#define VECTOR_DECLARATION(name)
#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)
Quantized the 8-bit tile with fixed-point scale for asymmetric.
#define TILE(DATA_TYPE, H, W, BASENAME)
Tile object A tile object is a 2D memory block and can be accessed using the following syntax:a[m0]...
#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst)
Element-wise addition with a constant value.
#define TENSOR4D(name, type)
__kernel void direct_convolution3d_ndhwc(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_stride_w, uint src_step_w, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_stride_w, uint dst_step_w, uint dst_offset_first_element_in_bytes, __global uchar *wei_ptr, uint wei_stride_x, uint wei_step_x, uint wei_stride_y, uint wei_step_y, uint wei_stride_z, uint wei_step_z, uint wei_stride_w, uint wei_step_w, uint wei_offset_first_element_in_bytes, __global uchar *bia_ptr, uint bia_stride_x, uint bia_step_x, uint bia_offset_first_element_in_bytes)
#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)
Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a differe...
#define T_ADD_BROADCAST_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
Element-wise addition with RHS broadcasted (RHS has the X dimension only)
#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)
Offset correction for the QASYMM8 computation.