28 #if defined(IS_QUANTIZED) 30 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 31 #define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val)); 32 #elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 33 #define ARM_DOT(x, y, val) val += arm_dot((x), (y)); 34 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 35 #define ARM_DOT(x, y, val) \ 37 val += (ACC_DATA_TYPE)x.s0 * (ACC_DATA_TYPE)y.s0; \ 38 val += (ACC_DATA_TYPE)x.s1 * (ACC_DATA_TYPE)y.s1; \ 39 val += (ACC_DATA_TYPE)x.s2 * (ACC_DATA_TYPE)y.s2; \ 40 val += (ACC_DATA_TYPE)x.s3 * (ACC_DATA_TYPE)y.s3; \ 42 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 44 #define ARM_DOT1(a, b, c) \ 46 ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 3))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 3))0)), c); \ 48 #define ARM_DOT2(a, b, c) \ 50 ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 2))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 2))0)), c); \ 52 #define ARM_DOT3(a, b, c) \ 54 ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (SRC_DATA_TYPE)0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (WEI_DATA_TYPE)0)), c); \ 56 #define ARM_DOT4(a, b, c) \ 60 #define ARM_DOT8(a, b, c) \ 62 ARM_DOT4((a.lo), (b.lo), c); \ 63 ARM_DOT4((a.hi), (b.hi), c); \ 65 #define ARM_DOT16(a, b, c) \ 67 ARM_DOT8((a.lo), (b.lo), c); \ 68 ARM_DOT8((a.hi), (b.hi), c); \ 71 #define ARM_OFFSET1(a, b, c) \ 73 c += (ACC_DATA_TYPE)a * (ACC_DATA_TYPE)b; \ 75 #define ARM_OFFSET2(a, b, c) \ 77 c += (ACC_DATA_TYPE)a.s0 * (ACC_DATA_TYPE)b; \ 78 c += (ACC_DATA_TYPE)a.s1 * (ACC_DATA_TYPE)b; \ 80 #define ARM_OFFSET3(a, b, c) \ 82 ARM_OFFSET2(a, b, c); \ 83 c += (ACC_DATA_TYPE)a.s2 * (ACC_DATA_TYPE)b; \ 85 #define ARM_OFFSET4(a, b, c) \ 87 ARM_OFFSET3(a, b, c); \ 88 c += (ACC_DATA_TYPE)a.s3 * (ACC_DATA_TYPE)b; \ 90 #define ARM_OFFSET8(a, b, c) \ 92 ARM_OFFSET4((a.lo), (b), c); \ 93 ARM_OFFSET4((a.hi), (b), c); \ 95 #define ARM_OFFSET16(a, b, c) \ 97 ARM_OFFSET8((a.lo), (b), c); \ 98 ARM_OFFSET8((a.hi), (b), c); \ 102 #define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \ 104 CONCAT(ARM_OFFSET, k0) \ 105 ((a), (b_offset), (c)); \ 106 CONCAT(ARM_OFFSET, k0) \ 107 ((b##0), (a_offset), (c)); \ 109 #elif N0 == 2 // N) == 3 110 #define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \ 112 CONCAT(ARM_OFFSET, k0) \ 113 ((a), (b_offset), (c.s0)); \ 114 CONCAT(ARM_OFFSET, k0) \ 115 ((b##0), (a_offset), (c.s0)); \ 116 CONCAT(ARM_OFFSET, k0) \ 117 ((a), (b_offset), (c.s1)); \ 118 CONCAT(ARM_OFFSET, k0) \ 119 ((b##1), (a_offset), (c.s1)); \ 121 #elif N0 == 3 // N0 == 3 122 #define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \ 124 CONCAT(ARM_OFFSET, k0) \ 125 ((a), (b_offset), (c.s0)); \ 126 CONCAT(ARM_OFFSET, k0) \ 127 ((b##0), (a_offset), (c.s0)); \ 128 CONCAT(ARM_OFFSET, k0) \ 129 ((a), (b_offset), (c.s1)); \ 130 CONCAT(ARM_OFFSET, k0) \ 131 ((b##1), (a_offset), (c.s1)); \ 132 CONCAT(ARM_OFFSET, k0) \ 133 ((a), (b_offset), (c.s2)); \ 134 CONCAT(ARM_OFFSET, k0) \ 135 ((b##2), (a_offset), (c.s2)); \ 137 #elif N0 == 4 // N0 == 4 138 #define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \ 140 CONCAT(ARM_OFFSET, k0) \ 141 ((a), (b_offset), (c.s0)); \ 142 CONCAT(ARM_OFFSET, k0) \ 143 ((b##0), (a_offset), (c.s0)); \ 144 CONCAT(ARM_OFFSET, k0) \ 145 ((a), (b_offset), (c.s1)); \ 146 CONCAT(ARM_OFFSET, k0) \ 147 ((b##1), (a_offset), (c.s1)); \ 148 CONCAT(ARM_OFFSET, k0) \ 149 ((a), (b_offset), (c.s2)); \ 150 CONCAT(ARM_OFFSET, k0) \ 151 ((b##2), (a_offset), (c.s2)); \ 152 CONCAT(ARM_OFFSET, k0) \ 153 ((a), (b_offset), (c.s3)); \ 154 CONCAT(ARM_OFFSET, k0) \ 155 ((b##3), (a_offset), (c.s3)); \ 157 #elif N0 == 8 // N0 == 8 158 #define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \ 160 CONCAT(ARM_OFFSET, k0) \ 161 ((a), (b_offset), (c.s0)); \ 162 CONCAT(ARM_OFFSET, k0) \ 163 ((b##0), (a_offset), (c.s0)); \ 164 CONCAT(ARM_OFFSET, k0) \ 165 ((a), (b_offset), (c.s1)); \ 166 CONCAT(ARM_OFFSET, k0) \ 167 ((b##1), (a_offset), (c.s1)); \ 168 CONCAT(ARM_OFFSET, k0) \ 169 ((a), (b_offset), (c.s2)); \ 170 CONCAT(ARM_OFFSET, k0) \ 171 ((b##2), (a_offset), (c.s2)); \ 172 CONCAT(ARM_OFFSET, k0) \ 173 ((a), (b_offset), (c.s3)); \ 174 CONCAT(ARM_OFFSET, k0) \ 175 ((b##3), (a_offset), (c.s3)); \ 176 CONCAT(ARM_OFFSET, k0) \ 177 ((a), (b_offset), (c.s4)); \ 178 CONCAT(ARM_OFFSET, k0) \ 179 ((b##4), (a_offset), (c.s4)); \ 180 CONCAT(ARM_OFFSET, k0) \ 181 ((a), (b_offset), (c.s5)); \ 182 CONCAT(ARM_OFFSET, k0) \ 183 ((b##5), (a_offset), (c.s5)); \ 184 CONCAT(ARM_OFFSET, k0) \ 185 ((a), (b_offset), (c.s6)); \ 186 CONCAT(ARM_OFFSET, k0) \ 187 ((b##6), (a_offset), (c.s6)); \ 188 CONCAT(ARM_OFFSET, k0) \ 189 ((a), (b_offset), (c.s7)); \ 190 CONCAT(ARM_OFFSET, k0) \ 191 ((b##7), (a_offset), (c.s7)); \ 193 #elif N0 == 16 // N0 == 16 194 #define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \ 196 CONCAT(ARM_OFFSET, k0) \ 197 ((a), (b_offset), (c.s0)); \ 198 CONCAT(ARM_OFFSET, k0) \ 199 ((b##0), (a_offset), (c.s0)); \ 200 CONCAT(ARM_OFFSET, k0) \ 201 ((a), (b_offset), (c.s1)); \ 202 CONCAT(ARM_OFFSET, k0) \ 203 ((b##1), (a_offset), (c.s1)); \ 204 CONCAT(ARM_OFFSET, k0) \ 205 ((a), (b_offset), (c.s2)); \ 206 CONCAT(ARM_OFFSET, k0) \ 207 ((b##2), (a_offset), (c.s2)); \ 208 CONCAT(ARM_OFFSET, k0) \ 209 ((a), (b_offset), (c.s3)); \ 210 CONCAT(ARM_OFFSET, k0) \ 211 ((b##3), (a_offset), (c.s3)); \ 212 CONCAT(ARM_OFFSET, k0) \ 213 ((a), (b_offset), (c.s4)); \ 214 CONCAT(ARM_OFFSET, k0) \ 215 ((b##4), (a_offset), (c.s4)); \ 216 CONCAT(ARM_OFFSET, k0) \ 217 ((a), (b_offset), (c.s5)); \ 218 CONCAT(ARM_OFFSET, k0) \ 219 ((b##5), (a_offset), (c.s5)); \ 220 CONCAT(ARM_OFFSET, k0) \ 221 ((a), (b_offset), (c.s6)); \ 222 CONCAT(ARM_OFFSET, k0) \ 223 ((b##6), (a_offset), (c.s6)); \ 224 CONCAT(ARM_OFFSET, k0) \ 225 ((a), (b_offset), (c.s7)); \ 226 CONCAT(ARM_OFFSET, k0) \ 227 ((b##7), (a_offset), (c.s7)); \ 228 CONCAT(ARM_OFFSET, k0) \ 229 ((a), (b_offset), (c.s8)); \ 230 CONCAT(ARM_OFFSET, k0) \ 231 ((b##8), (a_offset), (c.s8)); \ 232 CONCAT(ARM_OFFSET, k0) \ 233 ((a), (b_offset), (c.s9)); \ 234 CONCAT(ARM_OFFSET, k0) \ 235 ((b##9), (a_offset), (c.s9)); \ 236 CONCAT(ARM_OFFSET, k0) \ 237 ((a), (b_offset), (c.sA)); \ 238 CONCAT(ARM_OFFSET, k0) \ 239 ((b##A), (a_offset), (c.sA)); \ 240 CONCAT(ARM_OFFSET, k0) \ 241 ((a), (b_offset), (c.sB)); \ 242 CONCAT(ARM_OFFSET, k0) \ 243 ((b##B), (a_offset), (c.sB)); \ 244 CONCAT(ARM_OFFSET, k0) \ 245 ((a), (b_offset), (c.sC)); \ 246 CONCAT(ARM_OFFSET, k0) \ 247 ((b##C), (a_offset), (c.sC)); \ 248 CONCAT(ARM_OFFSET, k0) \ 249 ((a), (b_offset), (c.sD)); \ 250 CONCAT(ARM_OFFSET, k0) \ 251 ((b##D), (a_offset), (c.sD)); \ 252 CONCAT(ARM_OFFSET, k0) \ 253 ((a), (b_offset), (c.sE)); \ 254 CONCAT(ARM_OFFSET, k0) \ 255 ((b##E), (a_offset), (c.sE)); \ 256 CONCAT(ARM_OFFSET, k0) \ 257 ((a), (b_offset), (c.sF)); \ 258 CONCAT(ARM_OFFSET, k0) \ 259 ((b##F), (a_offset), (c.sF)); \ 261 #else // N0 not supported 262 #error "N0 value not supported" 263 #endif // N0 conditions 264 #else // defined(IS_QUANTIZED) 266 #define ARM_DOT1(a, b, c) \ 268 c += (ACC_DATA_TYPE)a * (ACC_DATA_TYPE)b; \ 270 #define ARM_DOT2(a, b, c) \ 272 c += (ACC_DATA_TYPE)a.s0 * (ACC_DATA_TYPE)b.s0; \ 273 c += (ACC_DATA_TYPE)a.s1 * (ACC_DATA_TYPE)b.s1; \ 275 #define ARM_DOT3(a, b, c) \ 278 c += (ACC_DATA_TYPE)a.s2 * (ACC_DATA_TYPE)b.s2; \ 280 #define ARM_DOT4(a, b, c) \ 283 c += (ACC_DATA_TYPE)a.s3 * (ACC_DATA_TYPE)b.s3; \ 285 #define ARM_DOT8(a, b, c) \ 287 ARM_DOT4((a.lo), (b.lo), c); \ 288 ARM_DOT4((a.hi), (b.hi), c); \ 290 #define ARM_DOT16(a, b, c) \ 292 ARM_DOT8((a.lo), (b.lo), c); \ 293 ARM_DOT8((a.hi), (b.hi), c); \ 295 #endif // defined(IS_QUANTIZED) 298 #define ARM_DOT_K0XN0(k0, a, b, c) \ 300 CONCAT(ARM_DOT, k0) \ 301 ((a), (b##0), (c)); \ 303 #elif N0 == 2 // N) == 3 304 #define ARM_DOT_K0XN0(k0, a, b, c) \ 306 CONCAT(ARM_DOT, k0) \ 307 ((a), (b##0), (c.s0)); \ 308 CONCAT(ARM_DOT, k0) \ 309 ((a), (b##1), (c.s1)); \ 311 #elif N0 == 3 // N0 == 3 312 #define ARM_DOT_K0XN0(k0, a, b, c) \ 314 CONCAT(ARM_DOT, k0) \ 315 ((a), (b##0), (c.s0)); \ 316 CONCAT(ARM_DOT, k0) \ 317 ((a), (b##1), (c.s1)); \ 318 CONCAT(ARM_DOT, k0) \ 319 ((a), (b##2), (c.s2)); \ 321 #elif N0 == 4 // N0 == 4 322 #define ARM_DOT_K0XN0(k0, a, b, c) \ 324 CONCAT(ARM_DOT, k0) \ 325 ((a), (b##0), (c.s0)); \ 326 CONCAT(ARM_DOT, k0) \ 327 ((a), (b##1), (c.s1)); \ 328 CONCAT(ARM_DOT, k0) \ 329 ((a), (b##2), (c.s2)); \ 330 CONCAT(ARM_DOT, k0) \ 331 ((a), (b##3), (c.s3)); \ 333 #elif N0 == 8 // N0 == 8 334 #define ARM_DOT_K0XN0(k0, a, b, c) \ 336 CONCAT(ARM_DOT, k0) \ 337 ((a), (b##0), (c.s0)); \ 338 CONCAT(ARM_DOT, k0) \ 339 ((a), (b##1), (c.s1)); \ 340 CONCAT(ARM_DOT, k0) \ 341 ((a), (b##2), (c.s2)); \ 342 CONCAT(ARM_DOT, k0) \ 343 ((a), (b##3), (c.s3)); \ 344 CONCAT(ARM_DOT, k0) \ 345 ((a), (b##4), (c.s4)); \ 346 CONCAT(ARM_DOT, k0) \ 347 ((a), (b##5), (c.s5)); \ 348 CONCAT(ARM_DOT, k0) \ 349 ((a), (b##6), (c.s6)); \ 350 CONCAT(ARM_DOT, k0) \ 351 ((a), (b##7), (c.s7)); \ 353 #elif N0 == 16 // N0 == 16 354 #define ARM_DOT_K0XN0(k0, a, b, c) \ 356 CONCAT(ARM_DOT, k0) \ 357 ((a), (b##0), (c.s0)); \ 358 CONCAT(ARM_DOT, k0) \ 359 ((a), (b##1), (c.s1)); \ 360 CONCAT(ARM_DOT, k0) \ 361 ((a), (b##2), (c.s2)); \ 362 CONCAT(ARM_DOT, k0) \ 363 ((a), (b##3), (c.s3)); \ 364 CONCAT(ARM_DOT, k0) \ 365 ((a), (b##4), (c.s4)); \ 366 CONCAT(ARM_DOT, k0) \ 367 ((a), (b##5), (c.s5)); \ 368 CONCAT(ARM_DOT, k0) \ 369 ((a), (b##6), (c.s6)); \ 370 CONCAT(ARM_DOT, k0) \ 371 ((a), (b##7), (c.s7)); \ 372 CONCAT(ARM_DOT, k0) \ 373 ((a), (b##8), (c.s8)); \ 374 CONCAT(ARM_DOT, k0) \ 375 ((a), (b##9), (c.s9)); \ 376 CONCAT(ARM_DOT, k0) \ 377 ((a), (b##A), (c.sA)); \ 378 CONCAT(ARM_DOT, k0) \ 379 ((a), (b##B), (c.sB)); \ 380 CONCAT(ARM_DOT, k0) \ 381 ((a), (b##C), (c.sC)); \ 382 CONCAT(ARM_DOT, k0) \ 383 ((a), (b##D), (c.sD)); \ 384 CONCAT(ARM_DOT, k0) \ 385 ((a), (b##E), (c.sE)); \ 386 CONCAT(ARM_DOT, k0) \ 387 ((a), (b##F), (c.sF)); \ 389 #else // N0 not supported 390 #error "N0 value not supported" 391 #endif // N0 conditions 463 #
if defined(HAS_BIAS)
466 unsigned int wei_stride_w)
469 #error "M0: Only supported 1" 472 const int cout = max((
int)(get_global_id(0) * N0 - (N0 -
PARTIAL_STORE_N0) % N0), 0);
473 const int mout = get_global_id(1);
474 const int zout = get_global_id(2);
480 #define LINEAR_2_COORDS(i) \ 481 xi##i = ((mout * M0 + i) % DST_WIDTH) * STRIDE_X; \ 482 yi##i = ((mout * M0 + i) / DST_WIDTH) * STRIDE_Y; \ 489 #undef LINEAR_2_COORDS 491 uint src_offset = src_offset_first_element_in_bytes + zout * src_stride_y * (SRC_WIDTH * SRC_HEIGHT);
492 uint wei_offset = wei_offset_first_element_in_bytes + cout * wei_stride_w;
497 for(
int i = 0; i < (WEI_WIDTH * WEI_HEIGHT); ++i)
499 int xk = i % WEI_WIDTH;
500 int yk = i / WEI_WIDTH;
507 mi_valid_row##i = max(min(xi##i + xk, SRC_WIDTH - 1), 0) + max(min(yi##i + yk, SRC_HEIGHT - 1), 0) * SRC_WIDTH; \ 508 mi_mask##i = (xi##i + xk) >= 0 && (xi##i + xk) < SRC_WIDTH && (yi##i + yk) >= 0 && (yi##i + yk) < SRC_HEIGHT; 515 for(; k <= (SRC_CHANNELS - K0); k += K0)
518 LOAD_BLOCK_INDIRECT(M0, K0, SRC_DATA_TYPE, a, src_ptr, src_offset + k *
sizeof(SRC_DATA_TYPE), src_stride_y, mi_valid_row, mi_mask);
521 LOAD_BLOCK(N0, K0, WEI_DATA_TYPE,
b, wei_ptr, wei_offset, wei_stride_w, zero);
523 #if defined(IS_QUANTIZED) 524 #define TENSOR_DOT(K0, i) \ 525 if(mi_mask##i != 0) \ 527 ARM_DOT_K0XN0(K0, a##i, b, c##i); \ 528 ARM_OFFSET_K0XN0(K0, a##i, b, SRC_OFFSET, WEI_OFFSET, c##i); \ 532 ARM_DOT_K0XN0(K0, ((VEC_DATA_TYPE(SRC_DATA_TYPE, K0))ZERO_VALUE), b, c##i); \ 533 ARM_OFFSET_K0XN0(K0, ((VEC_DATA_TYPE(SRC_DATA_TYPE, K0))ZERO_VALUE), b, SRC_OFFSET, WEI_OFFSET, c##i); \ 535 #else // defined(IS_QUANTIZED) 536 #define TENSOR_DOT(K0, i) \ 537 ARM_DOT_K0XN0(K0, a##i, b, c##i); 538 #endif // defined(IS_QUANTIZED) 542 wei_offset += K0 *
sizeof(WEI_DATA_TYPE);
545 #if(SRC_CHANNELS % K0) != 0 547 for(; k < SRC_CHANNELS; ++k)
550 LOAD_BLOCK_INDIRECT(M0, 1, SRC_DATA_TYPE, a, src_ptr, src_offset + k *
sizeof(SRC_DATA_TYPE), src_stride_y, mi_valid_row, mi_mask);
553 LOAD_BLOCK(N0, 1, WEI_DATA_TYPE,
b, wei_ptr, wei_offset, wei_stride_w, zero);
559 wei_offset +=
sizeof(WEI_DATA_TYPE);
561 #endif // (SRC_CHANNELS % K0) != 0 563 c0 += (SRC_CHANNELS * SRC_OFFSET * WEI_OFFSET);
566 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (cout *
sizeof(DST_DATA_TYPE)) + (mout * M0 * dst_stride_y);
569 dst_addr += zout * dst_stride_y * (DST_WIDTH * DST_HEIGHT);
571 #if defined(HAS_BIAS) 572 __global uchar *bias_addr = bia_ptr + bia_offset_first_element_in_bytes + (cout *
sizeof(BIA_DATA_TYPE));
574 LOAD_BLOCK(1, N0, BIA_DATA_TYPE, bias, bias_addr, 0, zero0, zero);
580 #if defined(IS_QUANTIZED) 585 #define QUANTIZE(i) \ 586 c##i = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(c##i, DST_MULTIPLIER, DST_SHIFT, N0); \ 587 c##i = c##i + DST_OFFSET; \ 588 cq##i = CONVERT_SAT(c##i, VEC_DATA_TYPE(DST_DATA_TYPE, N0)); 589 #else // OUTPUT_SHIFT < 0 590 #define QUANTIZE(i) \ 591 c##i = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(c##i, DST_MULTIPLIER, DST_SHIFT, N0); \ 592 c##i = c##i + DST_OFFSET; \ 593 cq##i = CONVERT_SAT(c##i, VEC_DATA_TYPE(DST_DATA_TYPE, N0)); 594 #endif // OUTPUT_SHIFT < 0 601 #else // defined(IS_QUANTIZED) 603 #endif // defined(IS_QUANTIZED) #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL)
#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS)
SimpleTensor< float > src
#define LINEAR_2_COORDS(i)
#define VECTOR_DECLARATION(name)
#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
#define QUANTIZE(input, offset, scale, type, size)
__kernel void direct_convolution_nhwc(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *wei_ptr, uint wei_stride_x, uint wei_step_x, uint wei_stride_y, uint wei_step_y, uint wei_stride_z, uint wei_step_z, uint wei_offset_first_element_in_bytes, __global uchar *bia_ptr, uint bia_stride_x, uint bia_step_x, uint bia_offset_first_element_in_bytes, unsigned int wei_stride_w)
OpenCL kernel to compute the direct convolution.
#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
#define TENSOR3D_DECLARATION(name)
#define TENSOR_DOT(K0, i)
#define VEC_DATA_TYPE(type, size)