26 #if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN) 28 #define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 29 #define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE) 67 __kernel
void softmax_layer_norm_quantized(
72 const int x_offs = max((
int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
74 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs *
sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
75 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
80 int sum_val = *((__global
int *)
offset(&sum, 0, get_global_id(1)));
83 uint sum_val_u = convert_uint(sum_val);
84 int headroom_plus_one = clz(sum_val_u);
85 int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
86 int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
87 VEC_INT shifted_sum_minus_one = shifted_sum_minus_one_1;
91 VEC_INT data_diff =
VLOAD(VECTOR_SIZE)(0, (__global
int *)src_addr);
92 VEC_INT data_diff_mult = data_diff;
93 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) 94 if(INPUT_BETA_MULTIPLIER > 1)
96 data_diff_mult =
ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
101 data =
ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
103 #ifdef QASYMM8_SIGNED 106 data =
select(MIN_VALUE, data, data_diff >= (
VEC_INT)(DIFF_MIN));
112 #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) 115 #if !defined(GRID_SIZE) 119 #define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE) 123 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) 124 if(INPUT_BETA_MULTIPLIER > 1)
126 return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
178 __kernel
void softmax_layer_max_shift_exp_sum_quantized_serial(
184 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
185 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
190 VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
193 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 194 VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
195 VEC_BASE data =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)src_addr);
197 max_val_vec = max(max_val_vec,
select(vec_min_val, data,
CONVERT(widx, VEC_BASE)));
200 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
203 max_val_vec = max(data, max_val_vec);
213 int max_val = convert_int(max_local);
218 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 220 VEC_INT data_diff = data_fp - max_val;
221 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
223 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
225 (data_diff, 0, (__global
int *)dst_addr);
226 data_fp =
select(0, data_fp, data_diff >= (
VEC_INT)(DIFF_MIN));
227 sum1D +=
select(0, data_fp, widx);
231 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
235 VEC_INT data_diff = data_fp - max_val;
236 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
238 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
240 (data_diff, 0, (__global
int *)(dst_addr + i *
sizeof(
int)));
241 sum1D = sum1D +
select(0, data_fp, data_diff >= (
VEC_INT)(DIFF_MIN));
294 __kernel
void softmax_layer_max_shift_exp_sum_quantized_parallel(
300 const uint lid = get_local_id(0);
301 const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
303 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
304 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs *
sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
310 __local
VEC_INT tmp_local[GRID_SIZE];
313 VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
314 VEC_BASE max_val_vec = vec_min_val;
317 const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
320 for(; i < width; ++i)
322 VEC_BASE data_max =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
323 max_val_vec = max(data_max, max_val_vec);
325 #ifdef NON_MULTIPLE_OF_GRID_SIZE 328 int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
329 if(lid < boundary_workitems)
331 VEC_BASE data_max =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
332 max_val_vec = max(data_max, max_val_vec);
334 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 339 VEC_BASE data_max =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER *
sizeof(
DATA_TYPE)));
341 max_val_vec = max(max_val_vec,
select(vec_min_val, data_max,
CONVERT(widx, VEC_BASE)));
347 barrier(CLK_LOCAL_MEM_FENCE);
353 tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
355 barrier(CLK_LOCAL_MEM_FENCE);
361 tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
363 barrier(CLK_LOCAL_MEM_FENCE);
369 tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
371 barrier(CLK_LOCAL_MEM_FENCE);
377 tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
379 barrier(CLK_LOCAL_MEM_FENCE);
385 tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
387 barrier(CLK_LOCAL_MEM_FENCE);
393 tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
395 barrier(CLK_LOCAL_MEM_FENCE);
401 tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
403 barrier(CLK_LOCAL_MEM_FENCE);
407 max_val_vec = max(
CONVERT((tmp_local[lid + 1]), VEC_BASE),
CONVERT((tmp_local[lid]), VEC_BASE));
408 max_local =
MAX_REDUCE(max_val_vec, VECTOR_SIZE);
410 barrier(CLK_LOCAL_MEM_FENCE);
416 int max_val = convert_int(max_local);
419 for(i = 0; i < width; ++i)
421 VEC_BASE data =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
423 VEC_INT data_diff = data_fp - max_val;
424 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
426 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
428 (data_diff, 0, (__global
int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(int)));
429 sum1D = sum1D +
select(0, data_fp, data_diff >= (
VEC_INT)(DIFF_MIN));
431 #ifdef NON_MULTIPLE_OF_GRID_SIZE 433 boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
434 if(lid < boundary_workitems)
436 VEC_BASE data =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
438 VEC_INT data_diff = data_fp - max_val;
439 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
441 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
443 (data_diff, 0, (__global
int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(int)));
444 sum1D = sum1D +
select(0, data_fp, data_diff >= (
VEC_INT)(DIFF_MIN));
446 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 450 VEC_BASE data =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER *
sizeof(
DATA_TYPE)));
452 VEC_INT data_diff = data_fp - max_val;
453 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
455 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
457 (data_diff, 0, (__global
int *)(dst_addr - VECTOR_SIZE_LEFTOVER *
sizeof(
int)));
458 data_fp =
select(MIN_VALUE, data_fp, data_diff >= (
VEC_INT)(DIFF_MIN));
459 data_fp =
select(0, data_fp, widx);
460 sum1D = sum1D + data_fp;
464 tmp_local[lid] = sum1D;
466 barrier(CLK_LOCAL_MEM_FENCE);
472 tmp_local[lid] += tmp_local[lid + 128];
474 barrier(CLK_LOCAL_MEM_FENCE);
480 tmp_local[lid] += tmp_local[lid + 64];
482 barrier(CLK_LOCAL_MEM_FENCE);
488 tmp_local[lid] += tmp_local[lid + 32];
490 barrier(CLK_LOCAL_MEM_FENCE);
496 tmp_local[lid] += tmp_local[lid + 16];
498 barrier(CLK_LOCAL_MEM_FENCE);
504 tmp_local[lid] += tmp_local[lid + 8];
506 barrier(CLK_LOCAL_MEM_FENCE);
512 tmp_local[lid] += tmp_local[lid + 4];
514 barrier(CLK_LOCAL_MEM_FENCE);
520 tmp_local[lid] += tmp_local[lid + 2];
522 barrier(CLK_LOCAL_MEM_FENCE);
526 sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
531 #endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) __global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)
#define VSTORE_PARTIAL(size, store_size)
#define ASYMM_MULT(a, b, size)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size)
#define MAX_REDUCE(x, size)
#define CONVERT_SAT(a, b)
SimpleTensor< float > src
#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size)
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
Structure to hold Image information.
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
__global uchar * ptr
Pointer to the starting postion of the buffer.
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
#define SUM_REDUCE(x, size)
#define TENSOR3D_DECLARATION(name)
#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size)
#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size)
#define VEC_DATA_TYPE(type, size)