26 #if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) 61 __kernel
void softmax_layer_norm(
66 const int x_offs = max((
int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) *
sizeof(
DATA_TYPE);
68 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
69 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
78 #if defined(LOG_SOFTMAX) 79 sum_val = log(sum_val);
81 #else // defined(LOG_SOFTMAX) 83 #endif // defined(LOG_SOFTMAX) 88 #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL) 91 #if !defined(GRID_SIZE) 95 #define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 96 #define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 143 __kernel
void softmax_layer_max_shift_exp_sum_serial(
149 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
150 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
157 VEC_TYPE beta = (VEC_TYPE)BETA;
161 VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
163 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 164 VEC_TYPE data =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)src_addr);
166 max_val_vec = max(max_val_vec,
select((VEC_TYPE)(MINVAL), data, widx));
169 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
172 max_val_vec = max(data, max_val_vec);
184 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 191 (data, 0, (__global
DATA_TYPE *)dst_addr);
193 data =
select(0, data, widx);
196 data =
select(0, data, widx);
198 (data, 0, (__global
DATA_TYPE *)dst_addr);
204 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
272 __kernel
void softmax_layer_max_shift_exp_sum_parallel(
278 const uint lid = get_local_id(0);
279 const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) *
sizeof(
DATA_TYPE);
281 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
282 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
289 VEC_TYPE beta = (VEC_TYPE)BETA;
293 __local VEC_TYPE tmp_local[GRID_SIZE];
296 VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
299 const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
302 for(; i < width; ++i)
304 VEC_TYPE data_max =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
305 max_val_vec = max(data_max, max_val_vec);
307 #ifdef NON_MULTIPLE_OF_GRID_SIZE 310 int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
311 if(lid < boundary_workitems)
313 VEC_TYPE data_max =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
314 max_val_vec = max(data_max, max_val_vec);
316 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 321 VEC_TYPE data_max =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER *
sizeof(
DATA_TYPE)));
323 max_val_vec = max(max_val_vec,
select((VEC_TYPE)(MINVAL), data_max, widx));
327 tmp_local[lid] = max_val_vec;
329 barrier(CLK_LOCAL_MEM_FENCE);
335 tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
337 barrier(CLK_LOCAL_MEM_FENCE);
343 tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
345 barrier(CLK_LOCAL_MEM_FENCE);
351 tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
353 barrier(CLK_LOCAL_MEM_FENCE);
359 tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
361 barrier(CLK_LOCAL_MEM_FENCE);
367 tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
369 barrier(CLK_LOCAL_MEM_FENCE);
375 tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
377 barrier(CLK_LOCAL_MEM_FENCE);
383 tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
385 barrier(CLK_LOCAL_MEM_FENCE);
389 max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
390 max_local =
MAX_REDUCE(max_val_vec, VECTOR_SIZE);
392 barrier(CLK_LOCAL_MEM_FENCE);
401 for(i = 0; i < width; ++i)
403 VEC_TYPE data =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
410 (data, 0, (__global
DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
415 (data, 0, (__global
DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
419 #ifdef NON_MULTIPLE_OF_GRID_SIZE 421 boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
422 if(lid < boundary_workitems)
431 (data, 0, (__global
DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
436 (data, 0, (__global
DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) *
sizeof(
DATA_TYPE)));
440 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 444 VEC_TYPE data =
VLOAD(VECTOR_SIZE)(0, (__global
DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER *
sizeof(
DATA_TYPE)));
451 (data, 0, (__global
DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER *
sizeof(
DATA_TYPE)));
453 data =
select(0, data, widx);
456 data =
select(0, data, widx);
458 (data, 0, (__global
DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER *
sizeof(
DATA_TYPE)));
464 tmp_local[lid] = sum1D;
466 barrier(CLK_LOCAL_MEM_FENCE);
472 tmp_local[lid] += tmp_local[lid + 128];
474 barrier(CLK_LOCAL_MEM_FENCE);
480 tmp_local[lid] += tmp_local[lid + 64];
482 barrier(CLK_LOCAL_MEM_FENCE);
488 tmp_local[lid] += tmp_local[lid + 32];
490 barrier(CLK_LOCAL_MEM_FENCE);
496 tmp_local[lid] += tmp_local[lid + 16];
498 barrier(CLK_LOCAL_MEM_FENCE);
504 tmp_local[lid] += tmp_local[lid + 8];
506 barrier(CLK_LOCAL_MEM_FENCE);
512 tmp_local[lid] += tmp_local[lid + 4];
514 barrier(CLK_LOCAL_MEM_FENCE);
520 tmp_local[lid] += tmp_local[lid + 2];
522 barrier(CLK_LOCAL_MEM_FENCE);
526 sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
532 #endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL) 533 #endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) __global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)
#define VSTORE_PARTIAL(size, store_size)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define MAX_REDUCE(x, size)
SimpleTensor< float > src
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
#define SELECT_DATA_TYPE(type)
Structure to hold Image information.
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
__global uchar * ptr
Pointer to the starting postion of the buffer.
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
#define SUM_REDUCE(x, size)
#define TENSOR3D_DECLARATION(name)
#define VEC_DATA_TYPE(type, size)