26 #if defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
60 const uint x_offs = max((
int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
63 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
64 __global uchar *sum_addr = sum_ptr + sum_offset_first_element_in_bytes + get_global_id(1) * sum_stride_y;
65 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
68 in =
VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
71 normalize_value = (
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(*((__global DATA_TYPE *)sum_addr),
epsilon));
74 data0 = in * normalize_value;
76 STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
105 __kernel
void l2_normalize_y(
112 const uint x_offs = max((
int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
115 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
116 __global uchar *sum_addr = sum_ptr + sum_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE);
117 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
120 in =
VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
122 sums =
VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
128 data0 = in * normalize_value;
130 STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
165 __kernel
void l2_normalize_z(
172 const uint x_offs = max((
int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
175 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
176 __global uchar *sum_addr = sum_ptr + sum_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * sum_stride_y;
177 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
180 in =
VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
182 sums =
VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
187 STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
189 #endif // defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)