27 #define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2)
28 #define PERFORM_REDUCTION_IMPL(type) \
29 inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \
35 #define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4)
36 #define PERFORM_REDUCTION_IMPL(type) \
37 inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \
44 #define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8)
45 #define PERFORM_REDUCTION_IMPL(type) \
46 inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \
48 sum.s0123 += sum.s4567; \
55 #define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16)
56 #define PERFORM_REDUCTION_IMPL(type) \
57 inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \
59 sum.s01234567 += sum.s89abcdef; \
60 sum.s0123 += sum.s4567; \
67 #define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input)
68 #define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type)
87 int stddev_inv_multiplier = INT_MAX;
88 int stddev_inv_shift = 0;
92 stddev_inv.s0 = stddev_inv_multiplier;
93 stddev_inv.s1 = stddev_inv_shift;
97 stddev_inv_shift = 11;
98 while(
input >= (1 << 29))
104 const unsigned int max_left_shift_bits = clz(
input) - 1;
105 const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2;
106 const unsigned int left_shift_bit_pairs = max_left_shift_bits_pairs - 1;
107 stddev_inv_shift -= left_shift_bit_pairs;
108 input <<= 2 * left_shift_bit_pairs;
110 typedef int FixedPointRawType;
111 const unsigned int fixedpoint_position = 3;
112 const unsigned int fixedpoint_int_position =
sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
113 typedef FixedPointRawType FixedPoint3;
114 typedef FixedPointRawType FixedPoint0;
116 const FixedPoint3 fixedpoint_input = (
input >> 1);
118 const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
119 FixedPoint3 x = 0x1 << fixedpoint_int_position;
121 const int num_iteration = 5;
122 for(
int i = 0; i < num_iteration; i++)
127 const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
129 stddev_inv_multiplier = x;
130 if(stddev_inv_shift < 0)
132 stddev_inv_multiplier <<= -stddev_inv_shift;
133 stddev_inv_shift = 0;
135 stddev_inv_shift *= reverse_shift;
137 stddev_inv.s0 = stddev_inv_multiplier;
138 stddev_inv.s1 = stddev_inv_shift;
142 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
202 for(; i <
WIDTH; ++i)
204 DATA_TYPE data = *((__global DATA_TYPE *)
offset(&
input, i, 0));
210 int temp = 0x100000 /
WIDTH;
211 int mean = (int)(sum.s0 * 1024 / WIDTH);
212 int var2 = ((sum_sq.s0 * (long)temp) - ((long)mean * (
long)mean)) / 0x100000;
228 res = (res + 512) >> 10;
230 #if defined(MIN_BOUND)
232 #endif // defined(MIN_BOUND)
233 #if defined(MAX_BOUND)
235 #endif // defined(MAX_BOUND)
239 for(; i <
WIDTH; ++i)
241 DATA_TYPE data = *((__global DATA_TYPE *)
offset(&
input, i, 0));
242 int res = (int)data * 1024 - mean;
249 res = (res + 512) >> 10;
251 #if defined(MIN_BOUND)
252 res = max(res, MIN_BOUND);
253 #endif // defined(MIN_BOUND)
254 #if defined(MAX_BOUND)
255 res = min(res, MAX_BOUND);
256 #endif // defined(MAX_BOUND)
257 *((__global DATA_TYPE *)
offset(&output, i, 0)) = (DATA_TYPE)res;