27 #define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
29 #define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
31 #define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
33 #define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
35 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
36 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
38 #if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
40 #define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
41 #define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
76 __kernel
void pixelwise_mul_int(
79 #
if !defined(IN_PLACE)
84 size_t x = max((
int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
85 size_t y = get_global_id(1);
86 size_t z = get_global_id(2);
88 __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
89 __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
91 #if !defined(IN_PLACE)
92 out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
93 #else // !defined(IN_PLACE)
94 #if defined(SRC1_IN_PLACE)
96 #else //defined(SRC1_IN_PLACE)
98 #endif //defined(SRC1_IN_PLACE)
99 #endif // !defined(IN_PLACE)
102 VEC_ACC_TYPE in1_data =
CONVERT((
VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))
VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
103 VEC_ACC_TYPE in2_data =
CONVERT((
VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))
VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
105 VEC_OUT_TYPE out_data0 =
MUL_OP(in1_data, in2_data,
scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
106 STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
110 #if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
112 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
113 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
114 #define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
154 __kernel
void pixelwise_mul_quantized(
157 #
if !defined(IN_PLACE)
162 size_t x = max((
int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
163 size_t y = get_global_id(1);
164 size_t z = get_global_id(2);
166 __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
167 __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
169 #if !defined(IN_PLACE)
170 out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
171 #else // !defined(IN_PLACE)
172 #if defined(SRC1_IN_PLACE)
174 #else //defined(SRC1_IN_PLACE)
176 #endif //defined(SRC1_IN_PLACE)
177 #endif // !defined(IN_PLACE)
180 VEC_INT in_a =
CONVERT((VEC_TYPE)(
VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
181 VEC_INT in_b =
CONVERT((VEC_TYPE)(
VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
184 #if defined(OFFSET_IN1)
185 in_a -= (VEC_INT)((
int)OFFSET_IN1);
186 #endif // defined(OFFSET_IN1)
187 #if defined(OFFSET_IN2)
188 in_b -= (VEC_INT)((
int)OFFSET_IN2);
189 #endif // defined(OFFSET_IN2)
193 #if defined(OFFSET_OUT)
195 #else // defined(OFFSET_OUT)
197 #endif // defined(OFFSET_OUT)
201 STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);