26 #if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
54 int y = get_global_id(1) * 4;
55 int z = get_global_id(2);
57 __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
58 __global uchar *input_ptr =
src.ptr;
60 DATA_TYPE acc0 = (DATA_TYPE)0;
61 DATA_TYPE acc1 = (DATA_TYPE)0;
62 DATA_TYPE acc2 = (DATA_TYPE)0;
63 DATA_TYPE acc3 = (DATA_TYPE)0;
66 for(
int i = 0; i < SRC_WIDTH; i += 4)
69 weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
71 int4
offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
74 tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s0));
76 tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s1));
78 tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s2));
80 tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s3));
82 acc0 += dot(weights, tmp0);
83 acc1 += dot(weights, tmp1);
84 acc2 += dot(weights, tmp2);
85 acc3 += dot(weights, tmp3);
88 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
90 int rows_left = SRC_HEIGHT - (y + 4);
97 vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
104 *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
106 *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
108 *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
142 const int input_offset,
143 const int weights_offset)
147 int y = get_global_id(1) * 4;
148 int z = get_global_id(2);
150 __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
151 __global uchar *input_ptr =
src.ptr;
159 for(
int i = 0; i < SRC_WIDTH; i += 4)
161 int4
w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
163 int4
offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
165 int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s0))) + (int4)input_offset;
166 int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s1))) + (int4)input_offset;
167 int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s2))) + (int4)input_offset;
168 int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr +
offset.s3))) + (int4)input_offset;
171 acc0 += tmp0.s0 *
w.s0 + tmp0.s1 *
w.s1 + tmp0.s2 *
w.s2 + tmp0.s3 *
w.s3;
172 acc1 += tmp1.s0 *
w.s0 + tmp1.s1 *
w.s1 + tmp1.s2 *
w.s2 + tmp1.s3 *
w.s3;
173 acc2 += tmp2.s0 *
w.s0 + tmp2.s1 *
w.s1 + tmp2.s2 *
w.s2 + tmp2.s3 *
w.s3;
174 acc3 += tmp3.s0 *
w.s0 + tmp3.s1 *
w.s1 + tmp3.s2 *
w.s2 + tmp3.s3 *
w.s3;
177 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
179 int rows_left = SRC_HEIGHT - (y + 4);
184 vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global
int *)output_ptr);
191 *((__global
int *)(output_ptr + 2 * dst_stride_x)) = acc2;
193 *((__global
int *)(output_ptr + 1 * dst_stride_x)) = acc1;
195 *((__global
int *)(output_ptr + 0 * dst_stride_x)) = acc0;