26 #if defined(SRC_DIM_Z) 28 #define OUTPUT_ROW_2x2_7x7(out, tmp) \ 30 out.s0 = -tmp.s0 / 36.f; \ 31 out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f; \ 32 out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f; \ 33 out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \ 34 out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f; \ 35 out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \ 36 out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \ 66 __kernel
void winograd_filter_transform_2x2_3x3_nchw(
75 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 77 w0 = vload3(0, (__global
DATA_TYPE *)(src_addr));
78 #elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 81 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
82 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
83 #else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 85 w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
87 w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
89 w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
90 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 96 out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
97 out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
100 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 104 out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
105 out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
106 out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
107 out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
112 out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
113 out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
114 out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
115 out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
121 out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
122 out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
124 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 126 int z = get_global_id(2);
127 int x0 = z / SRC_DIM_Z;
128 int y0 = z % SRC_DIM_Z;
131 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
136 *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
137 *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
138 *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
139 *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
141 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 142 *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0;
143 *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1;
144 *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2;
145 *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3;
146 *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0;
147 *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1;
148 *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
149 *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
150 *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
151 *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
152 *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
153 *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
154 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 183 __kernel
void winograd_filter_transform_4x4_3x3_nchw(
192 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 194 w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
195 #elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 197 w0 = (
VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
198 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
199 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
200 #else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 202 w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
204 w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
206 w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
207 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 212 out0.s0 = (w0.s0) / 16.f;
213 out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
214 out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
215 out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
216 out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
217 out0.s5 = (w0.s2) / 4.f;
219 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 223 out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
224 out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
225 out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
226 out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
227 out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
228 out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
233 out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
234 out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
235 out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
236 out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
237 out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
238 out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
243 out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
244 out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
245 out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
246 out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
247 out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
248 out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
253 out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
254 out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
255 out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
256 out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
257 out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
258 out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
263 out5.s0 = (w2.s0) / 4.f;
264 out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
265 out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
266 out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
267 out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
269 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 271 int z = get_global_id(2);
272 int x0 = z / SRC_DIM_Z;
273 int y0 = z % SRC_DIM_Z;
276 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
281 *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
282 *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
283 *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
284 *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
285 *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
286 *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
288 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 289 *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0;
290 *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1;
291 *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2;
292 *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3;
293 *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
294 *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
295 *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
296 *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
297 *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
298 *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
299 *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
300 *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
301 *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
302 *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
303 *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
304 *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
305 *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
306 *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
307 *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
308 *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
309 *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
310 *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
311 *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
312 *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
313 *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
314 *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
315 *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
316 *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
317 *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
318 *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
319 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 348 __kernel
void winograd_filter_transform_4x4_3x3_nhwc(
354 const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
357 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 358 DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
359 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
360 DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
361 #else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 362 DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
363 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
364 DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
365 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 366 DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
367 DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
368 DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
369 DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
370 DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
371 DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
372 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 373 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 376 DATA_TYPE out00, out01, out02, out03, out04, out05;
377 out00 = (w00) / 16.f;
378 out01 = (-w00 - w01 - w02) / 24.f;
379 out02 = (-w00 + w01 - w02) / 24.f;
380 out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;
381 out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
384 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 386 DATA_TYPE out10, out11, out12, out13, out14, out15;
387 out10 = (-w00 - w10 - w20) / 24.f;
388 out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
389 out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
390 out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
391 out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
392 out15 = (-w02 - w12 - w22) / 6.f;
395 DATA_TYPE out20, out21, out22, out23, out24, out25;
396 out20 = (-w00 + w10 - w20) / 24.f;
397 out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
398 out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
399 out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
400 out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
401 out25 = (-w02 + w12 - w22) / 6.f;
404 DATA_TYPE out30, out31, out32, out33, out34, out35;
405 out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
406 out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
407 out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
408 out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
409 out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
410 out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
413 DATA_TYPE out40, out41, out42, out43, out44, out45;
414 out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
415 out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
416 out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
417 out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
418 out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
419 out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
422 DATA_TYPE out50, out51, out52, out53, out54, out55;
424 out51 = (-w20 - w21 - w22) / 6.f;
425 out52 = (-w20 + w21 - w22) / 6.f;
426 out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
427 out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
429 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 431 int x0 = get_global_id(2);
432 int y0 = get_global_id(0);
435 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 *
sizeof(
DATA_TYPE) + y0 * dst_stride_y;
440 *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
441 *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
442 *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
443 *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
444 *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
445 *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
446 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 447 *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out10;
448 *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out11;
449 *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out12;
450 *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out13;
451 *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
452 *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
453 *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
454 *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
455 *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
456 *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
457 *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
458 *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
459 *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
460 *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
461 *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
462 *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
463 *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
464 *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
465 *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
466 *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
467 *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
468 *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
469 *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
470 *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
471 *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
472 *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
473 *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
474 *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
475 *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
476 *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
477 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 507 __kernel
void winograd_filter_transform_4x4_5x5_nchw(
516 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 518 w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
519 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
520 #elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 522 w00 = (
VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
523 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
524 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
525 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
526 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
527 #else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 529 w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
530 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
532 w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
533 DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
535 w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
536 DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
538 w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
539 DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
541 w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
542 DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
543 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 551 out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
552 out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
553 out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
554 out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
555 out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
556 out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
559 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 563 out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
564 out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
565 (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
566 out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
567 (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
568 out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
569 (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
570 out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
571 (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
572 out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
573 (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
574 out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
575 (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
576 out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
581 out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
582 out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
583 (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
584 out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
585 (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
586 out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
587 (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
588 out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
589 (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
590 out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
591 (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
592 out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
593 (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
594 out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
599 out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
600 out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
601 (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
602 (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
603 out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
604 (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
605 (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
606 out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
607 (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
608 (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
609 out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
610 (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
611 (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
612 out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
613 (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
614 (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
615 out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
616 (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
617 (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
618 out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
623 out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
624 out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
625 (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
626 (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
627 out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
628 (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
629 (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
630 out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
631 (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
632 (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
633 out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
634 (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
635 (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
636 out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
637 (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
638 (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
639 out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
640 (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
641 (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
642 out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
647 out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
648 out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
649 (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
650 (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
651 out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
652 (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
653 (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
654 out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
655 (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
656 (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
657 out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
658 (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
659 (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
660 out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
661 (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
662 (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
663 out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
664 (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
665 (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
666 out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
671 out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
672 out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
673 (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
674 (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
675 out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
676 (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
677 (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
678 out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
679 (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
680 (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
681 out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
682 (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
683 (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
684 out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
685 (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
686 (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
687 out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
688 (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
689 (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
690 out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
696 out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
697 out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
698 out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
699 out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
700 out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
701 out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
703 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 705 int z = get_global_id(2);
706 int x0 = z / SRC_DIM_Z;
707 int y0 = z % SRC_DIM_Z;
710 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 *
sizeof(
DATA_TYPE) + y0 * dst_stride_y;
713 *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
714 *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
715 *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
716 *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
717 *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
718 *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
719 *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
720 *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
722 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 723 *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
724 *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
725 *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
726 *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
727 *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
728 *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
729 *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
730 *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
731 *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
732 *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
733 *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
734 *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
735 *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
736 *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
737 *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
738 *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
739 *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
740 *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
741 *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
742 *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
743 *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
744 *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
745 *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
746 *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
747 *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
748 *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
749 *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
750 *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
751 *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
752 *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
753 *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
754 *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
755 *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
756 *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
757 *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
758 *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
759 *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
760 *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
761 *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
762 *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
763 *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
764 *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
765 *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
766 *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
767 *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
768 *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
769 *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
770 *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
771 *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
772 *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
773 *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
774 *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
775 *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
776 *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
777 *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
778 *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
779 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 808 __kernel
void winograd_filter_transform_4x4_5x5_nhwc(
814 const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) *
sizeof(
DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
816 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 818 DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
819 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
820 DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
821 DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
822 DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
823 #else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 825 DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
826 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
827 DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
828 DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
829 DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
830 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 832 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 833 DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
834 DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
835 DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
836 DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
837 DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
838 DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
839 DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
840 DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
841 DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
842 DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
843 DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
844 DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
845 DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
846 DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
847 DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
848 DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
849 DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
850 DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
851 DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
852 DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
853 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 859 out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
860 out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
861 out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
862 out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
863 out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
864 out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
867 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 871 out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
872 out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
873 out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
874 out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
875 (w04 + w14 + w24 + w34 + w44)) / 405.f;
876 out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
877 (w04 + w14 + w24 + w34 + w44)) / 405.f;
878 out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
879 (w04 + w14 + w24 + w34 + w44)) / 810.f;
880 out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
881 (w04 + w14 + w24 + w34 + w44)) / 810.f;
882 out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
887 out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
888 out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
889 out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
890 out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
891 (w04 - w14 + w24 - w34 + w44)) / 405.f;
892 out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
893 (w04 - w14 + w24 - w34 + w44)) / 405.f;
894 out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
895 (w04 - w14 + w24 - w34 + w44)) / 810.f;
896 out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
897 (w04 - w14 + w24 - w34 + w44)) / 810.f;
898 out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
903 out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
904 out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
905 (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
906 out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
907 (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
908 out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
909 * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
910 out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
911 * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
912 out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
913 (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
914 out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
915 (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
916 out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
921 out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
922 out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
923 (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
924 out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
925 (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
926 out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
927 * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
928 out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
929 * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
930 out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
931 (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
932 out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
933 (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
934 out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
939 out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
940 out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
941 (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
942 out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
943 (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
944 out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
945 * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
946 out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
947 * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
948 out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
949 (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
950 out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
951 (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
952 out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
957 out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
958 out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
959 (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
960 out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
961 (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
962 out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
963 * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
964 out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
965 * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
966 out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
967 (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
968 out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
969 (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
970 out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
976 out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
977 out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
978 out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
979 out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
980 out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
981 out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
983 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 985 int x0 = get_global_id(2);
986 int y0 = get_global_id(0);
989 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 *
sizeof(
DATA_TYPE) + y0 * dst_stride_y;
992 *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
993 *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
994 *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
995 *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
996 *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
997 *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
998 *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
999 *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
1001 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1002 *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
1003 *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
1004 *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
1005 *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
1006 *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
1007 *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
1008 *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
1009 *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
1010 *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
1011 *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
1012 *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
1013 *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
1014 *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
1015 *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
1016 *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
1017 *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
1018 *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
1019 *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
1020 *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
1021 *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
1022 *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
1023 *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
1024 *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
1025 *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
1026 *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
1027 *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
1028 *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
1029 *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
1030 *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
1031 *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
1032 *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
1033 *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
1034 *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
1035 *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
1036 *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
1037 *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
1038 *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
1039 *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
1040 *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
1041 *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
1042 *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
1043 *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
1044 *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
1045 *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
1046 *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
1047 *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
1048 *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
1049 *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
1050 *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
1051 *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
1052 *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
1053 *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
1054 *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
1055 *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
1056 *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
1057 *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
1058 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1086 __kernel
void winograd_filter_transform_2x2_7x7_nhwc(
1092 const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) *
sizeof(
DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
1094 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1096 DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
1097 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
1098 DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
1099 DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
1100 DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
1101 DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
1102 DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
1103 #else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1105 DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
1106 DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
1107 DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
1108 DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
1109 DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
1110 DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
1111 DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
1112 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1114 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1115 DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
1116 DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
1117 DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
1118 DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
1119 DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
1120 DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
1121 DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
1123 DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
1124 DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
1125 DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
1126 DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
1127 DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
1128 DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
1129 DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
1131 DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
1132 DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
1133 DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
1134 DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
1135 DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
1136 DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
1137 DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
1139 DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
1140 DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
1141 DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
1142 DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
1143 DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
1144 DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
1145 DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
1147 DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
1148 DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
1149 DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
1150 DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
1151 DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
1152 DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
1153 DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
1155 DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
1156 DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
1157 DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
1158 DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
1159 DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
1160 DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
1161 DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
1163 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1172 out0.s0 = -w00 / 36.0f;
1173 out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
1174 out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
1175 out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
1176 out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
1177 out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
1178 out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
1183 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1189 tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
1190 tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
1191 tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
1192 tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
1193 tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
1194 tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
1195 tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
1203 tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
1204 tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
1205 tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
1206 tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
1207 tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
1208 tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
1209 tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
1217 tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
1218 tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
1219 tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
1220 tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
1221 tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
1222 tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
1223 tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
1231 tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
1232 tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
1233 tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
1234 tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
1235 tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
1236 tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
1237 tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
1245 tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
1246 tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
1247 tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
1248 tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
1249 tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
1250 tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
1251 tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
1259 tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
1260 tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
1261 tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
1262 tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
1263 tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
1264 tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
1265 tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
1283 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1285 int x0 = get_global_id(2);
1286 int y0 = get_global_id(0);
1289 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 *
sizeof(
DATA_TYPE) + y0 * dst_stride_y;
1292 *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
1293 *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
1294 *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
1295 *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
1296 *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
1297 *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
1298 *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
1299 *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
1301 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1302 *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
1303 *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
1304 *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
1305 *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
1306 *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
1307 *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
1308 *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
1309 *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
1310 *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
1311 *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
1312 *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
1313 *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
1314 *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
1315 *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
1316 *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
1317 *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
1318 *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
1319 *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
1320 *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
1321 *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
1322 *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
1323 *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
1324 *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
1325 *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
1326 *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
1327 *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
1328 *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
1329 *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
1330 *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
1331 *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
1332 *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
1333 *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
1334 *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
1335 *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
1336 *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
1337 *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
1338 *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
1339 *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
1340 *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
1341 *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
1342 *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
1343 *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
1344 *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
1345 *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
1346 *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
1347 *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
1348 *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
1349 *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
1350 *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
1351 *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
1352 *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
1353 *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
1354 *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
1355 *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
1356 *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
1357 *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
1358 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1360 #endif // defined(SRC_DIM_Z) 1362 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 1388 __kernel
void winograd_filter_transform_2x1_3x1_nchw(
1392 winograd_filter_transform_2x2_3x3_nchw(src_ptr,
1401 src_offset_first_element_in_bytes,
1409 dst_offset_first_element_in_bytes);
1437 __kernel
void winograd_filter_transform_4x1_3x1_nchw(
1441 winograd_filter_transform_4x4_3x3_nchw(src_ptr,
1450 src_offset_first_element_in_bytes,
1458 dst_offset_first_element_in_bytes);
1486 __kernel
void winograd_filter_transform_4x1_5x1_nchw(
1490 winograd_filter_transform_4x4_5x5_nchw(src_ptr,
1499 src_offset_first_element_in_bytes,
1507 dst_offset_first_element_in_bytes);
1535 __kernel
void winograd_filter_transform_4x1_3x1_nhwc(
1539 winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
1548 src_offset_first_element_in_bytes,
1556 dst_offset_first_element_in_bytes);
1584 __kernel
void winograd_filter_transform_4x1_5x1_nhwc(
1588 winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
1597 src_offset_first_element_in_bytes,
1605 dst_offset_first_element_in_bytes);
1633 __kernel
void winograd_filter_transform_2x1_7x1_nhwc(
1637 winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
1646 src_offset_first_element_in_bytes,
1654 dst_offset_first_element_in_bytes);
1656 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 1658 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) 1684 __kernel
void winograd_filter_transform_1x2_1x3_nchw(
1688 winograd_filter_transform_2x2_3x3_nchw(src_ptr,
1697 src_offset_first_element_in_bytes,
1705 dst_offset_first_element_in_bytes);
1733 __kernel
void winograd_filter_transform_1x4_1x3_nchw(
1737 winograd_filter_transform_4x4_3x3_nchw(src_ptr,
1746 src_offset_first_element_in_bytes,
1754 dst_offset_first_element_in_bytes);
1782 __kernel
void winograd_filter_transform_1x4_1x5_nchw(
1786 winograd_filter_transform_4x4_5x5_nchw(src_ptr,
1795 src_offset_first_element_in_bytes,
1803 dst_offset_first_element_in_bytes);
1831 __kernel
void winograd_filter_transform_1x4_1x3_nhwc(
1835 winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
1844 src_offset_first_element_in_bytes,
1852 dst_offset_first_element_in_bytes);
1880 __kernel
void winograd_filter_transform_1x4_1x5_nhwc(
1884 winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
1893 src_offset_first_element_in_bytes,
1901 dst_offset_first_element_in_bytes);
1929 __kernel
void winograd_filter_transform_1x2_1x7_nhwc(
1933 winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
1942 src_offset_first_element_in_bytes,
1950 dst_offset_first_element_in_bytes);
1952 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
SimpleTensor< float > src
Structure to hold 4D tensor information.
__global const uchar * tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
Get the pointer position of a Tensor4D.
#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)
#define TENSOR4D_DECLARATION(name)
#define TENSOR3D_DECLARATION(name)
#define VEC_DATA_TYPE(type, size)