27 #define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
29 comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
30 comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
31 comm_fact.s2 = 2.5f * tmp.s3; \
32 comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
33 comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
34 comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
35 comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
37 out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
38 out.s1 = comm_fact.s0 + comm_fact.s1; \
39 out.s2 = comm_fact.s0 - comm_fact.s1; \
40 out.s3 = comm_fact.s3 + comm_fact.s4; \
41 out.s4 = comm_fact.s4 - comm_fact.s3; \
42 out.s5 = comm_fact.s5 + comm_fact.s6; \
43 out.s6 = comm_fact.s5 - comm_fact.s6; \
44 out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
47 #define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
49 comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
50 comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
51 comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
52 comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
53 comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
54 comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
55 out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
56 out.s1 = comm_fact.s0 - comm_fact.s1; \
57 out.s2 = comm_fact.s0 + comm_fact.s1; \
58 out.s3 = comm_fact.s2 - comm_fact.s3; \
59 out.s4 = comm_fact.s2 + comm_fact.s3; \
60 out.s5 = comm_fact.s4 - comm_fact.s5; \
61 out.s6 = comm_fact.s4 + comm_fact.s5; \
62 out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
65 #if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
95 __kernel
void winograd_input_transform_2x2_3x3_stepz1_nchw(
101 const int x = get_global_id(0);
102 const int y = get_global_id(1);
103 #if defined(SRC_DEPTH)
104 const int z = get_global_id(2) % SRC_DEPTH;
105 const int b = get_global_id(2) / SRC_DEPTH;
107 const int z = get_global_id(2);
111 #if defined(SRC_DEPTH)
112 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z +
b * src_stride_w;
114 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
117 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
119 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
121 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
122 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
124 in_row0 = (
VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
125 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
126 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
127 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
128 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
130 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
132 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
134 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
136 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
137 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
142 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
144 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
146 DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
147 DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
148 DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
149 DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
151 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
153 tmp1 = in_row1 + in_row2;
155 tmp2 = in_row2 - in_row1;
157 tmp3 = in_row1 - in_row3;
159 DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
160 DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
161 DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
162 DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
164 DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
165 DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
166 DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
167 DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
169 DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
170 DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
171 DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
172 DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
173 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
175 #if defined(SRC_DEPTH)
176 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y +
b * dst_stride_w;
178 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y;
181 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00;
182 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01;
183 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02;
184 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03;
186 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
187 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;
188 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;
189 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;
190 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;
191 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;
192 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;
193 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
194 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
195 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
196 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
197 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
198 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
199 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
231 __kernel
void winograd_input_transform_2x2_3x3_stepz2_nchw(
237 const int x = get_global_id(0);
238 const int y = get_global_id(1);
239 #if defined(SRC_DEPTH)
240 const int z = (get_global_id(2) * 2) % SRC_DEPTH;
241 const int b = (get_global_id(2) * 2) / SRC_DEPTH;
243 const int z = get_global_id(2) * 2;
247 #if defined(SRC_DEPTH)
248 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z +
b * src_stride_w;
250 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
252 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
254 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
256 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
257 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
259 in_row0 = (
VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
260 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
261 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
262 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
263 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
265 in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
267 in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
269 in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
271 in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
272 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
274 src_addr += src_stride_z;
275 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
277 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
278 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
280 in_row4 = (
VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
281 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
282 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
283 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
284 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
286 in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
288 in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
290 in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
292 in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
293 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
300 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
303 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
306 out00 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
308 out01 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
310 out02 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
312 out03 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
314 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
316 tmp1 = in_row1 + in_row2;
318 tmp2 = in_row2 - in_row1;
320 tmp3 = in_row1 - in_row3;
323 tmp5 = in_row5 + in_row6;
325 tmp6 = in_row6 - in_row5;
327 tmp7 = in_row5 - in_row7;
330 out10 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
332 out11 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
334 out12 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
336 out13 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
339 out20 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
341 out21 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
343 out22 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
345 out23 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
348 out30 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
350 out31 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
352 out32 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
354 out33 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
355 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
357 #if defined(SRC_DEPTH)
358 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y +
b * dst_stride_w;
360 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y;
363 vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
364 vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
365 vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
366 vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
368 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
369 vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
370 vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
371 vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
372 vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
373 vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
374 vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
375 vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
376 vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
377 vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
378 vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
379 vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
380 vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
381 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
413 __kernel
void winograd_input_transform_4x4_3x3_stepz1_nchw(
419 const int x = get_global_id(0);
420 const int y = get_global_id(1);
421 #if defined(SRC_DEPTH)
422 const int z = get_global_id(2) % SRC_DEPTH;
423 const int b = get_global_id(2) / SRC_DEPTH;
425 const int z = get_global_id(2);
429 #if defined(SRC_DEPTH)
430 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z +
b * src_stride_w;
432 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
435 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
437 #if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
440 d00 = (
VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
441 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
442 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
443 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
445 d01 = (
VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
446 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
447 #else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
450 d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
452 d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
453 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
455 DATA_TYPE out0 = 0.0f;
456 DATA_TYPE out1 = 0.0f;
457 DATA_TYPE out2 = 0.0f;
458 DATA_TYPE out3 = 0.0f;
459 DATA_TYPE out4 = 0.0f;
460 DATA_TYPE out5 = 0.0f;
463 out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
464 out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
465 out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
466 out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
467 out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
468 out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
470 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
473 d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
475 d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
478 DATA_TYPE k0 = d41.s0;
479 DATA_TYPE k1 = d41.s0;
480 DATA_TYPE k2 = d41.s0;
481 DATA_TYPE k3 = d41.s0;
482 DATA_TYPE k4 = d41.s0;
485 k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
486 k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
487 k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
488 k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
489 k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
490 k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
501 d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
503 d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
505 out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
506 out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
507 out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
508 out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
509 out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
510 out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
511 #endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
514 #if defined(SRC_DEPTH)
515 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y +
b * dst_stride_w);
517 __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
520 uint dst_plane_stride = dst_stride_z /
sizeof(DATA_TYPE);
523 dst_addr += dst_plane_stride;
525 dst_addr += dst_plane_stride;
527 dst_addr += dst_plane_stride;
529 dst_addr += dst_plane_stride;
531 dst_addr += dst_plane_stride;
533 dst_addr += dst_plane_stride;
535 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
540 DATA_TYPE out10 = k4;
541 DATA_TYPE out11 = k5;
542 DATA_TYPE out12 = k0;
543 DATA_TYPE out13 = k1;
544 DATA_TYPE out14 = k2;
545 DATA_TYPE out15 = k3;
546 DATA_TYPE out16 = k4;
547 DATA_TYPE out17 = k5;
548 DATA_TYPE out18 = k0;
549 DATA_TYPE out19 = k1;
550 DATA_TYPE out20 = k2;
551 DATA_TYPE out21 = k3;
552 DATA_TYPE out22 = k4;
553 DATA_TYPE out23 = k5;
554 DATA_TYPE out24 = k0;
555 DATA_TYPE out25 = k1;
556 DATA_TYPE out26 = k2;
557 DATA_TYPE out27 = k3;
558 DATA_TYPE out28 = k4;
559 DATA_TYPE out29 = k5;
563 d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
565 d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
569 d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
571 d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
576 DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
577 DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
578 DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
579 DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
580 DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
581 DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
582 DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
583 DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
584 DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
585 DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
586 DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
587 DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
591 DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
592 DATA_TYPE part13 = part0 * 0.25f;
593 DATA_TYPE part14 = part2 * 0.25f;
594 DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
595 DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
596 DATA_TYPE part17 = part3 * 0.25f;
597 DATA_TYPE part18 = part6 * 0.25f;
598 DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
599 DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
600 DATA_TYPE part21 = part9 * 0.25f;
601 DATA_TYPE part22 = part10 * 0.25f;
602 DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1;
604 out6 += part0 - part1;
605 out12 += part0 + part1;
606 out7 += part2 + part3 + part4 + part5;
607 out8 += part2 - part3 + part4 - part5;
608 out13 += part2 + part3 - part4 - part5;
609 out14 += part2 - part3 - part4 + part5;
610 out9 += part6 + part7 + part8 + part9;
611 out10 += part6 - part7 + part8 - part9;
612 out15 += part6 - part7 - part8 + part9;
613 out16 += part6 + part7 - part8 - part9;
614 out11 += part10 + part11;
615 out17 += part10 - part11;
617 out18 += part13 - part12;
618 out24 += part13 + part12;
619 out19 += part14 + part15 + part16 + part17;
620 out20 += part14 - part15 + part16 - part17;
621 out25 += part14 - part15 - part16 + part17;
622 out26 += part14 + part15 - part16 - part17;
623 out21 += part18 + part19 + part20 + part21;
624 out22 += part18 - part19 + part20 - part21;
625 out27 += part18 - part19 - part20 + part21;
626 out28 += part18 + part19 - part20 - part21;
627 out23 += part22 + part23;
628 out29 += part22 - part23;
631 dst_addr += dst_plane_stride;
633 dst_addr += dst_plane_stride;
635 dst_addr += dst_plane_stride;
637 dst_addr += dst_plane_stride;
639 dst_addr += dst_plane_stride;
641 dst_addr += dst_plane_stride;
643 dst_addr += dst_plane_stride;
645 dst_addr += dst_plane_stride;
647 dst_addr += dst_plane_stride;
649 dst_addr += dst_plane_stride;
651 dst_addr += dst_plane_stride;
653 dst_addr += dst_plane_stride;
656 dst_addr += dst_plane_stride;
658 dst_addr += dst_plane_stride;
660 dst_addr += dst_plane_stride;
662 dst_addr += dst_plane_stride;
664 dst_addr += dst_plane_stride;
666 dst_addr += dst_plane_stride;
668 dst_addr += dst_plane_stride;
670 dst_addr += dst_plane_stride;
672 dst_addr += dst_plane_stride;
674 dst_addr += dst_plane_stride;
676 dst_addr += dst_plane_stride;
678 dst_addr += dst_plane_stride;
682 d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
684 d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
687 out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
688 out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
689 out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
690 out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
691 out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
692 out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
695 dst_addr += dst_plane_stride;
697 dst_addr += dst_plane_stride;
699 dst_addr += dst_plane_stride;
701 dst_addr += dst_plane_stride;
703 dst_addr += dst_plane_stride;
705 dst_addr += dst_plane_stride;
706 #endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
738 __kernel
void winograd_input_transform_4x4_5x5_stepz1_nchw(
744 const int x = get_global_id(0);
745 const int y = get_global_id(1);
746 #if defined(SRC_DEPTH)
747 const int z = get_global_id(2) % SRC_DEPTH;
748 const int b = get_global_id(2) / SRC_DEPTH;
750 const int z = get_global_id(2);
754 #if defined(SRC_DEPTH)
755 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z +
b * src_stride_w;
757 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
759 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
762 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
763 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
764 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
765 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (
VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
766 *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
767 *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
768 *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
769 *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
770 *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
771 *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
772 *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
773 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
774 const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
775 const
VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
776 const
VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
777 const
VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
778 const
VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
779 const
VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
780 const
VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
781 const
VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
782 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
790 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
791 comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
792 tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
795 comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
797 comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
799 const
VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
800 const
VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
802 comm_fact0 = (DATA_TYPE)2.5f * in_row3;
803 comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
805 const
VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
806 const
VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
808 comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
809 comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
811 const
VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
812 const
VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
813 const
VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
814 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
822 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
824 out1, out2, out3, out4, out5, out6, out7;
833 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
836 #if defined(SRC_DEPTH)
837 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y +
b * dst_stride_w;
839 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y;
842 *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
843 *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
844 *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
845 *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
846 *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
847 *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
848 *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
849 *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
851 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
852 *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
853 *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
854 *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
855 *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
856 *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
857 *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
858 *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
859 *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
860 *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
861 *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
862 *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
863 *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
864 *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
865 *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
866 *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
867 *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
868 *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
869 *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
870 *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
871 *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
872 *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
873 *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
874 *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
875 *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
876 *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
877 *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
878 *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
879 *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
880 *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
881 *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
882 *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
883 *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
884 *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
885 *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
886 *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
887 *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
888 *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
889 *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
890 *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
891 *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
892 *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
893 *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
894 *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
895 *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
896 *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
897 *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
898 *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
899 *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
900 *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
901 *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
902 *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
903 *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
904 *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
905 *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
906 *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
907 *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
908 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
911 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
940 __kernel
void winograd_input_transform_2x1_3x1_stepz1_nchw(
946 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
953 src_offset_first_element_in_bytes,
961 dst_offset_first_element_in_bytes,
994 __kernel
void winograd_input_transform_2x1_3x1_stepz2_nchw(
1000 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
1007 src_offset_first_element_in_bytes,
1015 dst_offset_first_element_in_bytes,
1048 __kernel
void winograd_input_transform_4x1_3x1_stepz1_nchw(
1054 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
1061 src_offset_first_element_in_bytes,
1069 dst_offset_first_element_in_bytes,
1102 __kernel
void winograd_input_transform_4x1_5x1_stepz1_nchw(
1108 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
1115 src_offset_first_element_in_bytes,
1123 dst_offset_first_element_in_bytes,
1127 #endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
1129 #if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1158 __kernel
void winograd_input_transform_1x2_1x3_stepz1_nchw(
1164 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
1171 src_offset_first_element_in_bytes,
1179 dst_offset_first_element_in_bytes,
1212 __kernel
void winograd_input_transform_1x2_1x3_stepz2_nchw(
1218 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
1225 src_offset_first_element_in_bytes,
1233 dst_offset_first_element_in_bytes,
1266 __kernel
void winograd_input_transform_1x4_1x3_stepz1_nchw(
1272 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
1279 src_offset_first_element_in_bytes,
1287 dst_offset_first_element_in_bytes,
1320 __kernel
void winograd_input_transform_1x4_1x5_stepz1_nchw(
1326 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
1333 src_offset_first_element_in_bytes,
1341 dst_offset_first_element_in_bytes,
1345 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
1346 #endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)