26 #define FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(datatype, basename, y_cond, z_cond) \ 28 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \ 29 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \ 30 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \ 31 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \ 32 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s0) && (z_cond))); \ 33 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s1) && (z_cond))); \ 36 #define FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(datatype, basename, y_cond, z_cond) \ 38 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \ 39 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \ 40 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \ 41 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \ 42 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s0))); \ 43 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s1))); \ 46 #define FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(datatype, basename, y_cond, z_cond) \ 48 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \ 49 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \ 50 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \ 51 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \ 52 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s4) && (z_cond))); \ 53 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s5) && (z_cond))); \ 54 basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s6) && (z_cond))); \ 55 basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s7) && (z_cond))); \ 58 #define FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(datatype, basename, y_cond, z_cond) \ 60 basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \ 61 basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \ 62 basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \ 63 basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \ 64 basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s4))); \ 65 basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s5))); \ 66 basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s6))); \ 67 basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s7))); \ 70 #define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \ 72 comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \ 73 comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \ 74 comm_fact.s2 = 2.5f * tmp.s3; \ 75 comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \ 76 comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \ 77 comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \ 78 comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \ 80 out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \ 81 out.s1 = comm_fact.s0 + comm_fact.s1; \ 82 out.s2 = comm_fact.s0 - comm_fact.s1; \ 83 out.s3 = comm_fact.s3 + comm_fact.s4; \ 84 out.s4 = comm_fact.s4 - comm_fact.s3; \ 85 out.s5 = comm_fact.s5 + comm_fact.s6; \ 86 out.s6 = comm_fact.s5 - comm_fact.s6; \ 87 out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \ 90 #define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \ 92 comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \ 93 comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \ 94 comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \ 95 comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \ 96 comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \ 97 comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \ 98 out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \ 99 out.s1 = comm_fact.s0 - comm_fact.s1; \ 100 out.s2 = comm_fact.s0 + comm_fact.s1; \ 101 out.s3 = comm_fact.s2 - comm_fact.s3; \ 102 out.s4 = comm_fact.s2 + comm_fact.s3; \ 103 out.s5 = comm_fact.s4 - comm_fact.s5; \ 104 out.s6 = comm_fact.s4 + comm_fact.s5; \ 105 out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \ 108 #if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) 138 __kernel
void winograd_input_transform_2x2_3x3_stepz1_nchw(
144 const int x = get_global_id(0);
145 const int y = get_global_id(1);
146 #if defined(SRC_DEPTH) 147 const int z = get_global_id(2) % SRC_DEPTH;
148 const int b = get_global_id(2) / SRC_DEPTH;
150 const int z = get_global_id(2);
154 #if defined(SRC_DEPTH) 155 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
157 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
160 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(
DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
162 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 164 in_row0 = vload4(0, (__global
DATA_TYPE *)(src_addr));
165 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 168 *((__global
DATA_TYPE *)(src_addr + 1 * src_stride_y)),
169 *((__global
DATA_TYPE *)(src_addr + 2 * src_stride_y)),
170 *((__global
DATA_TYPE *)(src_addr + 3 * src_stride_y)));
171 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 173 in_row0 = vload4(0, (__global
DATA_TYPE *)(src_addr + 0 * src_stride_y));
175 in_row1 = vload4(0, (__global
DATA_TYPE *)(src_addr + 1 * src_stride_y));
177 in_row2 = vload4(0, (__global
DATA_TYPE *)(src_addr + 2 * src_stride_y));
179 in_row3 = vload4(0, (__global
DATA_TYPE *)(src_addr + 3 * src_stride_y));
180 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 185 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 187 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 194 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 196 tmp1 = in_row1 + in_row2;
198 tmp2 = in_row2 - in_row1;
200 tmp3 = in_row1 - in_row3;
216 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 218 #if defined(SRC_DEPTH) 219 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
221 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y;
224 *((__global
DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00;
225 *((__global
DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01;
226 *((__global
DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02;
227 *((__global
DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03;
229 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 230 *((__global
DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out10;
231 *((__global
DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out11;
232 *((__global
DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out12;
233 *((__global
DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out13;
234 *((__global
DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out20;
235 *((__global
DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out21;
236 *((__global
DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
237 *((__global
DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
238 *((__global
DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
239 *((__global
DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
240 *((__global
DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
241 *((__global
DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
242 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 274 __kernel
void winograd_input_transform_2x2_3x3_stepz2_nchw(
280 const int x = get_global_id(0);
281 const int y = get_global_id(1);
282 #if defined(SRC_DEPTH) 283 const int z = (get_global_id(2) * 2) % SRC_DEPTH;
284 const int b = (get_global_id(2) * 2) / SRC_DEPTH;
286 const int z = get_global_id(2) * 2;
290 #if defined(SRC_DEPTH) 291 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
293 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
295 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(
DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
297 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 299 in_row0 = vload4(0, (__global
DATA_TYPE *)(src_addr));
300 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 303 *((__global
DATA_TYPE *)(src_addr + 1 * src_stride_y)),
304 *((__global
DATA_TYPE *)(src_addr + 2 * src_stride_y)),
305 *((__global
DATA_TYPE *)(src_addr + 3 * src_stride_y)));
306 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 308 in_row0 = vload4(0, (__global
DATA_TYPE *)(src_addr + 0 * src_stride_y));
310 in_row1 = vload4(0, (__global
DATA_TYPE *)(src_addr + 1 * src_stride_y));
312 in_row2 = vload4(0, (__global
DATA_TYPE *)(src_addr + 2 * src_stride_y));
314 in_row3 = vload4(0, (__global
DATA_TYPE *)(src_addr + 3 * src_stride_y));
315 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 317 src_addr += src_stride_z;
318 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 320 in_row4 = vload4(0, (__global
DATA_TYPE *)(src_addr));
321 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 324 *((__global
DATA_TYPE *)(src_addr + 1 * src_stride_y)),
325 *((__global
DATA_TYPE *)(src_addr + 2 * src_stride_y)),
326 *((__global
DATA_TYPE *)(src_addr + 3 * src_stride_y)));
327 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 329 in_row4 = vload4(0, (__global
DATA_TYPE *)(src_addr + 0 * src_stride_y));
331 in_row5 = vload4(0, (__global
DATA_TYPE *)(src_addr + 1 * src_stride_y));
333 in_row6 = vload4(0, (__global
DATA_TYPE *)(src_addr + 2 * src_stride_y));
335 in_row7 = vload4(0, (__global
DATA_TYPE *)(src_addr + 3 * src_stride_y));
336 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 343 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 346 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 357 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 359 tmp1 = in_row1 + in_row2;
361 tmp2 = in_row2 - in_row1;
363 tmp3 = in_row1 - in_row3;
366 tmp5 = in_row5 + in_row6;
368 tmp6 = in_row6 - in_row5;
370 tmp7 = in_row5 - in_row7;
398 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 400 #if defined(SRC_DEPTH) 401 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
403 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y;
406 vstore2(out00, 0, (__global
DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
407 vstore2(out01, 0, (__global
DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
408 vstore2(out02, 0, (__global
DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
409 vstore2(out03, 0, (__global
DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
411 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 412 vstore2(out10, 0, (__global
DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
413 vstore2(out11, 0, (__global
DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
414 vstore2(out12, 0, (__global
DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
415 vstore2(out13, 0, (__global
DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
416 vstore2(out20, 0, (__global
DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
417 vstore2(out21, 0, (__global
DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
418 vstore2(out22, 0, (__global
DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
419 vstore2(out23, 0, (__global
DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
420 vstore2(out30, 0, (__global
DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
421 vstore2(out31, 0, (__global
DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
422 vstore2(out32, 0, (__global
DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
423 vstore2(out33, 0, (__global
DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
424 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 456 __kernel
void winograd_input_transform_4x4_3x3_stepz1_nchw(
462 const int x = get_global_id(0);
463 const int y = get_global_id(1);
464 #if defined(SRC_DEPTH) 465 const int z = get_global_id(2) % SRC_DEPTH;
466 const int b = get_global_id(2) / SRC_DEPTH;
468 const int z = get_global_id(2);
472 #if defined(SRC_DEPTH) 473 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
475 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
478 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(
DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
480 #if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 484 *((__global
DATA_TYPE *)(src_addr + 1 * src_stride_y)),
485 *((__global
DATA_TYPE *)(src_addr + 2 * src_stride_y)),
486 *((__global
DATA_TYPE *)(src_addr + 3 * src_stride_y)));
489 *((__global
DATA_TYPE *)(src_addr + 5 * src_stride_y)));
490 #else // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 493 d00 = vload4(0, (__global
DATA_TYPE *)(src_addr + 0 * src_stride_y));
495 d01 = vload2(2, (__global
DATA_TYPE *)(src_addr + 0 * src_stride_y));
496 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 506 out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
507 out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
508 out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
509 out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
510 out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
511 out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
513 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 516 d40 = vload4(0, (__global
DATA_TYPE *)(src_addr + 4 * src_stride_y));
518 d41 = vload2(2, (__global
DATA_TYPE *)(src_addr + 4 * src_stride_y));
528 k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
529 k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
530 k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
531 k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
532 k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
533 k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
544 d20 = vload4(0, (__global
DATA_TYPE *)(src_addr + 2 * src_stride_y));
546 d21 = vload2(2, (__global
DATA_TYPE *)(src_addr + 2 * src_stride_y));
548 out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
549 out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
550 out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
551 out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
552 out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
553 out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
554 #endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 557 #if defined(SRC_DEPTH) 558 __global
DATA_TYPE *dst_addr = (__global
DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
560 __global
DATA_TYPE *dst_addr = (__global
DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
563 uint dst_plane_stride = dst_stride_z /
sizeof(
DATA_TYPE);
566 dst_addr += dst_plane_stride;
568 dst_addr += dst_plane_stride;
570 dst_addr += dst_plane_stride;
572 dst_addr += dst_plane_stride;
574 dst_addr += dst_plane_stride;
576 dst_addr += dst_plane_stride;
578 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 606 d10 = vload4(0, (__global
DATA_TYPE *)(src_addr + 1 * src_stride_y));
608 d11 = vload2(2, (__global
DATA_TYPE *)(src_addr + 1 * src_stride_y));
612 d30 = vload4(0, (__global
DATA_TYPE *)(src_addr + 3 * src_stride_y));
614 d31 = vload2(2, (__global
DATA_TYPE *)(src_addr + 3 * src_stride_y));
619 DATA_TYPE part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
620 DATA_TYPE part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
621 DATA_TYPE part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
622 DATA_TYPE part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
623 DATA_TYPE part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
624 DATA_TYPE part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
625 DATA_TYPE part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
626 DATA_TYPE part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
627 DATA_TYPE part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
628 DATA_TYPE part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
629 DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
630 DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
634 DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
637 DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
638 DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
641 DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
642 DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
645 DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1;
647 out6 += part0 - part1;
648 out12 += part0 + part1;
649 out7 += part2 + part3 + part4 + part5;
650 out8 += part2 - part3 + part4 - part5;
651 out13 += part2 + part3 - part4 - part5;
652 out14 += part2 - part3 - part4 + part5;
653 out9 += part6 + part7 + part8 + part9;
654 out10 += part6 - part7 + part8 - part9;
655 out15 += part6 - part7 - part8 + part9;
656 out16 += part6 + part7 - part8 - part9;
657 out11 += part10 + part11;
658 out17 += part10 - part11;
660 out18 += part13 - part12;
661 out24 += part13 + part12;
662 out19 += part14 + part15 + part16 + part17;
663 out20 += part14 - part15 + part16 - part17;
664 out25 += part14 - part15 - part16 + part17;
665 out26 += part14 + part15 - part16 - part17;
666 out21 += part18 + part19 + part20 + part21;
667 out22 += part18 - part19 + part20 - part21;
668 out27 += part18 - part19 - part20 + part21;
669 out28 += part18 + part19 - part20 - part21;
670 out23 += part22 + part23;
671 out29 += part22 - part23;
674 dst_addr += dst_plane_stride;
676 dst_addr += dst_plane_stride;
678 dst_addr += dst_plane_stride;
680 dst_addr += dst_plane_stride;
682 dst_addr += dst_plane_stride;
684 dst_addr += dst_plane_stride;
686 dst_addr += dst_plane_stride;
688 dst_addr += dst_plane_stride;
690 dst_addr += dst_plane_stride;
692 dst_addr += dst_plane_stride;
694 dst_addr += dst_plane_stride;
696 dst_addr += dst_plane_stride;
699 dst_addr += dst_plane_stride;
701 dst_addr += dst_plane_stride;
703 dst_addr += dst_plane_stride;
705 dst_addr += dst_plane_stride;
707 dst_addr += dst_plane_stride;
709 dst_addr += dst_plane_stride;
711 dst_addr += dst_plane_stride;
713 dst_addr += dst_plane_stride;
715 dst_addr += dst_plane_stride;
717 dst_addr += dst_plane_stride;
719 dst_addr += dst_plane_stride;
721 dst_addr += dst_plane_stride;
725 d50 = vload4(0, (__global
DATA_TYPE *)(src_addr + 5 * src_stride_y));
727 d51 = vload2(2, (__global
DATA_TYPE *)(src_addr + 5 * src_stride_y));
730 out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
731 out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
732 out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
733 out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
734 out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
735 out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
738 dst_addr += dst_plane_stride;
740 dst_addr += dst_plane_stride;
742 dst_addr += dst_plane_stride;
744 dst_addr += dst_plane_stride;
746 dst_addr += dst_plane_stride;
748 dst_addr += dst_plane_stride;
749 #endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 781 __kernel
void winograd_input_transform_4x4_5x5_stepz1_nchw(
787 const int x = get_global_id(0);
788 const int y = get_global_id(1);
789 #if defined(SRC_DEPTH) 790 const int z = get_global_id(2) % SRC_DEPTH;
791 const int b = get_global_id(2) / SRC_DEPTH;
793 const int z = get_global_id(2);
797 #if defined(SRC_DEPTH) 798 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
800 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W *
sizeof(
DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
802 src_addr = src_addr - ((int)PAD_LEFT *
sizeof(
DATA_TYPE)) - ((
int)PAD_TOP * src_stride_y);
805 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 807 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) 809 *((__global
DATA_TYPE *)(src_addr + 1 * src_stride_y)),
810 *((__global
DATA_TYPE *)(src_addr + 2 * src_stride_y)),
811 *((__global
DATA_TYPE *)(src_addr + 3 * src_stride_y)),
812 *((__global
DATA_TYPE *)(src_addr + 4 * src_stride_y)),
813 *((__global
DATA_TYPE *)(src_addr + 5 * src_stride_y)),
814 *((__global
DATA_TYPE *)(src_addr + 6 * src_stride_y)),
815 *((__global
DATA_TYPE *)(src_addr + 7 * src_stride_y)));
816 #else // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 825 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 833 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 834 comm_fact0 += in_row2 + in_row6 - (
DATA_TYPE)4.25 * in_row4;
838 comm_fact1 = in_row1 + in_row5 - (
DATA_TYPE)4.25 * in_row3;
857 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 865 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 867 out1, out2, out3, out4, out5, out6, out7;
876 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 879 #if defined(SRC_DEPTH) 880 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
882 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z *
sizeof(
DATA_TYPE) + (x + y * (
int)NUM_TILES_X) * dst_stride_y;
885 *((__global
DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
886 *((__global
DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
887 *((__global
DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
888 *((__global
DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
889 *((__global
DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
890 *((__global
DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
891 *((__global
DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
892 *((__global
DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
894 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 895 *((__global
DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
896 *((__global
DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
897 *((__global
DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
898 *((__global
DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
899 *((__global
DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
900 *((__global
DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
901 *((__global
DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
902 *((__global
DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
903 *((__global
DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
904 *((__global
DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
905 *((__global
DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
906 *((__global
DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
907 *((__global
DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
908 *((__global
DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
909 *((__global
DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
910 *((__global
DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
911 *((__global
DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
912 *((__global
DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
913 *((__global
DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
914 *((__global
DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
915 *((__global
DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
916 *((__global
DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
917 *((__global
DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
918 *((__global
DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
919 *((__global
DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
920 *((__global
DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
921 *((__global
DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
922 *((__global
DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
923 *((__global
DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
924 *((__global
DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
925 *((__global
DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
926 *((__global
DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
927 *((__global
DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
928 *((__global
DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
929 *((__global
DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
930 *((__global
DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
931 *((__global
DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
932 *((__global
DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
933 *((__global
DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
934 *((__global
DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
935 *((__global
DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
936 *((__global
DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
937 *((__global
DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
938 *((__global
DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
939 *((__global
DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
940 *((__global
DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
941 *((__global
DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
942 *((__global
DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
943 *((__global
DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
944 *((__global
DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
945 *((__global
DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
946 *((__global
DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
947 *((__global
DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
948 *((__global
DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
949 *((__global
DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
950 *((__global
DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
951 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 954 #if defined(SRC_DIM_1) && defined(SRC_DIM_2) 986 __kernel
void winograd_input_transform_4x4_3x3_stepz1_nhwc(
993 const int x = get_global_id(0);
995 const int y = get_global_id(1);
996 #if defined(NUM_TILES_Y) 998 const int z = get_global_id(2) % NUM_TILES_Y;
1000 const int b = get_global_id(2) / NUM_TILES_Y;
1001 #else // defined(NUM_TILES_Y) 1003 const int z = get_global_id(2);
1004 #endif // defined(NUM_TILES_Y) 1006 #if defined(NUM_TILES_Y) 1007 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + b * src_stride_w;
1008 #else // defined(NUM_TILES_Y) 1009 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE);
1010 #endif // defined(NUM_TILES_Y) 1013 int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
1014 int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
1015 int4 z_coord0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
1016 int2 z_coord1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
1019 int4 y_coord_valid0 =
clamp(y_coord0, (int4)0, (int4)((
int)SRC_DIM_1 - 1));
1020 int2 y_coord_valid1 =
clamp(y_coord1, (int2)0, (int2)((
int)SRC_DIM_1 - 1));
1021 int4 z_coord_valid0 =
clamp(z_coord0, (int4)0, (int4)((
int)SRC_DIM_2 - 1));
1022 int2 z_coord_valid1 =
clamp(z_coord1, (int2)0, (int2)((
int)SRC_DIM_2 - 1));
1025 int4 y_cond0 = y_coord_valid0 == y_coord0;
1026 int2 y_cond1 = y_coord_valid1 == y_coord1;
1027 int4 z_cond0 = z_coord_valid0 == z_coord0;
1028 int2 z_cond1 = z_coord_valid1 == z_coord1;
1030 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1032 DATA_TYPE d40 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1033 DATA_TYPE d41 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1034 DATA_TYPE d42 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1035 DATA_TYPE d43 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1036 DATA_TYPE d44 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (
int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1037 DATA_TYPE d45 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (
int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1048 k0 += 4.0f * d40 - 5.0f * d42;
1049 k1 += -4.0f * d41 - 4.0f * d42 + d43;
1050 k2 += 4.0f * d41 - 4.0f * d42 - d43;
1051 k3 += -2.0f * d41 + 2.0f * d43 - d42;
1052 k4 += 2.0f * d41 - 2.0f * d43 - d42;
1053 k5 += 4.0f * d41 - 5.0f * d43 + d45;
1054 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1056 #if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1057 DATA_TYPE d00 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1058 DATA_TYPE d01 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1059 DATA_TYPE d02 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1060 DATA_TYPE d03 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1061 DATA_TYPE d04 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1062 DATA_TYPE d05 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1066 #else // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1067 DATA_TYPE d00 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1068 DATA_TYPE d01 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1069 DATA_TYPE d02 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1070 DATA_TYPE d03 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1071 DATA_TYPE d04 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
1072 DATA_TYPE d05 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1075 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1077 DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
1078 DATA_TYPE out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
1079 DATA_TYPE out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
1080 DATA_TYPE out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
1081 DATA_TYPE out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
1082 DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
1084 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1085 DATA_TYPE d20 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1086 DATA_TYPE d21 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1087 DATA_TYPE d22 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1088 DATA_TYPE d23 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1089 DATA_TYPE d24 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1090 DATA_TYPE d25 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1126 out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;
1127 out1 += 20.0f * d21 + 20.0f * d22 - 5.0f * d23 - 5.0f * d24;
1128 out2 += -20.0f * d21 + 20.0f * d22 + 5.0f * d23 - 5.0f * d24;
1129 out3 += 10.0f * d21 + 5.0f * d22 - 10.0f * d23 - 5.0f * d24;
1130 out4 += -10.0f * d21 + 5.0f * d22 + 10.0f * d23 - 5.0f * d24;
1131 out5 += -20.0f * d21 + 25.0f * d23 - 5.0f * d25;
1132 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1135 #if defined(NUM_TILES_Y) 1136 __global
DATA_TYPE *dst_addr = (__global
DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
1137 #else // defined(NUM_TILES_Y) 1138 __global
DATA_TYPE *dst_addr = (__global
DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
1139 #endif // defined(NUM_TILES_Y) 1141 uint dst_plane_stride = dst_stride_z /
sizeof(
DATA_TYPE);
1143 *((__global
DATA_TYPE *)dst_addr) = out0;
1144 dst_addr += dst_plane_stride;
1145 *((__global
DATA_TYPE *)dst_addr) = out1;
1146 dst_addr += dst_plane_stride;
1147 *((__global
DATA_TYPE *)dst_addr) = out2;
1148 dst_addr += dst_plane_stride;
1149 *((__global
DATA_TYPE *)dst_addr) = out3;
1150 dst_addr += dst_plane_stride;
1151 *((__global
DATA_TYPE *)dst_addr) = out4;
1152 dst_addr += dst_plane_stride;
1153 *((__global
DATA_TYPE *)dst_addr) = out5;
1154 dst_addr += dst_plane_stride;
1156 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1157 DATA_TYPE d10 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1158 DATA_TYPE d11 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1159 DATA_TYPE d12 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1160 DATA_TYPE d13 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1161 DATA_TYPE d14 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1162 DATA_TYPE d15 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1164 DATA_TYPE d30 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1165 DATA_TYPE d31 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1166 DATA_TYPE d32 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1167 DATA_TYPE d33 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1168 DATA_TYPE d34 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1169 DATA_TYPE d35 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1177 DATA_TYPE part0 = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
1178 DATA_TYPE part1 = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
1179 DATA_TYPE part2 = 16.0f * d22 - 4.0f * d24;
1180 DATA_TYPE part3 = 16.0f * d21 - 4.0f * d23;
1181 DATA_TYPE part4 = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
1182 DATA_TYPE part5 = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
1183 DATA_TYPE part6 = 4.0f * d22 - 4.0f * d24;
1184 DATA_TYPE part7 = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
1185 DATA_TYPE part8 = 4.0f * d12 - 4.0f * d14 - d32 + d34;
1186 DATA_TYPE part9 = 8.0f * d21 - 8.0f * d23;
1187 DATA_TYPE part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
1188 DATA_TYPE part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
1192 DATA_TYPE part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
1195 DATA_TYPE part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
1196 DATA_TYPE part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
1199 DATA_TYPE part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
1200 DATA_TYPE part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
1203 DATA_TYPE part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35;
1205 out6 += part0 - part1;
1206 out12 += part0 + part1;
1207 out7 += part2 + part3 + part4 + part5;
1208 out8 += part2 - part3 + part4 - part5;
1209 out13 += part2 + part3 - part4 - part5;
1210 out14 += part2 - part3 - part4 + part5;
1211 out9 += part6 + part7 + part8 + part9;
1212 out10 += part6 - part7 + part8 - part9;
1213 out15 += part6 - part7 - part8 + part9;
1214 out16 += part6 + part7 - part8 - part9;
1215 out11 += part10 + part11;
1216 out17 += part10 - part11;
1218 out18 += part13 - part12;
1219 out24 += part13 + part12;
1220 out19 += part14 + part15 + part16 + part17;
1221 out20 += part14 - part15 + part16 - part17;
1222 out25 += part14 - part15 - part16 + part17;
1223 out26 += part14 + part15 - part16 - part17;
1224 out21 += part18 + part19 + part20 + part21;
1225 out22 += part18 - part19 + part20 - part21;
1226 out27 += part18 - part19 - part20 + part21;
1227 out28 += part18 + part19 - part20 - part21;
1228 out23 += part22 + part23;
1229 out29 += part22 - part23;
1231 *((__global
DATA_TYPE *)dst_addr) = out6;
1232 dst_addr += dst_plane_stride;
1233 *((__global
DATA_TYPE *)dst_addr) = out7;
1234 dst_addr += dst_plane_stride;
1235 *((__global
DATA_TYPE *)dst_addr) = out8;
1236 dst_addr += dst_plane_stride;
1237 *((__global
DATA_TYPE *)dst_addr) = out9;
1238 dst_addr += dst_plane_stride;
1239 *((__global
DATA_TYPE *)dst_addr) = out10;
1240 dst_addr += dst_plane_stride;
1241 *((__global
DATA_TYPE *)dst_addr) = out11;
1242 dst_addr += dst_plane_stride;
1243 *((__global
DATA_TYPE *)dst_addr) = out12;
1244 dst_addr += dst_plane_stride;
1245 *((__global
DATA_TYPE *)dst_addr) = out13;
1246 dst_addr += dst_plane_stride;
1247 *((__global
DATA_TYPE *)dst_addr) = out14;
1248 dst_addr += dst_plane_stride;
1249 *((__global
DATA_TYPE *)dst_addr) = out15;
1250 dst_addr += dst_plane_stride;
1251 *((__global
DATA_TYPE *)dst_addr) = out16;
1252 dst_addr += dst_plane_stride;
1253 *((__global
DATA_TYPE *)dst_addr) = out17;
1254 dst_addr += dst_plane_stride;
1256 *((__global
DATA_TYPE *)dst_addr) = out18;
1257 dst_addr += dst_plane_stride;
1258 *((__global
DATA_TYPE *)dst_addr) = out19;
1259 dst_addr += dst_plane_stride;
1260 *((__global
DATA_TYPE *)dst_addr) = out20;
1261 dst_addr += dst_plane_stride;
1262 *((__global
DATA_TYPE *)dst_addr) = out21;
1263 dst_addr += dst_plane_stride;
1264 *((__global
DATA_TYPE *)dst_addr) = out22;
1265 dst_addr += dst_plane_stride;
1266 *((__global
DATA_TYPE *)dst_addr) = out23;
1267 dst_addr += dst_plane_stride;
1268 *((__global
DATA_TYPE *)dst_addr) = out24;
1269 dst_addr += dst_plane_stride;
1270 *((__global
DATA_TYPE *)dst_addr) = out25;
1271 dst_addr += dst_plane_stride;
1272 *((__global
DATA_TYPE *)dst_addr) = out26;
1273 dst_addr += dst_plane_stride;
1274 *((__global
DATA_TYPE *)dst_addr) = out27;
1275 dst_addr += dst_plane_stride;
1276 *((__global
DATA_TYPE *)dst_addr) = out28;
1277 dst_addr += dst_plane_stride;
1278 *((__global
DATA_TYPE *)dst_addr) = out29;
1279 dst_addr += dst_plane_stride;
1282 DATA_TYPE d50 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1283 DATA_TYPE d51 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1284 DATA_TYPE d52 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1285 DATA_TYPE d53 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1286 DATA_TYPE d54 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (
int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1287 DATA_TYPE d55 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (
int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
1292 out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;
1293 out1 = -16.0f * d11 - 16.0f * d12 + 4.0f * d13 + 20.0f * d31 + 20.0f * d32 - 5.0f * d33 - 4.0f * d51 - 4.0f * d52 + d53 + d54 + 4.0f * d14 - 5.0f * d34;
1294 out2 = 16.0f * d11 - 16.0f * d12 - 4.0f * d13 - 20.0f * d31 + 20.0f * d32 + 5.0f * d33 + 4.0f * d51 - 4.0f * d52 - d53 + d54 + 4.0f * d14 - 5.0f * d34;
1295 out3 = -8.0f * d11 - 4.0f * d12 + 8.0f * d13 + 10.0f * d31 - 10.0f * d33 + 5.0f * d32 - 2.0f * d51 + 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
1296 out4 = 8.0f * d11 - 4.0f * d12 - 8.0f * d13 - 10.0f * d31 + 5.0f * d32 + 10.0f * d33 + 2.0f * d51 - 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
1297 out5 = 16.0f * d11 - 20.0f * d13 + 4.0f * d15 - 20.0f * d31 + 25.0f * d33 - 5.0f * d35 + 4.0f * d51 - 5.0f * d53 + d55;
1299 *((__global
DATA_TYPE *)dst_addr) = out0;
1300 dst_addr += dst_plane_stride;
1301 *((__global
DATA_TYPE *)dst_addr) = out1;
1302 dst_addr += dst_plane_stride;
1303 *((__global
DATA_TYPE *)dst_addr) = out2;
1304 dst_addr += dst_plane_stride;
1305 *((__global
DATA_TYPE *)dst_addr) = out3;
1306 dst_addr += dst_plane_stride;
1307 *((__global
DATA_TYPE *)dst_addr) = out4;
1308 dst_addr += dst_plane_stride;
1309 *((__global
DATA_TYPE *)dst_addr) = out5;
1310 dst_addr += dst_plane_stride;
1311 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1345 __kernel
void winograd_input_transform_4x4_5x5_stepz1_nhwc(
1351 const int x = get_global_id(0);
1352 const int y = get_global_id(1);
1353 #if defined(NUM_TILES_Y) 1354 const int z = get_global_id(2) % NUM_TILES_Y;
1355 const int b = get_global_id(2) / NUM_TILES_Y;
1356 #else // defined(NUM_TILES_Y) 1357 const int z = get_global_id(2);
1358 #endif // defined(NUM_TILES_Y) 1361 #if defined(NUM_TILES_Y) 1362 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + b * src_stride_w;
1363 #else // defined(NUM_TILES_Y) 1364 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE);
1365 #endif // defined(NUM_TILES_Y) 1368 int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
1369 int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
1372 int8 y_coord_valid0 =
clamp(y_coord0, (int8)0, (int8)((
int)SRC_DIM_1 - 1));
1373 int8 z_coord_valid0 =
clamp(z_coord0, (int8)0, (int8)((
int)SRC_DIM_2 - 1));
1376 int8 y_cond0 = y_coord_valid0 == y_coord0;
1377 int8 z_cond0 = z_coord_valid0 == z_coord0;
1379 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 1384 in_row0.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1385 in_row0.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1386 in_row0.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1387 in_row0.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1388 in_row0.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1389 in_row0.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1390 in_row0.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1391 in_row0.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1406 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 1411 in_row0.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1412 in_row0.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1413 in_row0.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1414 in_row0.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1415 in_row0.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1416 in_row0.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1417 in_row0.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1418 in_row0.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1432 #else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 1434 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
1437 in_row0.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1438 in_row0.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1439 in_row0.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1440 in_row0.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1441 in_row0.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1442 in_row0.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1443 in_row0.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1444 in_row0.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1449 in_row1.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1450 in_row1.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1451 in_row1.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1452 in_row1.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1453 in_row1.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1454 in_row1.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1455 in_row1.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1456 in_row1.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1461 in_row2.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1462 in_row2.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1463 in_row2.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1464 in_row2.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1465 in_row2.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1466 in_row2.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1467 in_row2.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1468 in_row2.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1473 in_row3.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1474 in_row3.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1475 in_row3.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1476 in_row3.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1477 in_row3.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1478 in_row3.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1479 in_row3.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1480 in_row3.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1485 in_row4.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1486 in_row4.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1487 in_row4.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1488 in_row4.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1489 in_row4.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1490 in_row4.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1491 in_row4.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1492 in_row4.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1497 in_row5.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1498 in_row5.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1499 in_row5.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1500 in_row5.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1501 in_row5.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1502 in_row5.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1503 in_row5.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1504 in_row5.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1509 in_row6.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1510 in_row6.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1511 in_row6.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1512 in_row6.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1513 in_row6.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1514 in_row6.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1515 in_row6.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1516 in_row6.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1521 in_row7.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1522 in_row7.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1523 in_row7.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1524 in_row7.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1525 in_row7.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1526 in_row7.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1527 in_row7.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1528 in_row7.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1533 comm_fact0 = in_row2 + in_row6 - (
DATA_TYPE)4.25f * in_row4;
1535 comm_fact1 = in_row1 + in_row5 - (
DATA_TYPE)4.25f * in_row3;
1559 out0, out1, out2, out3, out4, out5, out6, out7;
1568 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1571 #if defined(NUM_TILES_Y) 1572 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + (y + z * (
int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
1574 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + (y + z * (
int)NUM_TILES_X) * dst_stride_y;
1577 *((__global
DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
1578 *((__global
DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
1579 *((__global
DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
1580 *((__global
DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
1581 *((__global
DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
1582 *((__global
DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
1583 *((__global
DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
1584 *((__global
DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
1586 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1587 *((__global
DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
1588 *((__global
DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
1589 *((__global
DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
1590 *((__global
DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
1591 *((__global
DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
1592 *((__global
DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
1593 *((__global
DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
1594 *((__global
DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
1595 *((__global
DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
1596 *((__global
DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
1597 *((__global
DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
1598 *((__global
DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
1599 *((__global
DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
1600 *((__global
DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
1601 *((__global
DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
1602 *((__global
DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
1603 *((__global
DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
1604 *((__global
DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
1605 *((__global
DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
1606 *((__global
DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
1607 *((__global
DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
1608 *((__global
DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
1609 *((__global
DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
1610 *((__global
DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
1611 *((__global
DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
1612 *((__global
DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
1613 *((__global
DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
1614 *((__global
DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
1615 *((__global
DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
1616 *((__global
DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
1617 *((__global
DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
1618 *((__global
DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
1619 *((__global
DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
1620 *((__global
DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
1621 *((__global
DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
1622 *((__global
DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
1623 *((__global
DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
1624 *((__global
DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
1625 *((__global
DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
1626 *((__global
DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
1627 *((__global
DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
1628 *((__global
DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
1629 *((__global
DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
1630 *((__global
DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
1631 *((__global
DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
1632 *((__global
DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
1633 *((__global
DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
1634 *((__global
DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
1635 *((__global
DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
1636 *((__global
DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
1637 *((__global
DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
1638 *((__global
DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
1639 *((__global
DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
1640 *((__global
DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
1641 *((__global
DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
1642 *((__global
DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
1643 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1677 __kernel
void winograd_input_transform_2x2_7x7_stepz1_nhwc(
1683 const int x = get_global_id(0);
1684 const int y = get_global_id(1);
1685 #if defined(NUM_TILES_Y) 1686 const int z = get_global_id(2) % NUM_TILES_Y;
1687 const int b = get_global_id(2) / NUM_TILES_Y;
1689 const int z = get_global_id(2);
1693 #if defined(NUM_TILES_Y) 1694 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + b * src_stride_w;
1696 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE);
1700 int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
1701 int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
1704 int8 y_coord_valid0 =
clamp(y_coord0, (int8)0, (int8)((
int)SRC_DIM_1 - 1));
1705 int8 z_coord_valid0 =
clamp(z_coord0, (int8)0, (int8)((
int)SRC_DIM_2 - 1));
1708 int8 y_cond0 = y_coord_valid0 == y_coord0;
1709 int8 z_cond0 = z_coord_valid0 == z_coord0;
1711 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 1716 in_row0.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1717 in_row0.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1718 in_row0.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1719 in_row0.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1720 in_row0.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1721 in_row0.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1722 in_row0.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1723 in_row0.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1738 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 1742 in_row0.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1743 in_row0.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1744 in_row0.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1745 in_row0.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1746 in_row0.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1747 in_row0.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1748 in_row0.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1749 in_row0.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1764 #else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 1766 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
1769 in_row0.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1770 in_row0.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1771 in_row0.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1772 in_row0.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1773 in_row0.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1774 in_row0.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1775 in_row0.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1776 in_row0.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
1781 in_row1.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1782 in_row1.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1783 in_row1.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1784 in_row1.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1785 in_row1.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1786 in_row1.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1787 in_row1.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1788 in_row1.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
1793 in_row2.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1794 in_row2.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1795 in_row2.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1796 in_row2.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1797 in_row2.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1798 in_row2.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1799 in_row2.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1800 in_row2.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
1805 in_row3.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1806 in_row3.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1807 in_row3.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1808 in_row3.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1809 in_row3.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1810 in_row3.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1811 in_row3.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1812 in_row3.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
1817 in_row4.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1818 in_row4.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1819 in_row4.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1820 in_row4.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1821 in_row4.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1822 in_row4.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1823 in_row4.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1824 in_row4.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
1829 in_row5.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1830 in_row5.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1831 in_row5.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1832 in_row5.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1833 in_row5.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1834 in_row5.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1835 in_row5.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1836 in_row5.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
1841 in_row6.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1842 in_row6.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1843 in_row6.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1844 in_row6.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1845 in_row6.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1846 in_row6.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1847 in_row6.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1848 in_row6.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
1853 in_row7.s0 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1854 in_row7.s1 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1855 in_row7.s2 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1856 in_row7.s3 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1857 in_row7.s4 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1858 in_row7.s5 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1859 in_row7.s6 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1860 in_row7.s7 = *(__global
DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (
int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
1888 out0, out1, out2, out3, out4, out5, out6, out7;
1899 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1902 #if defined(NUM_TILES_Y) 1903 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + (y + z * (
int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
1905 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(
DATA_TYPE) + (y + z * (
int)NUM_TILES_X) * dst_stride_y;
1908 *((__global
DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
1909 *((__global
DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
1910 *((__global
DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
1911 *((__global
DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
1912 *((__global
DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
1913 *((__global
DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
1914 *((__global
DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
1915 *((__global
DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
1917 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1918 *((__global
DATA_TYPE *)(dst_addr + 8 * dst_stride_z)) = out1.s0;
1919 *((__global
DATA_TYPE *)(dst_addr + 9 * dst_stride_z)) = out1.s1;
1920 *((__global
DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
1921 *((__global
DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
1922 *((__global
DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
1923 *((__global
DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
1924 *((__global
DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
1925 *((__global
DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
1926 *((__global
DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
1927 *((__global
DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
1928 *((__global
DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
1929 *((__global
DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
1930 *((__global
DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
1931 *((__global
DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
1932 *((__global
DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
1933 *((__global
DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
1934 *((__global
DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
1935 *((__global
DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
1936 *((__global
DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
1937 *((__global
DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
1938 *((__global
DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
1939 *((__global
DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
1940 *((__global
DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
1941 *((__global
DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
1942 *((__global
DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
1943 *((__global
DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
1944 *((__global
DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
1945 *((__global
DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
1946 *((__global
DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
1947 *((__global
DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
1948 *((__global
DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
1949 *((__global
DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
1950 *((__global
DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
1951 *((__global
DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
1952 *((__global
DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
1953 *((__global
DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
1954 *((__global
DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
1955 *((__global
DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
1956 *((__global
DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
1957 *((__global
DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
1958 *((__global
DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
1959 *((__global
DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
1960 *((__global
DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
1961 *((__global
DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
1962 *((__global
DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
1963 *((__global
DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
1964 *((__global
DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
1965 *((__global
DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
1966 *((__global
DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
1967 *((__global
DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
1968 *((__global
DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
1969 *((__global
DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
1970 *((__global
DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
1971 *((__global
DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
1972 *((__global
DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
1973 *((__global
DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
1974 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 1976 #endif // defined(SRC_DIM_1) && defined(SRC_DIM_2) 1978 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 2007 __kernel
void winograd_input_transform_2x1_3x1_stepz1_nchw(
2013 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
2020 src_offset_first_element_in_bytes,
2028 dst_offset_first_element_in_bytes,
2061 __kernel
void winograd_input_transform_2x1_3x1_stepz2_nchw(
2067 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
2074 src_offset_first_element_in_bytes,
2082 dst_offset_first_element_in_bytes,
2115 __kernel
void winograd_input_transform_4x1_3x1_stepz1_nchw(
2121 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
2128 src_offset_first_element_in_bytes,
2136 dst_offset_first_element_in_bytes,
2169 __kernel
void winograd_input_transform_4x1_5x1_stepz1_nchw(
2175 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
2182 src_offset_first_element_in_bytes,
2190 dst_offset_first_element_in_bytes,
2195 #if defined(SRC_DIM_1) && defined(SRC_DIM_2) 2226 __kernel
void winograd_input_transform_4x1_3x1_stepz1_nhwc(
2232 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
2239 src_offset_first_element_in_bytes,
2247 dst_offset_first_element_in_bytes,
2282 __kernel
void winograd_input_transform_4x1_5x1_stepz1_nhwc(
2288 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
2295 src_offset_first_element_in_bytes,
2303 dst_offset_first_element_in_bytes,
2338 __kernel
void winograd_input_transform_2x1_7x1_stepz1_nhwc(
2344 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
2351 src_offset_first_element_in_bytes,
2359 dst_offset_first_element_in_bytes,
2363 #endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2) 2364 #endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) 2366 #if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 2395 __kernel
void winograd_input_transform_1x2_1x3_stepz1_nchw(
2401 winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
2408 src_offset_first_element_in_bytes,
2416 dst_offset_first_element_in_bytes,
2449 __kernel
void winograd_input_transform_1x2_1x3_stepz2_nchw(
2455 winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
2462 src_offset_first_element_in_bytes,
2470 dst_offset_first_element_in_bytes,
2503 __kernel
void winograd_input_transform_1x4_1x3_stepz1_nchw(
2509 winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
2516 src_offset_first_element_in_bytes,
2524 dst_offset_first_element_in_bytes,
2557 __kernel
void winograd_input_transform_1x4_1x5_stepz1_nchw(
2563 winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
2570 src_offset_first_element_in_bytes,
2578 dst_offset_first_element_in_bytes,
2583 #if defined(SRC_DIM_1) && defined(SRC_DIM_2) 2614 __kernel
void winograd_input_transform_1x4_1x3_stepz1_nhwc(
2620 winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
2627 src_offset_first_element_in_bytes,
2635 dst_offset_first_element_in_bytes,
2670 __kernel
void winograd_input_transform_1x4_1x5_stepz1_nhwc(
2676 winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
2683 src_offset_first_element_in_bytes,
2691 dst_offset_first_element_in_bytes,
2726 __kernel
void winograd_input_transform_1x2_1x7_stepz1_nhwc(
2732 winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
2739 src_offset_first_element_in_bytes,
2747 dst_offset_first_element_in_bytes,
2751 #endif // defined(SRC_DIM_1) && defined(SRC_DIM_2) 2752 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) 2753 #endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
SimpleTensor< float > src
DataType clamp(const DataType &n, const DataType &lower=std::numeric_limits< RangeType >::lowest(), const DataType &upper=std::numeric_limits< RangeType >::max())
Performs clamping among a lower and upper value.
#define TENSOR3D_DECLARATION(name)
#define VEC_DATA_TYPE(type, size)