26 #if defined(DATA_TYPE)
32 #define TWIDDLE_FACTOR_MULTIPLICATION(phi, input) \
34 VEC_DATA_TYPE(DATA_TYPE, 2) \
38 tmp.x = (w.x * input.x) - (w.y * input.y); \
39 tmp.y = (w.x * input.y) + (w.y * input.x); \
48 #define DFT_2(c0, c1) \
50 VEC_DATA_TYPE(DATA_TYPE, 2) \
58 #define SQRT3DIV2 0.86602540378443f
66 #define DFT_3(c0, c1, c2) \
68 VEC_DATA_TYPE(DATA_TYPE, 2) \
70 VEC_DATA_TYPE(DATA_TYPE, 2) \
72 c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
73 c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
74 c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
75 c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
86 #define DFT_4(c0, c1, c2, c3) \
88 VEC_DATA_TYPE(DATA_TYPE, 2) \
102 #define W5_A (DATA_TYPE)0.30901699437494f
103 #define W5_B (DATA_TYPE)0.95105651629515f
104 #define W5_C (DATA_TYPE)0.80901699437494f
105 #define W5_D (DATA_TYPE)0.58778525229247f
115 #define DFT_5(c0, c1, c2, c3, c4) \
117 VEC_DATA_TYPE(DATA_TYPE, 2) \
118 v0, v1, v2, v3, v4; \
120 v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \
121 v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \
122 v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \
123 v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \
124 c0 = v0 + c1 + c2 + c3 + c4; \
125 c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
126 c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \
127 c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \
128 c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
132 #define W7_A (DATA_TYPE)0.62348980185873f
133 #define W7_B (DATA_TYPE)0.78183148246802f
134 #define W7_C (DATA_TYPE)0.22252093395631f
135 #define W7_D (DATA_TYPE)0.97492791218182f
136 #define W7_E (DATA_TYPE)0.90096886790241f
137 #define W7_F (DATA_TYPE)0.43388373911755f
149 #define DFT_7(c0, c1, c2, c3, c4, c5, c6) \
151 VEC_DATA_TYPE(DATA_TYPE, 2) \
152 v0, v1, v2, v3, v4, v5, v6; \
154 v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
155 v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
156 v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
157 v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
158 v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
159 v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
160 c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6; \
161 c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
162 c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x); \
163 c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x); \
164 c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x); \
165 c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x); \
166 c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
180 #define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
182 VEC_DATA_TYPE(DATA_TYPE, 2) \
183 v0, v1, v2, v3, v4, v5, v6, v7; \
184 VEC_DATA_TYPE(DATA_TYPE, 2) \
185 s0, s1, s2, s3, s4, s5, s6, s7; \
186 VEC_DATA_TYPE(DATA_TYPE, 2) \
200 s4.x = v4.x - v6.y; \
201 s4.y = v4.y + v6.x; \
202 s5.x = v5.x - v7.y; \
203 s5.y = v5.y + v7.x; \
204 s6.x = v4.x + v6.y; \
205 s6.y = v4.y - v6.x; \
206 s7.x = v5.x + v7.y; \
207 s7.y = v5.y - v7.x; \
210 t1.x = M_SQRT1_2_F * (s5.x - s5.y); \
211 t1.y = M_SQRT1_2_F * (s5.x + s5.y); \
212 t2.x = -M_SQRT1_2_F * (s7.x + s7.y); \
213 t2.y = M_SQRT1_2_F * (s7.x - s7.y); \
245 __kernel
void fft_radix_2_first_stage_axis_0(
263 data = vload4(0, (__global DATA_TYPE *)
input.ptr);
266 DFT_2(data.s01, data.s23);
269 vstore4(data, 0, (__global DATA_TYPE *)output.ptr);
293 __kernel
void fft_radix_2_first_stage_axis_1(
311 data1 = vload2(0, (__global DATA_TYPE *)
input.ptr);
319 vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
320 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
344 __kernel
void fft_radix_3_first_stage_axis_0(
362 data0 = vload4(0, (__global DATA_TYPE *)
input.ptr);
367 DFT_3(data0.s01, data0.s23, data1.s01);
370 vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
371 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2, 0, 0));
395 __kernel
void fft_radix_3_first_stage_axis_1(
413 data0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
420 DFT_3(data0, data1, data2);
423 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
424 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
425 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
449 __kernel
void fft_radix_4_first_stage_axis_0(
467 data = vload8(0, (__global DATA_TYPE *)
input.ptr);
470 DFT_4(data.s01, data.s23, data.s45, data.s67);
473 vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
497 __kernel
void fft_radix_4_first_stage_axis_1(
515 data0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
524 DFT_4(data0, data1, data2, data3);
527 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
528 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
529 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
530 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
554 __kernel
void fft_radix_5_first_stage_axis_0(
572 data0 = vload8(0, (__global DATA_TYPE *)
input.ptr);
577 DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
580 vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
581 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4, 0, 0));
605 __kernel
void fft_radix_5_first_stage_axis_1(
623 data0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
634 DFT_5(data0, data1, data2, data3, data4);
637 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
638 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
639 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
640 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
641 vstore2(data4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4, 0));
665 __kernel
void fft_radix_7_first_stage_axis_0(
683 data0 = vload8(0, (__global DATA_TYPE *)
input.ptr);
690 DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
693 vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
694 vstore4(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4, 0, 0));
695 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 6, 0, 0));
719 __kernel
void fft_radix_7_first_stage_axis_1(
737 data0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
752 DFT_7(data0, data1, data2, data3, data4, data5, data6);
755 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
756 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
757 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
758 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
759 vstore2(data4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4, 0));
760 vstore2(data5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5, 0));
761 vstore2(data6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6, 0));
785 __kernel
void fft_radix_8_first_stage_axis_0(
803 data = vload16(0, (__global DATA_TYPE *)
input.ptr);
806 DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
809 vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
833 __kernel
void fft_radix_8_first_stage_axis_1(
851 data0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
868 DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
871 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
872 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
873 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
874 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
875 vstore2(data4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4, 0));
876 vstore2(data5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5, 0));
877 vstore2(data6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6, 0));
878 vstore2(data7, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 7, 0));
905 __kernel
void fft_radix_2_axis_0(
912 uint Nx, uint Ni,
float exp_const)
915 uint kx = get_global_id(0);
921 uint n = nx + (kx / Nx) * Ni;
925 input.ptr += n *
input.stride_x + get_global_id(1) *
input.stride_y + get_global_id(2) *
input.stride_z;
935 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
940 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
943 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
949 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
950 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
977 __kernel
void fft_radix_2_axis_1(
984 uint Nx, uint Ni,
float exp_const)
987 uint kx = get_global_id(1);
993 uint n = nx + (kx / Nx) * Ni;
997 input.ptr += get_global_id(0) *
input.stride_x + n *
input.stride_y + get_global_id(2) *
input.stride_z;
1007 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1012 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1015 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1021 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1022 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1049 __kernel
void fft_radix_3_axis_0(
1056 uint Nx, uint Ni,
float exp_const)
1059 uint kx = get_global_id(0);
1065 uint n = nx + (kx / Nx) * Ni;
1069 input.ptr += n *
input.stride_x + get_global_id(1) *
input.stride_y + get_global_id(2) *
input.stride_z;
1079 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1086 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1089 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1090 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1096 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1097 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1098 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1125 __kernel
void fft_radix_3_axis_1(
1132 uint Nx, uint Ni,
float exp_const)
1135 uint kx = get_global_id(1);
1141 uint n = nx + (kx / Nx) * Ni;
1145 input.ptr += get_global_id(0) *
input.stride_x + n *
input.stride_y + get_global_id(2) *
input.stride_z;
1155 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1162 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1165 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1166 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1172 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1173 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1174 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1201 __kernel
void fft_radix_4_axis_0(
1208 uint Nx, uint Ni,
float exp_const)
1211 uint kx = get_global_id(0);
1217 uint n = nx + (kx / Nx) * Ni;
1221 input.ptr += n *
input.stride_x + get_global_id(1) *
input.stride_y + get_global_id(2) *
input.stride_z;
1231 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1240 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1243 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1244 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1245 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1248 DFT_4(c0, c1, c2, c3);
1251 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1252 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1253 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1254 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1281 __kernel
void fft_radix_4_axis_1(
1288 uint Nx, uint Ni,
float exp_const)
1291 uint kx = get_global_id(1);
1297 uint n = nx + (kx / Nx) * Ni;
1301 input.ptr += get_global_id(0) *
input.stride_x + n *
input.stride_y + get_global_id(2) *
input.stride_z;
1311 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1320 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1323 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1324 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1325 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1328 DFT_4(c0, c1, c2, c3);
1331 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1332 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1333 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1334 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1361 __kernel
void fft_radix_5_axis_0(
1368 uint Nx, uint Ni,
float exp_const)
1371 uint kx = get_global_id(0);
1377 uint n = nx + (kx / Nx) * Ni;
1381 input.ptr += n *
input.stride_x + get_global_id(1) *
input.stride_y + get_global_id(2) *
input.stride_z;
1391 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1402 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1405 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1406 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1407 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1408 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1411 DFT_5(c0, c1, c2, c3, c4);
1414 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1415 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1416 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1417 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1418 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4 * Nx, 0, 0));
1445 __kernel
void fft_radix_5_axis_1(
1452 uint Nx, uint Ni,
float exp_const)
1455 uint kx = get_global_id(1);
1461 uint n = nx + (kx / Nx) * Ni;
1465 input.ptr += get_global_id(0) *
input.stride_x + n *
input.stride_y + get_global_id(2) *
input.stride_z;
1475 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1486 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1489 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1490 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1491 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1492 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1495 DFT_5(c0, c1, c2, c3, c4);
1498 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1499 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1500 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1501 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1502 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4 * Nx, 0));
1529 __kernel
void fft_radix_7_axis_0(
1536 uint Nx, uint Ni,
float exp_const)
1539 uint kx = get_global_id(0);
1545 uint n = nx + (kx / Nx) * Ni;
1549 input.ptr += n *
input.stride_x + get_global_id(1) *
input.stride_y + get_global_id(2) *
input.stride_z;
1559 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1574 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1577 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1578 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1579 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1580 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1581 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1582 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1585 DFT_7(c0, c1, c2, c3, c4, c5, c6);
1588 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1589 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1590 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1591 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1592 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4 * Nx, 0, 0));
1593 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 5 * Nx, 0, 0));
1594 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 6 * Nx, 0, 0));
1621 __kernel
void fft_radix_7_axis_1(
1628 uint Nx, uint Ni,
float exp_const)
1631 uint kx = get_global_id(1);
1637 uint n = nx + (kx / Nx) * Ni;
1641 input.ptr += get_global_id(0) *
input.stride_x + n *
input.stride_y + get_global_id(2) *
input.stride_z;
1651 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1666 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1669 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1670 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1671 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1672 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1673 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1674 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1677 DFT_7(c0, c1, c2, c3, c4, c5, c6);
1680 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1681 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1682 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1683 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1684 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4 * Nx, 0));
1685 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5 * Nx, 0));
1686 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6 * Nx, 0));
1713 __kernel
void fft_radix_8_axis_0(
1720 uint Nx, uint Ni,
float exp_const)
1723 uint kx = get_global_id(0);
1729 uint n = nx + (kx / Nx) * Ni;
1733 input.ptr += n *
input.stride_x + get_global_id(1) *
input.stride_y + get_global_id(2) *
input.stride_z;
1743 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1760 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1763 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1764 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1765 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1766 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1767 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1768 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1769 TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
1772 DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
1775 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1776 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1777 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1778 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1779 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4 * Nx, 0, 0));
1780 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 5 * Nx, 0, 0));
1781 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 6 * Nx, 0, 0));
1782 vstore2(c7, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 7 * Nx, 0, 0));
1809 __kernel
void fft_radix_8_axis_1(
1816 uint Nx, uint Ni,
float exp_const)
1819 uint kx = get_global_id(1);
1825 uint n = nx + (kx / Nx) * Ni;
1829 input.ptr += get_global_id(0) *
input.stride_x + n *
input.stride_y + get_global_id(2) *
input.stride_z;
1839 c0 = vload2(0, (__global DATA_TYPE *)
input.ptr);
1856 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1859 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1860 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1861 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1862 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1863 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1864 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1865 TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
1868 DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
1871 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1872 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1873 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1874 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1875 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4 * Nx, 0));
1876 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5 * Nx, 0));
1877 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6 * Nx, 0));
1878 vstore2(c7, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 7 * Nx, 0));
1880 #endif // defined(DATA_TYPE)