26 #if defined(DATA_TYPE) 32 #define TWIDDLE_FACTOR_MULTIPLICATION(phi, input) \ 34 VEC_DATA_TYPE(DATA_TYPE, 2) \ 38 tmp.x = (w.x * input.x) - (w.y * input.y); \ 39 tmp.y = (w.x * input.y) + (w.y * input.x); \ 48 #define DFT_2(c0, c1) \ 50 VEC_DATA_TYPE(DATA_TYPE, 2) \ 58 #define SQRT3DIV2 0.86602540378443f 66 #define DFT_3(c0, c1, c2) \ 68 VEC_DATA_TYPE(DATA_TYPE, 2) \ 70 VEC_DATA_TYPE(DATA_TYPE, 2) \ 72 c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \ 73 c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \ 74 c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \ 75 c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \ 86 #define DFT_4(c0, c1, c2, c3) \ 88 VEC_DATA_TYPE(DATA_TYPE, 2) \ 102 #define W5_A (DATA_TYPE)0.30901699437494f 103 #define W5_B (DATA_TYPE)0.95105651629515f 104 #define W5_C (DATA_TYPE)0.80901699437494f 105 #define W5_D (DATA_TYPE)0.58778525229247f 115 #define DFT_5(c0, c1, c2, c3, c4) \ 117 VEC_DATA_TYPE(DATA_TYPE, 2) \ 118 v0, v1, v2, v3, v4; \ 120 v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \ 121 v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \ 122 v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \ 123 v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \ 124 c0 = v0 + c1 + c2 + c3 + c4; \ 125 c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \ 126 c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \ 127 c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \ 128 c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \ 132 #define W7_A (DATA_TYPE)0.62348980185873f 133 #define W7_B (DATA_TYPE)0.78183148246802f 134 #define W7_C (DATA_TYPE)0.22252093395631f 135 #define W7_D (DATA_TYPE)0.97492791218182f 136 #define W7_E (DATA_TYPE)0.90096886790241f 137 #define W7_F (DATA_TYPE)0.43388373911755f 149 #define DFT_7(c0, c1, c2, c3, c4, c5, c6) \ 151 VEC_DATA_TYPE(DATA_TYPE, 2) \ 152 v0, v1, v2, v3, v4, v5, v6; \ 154 v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \ 155 v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \ 156 v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \ 157 v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \ 158 v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \ 159 v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \ 160 c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6; \ 161 c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \ 162 c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x); \ 163 c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x); \ 164 c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x); \ 165 c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x); \ 166 c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \ 180 #define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \ 182 VEC_DATA_TYPE(DATA_TYPE, 2) \ 183 v0, v1, v2, v3, v4, v5, v6, v7; \ 184 VEC_DATA_TYPE(DATA_TYPE, 2) \ 185 s0, s1, s2, s3, s4, s5, s6, s7; \ 186 VEC_DATA_TYPE(DATA_TYPE, 2) \ 200 s4.x = v4.x - v6.y; \ 201 s4.y = v4.y + v6.x; \ 202 s5.x = v5.x - v7.y; \ 203 s5.y = v5.y + v7.x; \ 204 s6.x = v4.x + v6.y; \ 205 s6.y = v4.y - v6.x; \ 206 s7.x = v5.x + v7.y; \ 207 s7.y = v5.y - v7.x; \ 210 t1.x = M_SQRT1_2_F * (s5.x - s5.y); \ 211 t1.y = M_SQRT1_2_F * (s5.x + s5.y); \ 212 t2.x = -M_SQRT1_2_F * (s7.x + s7.y); \ 213 t2.y = M_SQRT1_2_F * (s7.x - s7.y); \ 245 __kernel
void fft_radix_2_first_stage_axis_0(
263 data = vload4(0, (__global
DATA_TYPE *)input.ptr);
266 DFT_2(data.s01, data.s23);
269 vstore4(data, 0, (__global
DATA_TYPE *)output.ptr);
293 __kernel
void fft_radix_2_first_stage_axis_1(
311 data1 = vload2(0, (__global
DATA_TYPE *)input.ptr);
313 data2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 1, 0));
319 vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
320 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
344 __kernel
void fft_radix_3_first_stage_axis_0(
362 data0 = vload4(0, (__global DATA_TYPE *)input.ptr);
364 data1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 2, 0, 0));
367 DFT_3(data0.s01, data0.s23, data1.s01);
370 vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
371 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2, 0, 0));
395 __kernel
void fft_radix_3_first_stage_axis_1(
413 data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
415 data1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 1, 0));
417 data2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2, 0));
420 DFT_3(data0, data1, data2);
423 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
424 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
425 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
449 __kernel
void fft_radix_4_first_stage_axis_0(
467 data = vload8(0, (__global DATA_TYPE *)input.ptr);
470 DFT_4(data.s01, data.s23, data.s45, data.s67);
473 vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
497 __kernel
void fft_radix_4_first_stage_axis_1(
515 data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
517 data1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 1, 0));
519 data2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2, 0));
521 data3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3, 0));
524 DFT_4(data0, data1, data2, data3);
527 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
528 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
529 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
530 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
554 __kernel
void fft_radix_5_first_stage_axis_0(
572 data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
574 data1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 4, 0, 0));
577 DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
580 vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
581 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4, 0, 0));
605 __kernel
void fft_radix_5_first_stage_axis_1(
623 data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
625 data1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 1, 0));
627 data2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2, 0));
629 data3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3, 0));
631 data4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 4, 0));
634 DFT_5(data0, data1, data2, data3, data4);
637 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
638 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
639 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
640 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
641 vstore2(data4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4, 0));
665 __kernel
void fft_radix_7_first_stage_axis_0(
683 data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
685 data1 = vload4(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 4, 0, 0));
687 data2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 6, 0, 0));
690 DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
693 vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
694 vstore4(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4, 0, 0));
695 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 6, 0, 0));
719 __kernel
void fft_radix_7_first_stage_axis_1(
737 data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
739 data1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 1, 0));
741 data2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2, 0));
743 data3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3, 0));
745 data4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 4, 0));
747 data5 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 5, 0));
749 data6 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 6, 0));
752 DFT_7(data0, data1, data2, data3, data4, data5, data6);
755 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
756 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
757 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
758 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
759 vstore2(data4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4, 0));
760 vstore2(data5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5, 0));
761 vstore2(data6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6, 0));
785 __kernel
void fft_radix_8_first_stage_axis_0(
803 data = vload16(0, (__global DATA_TYPE *)input.ptr);
806 DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
809 vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
833 __kernel
void fft_radix_8_first_stage_axis_1(
851 data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
853 data1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 1, 0));
855 data2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2, 0));
857 data3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3, 0));
859 data4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 4, 0));
861 data5 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 5, 0));
863 data6 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 6, 0));
865 data7 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 7, 0));
868 DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
871 vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
872 vstore2(data1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 1, 0));
873 vstore2(data2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2, 0));
874 vstore2(data3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3, 0));
875 vstore2(data4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4, 0));
876 vstore2(data5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5, 0));
877 vstore2(data6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6, 0));
878 vstore2(data7, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 7, 0));
905 __kernel
void fft_radix_2_axis_0(
912 uint Nx, uint Ni,
float exp_const)
915 uint kx = get_global_id(0);
921 uint n = nx + (kx / Nx) * Ni;
925 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
935 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
937 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, Nx, 0, 0));
940 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
943 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
949 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
950 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
977 __kernel
void fft_radix_2_axis_1(
978 TENSOR3D_DECLARATION(input)
984 uint Nx, uint Ni,
float exp_const)
987 uint kx = get_global_id(1);
993 uint n = nx + (kx / Nx) * Ni;
997 input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
1007 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1009 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, Nx, 0));
1012 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1015 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1021 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1022 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1049 __kernel
void fft_radix_3_axis_0(
1050 TENSOR3D_DECLARATION(input)
1056 uint Nx, uint Ni,
float exp_const)
1059 uint kx = get_global_id(0);
1065 uint n = nx + (kx / Nx) * Ni;
1069 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
1079 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1081 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, Nx, 0, 0));
1083 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 2 * Nx, 0, 0));
1086 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1089 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1090 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1096 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1097 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1098 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1125 __kernel
void fft_radix_3_axis_1(
1126 TENSOR3D_DECLARATION(input)
1132 uint Nx, uint Ni,
float exp_const)
1135 uint kx = get_global_id(1);
1141 uint n = nx + (kx / Nx) * Ni;
1145 input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
1155 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1157 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, Nx, 0));
1159 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2 * Nx, 0));
1162 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1165 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1166 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1172 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1173 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1174 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1201 __kernel
void fft_radix_4_axis_0(
1202 TENSOR3D_DECLARATION(input)
1208 uint Nx, uint Ni,
float exp_const)
1211 uint kx = get_global_id(0);
1217 uint n = nx + (kx / Nx) * Ni;
1221 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
1231 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1233 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, Nx, 0, 0));
1235 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 2 * Nx, 0, 0));
1237 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 3 * Nx, 0, 0));
1240 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1243 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1244 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1245 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1248 DFT_4(c0, c1, c2, c3);
1251 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1252 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1253 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1254 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1281 __kernel
void fft_radix_4_axis_1(
1282 TENSOR3D_DECLARATION(input)
1288 uint Nx, uint Ni,
float exp_const)
1291 uint kx = get_global_id(1);
1297 uint n = nx + (kx / Nx) * Ni;
1301 input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
1311 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1313 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, Nx, 0));
1315 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2 * Nx, 0));
1317 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3 * Nx, 0));
1320 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1323 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1324 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1325 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1328 DFT_4(c0, c1, c2, c3);
1331 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1332 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1333 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1334 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1361 __kernel
void fft_radix_5_axis_0(
1362 TENSOR3D_DECLARATION(input)
1368 uint Nx, uint Ni,
float exp_const)
1371 uint kx = get_global_id(0);
1377 uint n = nx + (kx / Nx) * Ni;
1381 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
1391 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1393 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, Nx, 0, 0));
1395 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 2 * Nx, 0, 0));
1397 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 3 * Nx, 0, 0));
1399 c4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 4 * Nx, 0, 0));
1402 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1405 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1406 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1407 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1408 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1411 DFT_5(c0, c1, c2, c3, c4);
1414 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1415 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1416 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1417 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1418 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4 * Nx, 0, 0));
1445 __kernel
void fft_radix_5_axis_1(
1446 TENSOR3D_DECLARATION(input)
1452 uint Nx, uint Ni,
float exp_const)
1455 uint kx = get_global_id(1);
1461 uint n = nx + (kx / Nx) * Ni;
1465 input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
1475 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1477 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, Nx, 0));
1479 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2 * Nx, 0));
1481 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3 * Nx, 0));
1483 c4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 4 * Nx, 0));
1486 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1489 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1490 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1491 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1492 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1495 DFT_5(c0, c1, c2, c3, c4);
1498 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1499 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1500 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1501 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1502 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4 * Nx, 0));
1529 __kernel
void fft_radix_7_axis_0(
1530 TENSOR3D_DECLARATION(input)
1536 uint Nx, uint Ni,
float exp_const)
1539 uint kx = get_global_id(0);
1545 uint n = nx + (kx / Nx) * Ni;
1549 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
1559 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1561 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, Nx, 0, 0));
1563 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 2 * Nx, 0, 0));
1565 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 3 * Nx, 0, 0));
1567 c4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 4 * Nx, 0, 0));
1569 c5 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 5 * Nx, 0, 0));
1571 c6 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 6 * Nx, 0, 0));
1574 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1577 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1578 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1579 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1580 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1581 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1582 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1585 DFT_7(c0, c1, c2, c3, c4, c5, c6);
1588 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1589 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1590 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1591 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1592 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4 * Nx, 0, 0));
1593 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 5 * Nx, 0, 0));
1594 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 6 * Nx, 0, 0));
1621 __kernel
void fft_radix_7_axis_1(
1622 TENSOR3D_DECLARATION(input)
1628 uint Nx, uint Ni,
float exp_const)
1631 uint kx = get_global_id(1);
1637 uint n = nx + (kx / Nx) * Ni;
1641 input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
1651 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1653 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, Nx, 0));
1655 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2 * Nx, 0));
1657 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3 * Nx, 0));
1659 c4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 4 * Nx, 0));
1661 c5 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 5 * Nx, 0));
1663 c6 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 6 * Nx, 0));
1666 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1669 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1670 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1671 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1672 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1673 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1674 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1677 DFT_7(c0, c1, c2, c3, c4, c5, c6);
1680 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1681 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1682 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1683 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1684 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4 * Nx, 0));
1685 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5 * Nx, 0));
1686 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6 * Nx, 0));
1713 __kernel
void fft_radix_8_axis_0(
1714 TENSOR3D_DECLARATION(input)
1720 uint Nx, uint Ni,
float exp_const)
1723 uint kx = get_global_id(0);
1729 uint n = nx + (kx / Nx) * Ni;
1733 input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
1743 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1745 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, Nx, 0, 0));
1747 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 2 * Nx, 0, 0));
1749 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 3 * Nx, 0, 0));
1751 c4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 4 * Nx, 0, 0));
1753 c5 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 5 * Nx, 0, 0));
1755 c6 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 6 * Nx, 0, 0));
1757 c7 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 7 * Nx, 0, 0));
1760 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1763 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1764 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1765 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1766 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1767 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1768 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1769 TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
1772 DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
1775 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1776 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, Nx, 0, 0));
1777 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 2 * Nx, 0, 0));
1778 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 3 * Nx, 0, 0));
1779 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 4 * Nx, 0, 0));
1780 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 5 * Nx, 0, 0));
1781 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 6 * Nx, 0, 0));
1782 vstore2(c7, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 7 * Nx, 0, 0));
1809 __kernel
void fft_radix_8_axis_1(
1810 TENSOR3D_DECLARATION(input)
1816 uint Nx, uint Ni,
float exp_const)
1819 uint kx = get_global_id(1);
1825 uint n = nx + (kx / Nx) * Ni;
1829 input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
1839 c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
1841 c1 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, Nx, 0));
1843 c2 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 2 * Nx, 0));
1845 c3 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 3 * Nx, 0));
1847 c4 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 4 * Nx, 0));
1849 c5 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 5 * Nx, 0));
1851 c6 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 6 * Nx, 0));
1853 c7 = vload2(0, (__global DATA_TYPE *)
tensor3D_offset(&input, 0, 7 * Nx, 0));
1856 DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
1859 TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
1860 TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
1861 TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
1862 TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
1863 TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
1864 TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
1865 TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
1868 DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
1871 vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
1872 vstore2(c1, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, Nx, 0));
1873 vstore2(c2, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 2 * Nx, 0));
1874 vstore2(c3, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 3 * Nx, 0));
1875 vstore2(c4, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 4 * Nx, 0));
1876 vstore2(c5, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 5 * Nx, 0));
1877 vstore2(c6, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 6 * Nx, 0));
1878 vstore2(c7, 0, (__global DATA_TYPE *)
tensor3D_offset(&output, 0, 7 * Nx, 0));
1880 #endif // defined(DATA_TYPE)
int stride_z
Stride of the image in Z dimension (in bytes)
int stride_x
Stride of the image in X dimension (in bytes)
Structure to hold 3D tensor information.
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
#define CONVERT_TO_TENSOR3D_STRUCT(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
int stride_y
Stride of the image in Y dimension (in bytes)
#define TENSOR3D_DECLARATION(name)
__global const uchar * tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
Get the pointer position of a Tensor3D.
#define VEC_DATA_TYPE(type, size)