25 #ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
26 #define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
36 const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
40 template <
unsigned int str
idex>
44 const float32x4x3_t &m0,
45 const float32x4x3_t &m1,
46 const float32x4x3_t &m2);
52 const float32x4x3_t &m0,
53 const float32x4x3_t &m1,
54 const float32x4x3_t &m2)
56 const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
57 const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
58 const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
59 float32x4x2_t out = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
60 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
61 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
63 out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
64 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
65 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
67 out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
68 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
69 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
71 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
72 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
74 out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
75 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
76 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
78 out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
79 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
80 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
88 const float32x4x3_t &m0,
89 const float32x4x3_t &m1,
90 const float32x4x3_t &m2)
92 float32x4x2_t out =
convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
93 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
94 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
95 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
103 const float32x4x3_t &m0,
104 const float32x4x3_t &m1,
105 const float32x4x3_t &m2)
107 float32x4x2_t out =
convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
108 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
112 template <
unsigned int str
idex>
113 void store_results(
float *buffer,
const float32x4x2_t &values);
118 vst1q_f32(buffer, values.val[0]);
119 vst1q_f32(buffer + 4, values.val[1]);
125 vst1q_f32(buffer, values.val[0]);
131 vst1_f32(buffer, vget_low_f32(values.val[0]));
134 template <
unsigned int str
idex>
140 return num_elems_written_per_iteration;
146 return num_elems_written_per_iteration << 1;
152 return num_elems_written_per_iteration * 3;