25 #ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H 26 #define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H 36 const float32x4x3_t r =
40 vld1q_dup_f32(1 + ptr),
41 vld1q_dup_f32(2 + ptr)
47 template <
unsigned int str
idex>
48 float32x4x2_t
convolve_3x3(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2);
51 inline float32x4x2_t
convolve_3x3<1>(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2)
53 const float32x4x3_t vtop =
57 vld1q_f32(in_top + 4),
61 const float32x4x3_t vmid =
65 vld1q_f32(in_mid + 4),
69 const float32x4x3_t vlow =
73 vld1q_f32(in_low + 4),
80 vmulq_f32(vtop.val[0], m0.val[0]),
81 vmulq_f32(vtop.val[1], m0.val[0])
84 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
85 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
87 out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
88 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
89 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
91 out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
92 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
93 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
95 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
96 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
98 out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
99 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
100 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
102 out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
103 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
104 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
109 inline float32x4x2_t
convolve_3x3<2>(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2)
111 float32x4x2_t out =
convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
112 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
113 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
114 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
119 inline float32x4x2_t
convolve_3x3<3>(
const float *in_top,
const float *in_mid,
const float *in_low,
const float32x4x3_t &m0,
const float32x4x3_t &m1,
const float32x4x3_t &m2)
121 float32x4x2_t out =
convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
122 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
126 template <
unsigned int str
idex>
127 void store_results(
float *buffer,
const float32x4x2_t &values);
132 vst1q_f32(buffer, values.val[0]);
133 vst1q_f32(buffer + 4, values.val[1]);
139 vst1q_f32(buffer, values.val[0]);
145 vst1_f32(buffer, vget_low_f32(values.val[0]));
148 template <
unsigned int str
idex>
154 return num_elems_written_per_iteration;
160 return num_elems_written_per_iteration << 1;
166 return num_elems_written_per_iteration * 3;
float32x4x2_t convolve_3x3< 2 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration)
Copyright (c) 2017-2022 Arm Limited.
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
int get_input_num_elems_processed< 2 >(unsigned int num_elems_written_per_iteration)
void store_results(float *buffer, const float32x4x2_t &values)
Stores a float32x4x2_t array into a memory location.
float32x4x2_t convolve_3x3< 3 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
void store_results< 3 >(float *buffer, const float32x4x2_t &values)
void store_results< 1 >(float *buffer, const float32x4x2_t &values)
int get_input_num_elems_processed< 3 >(unsigned int num_elems_written_per_iteration)
float32x4x3_t load_matrix_row(const float *ptr)
int get_input_num_elems_processed< 1 >(unsigned int num_elems_written_per_iteration)
void store_results< 2 >(float *buffer, const float32x4x2_t &values)
float32x4x2_t convolve_3x3< 1 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)