47 constexpr
float SCALE_FACTOR = 0.7111111111111111f;
48 constexpr
float PI = 3.141592653589793f;
49 constexpr
float SCALE_180 = 180.0f / PI;
50 constexpr
float SCALE_360 = SCALE_180 * SCALE_FACTOR;
51 constexpr
float PI_4 = 0.7853981633974483f;
52 constexpr
float COEFF1 = 0.0663f;
53 constexpr
float COEFF2 = 0.2447f;
58 inline float32x4_t inv(float32x4_t x)
60 float32x4_t result = vrecpeq_f32(x);
61 result = vmulq_f32(vrecpsq_f32(x, result), result);
65 inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
67 const float32x4_t zero = vdupq_n_f32(0.0f);
68 const float32x4_t
epsilon = vdupq_n_f32(1e-9f);
69 const float32x4_t piover4 = vdupq_n_f32(PI_4);
70 const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
71 const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
72 const float32x4_t ninety = vdupq_n_f32(90.0f * SCALE_FACTOR);
73 const float32x4_t oneeighty = vdupq_n_f32(180.0f * SCALE_FACTOR);
74 const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
75 const float32x4_t
scale = vdupq_n_f32(SCALE_360);
77 float32x4_t abs_gx = vabsq_f32(gx);
78 float32x4_t abs_gy = vabsq_f32(gy);
79 float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
80 float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
81 float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
82 float32x4_t absz = vabsq_f32(z);
83 float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
86 float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
87 result = vmulq_f32(result, term);
88 result = vmlaq_f32(result, piover4, z);
91 result = vmulq_f32(result, scale);
94 result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
97 result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
98 result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
103 inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
105 const float32x4_t zero = vdupq_n_f32(0.0f);
106 const float32x4_t epsilon = vdupq_n_f32(1e-9f);
107 const float32x4_t piover4 = vdupq_n_f32(PI_4);
108 const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
109 const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
110 const float32x4_t ninety = vdupq_n_f32(90.0f);
111 const float32x4_t oneeighty = vdupq_n_f32(180.0f);
112 const float32x4_t threesixty = vdupq_n_f32(360.0f);
113 const float32x4_t scale = vdupq_n_f32(SCALE_180);
115 float32x4_t abs_gx = vabsq_f32(gx);
116 float32x4_t abs_gy = vabsq_f32(gy);
117 float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
118 float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
119 float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
120 float32x4_t absz = vabsq_f32(z);
123 float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
124 float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
125 result = vmulq_f32(result, term);
126 result = vmlaq_f32(result, piover4, z);
129 result = vmulq_f32(result, scale);
132 result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
135 result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
136 result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
137 result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
142 inline float32x4_t invsqrtv(float32x4_t x)
144 float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
146 sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
148 sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
151 return sqrt_reciprocal;
154 inline float32x4_t sqrtv(float32x4_t x)
156 float32x4_t res = vdupq_n_f32(0.5f);
157 return vmlaq_f32(res, x, invsqrtv(x));
160 inline int16x8_t
magnitude_l2(int16x8_t input1, int16x8_t input2)
162 const int32x4x2_t square_x =
165 vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
166 vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
170 const int32x4x2_t square_y =
173 vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
174 vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
178 const uint32x4x2_t
sum =
181 vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
182 vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
186 const float32x4x2_t res =
189 sqrtv(vcvtq_f32_u32(sum.val[0])),
190 sqrtv(vcvtq_f32_u32(sum.val[1]))
194 return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
195 vqmovn_s32(vcvtq_s32_f32(res.val[1])));
198 inline int16x8_t
magnitude_l1(int16x8_t input1, int16x8_t input2)
201 return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
204 inline uint8x8_t
phase_signed(int16x8_t input1, int16x8_t input2)
206 const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
208 float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
209 float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
210 float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
211 float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
214 float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
215 float32x4_t angle_low = atan2_0_360(inputx_f32_low, inputy_f32_low);
217 angle_high = vaddq_f32(angle_high, zeropointfive);
218 angle_low = vaddq_f32(angle_low, zeropointfive);
220 return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
221 vqmovun_s32(vcvtq_s32_f32(angle_high))));
224 inline uint8x8_t
phase_unsigned(int16x8_t input1, int16x8_t input2)
226 const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
228 float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
229 float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
230 float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
231 float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
234 float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
235 float32x4_t angle_low = atan2_0_180(inputx_f32_low, inputy_f32_low);
237 angle_high = vaddq_f32(angle_high, zeropointfive);
238 angle_low = vaddq_f32(angle_low, zeropointfive);
240 return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
241 vqmovun_s32(vcvtq_s32_f32(angle_high))));
245 template <MagnitudeType mag_type, PhaseType phase_type>
247 : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
251 template <MagnitudeType mag_type, PhaseType phase_type>
258 const bool run_mag = magnitude !=
nullptr;
259 const bool run_phase = phase !=
nullptr;
276 if(run_mag && run_phase)
315 magnitude_access.set_valid_region(win, valid_region);
316 phase_access.set_valid_region(win, valid_region);
318 INEKernel::configure(win);
321 template <MagnitudeType mag_type, PhaseType phase_type>
330 const int16x8x2_t input1 =
333 vld1q_s16(reinterpret_cast<int16_t *>(gx.
ptr())),
334 vld1q_s16(reinterpret_cast<int16_t *>(gx.
ptr()) + 8)
338 const int16x8x2_t input2 =
341 vld1q_s16(reinterpret_cast<int16_t *>(gy.
ptr())),
342 vld1q_s16(reinterpret_cast<int16_t *>(gy.
ptr()) + 8)
347 int16x8x2_t mag{ {} };
351 mag.val[0] =
magnitude_l2(input1.val[0], input2.val[0]);
352 mag.val[1] =
magnitude_l2(input1.val[1], input2.val[1]);
356 mag.val[0] =
magnitude_l1(input1.val[0], input2.val[0]);
357 mag.val[1] =
magnitude_l1(input1.val[1], input2.val[1]);
361 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.
ptr()), mag.val[0]);
362 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.
ptr()) + 8, mag.val[1]);
367 template <MagnitudeType mag_type, PhaseType phase_type>
376 const int16x8x2_t input1 =
379 vld1q_s16(reinterpret_cast<int16_t *>(gx.
ptr())),
380 vld1q_s16(reinterpret_cast<int16_t *>(gx.
ptr()) + 8)
384 const int16x8x2_t input2 =
387 vld1q_s16(reinterpret_cast<int16_t *>(gy.
ptr())),
388 vld1q_s16(reinterpret_cast<int16_t *>(gy.
ptr()) + 8)
393 uint8x8x2_t vphase{ {} };
397 vphase.val[0] =
phase_signed(input1.val[0], input2.val[0]);
398 vphase.val[1] =
phase_signed(input1.val[1], input2.val[1]);
407 vst1q_u8(phase.
ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
412 template <MagnitudeType mag_type, PhaseType phase_type>
422 const int16x8x2_t input1 =
425 vld1q_s16(reinterpret_cast<int16_t *>(gx.
ptr())),
426 vld1q_s16(reinterpret_cast<int16_t *>(gx.
ptr()) + 8)
430 const int16x8x2_t input2 =
433 vld1q_s16(reinterpret_cast<int16_t *>(gy.
ptr())),
434 vld1q_s16(reinterpret_cast<int16_t *>(gy.
ptr()) + 8)
439 int16x8x2_t mag{ {} };
443 mag.val[0] =
magnitude_l2(input1.val[0], input2.val[0]);
444 mag.val[1] =
magnitude_l2(input1.val[1], input2.val[1]);
448 mag.val[0] =
magnitude_l1(input1.val[0], input2.val[0]);
449 mag.val[1] =
magnitude_l1(input1.val[1], input2.val[1]);
453 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.
ptr()), mag.val[0]);
454 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.
ptr()) + 8, mag.val[1]);
457 uint8x8x2_t vphase{ {} };
461 vphase.val[0] =
phase_signed(input1.val[0], input2.val[0]);
462 vphase.val[1] =
phase_signed(input1.val[1], input2.val[1]);
471 vst1q_u8(phase.
ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
476 template <MagnitudeType mag_type, PhaseType phase_type>
484 (this->*_func)(window);
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
NEMagnitudePhaseKernel()
Default constructor.
const Window & window() const
The maximum window the kernel can be executed on.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
1 channel, 1 U8 per channel
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
SimpleTensor< uint8_t > phase(const SimpleTensor< T > &gx, const SimpleTensor< T > &gy, PhaseType phase_type)
uchar16 phase_signed(DATA_TYPE16 a, DATA_TYPE16 b)
Calculates signed phase between two inputs.
DATA_TYPE16 magnitude_l2(int16 a, int16 b)
Calculates L2 normalization between two inputs.
Template interface for the kernel to compute magnitude and phase.
const ValidRegion valid_region
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Class to describe a number of elements in each dimension.
Implementation of a row access pattern.
uchar16 phase_unsigned(DATA_TYPE16 a, DATA_TYPE16 b)
Calculates unsigned phase between two inputs.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
ValidRegion intersect_valid_regions(const Ts &... regions)
Intersect multiple valid regions.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Information about executing thread and CPU.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
__kernel void magnitude_phase(__global uchar *gx_ptr, uint gx_stride_x, uint gx_step_x, uint gx_stride_y, uint gx_step_y, uint gx_offset_first_element_in_bytes, __global uchar *gy_ptr, uint gy_stride_x, uint gy_step_x, uint gy_stride_y, uint gy_step_y, uint gy_offset_first_element_in_bytes, __global uchar *magnitude_ptr, uint magnitude_stride_x, uint magnitude_step_x, uint magnitude_stride_y, uint magnitude_step_y, uint magnitude_offset_first_element_in_bytes, __global uchar *phase_ptr, uint phase_stride_x, uint phase_step_x, uint phase_stride_y, uint phase_step_y, uint phase_offset_first_element_in_bytes)
Calculate the magnitude and phase of given the gradients of an image.
Container for valid region of a window.
void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
Initialise the kernel's input, output.
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
SimpleTensor< T > magnitude(const SimpleTensor< T > &gx, const SimpleTensor< T > &gy, MagnitudeType magnitude_type)
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
DATA_TYPE16 magnitude_l1(DATA_TYPE16 a, DATA_TYPE16 b)
Calculates L1 normalization between two inputs.