44 inline float32x4_t load_as_f32(T *ptr)
51 inline float32x4_t load_as_f32(
float *ptr)
57 inline float32x4_t load_as_f32(int32_t *ptr)
63 inline float32x4_t load_as_f32(uint32_t *ptr)
69 inline float32x4_t load_as_f32(int16_t *ptr)
75 inline float32x4_t load_as_f32(uint16_t *ptr)
81 inline float32x4_t load_as_f32(uint8_t *ptr)
83 return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(
wrapper::vload(ptr)))));
86 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 88 inline float32x4_t load_as_f32(float16_t *ptr)
95 inline void in_bounds_crop_window(
const ITensor *
input,
const ITensor *output,
float *output_ptr, Coordinates input_offset,
96 int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit,
bool input_has_single_channel,
bool is_width_flipped)
102 if(input_has_single_channel)
104 int32_t x = output_width_start;
105 Coordinates negative_offset(input_offset);
106 negative_offset.set(1, negative_offset[1] - window_step_x + 1);
107 for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
109 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
116 input_offset[1] = negative_offset[1] + window_step_x - 1;
117 for(; x < output_width_limit; ++x, --input_offset[1])
119 *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
124 for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
126 input_offset.set(0, 0);
128 for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
130 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
131 wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
133 for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
135 *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
143 if(std::is_same<T, float>::value)
145 memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
146 reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
147 (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
152 int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
153 float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
154 for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
156 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
159 for(; x < limit; ++x, ++input_offset[0])
161 *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
167 inline void out_of_bounds_crop_window(
const ITensor *output,
float *output_ptr,
float extrapolation_value,
168 int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
170 auto in =
wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
172 int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
173 float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
174 for(; x <= limit - window_step_x; x += window_step_x)
178 for(; x < limit; ++x)
180 *(output_start_ptr + x) = extrapolation_value;
184 inline void execute_window(
const ITensor *input,
const ITensor *output, Coordinates input_offset,
float extrapolation_value,
186 bool is_height_flipped,
bool has_cols_in_bounds,
bool has_cols_out_of_bounds_before,
bool has_cols_out_of_bounds_after,
bool input_has_single_channel,
bool is_width_flipped)
189 const int window_step_x = 16 /
sizeof(float);
190 auto *output_ptr =
reinterpret_cast<float *
>(output->buffer());
207 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
208 output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
210 for(uint32_t row = rows_out_of_bounds[0];
static_cast<int32_t
>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
211 ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
215 if(has_cols_out_of_bounds_before)
217 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
220 if(has_cols_in_bounds)
222 (*in_bounds_crop_function)(
input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
223 output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
226 if(has_cols_out_of_bounds_after)
228 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
230 output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
233 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
238 : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
239 _in_bounds_crop_function(nullptr)
249 _crop_boxes = crop_boxes;
252 _crop_box_ind = crop_box_ind;
253 _extrapolation_value = extrapolation_value;
258 _in_bounds_crop_function = &in_bounds_crop_window<float>;
260 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 262 _in_bounds_crop_function = &in_bounds_crop_window<float16_t>;
266 _in_bounds_crop_function = &in_bounds_crop_window<uint32_t>;
269 _in_bounds_crop_function = &in_bounds_crop_window<int32_t>;
272 _in_bounds_crop_function = &in_bounds_crop_window<uint16_t>;
275 _in_bounds_crop_function = &in_bounds_crop_window<int16_t>;
278 _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>;
322 bool is_width_flipped = _end[0] < _start[0];
323 bool is_height_flipped = _end[1] < _start[1];
324 if(is_height_flipped)
326 _rows_out_of_bounds[0] = _start[1] >=
static_cast<int32_t
>(_input->
info()->
dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->
info()->
dimension(2) + 1),
329 _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
335 _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
338 _rows_out_of_bounds[1] = _end[1] >=
static_cast<int32_t
>(_input->
info()->
dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->
info()->
dimension(2) + 1),
344 _cols_out_of_bounds[0] = _start[0] >=
static_cast<int32_t
>(_input->
info()->
dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->
info()->
dimension(1) + 1),
347 _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
353 _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
356 _cols_out_of_bounds[1] = _end[0] >=
static_cast<int32_t
>(_input->
info()->
dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->
info()->
dimension(1) + 1),
374 Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
375 _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
376 execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function, _end[1] < _start[1],
377 _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->
info()->
dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
378 _start[0] <= _end[0], _end[0] < _start[0]);
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(t,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool) InBoundsCropFunction
Function to use for in bounds crop for the particular tensor types passed to configure() ...
virtual ITensorInfo & set_tensor_shape(const TensorShape &shape)=0
Set the shape of an already initialized tensor.
1 channel, 1 U8 per channel
uint8x16_t vloadq(const uint8_t *ptr)
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
1 channel, 1 U16 per channel
void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind=0, float extrapolation_value=0)
Configure kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
uint8x8_t vgetlow(const uint8x16_t val)
void configure_output_shape()
Configure output tensor's shape as this can only be determined at runtime.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind=0, float extrapolation_value=0)
Static function to check if given info will lead to a valid configuration of CLStridedSliceKernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
uint8x8_t vgethigh(const uint8x16_t val)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
unsigned int num_dimensions() const
Returns the effective dimensionality of the tensor.
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
uint8x8_t vrev64(const uint8x8_t &a)
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Includes all wrapper headers at once.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(t,...)
Describe a multidimensional execution window.
virtual bool has_padding() const =0
Checks if the tensor has been allocated with padding or not.
NECropKernel()
Default constructor.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)