43 inline float32x4_t load_as_f32(T *ptr)
50 inline float32x4_t load_as_f32(
float *ptr)
56 inline float32x4_t load_as_f32(int32_t *ptr)
62 inline float32x4_t load_as_f32(uint32_t *ptr)
68 inline float32x4_t load_as_f32(int16_t *ptr)
74 inline float32x4_t load_as_f32(uint16_t *ptr)
80 inline float32x4_t load_as_f32(uint8_t *ptr)
82 return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(
wrapper::vload(ptr)))));
85 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 87 inline float32x4_t load_as_f32(float16_t *ptr)
94 inline void in_bounds_crop_window(
const ITensor *
input,
const ITensor *output,
float *output_ptr, Coordinates input_offset,
95 int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit,
bool input_has_single_channel,
bool is_width_flipped)
101 if(input_has_single_channel)
103 int32_t x = output_width_start;
104 Coordinates negative_offset(input_offset);
105 negative_offset.set(1, negative_offset[1] - window_step_x + 1);
106 for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
108 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
115 input_offset[1] = negative_offset[1] + window_step_x - 1;
116 for(; x < output_width_limit; ++x, --input_offset[1])
118 *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
123 for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
125 input_offset.set(0, 0);
127 for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
129 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
130 wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
132 for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
134 *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
142 if(std::is_same<T, float>::value)
144 memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
145 reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
146 (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
151 int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
152 float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
153 for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
155 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
158 for(; x < limit; ++x, ++input_offset[0])
160 *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
166 inline void out_of_bounds_crop_window(
const ITensor *output,
float *output_ptr,
float extrapolation_value,
167 int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
169 auto in =
wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
171 int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
172 float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
173 for(; x <= limit - window_step_x; x += window_step_x)
177 for(; x < limit; ++x)
179 *(output_start_ptr + x) = extrapolation_value;
183 inline void execute_window(
const ITensor *input,
const ITensor *output, Coordinates input_offset,
float extrapolation_value,
185 bool is_height_flipped,
bool has_cols_in_bounds,
bool has_cols_out_of_bounds_before,
bool has_cols_out_of_bounds_after,
bool input_has_single_channel,
bool is_width_flipped)
188 const int window_step_x = 16 /
sizeof(float);
189 auto *output_ptr =
reinterpret_cast<float *
>(output->buffer());
206 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
207 output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
209 for(uint32_t row = rows_out_of_bounds[0];
static_cast<int32_t
>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
210 ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
214 if(has_cols_out_of_bounds_before)
216 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
219 if(has_cols_in_bounds)
221 (*in_bounds_crop_function)(
input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
222 output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
225 if(has_cols_out_of_bounds_after)
227 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
229 output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
232 out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
237 : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds(),
238 _in_bounds_crop_function(nullptr)
248 _crop_boxes = crop_boxes;
251 _crop_box_ind = crop_box_ind;
252 _extrapolation_value = extrapolation_value;
257 _in_bounds_crop_function = &in_bounds_crop_window<float>;
259 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 261 _in_bounds_crop_function = &in_bounds_crop_window<float16_t>;
265 _in_bounds_crop_function = &in_bounds_crop_window<uint32_t>;
268 _in_bounds_crop_function = &in_bounds_crop_window<int32_t>;
271 _in_bounds_crop_function = &in_bounds_crop_window<uint16_t>;
274 _in_bounds_crop_function = &in_bounds_crop_window<int16_t>;
277 _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>;
321 bool is_width_flipped = _end[0] < _start[0];
322 bool is_height_flipped = _end[1] < _start[1];
323 if(is_height_flipped)
325 _rows_out_of_bounds[0] = _start[1] >=
static_cast<int32_t
>(_input->
info()->
dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->
info()->
dimension(2) + 1),
328 _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
334 _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
337 _rows_out_of_bounds[1] = _end[1] >=
static_cast<int32_t
>(_input->
info()->
dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->
info()->
dimension(2) + 1),
343 _cols_out_of_bounds[0] = _start[0] >=
static_cast<int32_t
>(_input->
info()->
dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->
info()->
dimension(1) + 1),
346 _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
352 _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
355 _cols_out_of_bounds[1] = _end[0] >=
static_cast<int32_t
>(_input->
info()->
dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->
info()->
dimension(1) + 1),
373 Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
374 _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
375 execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, _in_bounds_crop_function, _end[1] < _start[1],
376 _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->
info()->
dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
377 _start[0] <= _end[0], _end[0] < _start[0]);
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(t,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool) InBoundsCropFunction
Function to use for in bounds crop for the particular tensor types passed to configure() ...
virtual ITensorInfo & set_tensor_shape(const TensorShape &shape)=0
Set the shape of an already initialized tensor.
1 channel, 1 U8 per channel
uint8x16_t vloadq(const uint8_t *ptr)
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
1 channel, 1 U16 per channel
void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind=0, float extrapolation_value=0)
Configure kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
uint8x8_t vgetlow(const uint8x16_t val)
void configure_output_shape()
Configure output tensor's shape as this can only be determined at runtime.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind=0, float extrapolation_value=0)
Static function to check if given info will lead to a valid configuration of CLStridedSliceKernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
uint8x8_t vgethigh(const uint8x16_t val)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
unsigned int num_dimensions() const
Returns the effective dimensionality of the tensor.
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
uint8x8_t vrev64(const uint8x8_t &a)
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Includes all wrapper headers at once.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(t,...)
Describe a multidimensional execution window.
virtual bool has_padding() const =0
Checks if the tensor has been allocated with padding or not.
NECropKernel()
Default constructor.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)