Compute Library
 21.02
CpuSoftmaxKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
32 #include "src/core/CPP/Validate.h"
35 
39 
40 namespace arm_compute
41 {
42 namespace cpu
43 {
44 namespace kernels
45 {
46 namespace
47 {
48 struct SoftmaxSelectorData
49 {
51 };
55 
56 struct SoftmaxLogits1DKernel
57 {
58  const char *name;
59  const SoftmaxSelectorPtr is_selected;
60  SoftmaxLogits1DKernelPtr ukernel;
61 };
62 
63 struct SoftmaxLogits1DMaxKernel
64 {
65  const char *name;
66  const SoftmaxSelectorPtr is_selected;
67  SoftmaxLogits1DMaxKernelPtr ukernel;
68 };
69 
70 static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
71 {
72 #if defined(__ARM_FEATURE_SVE)
73  {
74  "sve_softmax_logits_1d_float",
75  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
76  REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
77  },
78  {
79  "sve_softmax_logits_1d_float",
80  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
81  REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
82  },
83 #else /* !defined(__ARM_FEATURE_SVE) */
84  {
85  "neon_softmax_logits_1d_float",
86  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
87  REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
88  },
89 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
90  {
91  "neon_softmax_logits_1d_float",
92  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
93  REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
94  },
95 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
96 #endif /* defined(__ARM_FEATURE_SVE) */
97 
98 #if defined(__ARM_FEATURE_SVE2)
99  {
100  "sve_softmax_logits_1d_quantized",
101  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
102  REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
103  },
104  {
105  "sve_softmax_logits_1d_quantized",
106  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
107  REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
108  },
109 #else /* !defined(__ARM_FEATURE_SVE2) */
110  {
111  "neon_softmax_logits_1d_quantized",
112  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
113  REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
114  },
115  {
116  "neon_softmax_logits_1d_quantized",
117  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
118  REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
119  },
120 #endif /* defined(__ARM_FEATURE_SVE2) */
121 
122 };
123 
124 static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
125 {
126 #if defined(__ARM_FEATURE_SVE)
127  {
128  "sve_logits_1d_max",
129  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
130  REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
131  },
132  {
133  "sve_logits_1d_max",
134  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
135  REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
136  },
137  {
138  "sve_logits_1d_max",
139  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
140  REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
141  },
142  {
143  "sve_logits_1d_max",
144  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
145  REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
146  },
147 #else /* !defined(__ARM_FEATURE_SVE) */
148  {
149  "neon_logits_1d_max",
150  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
151  REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
152  },
153 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
154  {
155  "neon_logits_1d_max",
156  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
157  REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
158  },
159 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
160  {
161  "neon_logits_1d_max",
162  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
163  REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
164  },
165  {
166  "neon_logits_1d_max",
167  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
168  REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
169  },
170 #endif /* defined(__ARM_FEATURE_SVE) */
171 };
172 
173 const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
174 {
175  for(const auto &uk : available_logits_1d_kernels)
176  {
177  if(uk.is_selected({ data.dt }))
178  {
179  return &uk;
180  }
181  }
182  return nullptr;
183 }
184 
185 const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
186 {
187  for(const auto &uk : available_logits_1d_max_kernels)
188  {
189  if(uk.is_selected({ data.dt }))
190  {
191  return &uk;
192  }
193  }
194  return nullptr;
195 }
196 
197 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
198 {
201 
202  // Validate in case of configured output
203  if(output.total_size() != 0)
204  {
207  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
208  }
209 
210  return Status{};
211 }
212 
213 } // namespace
214 
216 {
217 }
218 
220 {
222 
223  // Perform validation step
224  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
225 
226  // Softmax across the x dimension
227  const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
228  // Output auto initialization if not yet initialized
229  auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
230 
231  Window win = calculate_max_window(*src, Steps());
232  Coordinates coord;
233  coord.set_num_dimensions(dst->num_dimensions());
234  dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
235 
236  ICpuKernel::configure(win);
237 }
238 
240 {
242  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
243 
244  return Status{};
245 }
246 
247 void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
248 {
249  ARM_COMPUTE_UNUSED(info);
252 
253  const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
254  auto dst = tensors.get_tensor(TensorType::ACL_DST);
255 
256  const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->info()->data_type() });
257  uk->ukernel(src, dst, window);
258 }
259 
260 const char *CpuLogits1DMaxKernel::name() const
261 {
262  return "CpuLogits1DMaxKernel";
263 }
264 
265 namespace
266 {
267 Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
268  const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
269 {
270  ARM_COMPUTE_UNUSED(beta);
271  // Check input
274 
275  const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
276 
277  // Check max
281 
282  // Check output if configured
283  if(dst.total_size() != 0)
284  {
285  const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
288  ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
289  }
290 
291  // Check tmp if configured
292  if(tmp.total_size() != 0)
293  {
294  const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
295  ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
296  // We could potentially reduce tmp memory if we could predict or make an assumption
297  // on the maximum number of threads that will run in parallel.
299  }
300 
301  return Status{};
302 }
303 } // namespace
304 
305 template <bool IS_LOG>
307  : _beta(1.0f)
308 {
309 }
310 
311 template <bool IS_LOG>
313 {
314  ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
315  ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
316  // Perform validation step
317  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
318 
319  _beta = beta;
320 
321  // Configure kernel window
322  const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
323 
324  // Output auto initialization if not yet initialized
325  const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
326  auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
327 
328  // Tmp auto initialization if not yet initialized
329  const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
330  auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
331 
332  // Configure kernel window
333  Window win = calculate_max_window(*max, Steps());
334  Coordinates coord;
335  coord.set_num_dimensions(dst->num_dimensions());
336  dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
337 
338  ICpuKernel::configure(win);
339 }
340 
341 template <bool IS_LOG>
343  const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
344 {
345  ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
346  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
347 
348  return Status{};
349 }
350 
351 template <bool IS_LOG>
353 {
354  ARM_COMPUTE_UNUSED(info);
357 
358  const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
359  auto max = tensors.get_tensor(TensorType::ACL_SRC_1);
360  auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
361  auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
362 
363  const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
364  const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
365 
366  ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
367 
368  void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
369 
370  const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->info()->data_type() });
371  uk->ukernel(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
372 }
373 
374 template <bool IS_LOG>
376 {
377  if(IS_LOG)
378  {
379  return "CpuLogits1DSoftmaxKernel";
380  }
381  else
382  {
383  return "CpuLogits1DLogSoftmaxKernel";
384  }
385 }
386 
387 template class CpuLogits1DSoftmaxKernel<true>;
388 template class CpuLogits1DSoftmaxKernel<false>;
389 
390 } // namespace kernels
391 } // namespace cpu
392 } // namespace arm_compute
const char * name() const override
Name of the kernel.
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
SoftmaxLogits1DKernelPtr ukernel
static Status validate(const ITensorInfo *src, const ITensorInfo *dst)
Static function to check if given info will lead to a valid configuration of CpuLogits1DMaxKernel.
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
Set the input and output tensors.
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
#define REGISTER_FP16_NEON(func_name)
Definition: Registrars.h:42
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
Definition: Validate.h:610
const char * name
#define REGISTER_FP32_NEON(func_name)
Definition: Registrars.h:52
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define REGISTER_FP32_SVE(func_name)
Definition: Registrars.h:53
ITensorInfo & set_data_type(DataType data_type) override
Set the data type to the specified value.
Definition: TensorInfo.cpp:321
#define REGISTER_QASYMM8_SVE(func_name)
Definition: Registrars.h:73
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
#define REGISTER_QASYMM8_SIGNED_NEON(func_name)
Definition: Registrars.h:62
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log)
Returns output quantization information for softmax layer.
Definition: Utils.cpp:462
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Interface for softmax computation for QASYMM8 with pre-computed max.
Status class.
Definition: Error.h:52
const char * name() const override
Name of the kernel.
virtual ITensorInfo & reset_padding()=0
Resets the padding settings of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
decltype(strategy::transforms) typedef type
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:288
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 F16 per channel
ITensorInfo & set_quantization_info(const QuantizationInfo &quantization_info) override
Set the quantization settings (scale and offset) of the tensor.
Definition: TensorInfo.cpp:380
#define REGISTER_QASYMM8_SIGNED_SVE(func_name)
Definition: Registrars.h:63
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:40
Quantization information.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
#define REGISTER_QASYMM8_NEON(func_name)
Definition: Registrars.h:72
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
void configure(const ITensorInfo *src, ITensorInfo *dst)
Set the input and output tensors.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1190
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:50
Information about executing thread and CPU.
Definition: CPPTypes.h:235
const SoftmaxSelectorPtr is_selected
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
#define REGISTER_FP16_SVE(func_name)
Definition: Registrars.h:43
static Status validate(const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
Static function to check if given info will lead to a valid configuration of CpuLogits1DSoftmaxKernel...
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
DataType dt
unsigned int num_elems_processed_per_iteration
Tensor packing service.
Definition: ITensorPack.h:37
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149
quantized, asymmetric fixed-point 8-bit number signed
Container for valid region of a window.
Definition: Types.h:188
DataType
Available data types.
Definition: Types.h:77
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.