Compute Library
 21.11
CpuSoftmaxKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
32 #include "src/core/CPP/Validate.h"
35 
39 
40 namespace arm_compute
41 {
42 namespace cpu
43 {
44 namespace kernels
45 {
46 namespace
47 {
48 struct SoftmaxSelectorData
49 {
51  const CPUInfo &ci;
52 };
56 
57 struct SoftmaxLogits1DKernel
58 {
59  const char *name;
60  const SoftmaxSelectorPtr is_selected;
61  SoftmaxLogits1DKernelPtr ukernel;
62 };
63 
64 struct SoftmaxLogits1DMaxKernel
65 {
66  const char *name;
67  const SoftmaxSelectorPtr is_selected;
68  SoftmaxLogits1DMaxKernelPtr ukernel;
69 };
70 
71 static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
72 {
73 #if defined(ARM_COMPUTE_ENABLE_SVE)
74  {
75  "sve_fp32_softmax_logits_1d",
76  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
77  REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
78  },
79  {
80  "sve_fp16_softmax_logits_1d",
81  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
82  REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
83  },
84 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
85 
86 #if defined(ARM_COMPUTE_ENABLE_NEON)
87  {
88  "neon_fp32_softmax_logits_1d",
89  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
90  REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
91  },
92 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
93  {
94  "neon_fp16_softmax_logits_1d",
95  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
96  REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
97  },
98 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
99 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
100 
101 #if defined(ARM_COMPUTE_ENABLE_SVE2)
102  {
103  "sve2_qu8_softmax_logits_1d",
104  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); },
105  REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
106  },
107  {
108  "sve2_qs8_softmax_logits_1d",
109  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); },
110  REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
111  },
112 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
113  {
114  "neon_qu8_softmax_logits_1d",
115  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
116  REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
117  },
118  {
119  "neon_qs8_softmax_logits_1d",
120  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
121  REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
122  },
123 };
124 
125 static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
126 {
127 #if defined(ARM_COMPUTE_ENABLE_SVE)
128  {
129  "sve_fp32_logits_1d_max",
130  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
131  REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
132  },
133  {
134  "sve_fp16_logits_1d_max",
135  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
136  REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
137  },
138  {
139  "sve_qu8_logits_1d_max",
140  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); },
141  REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
142  },
143  {
144  "sve_qs8_logits_1d_max",
145  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); },
146  REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
147  },
148 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
149 #if defined(ARM_COMPUTE_ENABLE_NEON)
150  {
151  "neon_fp32_logits_1d_max",
152  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
153  REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
154  },
155 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
156  {
157  "neon_fp16_logits_1d_max",
158  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
159  REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
160  },
161 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
162  {
163  "neon_qu8_logits_1d_max",
164  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
165  REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
166  },
167  {
168  "neon_qs8_logits_1d_max",
169  [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
170  REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
171  },
172 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
173 };
174 
175 const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
176 {
177  for(const auto &uk : available_logits_1d_kernels)
178  {
179  if(uk.is_selected({ data.dt, CPUInfo::get() }))
180  {
181  return &uk;
182  }
183  }
184  return nullptr;
185 }
186 
187 const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
188 {
189  for(const auto &uk : available_logits_1d_max_kernels)
190  {
191  if(uk.is_selected({ data.dt, CPUInfo::get() }))
192  {
193  return &uk;
194  }
195  }
196  return nullptr;
197 }
198 
199 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
200 {
203 
204  // Validate in case of configured output
205  if(output.total_size() != 0)
206  {
209  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
210  }
211 
212  return Status{};
213 }
214 
215 } // namespace
216 
218 {
220  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
221 
222  // Softmax across the x dimension
223  const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
224  // Output auto initialization if not yet initialized
225  auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
226 
227  const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
229 
230  _run_method = uk->ukernel;
231  _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
232 
233  Window win = calculate_max_window(*src, Steps());
234  ICpuKernel::configure(win);
235 }
236 
238 {
240  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
241 
242  return Status{};
243 }
244 
245 void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
246 {
247  ARM_COMPUTE_UNUSED(info);
250  ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
251 
252  const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
253  auto dst = tensors.get_tensor(TensorType::ACL_DST);
254 
255  _run_method(src, dst, window);
256 }
257 
258 const char *CpuLogits1DMaxKernel::name() const
259 {
260  return _name.c_str();
261 }
262 
263 namespace
264 {
265 Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
266  const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
267 {
268  ARM_COMPUTE_UNUSED(beta);
269  // Check input
272 
273  const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
274 
275  // Check max
279 
280  // Check output if configured
281  if(dst.total_size() != 0)
282  {
283  const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
286  ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
287  }
288 
289  // Check tmp if configured
290  if(tmp.total_size() != 0)
291  {
292  const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
293  ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
294  // We could potentially reduce tmp memory if we could predict or make an assumption
295  // on the maximum number of threads that will run in parallel.
297  }
298 
299  return Status{};
300 }
301 } // namespace
302 
303 template <bool IS_LOG>
305 {
306  ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
307  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
308 
309  // Configure kernel window
310  const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
311 
312  // Output auto initialization if not yet initialized
313  const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
314  auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
315 
316  // Tmp auto initialization if not yet initialized
317  const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
318  auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
319 
320  const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
322 
323  std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
324 
325  _beta = beta;
326  _run_method = uk->ukernel;
327  _name = kernel_name.append("/").append(uk->name);
328 
329  // Configure kernel window
330  Window win = calculate_max_window(*max, Steps());
331 
332  ICpuKernel::configure(win);
333 }
334 
335 template <bool IS_LOG>
337  const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
338 {
339  ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
340  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
341 
342  return Status{};
343 }
344 
345 template <bool IS_LOG>
347 {
348  ARM_COMPUTE_UNUSED(info);
351  ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
352 
353  const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
354  auto max = tensors.get_tensor(TensorType::ACL_SRC_1);
355  auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
356  auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
357 
358  const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
359  const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
360 
361  ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
362 
363  void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
364  _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
365 }
366 
367 template <bool IS_LOG>
369 {
370  return _name.c_str();
371 }
372 
373 template class CpuLogits1DSoftmaxKernel<true>;
374 template class CpuLogits1DSoftmaxKernel<false>;
375 
376 } // namespace kernels
377 } // namespace cpu
378 } // namespace arm_compute
const char * name() const override
Name of the kernel.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
SoftmaxLogits1DKernelPtr ukernel
static Status validate(const ITensorInfo *src, const ITensorInfo *dst)
Static function to check if given info will lead to a valid configuration.
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
Set the input and output tensors.
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
#define REGISTER_FP16_NEON(func_name)
Definition: Registrars.h:42
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
Definition: Validate.h:606
const char * name
#define REGISTER_FP32_NEON(func_name)
Definition: Registrars.h:61
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define REGISTER_FP32_SVE(func_name)
Definition: Registrars.h:62
ITensorInfo & set_data_type(DataType data_type) override
Set the data type to the specified value.
Definition: TensorInfo.cpp:287
#define REGISTER_QASYMM8_SVE(func_name)
Definition: Registrars.h:91
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
#define REGISTER_QASYMM8_SIGNED_NEON(func_name)
Definition: Registrars.h:76
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log)
Returns output quantization information for softmax layer.
Definition: Utils.cpp:467
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Interface for softmax computation for QASYMM8 with pre-computed max.
Status class.
Definition: Error.h:52
const char * name() const override
Name of the kernel.
virtual ITensorInfo & reset_padding()=0
Resets the padding settings of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
decltype(strategy::transforms) typedef type
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:284
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
ITensorInfo & set_quantization_info(const QuantizationInfo &quantization_info) override
Set the quantization settings (scale and offset) of the tensor.
Definition: TensorInfo.cpp:346
#define REGISTER_QASYMM8_SIGNED_SVE(func_name)
Definition: Registrars.h:77
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
Quantization information.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
#define REGISTER_QASYMM8_NEON(func_name)
Definition: Registrars.h:90
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
unsigned int num_elems_processed_per_iteration
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
void configure(const ITensorInfo *src, ITensorInfo *dst)
Set the input and output tensors.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1003
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Information about executing thread and CPU.
Definition: CPPTypes.h:158
const SoftmaxSelectorPtr is_selected
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:439
#define REGISTER_FP16_SVE(func_name)
Definition: Registrars.h:43
static Status validate(const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
Static function to check if given info will lead to a valid configuration.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
const CPUInfo & ci
DataType dt
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:43
quantized, asymmetric fixed-point 8-bit number signed
static CPUInfo & get()
Access the KernelLibrary singleton.
Definition: CPPTypes.cpp:39
std::string kernel_name
DataType
Available data types.
Definition: Types.h:79
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.