37 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS 43 constexpr std::array<uint8_t, 256> b64_invtab =
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
48 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
49 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
50 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
51 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
52 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 std::string decode_base64(
const std::string &
str)
71 constexpr
const char pad_char =
'=';
87 std::size_t padding = (
str.rbegin()[0] == pad_char) + (
str.rbegin()[1] == pad_char);
88 const int str_len =
str.size();
93 dec_b64.reserve(((str_len / 4) * 3));
97 const int end = str_len - 4 - padding;
98 for(; c <=
end; c += 4)
100 const int byte0 = b64_invtab[
str[c]];
101 const int byte1 = b64_invtab[
str[c + 1]];
102 const int byte2 = b64_invtab[
str[c + 2]];
103 const int byte3 = b64_invtab[
str[c + 3]];
105 dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
106 dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
107 dec_b64.push_back((byte2 << 6) | (byte3));
113 const int byte0 = b64_invtab[
str[c]];
114 const int byte1 = b64_invtab[
str[c + 1]];
115 const int byte2 = b64_invtab[
str[c + 2]];
117 dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
118 dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
120 else if(padding == 2)
122 const int byte0 = b64_invtab[
str[c]];
123 const int byte1 = b64_invtab[
str[c + 1]];
125 dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
137 std::string decompress_zlib(
const std::string &
str)
141 if(inflateInit(&ds) != Z_OK)
143 return std::string();
145 ds.avail_in =
str.size();
146 ds.next_in = (Bytef *)
str.data();
150 char roll_buff[16384];
151 std::string inflated_str;
154 ds.avail_out =
sizeof(roll_buff);
155 ds.next_out = reinterpret_cast<Bytef *>(roll_buff);
157 status = inflate(&ds, 0);
158 if(inflated_str.size() < ds.total_out)
160 inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
163 while(status == Z_OK);
167 if(status != Z_STREAM_END)
169 return std::string();
178 const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
180 {
"activation_layer",
"activation_layer.cl" },
181 {
"activation_layer_quant",
"activation_layer_quant.cl" },
182 {
"activation_layer_quant_f32",
"activation_layer_quant.cl" },
183 {
"arg_min_max_x",
"arg_min_max.cl" },
184 {
"arg_min_max_y",
"arg_min_max.cl" },
185 {
"arg_min_max_z",
"arg_min_max.cl" },
186 {
"arg_min_max_w",
"arg_min_max.cl" },
187 {
"batch_to_space_nchw",
"batch_to_space.cl" },
188 {
"batch_to_space_static_nchw",
"batch_to_space.cl" },
189 {
"batch_to_space_nhwc",
"batch_to_space.cl" },
190 {
"batch_to_space_static_nhwc",
"batch_to_space.cl" },
191 {
"batchnormalization_layer_nchw",
"batchnormalization_layer.cl" },
192 {
"batchnormalization_layer_nhwc",
"batchnormalization_layer.cl" },
193 {
"bitwise_or",
"bitwise_op.cl" },
194 {
"bitwise_and",
"bitwise_op.cl" },
195 {
"bitwise_xor",
"bitwise_op.cl" },
196 {
"bitwise_not",
"bitwise_op.cl" },
197 {
"bounding_box_transform",
"bounding_box_transform.cl" },
198 {
"bounding_box_transform_quantized",
"bounding_box_transform_quantized.cl" },
199 {
"channel_shuffle_nchw",
"channel_shuffle.cl" },
200 {
"channel_shuffle_nhwc",
"channel_shuffle.cl" },
201 {
"compare_equal",
"comparisons.cl" },
202 {
"compare_equal_quantized",
"comparisons.cl" },
203 {
"compare_notequal",
"comparisons.cl" },
204 {
"compare_notequal_quantized",
"comparisons.cl" },
205 {
"compare_greater",
"comparisons.cl" },
206 {
"compare_greater_quantized",
"comparisons.cl" },
207 {
"compare_greaterequal",
"comparisons.cl" },
208 {
"compare_greaterequal_quantized",
"comparisons.cl" },
209 {
"compare_less",
"comparisons.cl" },
210 {
"compare_less_quantized",
"comparisons.cl" },
211 {
"compare_lessequal",
"comparisons.cl" },
212 {
"compare_lessequal_quantized",
"comparisons.cl" },
213 {
"concatenate",
"concatenate.cl" },
214 {
"concatenate_width",
"concatenate.cl" },
215 {
"concatenate_height",
"concatenate.cl" },
216 {
"concatenate_width_x2",
"concatenate.cl" },
217 {
"concatenate_width_x4",
"concatenate.cl" },
218 {
"col2im",
"col2im.cl" },
219 {
"convert_depth_down",
"depth_convert.cl" },
220 {
"convert_depth_up",
"depth_convert.cl" },
221 {
"convert_fc_weights",
"convert_fc_weights.cl" },
222 {
"copy_tensor",
"copy_tensor.cl" },
223 {
"crop_tensor",
"crop_tensor.cl" },
224 {
"deconvolution_reshape",
"deconvolution_layer.cl" },
225 {
"deconvolution_upsample",
"deconvolution_layer.cl" },
226 {
"depthwise_convolution_3x3",
"depthwise_convolution.cl" },
227 {
"depthwise_convolution_3x3_f16",
"depthwise_convolution.cl" },
228 {
"depthwise_convolution_3x3_nhwc",
"depthwise_convolution.cl" },
229 {
"depthwise_convolution_3x3_nhwc_stride1",
"depthwise_convolution.cl" },
230 {
"dwc_MxN_native_fp_nhwc",
"depthwise_convolution.cl" },
231 {
"dwc_MxN_native_quantized8_nhwc",
"depthwise_convolution_quantized.cl" },
232 {
"dwc_3x3_native_quantized8_nchw",
"depthwise_convolution_quantized.cl" },
233 {
"dwc_3x3_native_quantized8_dot8_nchw",
"depthwise_convolution_quantized.cl" },
234 {
"depth_to_space_nchw",
"depth_to_space.cl" },
235 {
"depth_to_space_nhwc",
"depth_to_space.cl" },
236 {
"depthwise_convolution_3x3_stridex1_stridey1_f16",
"depthwise_convolution.cl" },
237 {
"depthwise_convolution_3x3_stridex2_stridey2_f16",
"depthwise_convolution.cl" },
238 {
"depthwise_convolution_3x3_stridex1_stridey1_f32",
"depthwise_convolution.cl" },
239 {
"depthwise_convolution_3x3_stridex2_stridey2_f32",
"depthwise_convolution.cl" },
240 {
"dequantization_layer",
"dequantization_layer.cl" },
241 {
"dequantization_layer_per_channel_nhwc",
"dequantization_layer.cl" },
242 {
"dequantization_layer_per_channel_nchw",
"dequantization_layer.cl" },
243 {
"direct_convolution_nhwc",
"direct_convolution.cl" },
244 {
"direct_convolution1x1",
"direct_convolution1x1.cl" },
245 {
"direct_convolution1x1_f32_bifrost",
"direct_convolution1x1.cl" },
246 {
"direct_convolution3x3",
"direct_convolution3x3.cl" },
247 {
"direct_convolution3x3_f32_bifrost",
"direct_convolution3x3.cl" },
248 {
"direct_convolution5x5",
"direct_convolution5x5.cl" },
249 {
"direct_convolution5x5_f32_bifrost",
"direct_convolution5x5.cl" },
250 {
"direct_convolution_quantized",
"direct_convolution_quantized.cl" },
251 {
"elementwise_operation_ADD",
"elementwise_operation.cl" },
252 {
"elementwise_operation_SUB",
"elementwise_operation.cl" },
253 {
"elementwise_operation_MAX",
"elementwise_operation.cl" },
254 {
"elementwise_operation_MIN",
"elementwise_operation.cl" },
255 {
"elementwise_operation_DIV",
"elementwise_operation.cl" },
256 {
"elementwise_operation_SQUARED_DIFF",
"elementwise_operation.cl" },
257 {
"elementwise_operation_POWER",
"elementwise_operation.cl" },
258 {
"elementwise_operation_PRELU",
"elementwise_operation.cl" },
259 {
"elementwise_operation_AND",
"elementwise_operation.cl" },
260 {
"elementwise_operation_OR",
"elementwise_operation.cl" },
261 {
"elementwise_operation_ADD_quantized",
"elementwise_operation_quantized.cl" },
262 {
"elementwise_operation_SUB_quantized",
"elementwise_operation_quantized.cl" },
263 {
"elementwise_operation_MAX_quantized",
"elementwise_operation_quantized.cl" },
264 {
"elementwise_operation_MIN_quantized",
"elementwise_operation_quantized.cl" },
265 {
"elementwise_operation_DIV_quantized",
"elementwise_operation_quantized.cl" },
266 {
"elementwise_operation_SQUARED_DIFF_quantized",
"elementwise_operation_quantized.cl" },
267 {
"elementwise_operation_PRELU_quantized",
"elementwise_operation_quantized.cl" },
268 {
"elementwise_unary",
"elementwise_unary.cl" },
269 {
"fft_digit_reverse_axis_0",
"fft_digit_reverse.cl" },
270 {
"fft_digit_reverse_axis_1",
"fft_digit_reverse.cl" },
271 {
"fft_radix_2_first_stage_axis_0",
"fft.cl" },
272 {
"fft_radix_2_first_stage_axis_1",
"fft.cl" },
273 {
"fft_radix_2_axis_0",
"fft.cl" },
274 {
"fft_radix_2_axis_1",
"fft.cl" },
275 {
"fft_radix_3_first_stage_axis_0",
"fft.cl" },
276 {
"fft_radix_3_first_stage_axis_1",
"fft.cl" },
277 {
"fft_radix_3_axis_0",
"fft.cl" },
278 {
"fft_radix_3_axis_1",
"fft.cl" },
279 {
"fft_radix_4_first_stage_axis_0",
"fft.cl" },
280 {
"fft_radix_4_first_stage_axis_1",
"fft.cl" },
281 {
"fft_radix_4_axis_0",
"fft.cl" },
282 {
"fft_radix_4_axis_1",
"fft.cl" },
283 {
"fft_radix_5_first_stage_axis_0",
"fft.cl" },
284 {
"fft_radix_5_first_stage_axis_1",
"fft.cl" },
285 {
"fft_radix_5_axis_0",
"fft.cl" },
286 {
"fft_radix_5_axis_1",
"fft.cl" },
287 {
"fft_radix_7_first_stage_axis_0",
"fft.cl" },
288 {
"fft_radix_7_first_stage_axis_1",
"fft.cl" },
289 {
"fft_radix_7_axis_0",
"fft.cl" },
290 {
"fft_radix_7_axis_1",
"fft.cl" },
291 {
"fft_radix_8_first_stage_axis_0",
"fft.cl" },
292 {
"fft_radix_8_first_stage_axis_1",
"fft.cl" },
293 {
"fft_radix_8_axis_0",
"fft.cl" },
294 {
"fft_radix_8_axis_1",
"fft.cl" },
295 {
"fft_scale_conj",
"fft_scale.cl" },
296 {
"fill_image_borders_constant",
"fill_border.cl" },
297 {
"fill_image_borders_replicate",
"fill_border.cl" },
298 {
"floor_layer",
"floor.cl" },
299 {
"fuse_batchnormalization_layer",
"batchnormalization_layer.cl" },
300 {
"gather",
"gather.cl" },
301 {
"gemm_ma_f16",
"gemm.cl" },
302 {
"gemm_ma_f32",
"gemm.cl" },
303 {
"gemm_mv",
"gemv.cl" },
304 {
"gemm_mv_quantized",
"gemv.cl" },
305 {
"gemm_mm_interleaved_transposed_f16",
"gemm_v1.cl" },
306 {
"gemm_mm_interleaved_transposed_f16_acc32",
"gemm_v1.cl" },
307 {
"gemm_mm_interleaved_transposed_f16_bifrost",
"gemm_v1.cl" },
308 {
"gemm_mm_interleaved_transposed_f32",
"gemm_v1.cl" },
309 {
"gemm_mm_interleaved_transposed_f32_bifrost",
"gemm_v1.cl" },
310 {
"gemm_mm_floating_point",
"gemm_v1.cl" },
311 {
"gemm_mm_floating_point_f16_bifrost",
"gemm_v1.cl" },
312 {
"gemm_mm_floating_point_f16_bifrost_acc32",
"gemm_v1.cl" },
313 {
"gemm_mm_floating_point_f32_bifrost",
"gemm_v1.cl" },
314 {
"gemm_mm_floating_point_f32_bifrost_1000",
"gemm_v1.cl" },
315 {
"gemm_mm_native",
"gemm.cl" },
316 {
"gemm_mm_reshaped_lhs_nt_rhs_t",
"gemm.cl" },
317 {
"gemm_mm_reshaped_lhs_nt_rhs_t_texture",
"gemm.cl" },
318 {
"gemm_mm_reshaped_lhs_t_rhs_nt",
"gemm.cl" },
319 {
"gemm_mm_reshaped_lhs_t_rhs_nt_texture",
"gemm.cl" },
320 {
"gemm_mm_reshaped_only_rhs_nt",
"gemm.cl" },
321 {
"gemm_mm_reshaped_only_rhs_nt_texture",
"gemm.cl" },
322 {
"gemm_mm_reshaped_only_rhs_t",
"gemm.cl" },
323 {
"gemm_mm_reshaped_only_rhs_t_texture",
"gemm.cl" },
324 {
"gemm_lc_vm_f32",
"gemm.cl" },
325 {
"gemm_reshape_lhs_matrix_nt",
"gemm.cl" },
326 {
"gemm_reshape_lhs_matrix_t",
"gemm.cl" },
327 {
"gemm_reshape_rhs_matrix_nt",
"gemm.cl" },
328 {
"gemm_reshape_rhs_matrix_t",
"gemm.cl" },
329 {
"gemmlowp_matrix_a_reduction",
"gemmlowp.cl" },
330 {
"gemmlowp_matrix_a_reduction_dot8",
"gemmlowp.cl" },
331 {
"gemmlowp_matrix_b_reduction",
"gemmlowp.cl" },
332 {
"gemmlowp_mm_native",
"gemmlowp.cl" },
333 {
"gemmlowp_mm_reshaped_lhs_nt_rhs_t",
"gemmlowp.cl" },
334 {
"gemmlowp_mm_reshaped_only_rhs_t",
"gemmlowp.cl" },
335 {
"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint",
"gemmlowp.cl" },
336 {
"gemmlowp_offset_contribution",
"gemmlowp.cl" },
337 {
"gemmlowp_offset_contribution_quantize_down",
"gemmlowp.cl" },
338 {
"gemmlowp_offset_contribution_quantize_down_fixedpoint",
"gemmlowp.cl" },
339 {
"gemmlowp_output_stage_quantize_down",
"gemmlowp.cl" },
340 {
"gemmlowp_output_stage_quantize_down_fixedpoint",
"gemmlowp.cl" },
341 {
"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16",
"gemmlowp.cl" },
342 {
"gemmlowp_output_stage_quantize_down_float",
"gemmlowp.cl" },
343 {
"generate_proposals_compute_all_anchors",
"generate_proposals.cl" },
344 {
"generate_proposals_compute_all_anchors_quantized",
"generate_proposals_quantized.cl" },
345 {
"im2col1x1_stridex1_nchw",
"im2col.cl" },
346 {
"im2col3x3_nchw",
"im2col.cl" },
347 {
"im2col5x5_nchw",
"im2col.cl" },
348 {
"im2col11x11_padx0_pady0_nchw",
"im2col.cl" },
349 {
"im2col_generic_nchw",
"im2col.cl" },
350 {
"im2col_generic_padx0_pady0_nchw",
"im2col.cl" },
351 {
"im2col3x3_nhwc",
"im2col.cl" },
352 {
"im2col9x9_nhwc",
"im2col.cl" },
353 {
"im2col_generic_nhwc",
"im2col.cl" },
354 {
"instance_normalization",
"instance_normalization.cl" },
355 {
"compute_mean_var",
"instance_normalization.cl" },
356 {
"l2_normalize_x",
"l2_normalize.cl" },
357 {
"l2_normalize_y",
"l2_normalize.cl" },
358 {
"l2_normalize_z",
"l2_normalize.cl" },
359 {
"max_unpooling_layer_2",
"unpooling_layer.cl" },
360 {
"mean_stddev_normalization",
"mean_stddev_normalization.cl" },
361 {
"memset",
"memset.cl" },
362 {
"minmax_layer",
"minmax_layer.cl" },
363 {
"non_max_suppression",
"nonmax.cl" },
364 {
"normalization_layer_cross_map_nchw",
"normalization_layer.cl" },
365 {
"normalization_layer_cross_map_nhwc",
"normalization_layer.cl" },
366 {
"normalization_layer_in_map_nchw",
"normalization_layer.cl" },
367 {
"normalization_layer_in_map_nhwc",
"normalization_layer.cl" },
368 {
"normalize_planar_yuv_layer_nchw",
"normalize_planar_yuv_layer.cl" },
369 {
"normalize_planar_yuv_layer_nhwc",
"normalize_planar_yuv_layer.cl" },
370 {
"normalize_planar_yuv_layer_q8_nchw",
"normalize_planar_yuv_layer_quantized.cl" },
371 {
"normalize_planar_yuv_layer_q8_nhwc",
"normalize_planar_yuv_layer_quantized.cl" },
372 {
"pad_layer_constant",
"pad_layer.cl" },
373 {
"pad_layer_symmetric_reflect",
"pad_layer.cl" },
374 {
"permute",
"permute.cl" },
375 {
"pixelwise_mul_complex",
"pixelwise_mul_float.cl" },
376 {
"pixelwise_mul_float",
"pixelwise_mul_float.cl" },
377 {
"pixelwise_mul_int",
"pixelwise_mul_int.cl" },
378 {
"pixelwise_mul_quantized",
"pixelwise_mul_int.cl" },
379 {
"pooling_layer_2",
"pooling_layer.cl" },
380 {
"pooling_layer_3",
"pooling_layer.cl" },
381 {
"pooling_layer_optimized_3",
"pooling_layer.cl" },
382 {
"pooling_layer_7",
"pooling_layer.cl" },
383 {
"pooling_layer_MxN_nchw",
"pooling_layer.cl" },
384 {
"pooling_layer_MxN_nhwc",
"pooling_layer.cl" },
385 {
"pooling_layer_2x2_nhwc",
"pooling_layer.cl" },
386 {
"pooling_layer_2_nchw_indices_fp32",
"pooling_layer.cl" },
387 {
"pooling_layer_2_nchw_indices_fp16",
"pooling_layer.cl" },
388 {
"pooling_layer_MxN_quantized_nhwc",
"pooling_layer_quantized.cl" },
389 {
"pooling_layer_MxN_quantized_nchw",
"pooling_layer_quantized.cl" },
390 {
"prior_box_layer_nchw",
"prior_box_layer.cl" },
391 {
"qlstm_layer_normalization",
"qlstm_layer_normalization.cl" },
392 {
"quantization_layer",
"quantization_layer.cl" },
393 {
"range",
"range.cl" },
394 {
"range_quantized",
"range.cl" },
395 {
"reduction_operation_x",
"reduction_operation.cl" },
396 {
"reduction_operation_non_parallel_x",
"reduction_operation.cl" },
397 {
"reduction_operation_y",
"reduction_operation.cl" },
398 {
"reduction_operation_z",
"reduction_operation.cl" },
399 {
"reduction_operation_w",
"reduction_operation.cl" },
400 {
"remap_nearest_neighbour",
"remap.cl" },
401 {
"remap_bilinear",
"remap.cl" },
402 {
"reorg_layer_nchw",
"reorg_layer.cl" },
403 {
"reorg_layer_nhwc",
"reorg_layer.cl" },
404 {
"reshape_layer",
"reshape_layer.cl" },
405 {
"reshape_to_columns",
"convolution_layer.cl" },
406 {
"reverse",
"reverse.cl" },
407 {
"roi_align_layer",
"roi_align_layer.cl" },
408 {
"roi_align_layer_quantized",
"roi_align_layer_quantized.cl" },
409 {
"roi_pooling_layer",
"roi_pooling_layer.cl" },
410 {
"scale_nearest_neighbour_nchw",
"scale.cl" },
411 {
"scale_nearest_neighbour_nhwc",
"scale.cl" },
412 {
"scale_bilinear_nchw",
"scale.cl" },
413 {
"scale_bilinear_nhwc",
"scale.cl" },
414 {
"scale_bilinear_quantized_nchw",
"scale_quantized.cl" },
415 {
"scale_bilinear_quantized_nhwc",
"scale_quantized.cl" },
416 {
"select_same_rank",
"select.cl" },
417 {
"select_different_rank_2",
"select.cl" },
418 {
"select_different_rank_n",
"select.cl" },
419 {
"softmax_layer_norm",
"softmax_layer.cl" },
420 {
"softmax_layer_norm_quantized",
"softmax_layer_quantized.cl" },
421 {
"softmax_layer_max_shift_exp_sum_quantized_serial",
"softmax_layer_quantized.cl" },
422 {
"softmax_layer_max_shift_exp_sum_quantized_parallel",
"softmax_layer_quantized.cl" },
423 {
"softmax_layer_max_shift_exp_sum_serial",
"softmax_layer.cl" },
424 {
"space_to_batch_nchw",
"space_to_batch.cl" },
425 {
"space_to_batch_static_nchw",
"space_to_batch.cl" },
426 {
"space_to_batch_nhwc",
"space_to_batch.cl" },
427 {
"space_to_batch_static_nhwc",
"space_to_batch.cl" },
428 {
"space_to_depth_nchw",
"space_to_depth.cl" },
429 {
"space_to_depth_nhwc",
"space_to_depth.cl" },
430 {
"softmax_layer_max_shift_exp_sum_parallel",
"softmax_layer.cl" },
431 {
"stack_layer",
"stack_layer.cl" },
432 {
"strided_slice",
"slice_ops.cl" },
433 {
"tile",
"tile.cl" },
434 {
"transpose",
"transpose.cl" },
435 {
"upsample_layer_nchw",
"upsample_layer.cl" },
436 {
"upsample_layer_nhwc",
"upsample_layer.cl" },
437 {
"winograd_filter_transform_2x2_3x3_nchw",
"winograd_filter_transform.cl" },
438 {
"winograd_filter_transform_2x1_3x1_nchw",
"winograd_filter_transform.cl" },
439 {
"winograd_filter_transform_1x2_1x3_nchw",
"winograd_filter_transform.cl" },
440 {
"winograd_filter_transform_4x4_3x3_nchw",
"winograd_filter_transform.cl" },
441 {
"winograd_filter_transform_4x1_3x1_nchw",
"winograd_filter_transform.cl" },
442 {
"winograd_filter_transform_1x4_1x3_nchw",
"winograd_filter_transform.cl" },
443 {
"winograd_filter_transform_4x4_5x5_nchw",
"winograd_filter_transform.cl" },
444 {
"winograd_filter_transform_4x1_5x1_nchw",
"winograd_filter_transform.cl" },
445 {
"winograd_filter_transform_1x4_1x5_nchw",
"winograd_filter_transform.cl" },
446 {
"winograd_filter_transform_4x1_3x1_nhwc",
"winograd_filter_transform.cl" },
447 {
"winograd_filter_transform_1x4_1x3_nhwc",
"winograd_filter_transform.cl" },
448 {
"winograd_filter_transform_4x4_3x3_nhwc",
"winograd_filter_transform.cl" },
449 {
"winograd_filter_transform_4x4_5x5_nhwc",
"winograd_filter_transform.cl" },
450 {
"winograd_filter_transform_4x1_5x1_nhwc",
"winograd_filter_transform.cl" },
451 {
"winograd_filter_transform_1x4_1x5_nhwc",
"winograd_filter_transform.cl" },
452 {
"winograd_filter_transform_2x2_7x7_nhwc",
"winograd_filter_transform.cl" },
453 {
"winograd_filter_transform_2x1_7x1_nhwc",
"winograd_filter_transform.cl" },
454 {
"winograd_filter_transform_1x2_1x7_nhwc",
"winograd_filter_transform.cl" },
455 {
"winograd_input_transform_2x2_3x3_stepz1_nchw",
"winograd_input_transform.cl" },
456 {
"winograd_input_transform_2x2_3x3_stepz2_nchw",
"winograd_input_transform.cl" },
457 {
"winograd_input_transform_2x1_3x1_stepz1_nchw",
"winograd_input_transform.cl" },
458 {
"winograd_input_transform_2x1_3x1_stepz2_nchw",
"winograd_input_transform.cl" },
459 {
"winograd_input_transform_1x2_1x3_stepz1_nchw",
"winograd_input_transform.cl" },
460 {
"winograd_input_transform_1x2_1x3_stepz2_nchw",
"winograd_input_transform.cl" },
461 {
"winograd_input_transform_4x4_3x3_stepz1_nchw",
"winograd_input_transform.cl" },
462 {
"winograd_input_transform_4x1_3x1_stepz1_nchw",
"winograd_input_transform.cl" },
463 {
"winograd_input_transform_1x4_1x3_stepz1_nchw",
"winograd_input_transform.cl" },
464 {
"winograd_input_transform_4x4_5x5_stepz1_nchw",
"winograd_input_transform.cl" },
465 {
"winograd_input_transform_4x1_5x1_stepz1_nchw",
"winograd_input_transform.cl" },
466 {
"winograd_input_transform_1x4_1x5_stepz1_nchw",
"winograd_input_transform.cl" },
467 {
"winograd_input_transform_4x1_3x1_stepz1_nhwc",
"winograd_input_transform.cl" },
468 {
"winograd_input_transform_1x4_1x3_stepz1_nhwc",
"winograd_input_transform.cl" },
469 {
"winograd_input_transform_4x4_3x3_stepz1_nhwc",
"winograd_input_transform.cl" },
470 {
"winograd_input_transform_4x4_5x5_stepz1_nhwc",
"winograd_input_transform.cl" },
471 {
"winograd_input_transform_4x1_5x1_stepz1_nhwc",
"winograd_input_transform.cl" },
472 {
"winograd_input_transform_1x4_1x5_stepz1_nhwc",
"winograd_input_transform.cl" },
473 {
"winograd_input_transform_2x2_7x7_stepz1_nhwc",
"winograd_input_transform.cl" },
474 {
"winograd_input_transform_2x1_7x1_stepz1_nhwc",
"winograd_input_transform.cl" },
475 {
"winograd_input_transform_1x2_1x7_stepz1_nhwc",
"winograd_input_transform.cl" },
476 {
"winograd_output_transform_2x2_3x3_nchw",
"winograd_output_transform.cl" },
477 {
"winograd_output_transform_2x1_3x1_nchw",
"winograd_output_transform.cl" },
478 {
"winograd_output_transform_1x2_1x3_nchw",
"winograd_output_transform.cl" },
479 {
"winograd_output_transform_4x4_3x3_nchw",
"winograd_output_transform.cl" },
480 {
"winograd_output_transform_4x1_3x1_nchw",
"winograd_output_transform.cl" },
481 {
"winograd_output_transform_1x4_1x3_nchw",
"winograd_output_transform.cl" },
482 {
"winograd_output_transform_4x4_5x5_nchw",
"winograd_output_transform.cl" },
483 {
"winograd_output_transform_4x1_5x1_nchw",
"winograd_output_transform.cl" },
484 {
"winograd_output_transform_1x4_1x5_nchw",
"winograd_output_transform.cl" },
485 {
"winograd_output_transform_4x1_3x1_nhwc",
"winograd_output_transform.cl" },
486 {
"winograd_output_transform_1x4_1x3_nhwc",
"winograd_output_transform.cl" },
487 {
"winograd_output_transform_4x4_3x3_nhwc",
"winograd_output_transform.cl" },
488 {
"winograd_output_transform_4x4_5x5_nhwc",
"winograd_output_transform.cl" },
489 {
"winograd_output_transform_4x1_5x1_nhwc",
"winograd_output_transform.cl" },
490 {
"winograd_output_transform_1x4_1x5_nhwc",
"winograd_output_transform.cl" },
491 {
"winograd_output_transform_2x2_7x7_nhwc",
"winograd_output_transform.cl" },
492 {
"winograd_output_transform_2x1_7x1_nhwc",
"winograd_output_transform.cl" },
493 {
"winograd_output_transform_1x2_1x7_nhwc",
"winograd_output_transform.cl" },
496 const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
498 #ifdef EMBEDDED_KERNELS 500 "activation_layer.cl",
501 #include "./cl_kernels/activation_layer.clembed" 504 "activation_layer_quant.cl",
505 #include "./cl_kernels/activation_layer_quant.clembed" 509 #include "./cl_kernels/arg_min_max.clembed" 513 #include "./cl_kernels/batch_to_space.clembed" 517 #include "./cl_kernels/bitwise_op.clembed" 520 "bounding_box_transform.cl",
521 #include "./cl_kernels/bounding_box_transform.clembed" 524 "bounding_box_transform_quantized.cl",
525 #include "./cl_kernels/bounding_box_transform_quantized.clembed" 528 "channel_shuffle.cl",
529 #include "./cl_kernels/channel_shuffle.clembed" 533 #include "./cl_kernels/col2im.clembed" 537 #include "./cl_kernels/comparisons.clembed" 541 #include "./cl_kernels/concatenate.clembed" 544 "convert_fc_weights.cl",
545 #include "./cl_kernels/convert_fc_weights.clembed" 548 "convolution_layer.cl",
549 #include "./cl_kernels/convolution_layer.clembed" 553 #include "./cl_kernels/copy_tensor.clembed" 557 #include "./cl_kernels/crop_tensor.clembed" 561 #include "./cl_kernels/upsample_layer.clembed" 564 "deconvolution_layer.cl",
565 #include "./cl_kernels/deconvolution_layer.clembed" 569 #include "./cl_kernels/depth_convert.clembed" 573 #include "./cl_kernels/depth_to_space.clembed" 576 "depthwise_convolution.cl",
577 #include "./cl_kernels/depthwise_convolution.clembed" 580 "depthwise_convolution_quantized.cl",
581 #include "./cl_kernels/depthwise_convolution_quantized.clembed" 584 "dequantization_layer.cl",
585 #include "./cl_kernels/dequantization_layer.clembed" 588 "direct_convolution1x1.cl",
589 #include "./cl_kernels/direct_convolution1x1.clembed" 592 "direct_convolution3x3.cl",
593 #include "./cl_kernels/direct_convolution3x3.clembed" 596 "direct_convolution5x5.cl",
597 #include "./cl_kernels/direct_convolution5x5.clembed" 600 "direct_convolution_quantized.cl",
601 #include "./cl_kernels/direct_convolution_quantized.clembed" 604 "direct_convolution.cl",
605 #include "./cl_kernels/direct_convolution.clembed" 608 "elementwise_operation.cl",
609 #include "./cl_kernels/elementwise_operation.clembed" 612 "elementwise_operation_quantized.cl",
613 #include "./cl_kernels/elementwise_operation_quantized.clembed" 616 "elementwise_unary.cl",
617 #include "./cl_kernels/elementwise_unary.clembed" 621 #include "./cl_kernels/fft.clembed" 624 "fft_digit_reverse.cl",
625 #include "./cl_kernels/fft_digit_reverse.clembed" 629 #include "./cl_kernels/fft_scale.clembed" 633 #include "./cl_kernels/fill_border.clembed" 637 #include "./cl_kernels/floor.clembed" 641 #include "./cl_kernels/gather.clembed" 645 #include "./cl_kernels/gemm.clembed" 649 #include "./cl_kernels/gemm_v1.clembed" 653 #include "./cl_kernels/gemmlowp.clembed" 657 #include "./cl_kernels/gemv.clembed" 660 "generate_proposals.cl",
661 #include "./cl_kernels/generate_proposals.clembed" 664 "generate_proposals_quantized.cl",
665 #include "./cl_kernels/generate_proposals_quantized.clembed" 669 #include "./cl_kernels/helpers.hembed" 673 #include "./cl_kernels/helpers_asymm.hembed" 677 #include "./cl_kernels/im2col.clembed" 680 "instance_normalization.cl",
681 #include "./cl_kernels/instance_normalization.clembed" 685 #include "./cl_kernels/l2_normalize.clembed" 688 "mean_stddev_normalization.cl",
689 #include "./cl_kernels/mean_stddev_normalization.clembed" 693 #include "./cl_kernels/memset.clembed" 697 #include "./cl_kernels/minmax_layer.clembed" 701 #include "./cl_kernels/nonmax.clembed" 704 "normalization_layer.cl",
705 #include "./cl_kernels/normalization_layer.clembed" 708 "normalize_planar_yuv_layer.cl",
709 #include "./cl_kernels/normalize_planar_yuv_layer.clembed" 712 "normalize_planar_yuv_layer_quantized.cl",
713 #include "./cl_kernels/normalize_planar_yuv_layer_quantized.clembed" 716 "batchnormalization_layer.cl",
717 #include "./cl_kernels/batchnormalization_layer.clembed" 721 #include "./cl_kernels/pad_layer.clembed" 725 #include "./cl_kernels/permute.clembed" 728 "pixelwise_mul_float.cl",
729 #include "./cl_kernels/pixelwise_mul_float.clembed" 732 "pixelwise_mul_int.cl",
733 #include "./cl_kernels/pixelwise_mul_int.clembed" 737 #include "./cl_kernels/pooling_layer.clembed" 740 "pooling_layer_quantized.cl",
741 #include "./cl_kernels/pooling_layer_quantized.clembed" 744 "prior_box_layer.cl",
745 #include "./cl_kernels/prior_box_layer.clembed" 748 "qlstm_layer_normalization.cl",
749 #include "./cl_kernels/qlstm_layer_normalization.clembed" 752 "quantization_layer.cl",
753 #include "./cl_kernels/quantization_layer.clembed" 757 #include "./cl_kernels/range.clembed" 760 "reduction_operation.cl",
761 #include "./cl_kernels/reduction_operation.clembed" 765 #include "./cl_kernels/remap.clembed" 769 #include "./cl_kernels/reorg_layer.clembed" 773 #include "./cl_kernels/reshape_layer.clembed" 777 #include "./cl_kernels/reverse.clembed" 780 "roi_align_layer.cl",
781 #include "./cl_kernels/roi_align_layer.clembed" 784 "roi_align_layer_quantized.cl",
785 #include "./cl_kernels/roi_align_layer_quantized.clembed" 788 "roi_pooling_layer.cl",
789 #include "./cl_kernels/roi_pooling_layer.clembed" 793 #include "./cl_kernels/scale.clembed" 796 "scale_quantized.cl",
797 #include "./cl_kernels/scale_quantized.clembed" 801 #include "./cl_kernels/select.clembed" 805 #include "./cl_kernels/softmax_layer.clembed" 808 "softmax_layer_quantized.cl",
809 #include "./cl_kernels/softmax_layer_quantized.clembed" 813 #include "./cl_kernels/slice_ops.clembed" 817 #include "./cl_kernels/space_to_batch.clembed" 821 #include "./cl_kernels/space_to_depth.clembed" 825 #include "./cl_kernels/stack_layer.clembed" 829 #include "./cl_kernels/tile.clembed" 833 #include "./cl_kernels/transpose.clembed" 837 #include "./cl_kernels/types.hembed" 840 "unpooling_layer.cl",
841 #include "./cl_kernels/unpooling_layer.clembed" 844 "winograd_filter_transform.cl",
845 #include "./cl_kernels/winograd_filter_transform.clembed" 848 "winograd_input_transform.cl",
849 #include "./cl_kernels/winograd_input_transform.clembed" 852 "winograd_output_transform.cl",
853 #include "./cl_kernels/winograd_output_transform.clembed" 858 CLKernelLibrary::CLKernelLibrary()
859 : _compile_context(), _kernel_path(), _decompressed_source_map()
867 return _kernel_library;
875 return _compile_context.
create_kernel(
kernel_name, program_name, program.first, _kernel_path, build_options_set, program.second);
881 auto kernel_program_it = _kernel_program_map.find(
kernel_name);
883 if(_kernel_program_map.end() == kernel_program_it)
888 const std::string program_name = kernel_program_it->second;
896 _kernel_path = kernel_path +
"/";
901 _kernel_path = std::move(kernel_path);
907 return _compile_context.
context();
962 #ifdef EMBEDDED_KERNELS 963 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS 964 const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
965 if(inflatted_program_source_it != _decompressed_source_map.end())
967 return std::make_pair(inflatted_program_source_it->second,
false);
971 const auto program_source_it = _program_source_map.find(program_name);
972 if(program_source_it == _program_source_map.end())
976 std::string program_source = program_source_it->second;
978 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS 979 std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second));
981 _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source));
982 program_source = std::move(decompressed_program_source);
985 return std::make_pair(program_source,
false);
988 std::string source_name = _kernel_path + program_name;
989 std::string binary_name = source_name +
"bin";
990 std::string program_source{};
991 bool is_binary =
false;
993 if(std::ifstream(binary_name).is_open())
995 program_source =
read_file(binary_name,
true);
998 else if(std::ifstream(source_name).is_open())
1000 program_source =
read_file(source_name,
false);
1007 return std::make_pair(program_source, is_binary);
1033 return _compile_context;
void set_kernel_path(const std::string &kernel_path)
Sets the path that the kernels reside in.
void set_device(cl::Device device)
Sets the CL device for which the programs are created.
const cl::Device & get_device() const
Gets the CL device for which the programs are created.
cl::Context & context()
Accessor for the associated CL context.
bool is_wbsm_supported() const
void add_built_program(const std::string &built_program_name, const cl::Program &program)
Add a new built program to the cache.
std::string get_device_version()
Return the device version.
#define ARM_COMPUTE_ERROR_VAR(msg,...)
Print the given message then throw an std::runtime_error.
std::pair< std::string, bool > get_program(const std::string &program_name) const
Gets the source of the selected program.
CLCompileContext & get_compile_context()
Gets the compile context used.
void set_context(cl::Context context)
Sets the CL context used to create programs.
std::string get_device_version() const
Return the device version.
Copyright (c) 2017-2021 Arm Limited.
cl_uint get_num_compute_units()
Return the maximum number of compute units in the device.
size_t max_local_workgroup_size(const cl::Kernel &kernel) const
Find the maximum number of local work items in a workgroup can be supported for the kernel.
cl::NDRange default_ndrange() const
Return the default NDRange for the device.
void clear_programs_cache()
Clear the library's cache of binary programs.
std::string read_file(const std::string &filename, bool binary)
Load an entire file in memory.
void set_device(cl::Device device)
Sets the CL device for which the programs are created.
cl_uint get_num_compute_units() const
Return the maximum number of compute units in the device.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Kernel create_kernel(const std::string &kernel_name, const std::set< std::string > &build_options_set={}) const
Creates a kernel from the kernel library.
cl::NDRange default_ndrange() const
Return the default NDRange for the device.
void end(TokenStream &in, bool &valid)
void init(std::string kernel_path, cl::Context context, cl::Device device)
Initialises the kernel library.
Kernel create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source, const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const
Creates an OpenCL kernel.
std::string get_program_name(const std::string &kernel_name) const
Returns the program name given a kernel name.
void clear_programs_cache()
Clear the library's cache of binary programs.
bool int64_base_atomics_supported() const
Returns true if int64_base_atomics extension is supported by the CL device.
bool fp16_supported() const
Returns true if FP16 is supported by the CL device.
bool int64_base_atomics_supported() const
Returns true if int64_base_atomics extension is supported by the CL device.
const std::map< std::string, cl::Program > & get_built_programs() const
Access the cache of built OpenCL programs.
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
bool fp16_supported() const
Returns true if FP16 is supported by the CL device.
size_t max_local_workgroup_size(const cl::Kernel &kernel) const
Find the maximum number of local work items in a workgroup can be supported for the kernel.
std::string get_kernel_path()
Gets the path that the kernels reside in.
void add_built_program(const std::string &built_program_name, const cl::Program &program) const
Add a new built program to the cache.
void set_context(cl::Context context)
Sets the CL context used to create programs.
const std::map< std::string, cl::Program > & get_built_programs() const
Access the cache of built OpenCL programs.
const cl::Device & get_device()
Gets the CL device for which the programs are created.
bool opencl_is_available()
Check if OpenCL is available.
cl::Context & context()
Accessor for the associated CL context.