226 std::string device_name = device.getInfo<CL_DEVICE_NAME>();
243 std::string device_name = device.getInfo<CL_DEVICE_NAME>();
248 return (
device_supports_extension(device,
"cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
258 std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
259 if(version_str.find(
"OpenCL 3") != std::string::npos)
263 else if(version_str.find(
"OpenCL 2") != std::string::npos)
267 else if(version_str.find(
"OpenCL 1.2") != std::string::npos)
271 else if(version_str.find(
"OpenCL 1.1") != std::string::npos)
275 else if(version_str.find(
"OpenCL 1.0") != std::string::npos)
285 std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
286 auto pos = extensions.find(extension_name);
287 return (pos != std::string::npos);
294 using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
296 std::vector<WinogradConfiguration> winograd_configs_nchw =
298 WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
299 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
300 WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)),
301 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
302 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
303 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
304 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
305 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
306 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
309 std::vector<WinogradConfiguration> winograd_configs_nhwc =
311 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
312 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
313 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
314 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
315 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
316 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
317 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5)),
318 WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7)),
319 WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1)),
320 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)),
323 auto p = std::make_pair(std::pair<int, int>(output_tile.
width, output_tile.
height),
324 std::pair<int, int>(kernel_size.
width, kernel_size.
height));
329 return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end());
333 return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
347 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR>();
352 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT>();
355 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT>();
358 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>();
361 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG>();
381 cl_uint pixel_aligment = 0;
383 cl_int err =
clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT,
sizeof(cl_uint), &pixel_aligment,
nullptr);
385 if(err == CL_SUCCESS)
387 return pixel_aligment;
397 cl_bool supported = CL_FALSE;
399 cl_int err =
clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT,
sizeof(cl_bool), &supported,
nullptr);
401 return (err == CL_SUCCESS && supported == CL_TRUE);
409 auto kernel_src = klib.
program(program_name);
410 const std::string kernel_path = klib.
kernel_path();
412 return static_cast<cl::Kernel
>(ctx.
create_kernel(
kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary));
417 const unsigned int width_leftover = input_dimension % vector_size;
418 const unsigned int border_width = (width_leftover != 0) ? vector_size - width_leftover : 0;
419 const unsigned int num_of_threads = ((input_dimension + border_width) / 16);
420 return cl::NDRange(std::min(8
U, num_of_threads));
425 cl_bitfield capabilities = 0;
426 cl_int err =
clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM,
sizeof(cl_bitfield), &capabilities,
nullptr);
427 if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
434 void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
437 CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
446 if(
tensor->tensor_shape()[0] % 4 != 0)
469 const size_t image_w =
tensor->tensor_shape()[0] / 4;
470 const size_t image_h =
tensor->tensor_shape().total_size() /
tensor->tensor_shape()[0];
474 if(image_w > max_image_w || image_h > max_image_h)
484 for(
const int value : values)
486 if(value > max_manual_loop_unrolling)
488 built_opts.
add_option(
"-DUNROLL_WITH_PRAGMA");