109 switch (element_size)
127 switch (element_size)
227 std::string device_name = device.getInfo<CL_DEVICE_NAME>();
244 std::string device_name = device.getInfo<CL_DEVICE_NAME>();
250 sw_workaround_issue.count(gpu_target) != 0);
260 std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
261 if (version_str.find(
"OpenCL 3") != std::string::npos)
265 else if (version_str.find(
"OpenCL 2") != std::string::npos)
269 else if (version_str.find(
"OpenCL 1.2") != std::string::npos)
273 else if (version_str.find(
"OpenCL 1.1") != std::string::npos)
277 else if (version_str.find(
"OpenCL 1.0") != std::string::npos)
287 std::string
extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
289 return (pos != std::string::npos);
293 const Size2D &kernel_size,
298 using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
300 std::vector<WinogradConfiguration> winograd_configs_nchw = {
301 WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
302 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
303 WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)),
304 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
305 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
306 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
307 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
308 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
309 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))};
311 std::vector<WinogradConfiguration> winograd_configs_nhwc = {
312 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
313 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
314 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
315 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
316 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
317 WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
318 WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5)),
319 WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7)),
320 WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1)),
321 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)),
324 auto p = std::make_pair(std::pair<int, int>(output_tile.
width, output_tile.
height),
325 std::pair<int, int>(kernel_size.
width, kernel_size.
height));
330 return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) !=
331 winograd_configs_nchw.end());
335 return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) !=
336 winograd_configs_nhwc.end());
350 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR>();
355 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT>();
358 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT>();
361 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>();
364 return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG>();
384 cl_uint pixel_aligment = 0;
386 cl_int err =
clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT,
sizeof(cl_uint), &pixel_aligment,
nullptr);
388 if (err == CL_SUCCESS)
390 return pixel_aligment;
400 cl_bool supported = CL_FALSE;
403 clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT,
sizeof(cl_bool), &supported,
nullptr);
405 return (err == CL_SUCCESS && supported == CL_TRUE);
414 auto kernel_src = klib.
program(program_name);
415 const std::string kernel_path = klib.
kernel_path();
418 build_opts, kernel_src.is_binary));
423 const unsigned int width_leftover = input_dimension %
vector_size;
424 const unsigned int border_width = (width_leftover != 0) ?
vector_size - width_leftover : 0;
425 const unsigned int num_of_threads = ((input_dimension + border_width) / 16);
426 return cl::NDRange(std::min(8
U, num_of_threads));
431 cl_bitfield capabilities = 0;
432 cl_int err =
clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM,
sizeof(cl_bitfield),
433 &capabilities,
nullptr);
434 if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
441 void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
443 cl_int err =
clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
444 sizeof(cl_int), &wbsm_hint);
451 if (
tensor->tensor_shape()[0] % 4 != 0)
474 const size_t image_w =
tensor->tensor_shape()[0] / 4;
475 const size_t image_h =
tensor->tensor_shape().total_size() /
tensor->tensor_shape()[0];
479 if (image_w > max_image_w || image_h > max_image_h)
489 for (
const int value : values)
491 if (value > max_manual_loop_unrolling)
493 built_opts.
add_option(
"-DUNROLL_WITH_PRAGMA");