34 #if !defined(BARE_METAL)
44 #include <unordered_map>
48 #if !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
49 #include <asm/hwcap.h>
51 #elif defined(__APPLE__) && defined(__aarch64__)
52 #include <sys/sysctl.h>
53 #include <sys/types.h>
57 #define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID (1 << 11)
58 #define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg : "=r"(var))
65 #if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
66 (defined(__arm__) || defined(__aarch64__))
73 std::vector<uint32_t> midr_from_cpuid(uint32_t max_num_cpus)
75 std::vector<uint32_t> cpus;
76 for (
unsigned int i = 0; i < max_num_cpus; ++i)
78 std::stringstream
str;
79 str <<
"/sys/devices/system/cpu/cpu" << i <<
"/regs/identification/midr_el1";
80 std::ifstream file(
str.str(), std::ios::in);
84 if (
bool(getline(file, line)))
99 std::vector<uint32_t> midr_from_proc_cpuinfo(
int max_num_cpus)
101 std::vector<uint32_t> cpus;
109 memset(&proc_regex, 0,
sizeof(regex_t));
110 memset(&imp_regex, 0,
sizeof(regex_t));
111 memset(&var_regex, 0,
sizeof(regex_t));
112 memset(&part_regex, 0,
sizeof(regex_t));
113 memset(&rev_regex, 0,
sizeof(regex_t));
117 ret_status |= regcomp(&proc_regex, R
"(^processor.*([[:digit:]]+)$)", REG_EXTENDED);
118 ret_status |= regcomp(&imp_regex, R"(^CPU implementer.*0x(..)$)", REG_EXTENDED);
119 ret_status |= regcomp(&var_regex, R"(^CPU variant.*0x(.)$)", REG_EXTENDED);
120 ret_status |= regcomp(&part_regex, R"(^CPU part.*0x(...)$)", REG_EXTENDED);
121 ret_status |= regcomp(&rev_regex, R"(^CPU revision.*([[:digit:]]+)$)", REG_EXTENDED);
125 std::ifstream file(
"/proc/cpuinfo", std::ios::in);
132 while (
bool(getline(file, line)))
134 std::array<regmatch_t, 2> match;
135 ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
138 std::string
id = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
141 if (curcpu >= 0 && midr == 0)
147 if (curcpu >= 0 && curcpu < max_num_cpus)
149 cpus.emplace_back(midr);
154 "Trying to populate a core id with id greater than the expected number of cores!");
163 ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
166 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
168 midr |= (impv << 24);
173 ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
176 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
178 midr |= (varv << 20);
183 ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
186 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
188 midr |= (partv << 4);
193 ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
196 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
205 if (curcpu >= 0 && curcpu < max_num_cpus)
207 cpus.emplace_back(midr);
212 "Trying to populate a core id with id greater than the expected number of cores!");
217 regfree(&proc_regex);
220 regfree(&part_regex);
233 std::ifstream CPUspresent;
234 CPUspresent.open(
"/sys/devices/system/cpu/present", std::ios::in);
235 bool success =
false;
237 if (CPUspresent.is_open())
241 if (
bool(getline(CPUspresent, line)))
248 auto startfrom = line.begin();
250 for (
auto i = line.begin(); i < line.end(); ++i)
252 if (*i ==
'-' || *i ==
',')
258 line.erase(line.begin(), startfrom);
268 max_cpus = std::thread::hardware_concurrency();
272 #elif defined(__aarch64__) && \
278 int get_hw_capability(
const std::string &cap)
281 size_t size =
sizeof(result);
282 sysctlbyname(cap.c_str(), &result, &size, NULL, 0);
287 #if defined(BARE_METAL) && defined(__aarch64__)
288 uint64_t get_sve_feature_reg()
291 __asm __volatile(
".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n"
307 #if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
308 (defined(__arm__) || defined(__aarch64__))
309 const uint32_t hwcaps = getauxval(AT_HWCAP);
310 const uint32_t hwcaps2 = getauxval(AT_HWCAP2);
311 const uint32_t max_cpus = get_max_cpus();
314 std::vector<uint32_t> cpus_midr;
317 cpus_midr = midr_from_cpuid(max_cpus);
319 if (cpus_midr.empty())
321 cpus_midr = midr_from_proc_cpuinfo(max_cpus);
323 if (cpus_midr.empty())
325 cpus_midr.resize(max_cpus, 0);
332 std::vector<CpuModel> cpus_model;
333 std::transform(std::begin(cpus_midr),
std::end(cpus_midr), std::back_inserter(cpus_model),
339 #elif (BARE_METAL) && \
344 uint64_t isar0 = 0, isar1 = 0, pfr0 = 0, pfr1 = 0, svefr0 = 0, midr = 0;
350 if ((pfr0 >> 32) & 0xf)
352 svefr0 = get_sve_feature_reg();
359 #elif defined(__aarch64__) && defined(__APPLE__)
360 int ncpus = get_hw_capability(
"hw.perflevel0.logicalcpu");
362 std::vector<CpuModel> cpus_model(ncpus);
363 isainfo.
neon = get_hw_capability(
"hw.optional.neon");
364 isainfo.
fp16 = get_hw_capability(
"hw.optional.neon_fp16");
365 isainfo.
dot = get_hw_capability(
"hw.optional.arm.FEAT_DotProd");
376 if (cpuid < _cpus.size())
380 return CpuModel::GENERIC;
385 #if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || \
386 (!defined(__arm__) && !defined(__aarch64__))
402 #if !defined(BARE_METAL) && !defined(_WIN64) && !defined(ARM_COMPUTE_DISABLE_THREADS_HINT)
403 std::vector<std::string> cpus;
407 regex_t cpu_part_rgx;
408 memset(&cpu_part_rgx, 0,
sizeof(regex_t));
409 int ret_status = regcomp(&cpu_part_rgx, R
"(.*CPU part.+/?\:[[:space:]]+([[:alnum:]]+).*)", REG_EXTENDED);
414 std::ifstream cpuinfo_file(
"/proc/cpuinfo", std::ios::in);
415 if (cpuinfo_file.is_open())
418 while (
bool(getline(cpuinfo_file, line)))
420 std::array<regmatch_t, 2> match;
421 if (regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0)
423 cpus.emplace_back(line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)));
427 regfree(&cpu_part_rgx);
430 std::sort(std::begin(cpus),
std::end(cpus));
431 auto least_frequent_cpu_occurences = [](
const std::vector<std::string> &cpus) -> uint32_t
433 std::unordered_map<std::string, uint32_t> cpus_freq;
434 for (
const auto &cpu : cpus)
439 uint32_t
vmin = cpus.size() + 1;
440 for (
const auto &cpu_freq : cpus_freq)
442 vmin = std::min(
vmin, cpu_freq.second);
448 num_threads_hint = cpus.empty() ? std::thread::hardware_concurrency() : least_frequent_cpu_occurences(cpus);