1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 30 // The machine topology 31 kmp_topology_t *__kmp_topology = nullptr; 32 // KMP_HW_SUBSET environment variable 33 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 34 35 // Store the real or imagined machine hierarchy here 36 static hierarchy_info machine_hierarchy; 37 38 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 39 40 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 41 kmp_uint32 depth; 42 // The test below is true if affinity is available, but set to "none". Need to 43 // init on first use of hierarchical barrier. 44 if (TCR_1(machine_hierarchy.uninitialized)) 45 machine_hierarchy.init(nproc); 46 47 // Adjust the hierarchy in case num threads exceeds original 48 if (nproc > machine_hierarchy.base_num_threads) 49 machine_hierarchy.resize(nproc); 50 51 depth = machine_hierarchy.depth; 52 KMP_DEBUG_ASSERT(depth > 0); 53 54 thr_bar->depth = depth; 55 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 56 &(thr_bar->base_leaf_kids)); 57 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 58 } 59 60 static int nCoresPerPkg, nPackages; 61 static int __kmp_nThreadsPerCore; 62 #ifndef KMP_DFLT_NTH_CORES 63 static int __kmp_ncores; 64 #endif 65 66 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 67 switch (type) { 68 case KMP_HW_SOCKET: 69 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 70 case KMP_HW_DIE: 71 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 72 case KMP_HW_MODULE: 73 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 74 case KMP_HW_TILE: 75 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 76 case KMP_HW_NUMA: 77 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 78 case KMP_HW_L3: 79 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 80 case KMP_HW_L2: 81 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 82 case KMP_HW_L1: 83 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 84 case KMP_HW_LLC: 85 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 86 case KMP_HW_CORE: 87 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 88 case KMP_HW_THREAD: 89 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 90 case KMP_HW_PROC_GROUP: 91 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 92 } 93 return KMP_I18N_STR(Unknown); 94 } 95 96 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 97 switch (type) { 98 case KMP_HW_SOCKET: 99 return ((plural) ? "sockets" : "socket"); 100 case KMP_HW_DIE: 101 return ((plural) ? "dice" : "die"); 102 case KMP_HW_MODULE: 103 return ((plural) ? "modules" : "module"); 104 case KMP_HW_TILE: 105 return ((plural) ? "tiles" : "tile"); 106 case KMP_HW_NUMA: 107 return ((plural) ? "numa_domains" : "numa_domain"); 108 case KMP_HW_L3: 109 return ((plural) ? "l3_caches" : "l3_cache"); 110 case KMP_HW_L2: 111 return ((plural) ? "l2_caches" : "l2_cache"); 112 case KMP_HW_L1: 113 return ((plural) ? "l1_caches" : "l1_cache"); 114 case KMP_HW_LLC: 115 return ((plural) ? "ll_caches" : "ll_cache"); 116 case KMP_HW_CORE: 117 return ((plural) ? "cores" : "core"); 118 case KMP_HW_THREAD: 119 return ((plural) ? "threads" : "thread"); 120 case KMP_HW_PROC_GROUP: 121 return ((plural) ? "proc_groups" : "proc_group"); 122 } 123 return ((plural) ? "unknowns" : "unknown"); 124 } 125 126 //////////////////////////////////////////////////////////////////////////////// 127 // kmp_hw_thread_t methods 128 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 129 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 130 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 131 int depth = __kmp_topology->get_depth(); 132 for (int level = 0; level < depth; ++level) { 133 if (ahwthread->ids[level] < bhwthread->ids[level]) 134 return -1; 135 else if (ahwthread->ids[level] > bhwthread->ids[level]) 136 return 1; 137 } 138 if (ahwthread->os_id < bhwthread->os_id) 139 return -1; 140 else if (ahwthread->os_id > bhwthread->os_id) 141 return 1; 142 return 0; 143 } 144 145 #if KMP_AFFINITY_SUPPORTED 146 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 147 int i; 148 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 149 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 150 int depth = __kmp_topology->get_depth(); 151 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 152 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); 153 for (i = 0; i < __kmp_affinity_compact; i++) { 154 int j = depth - i - 1; 155 if (aa->sub_ids[j] < bb->sub_ids[j]) 156 return -1; 157 if (aa->sub_ids[j] > bb->sub_ids[j]) 158 return 1; 159 } 160 for (; i < depth; i++) { 161 int j = i - __kmp_affinity_compact; 162 if (aa->sub_ids[j] < bb->sub_ids[j]) 163 return -1; 164 if (aa->sub_ids[j] > bb->sub_ids[j]) 165 return 1; 166 } 167 return 0; 168 } 169 #endif 170 171 void kmp_hw_thread_t::print() const { 172 int depth = __kmp_topology->get_depth(); 173 printf("%4d ", os_id); 174 for (int i = 0; i < depth; ++i) { 175 printf("%4d ", ids[i]); 176 } 177 printf("\n"); 178 } 179 180 //////////////////////////////////////////////////////////////////////////////// 181 // kmp_topology_t methods 182 183 // Remove layers that don't add information to the topology. 184 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 185 void kmp_topology_t::_remove_radix1_layers() { 186 int preference[KMP_HW_LAST]; 187 int top_index1, top_index2; 188 // Set up preference associative array 189 preference[KMP_HW_PROC_GROUP] = 110; 190 preference[KMP_HW_SOCKET] = 100; 191 preference[KMP_HW_CORE] = 95; 192 preference[KMP_HW_THREAD] = 90; 193 preference[KMP_HW_NUMA] = 85; 194 preference[KMP_HW_DIE] = 80; 195 preference[KMP_HW_TILE] = 75; 196 preference[KMP_HW_MODULE] = 73; 197 preference[KMP_HW_L3] = 70; 198 preference[KMP_HW_L2] = 65; 199 preference[KMP_HW_L1] = 60; 200 preference[KMP_HW_LLC] = 5; 201 top_index1 = 0; 202 top_index2 = 1; 203 while (top_index1 < depth - 1 && top_index2 < depth) { 204 kmp_hw_t type1 = types[top_index1]; 205 kmp_hw_t type2 = types[top_index2]; 206 KMP_ASSERT_VALID_HW_TYPE(type1); 207 KMP_ASSERT_VALID_HW_TYPE(type2); 208 // Do not allow the three main topology levels (sockets, cores, threads) to 209 // be compacted down 210 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 211 type1 == KMP_HW_SOCKET) && 212 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 213 type2 == KMP_HW_SOCKET)) { 214 top_index1 = top_index2++; 215 continue; 216 } 217 bool radix1 = true; 218 bool all_same = true; 219 int id1 = hw_threads[0].ids[top_index1]; 220 int id2 = hw_threads[0].ids[top_index2]; 221 int pref1 = preference[type1]; 222 int pref2 = preference[type2]; 223 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 224 if (hw_threads[hwidx].ids[top_index1] == id1 && 225 hw_threads[hwidx].ids[top_index2] != id2) { 226 radix1 = false; 227 break; 228 } 229 if (hw_threads[hwidx].ids[top_index2] != id2) 230 all_same = false; 231 id1 = hw_threads[hwidx].ids[top_index1]; 232 id2 = hw_threads[hwidx].ids[top_index2]; 233 } 234 if (radix1) { 235 // Select the layer to remove based on preference 236 kmp_hw_t remove_type, keep_type; 237 int remove_layer, remove_layer_ids; 238 if (pref1 > pref2) { 239 remove_type = type2; 240 remove_layer = remove_layer_ids = top_index2; 241 keep_type = type1; 242 } else { 243 remove_type = type1; 244 remove_layer = remove_layer_ids = top_index1; 245 keep_type = type2; 246 } 247 // If all the indexes for the second (deeper) layer are the same. 248 // e.g., all are zero, then make sure to keep the first layer's ids 249 if (all_same) 250 remove_layer_ids = top_index2; 251 // Remove radix one type by setting the equivalence, removing the id from 252 // the hw threads and removing the layer from types and depth 253 set_equivalent_type(remove_type, keep_type); 254 for (int idx = 0; idx < num_hw_threads; ++idx) { 255 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 256 for (int d = remove_layer_ids; d < depth - 1; ++d) 257 hw_thread.ids[d] = hw_thread.ids[d + 1]; 258 } 259 for (int idx = remove_layer; idx < depth - 1; ++idx) 260 types[idx] = types[idx + 1]; 261 depth--; 262 } else { 263 top_index1 = top_index2++; 264 } 265 } 266 KMP_ASSERT(depth > 0); 267 } 268 269 void kmp_topology_t::_set_last_level_cache() { 270 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 271 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 272 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 273 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 274 #if KMP_MIC_SUPPORTED 275 else if (__kmp_mic_type == mic3) { 276 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 277 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 278 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 279 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 280 // L2/Tile wasn't detected so just say L1 281 else 282 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 283 } 284 #endif 285 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 286 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 287 // Fallback is to set last level cache to socket or core 288 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 289 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 290 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 291 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 292 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 293 } 294 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 295 } 296 297 // Gather the count of each topology layer and the ratio 298 void kmp_topology_t::_gather_enumeration_information() { 299 int previous_id[KMP_HW_LAST]; 300 int max[KMP_HW_LAST]; 301 302 for (int i = 0; i < depth; ++i) { 303 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 304 max[i] = 0; 305 count[i] = 0; 306 ratio[i] = 0; 307 } 308 for (int i = 0; i < num_hw_threads; ++i) { 309 kmp_hw_thread_t &hw_thread = hw_threads[i]; 310 for (int layer = 0; layer < depth; ++layer) { 311 int id = hw_thread.ids[layer]; 312 if (id != previous_id[layer]) { 313 // Add an additional increment to each count 314 for (int l = layer; l < depth; ++l) 315 count[l]++; 316 // Keep track of topology layer ratio statistics 317 max[layer]++; 318 for (int l = layer + 1; l < depth; ++l) { 319 if (max[l] > ratio[l]) 320 ratio[l] = max[l]; 321 max[l] = 1; 322 } 323 break; 324 } 325 } 326 for (int layer = 0; layer < depth; ++layer) { 327 previous_id[layer] = hw_thread.ids[layer]; 328 } 329 } 330 for (int layer = 0; layer < depth; ++layer) { 331 if (max[layer] > ratio[layer]) 332 ratio[layer] = max[layer]; 333 } 334 } 335 336 // Find out if the topology is uniform 337 void kmp_topology_t::_discover_uniformity() { 338 int num = 1; 339 for (int level = 0; level < depth; ++level) 340 num *= ratio[level]; 341 flags.uniform = (num == count[depth - 1]); 342 } 343 344 // Set all the sub_ids for each hardware thread 345 void kmp_topology_t::_set_sub_ids() { 346 int previous_id[KMP_HW_LAST]; 347 int sub_id[KMP_HW_LAST]; 348 349 for (int i = 0; i < depth; ++i) { 350 previous_id[i] = -1; 351 sub_id[i] = -1; 352 } 353 for (int i = 0; i < num_hw_threads; ++i) { 354 kmp_hw_thread_t &hw_thread = hw_threads[i]; 355 // Setup the sub_id 356 for (int j = 0; j < depth; ++j) { 357 if (hw_thread.ids[j] != previous_id[j]) { 358 sub_id[j]++; 359 for (int k = j + 1; k < depth; ++k) { 360 sub_id[k] = 0; 361 } 362 break; 363 } 364 } 365 // Set previous_id 366 for (int j = 0; j < depth; ++j) { 367 previous_id[j] = hw_thread.ids[j]; 368 } 369 // Set the sub_ids field 370 for (int j = 0; j < depth; ++j) { 371 hw_thread.sub_ids[j] = sub_id[j]; 372 } 373 } 374 } 375 376 void kmp_topology_t::_set_globals() { 377 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 378 int core_level, thread_level, package_level; 379 package_level = get_level(KMP_HW_SOCKET); 380 #if KMP_GROUP_AFFINITY 381 if (package_level == -1) 382 package_level = get_level(KMP_HW_PROC_GROUP); 383 #endif 384 core_level = get_level(KMP_HW_CORE); 385 thread_level = get_level(KMP_HW_THREAD); 386 387 KMP_ASSERT(core_level != -1); 388 KMP_ASSERT(thread_level != -1); 389 390 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 391 if (package_level != -1) { 392 nCoresPerPkg = calculate_ratio(core_level, package_level); 393 nPackages = get_count(package_level); 394 } else { 395 // assume one socket 396 nCoresPerPkg = get_count(core_level); 397 nPackages = 1; 398 } 399 #ifndef KMP_DFLT_NTH_CORES 400 __kmp_ncores = get_count(core_level); 401 #endif 402 } 403 404 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 405 const kmp_hw_t *types) { 406 kmp_topology_t *retval; 407 // Allocate all data in one large allocation 408 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 409 sizeof(int) * ndepth * 3; 410 char *bytes = (char *)__kmp_allocate(size); 411 retval = (kmp_topology_t *)bytes; 412 if (nproc > 0) { 413 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 414 } else { 415 retval->hw_threads = nullptr; 416 } 417 retval->num_hw_threads = nproc; 418 retval->depth = ndepth; 419 int *arr = 420 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 421 retval->types = (kmp_hw_t *)arr; 422 retval->ratio = arr + ndepth; 423 retval->count = arr + 2 * ndepth; 424 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 425 for (int i = 0; i < ndepth; ++i) { 426 retval->types[i] = types[i]; 427 retval->equivalent[types[i]] = types[i]; 428 } 429 return retval; 430 } 431 432 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 433 if (topology) 434 __kmp_free(topology); 435 } 436 437 bool kmp_topology_t::check_ids() const { 438 // Assume ids have been sorted 439 if (num_hw_threads == 0) 440 return true; 441 for (int i = 1; i < num_hw_threads; ++i) { 442 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 443 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 444 bool unique = false; 445 for (int j = 0; j < depth; ++j) { 446 if (previous_thread.ids[j] != current_thread.ids[j]) { 447 unique = true; 448 break; 449 } 450 } 451 if (unique) 452 continue; 453 return false; 454 } 455 return true; 456 } 457 458 void kmp_topology_t::dump() const { 459 printf("***********************\n"); 460 printf("*** __kmp_topology: ***\n"); 461 printf("***********************\n"); 462 printf("* depth: %d\n", depth); 463 464 printf("* types: "); 465 for (int i = 0; i < depth; ++i) 466 printf("%15s ", __kmp_hw_get_keyword(types[i])); 467 printf("\n"); 468 469 printf("* ratio: "); 470 for (int i = 0; i < depth; ++i) { 471 printf("%15d ", ratio[i]); 472 } 473 printf("\n"); 474 475 printf("* count: "); 476 for (int i = 0; i < depth; ++i) { 477 printf("%15d ", count[i]); 478 } 479 printf("\n"); 480 481 printf("* equivalent map:\n"); 482 KMP_FOREACH_HW_TYPE(i) { 483 const char *key = __kmp_hw_get_keyword(i); 484 const char *value = __kmp_hw_get_keyword(equivalent[i]); 485 printf("%-15s -> %-15s\n", key, value); 486 } 487 488 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 489 490 printf("* num_hw_threads: %d\n", num_hw_threads); 491 printf("* hw_threads:\n"); 492 for (int i = 0; i < num_hw_threads; ++i) { 493 hw_threads[i].print(); 494 } 495 printf("***********************\n"); 496 } 497 498 void kmp_topology_t::print(const char *env_var) const { 499 kmp_str_buf_t buf; 500 int print_types_depth; 501 __kmp_str_buf_init(&buf); 502 kmp_hw_t print_types[KMP_HW_LAST + 2]; 503 504 // Num Available Threads 505 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 506 507 // Uniform or not 508 if (is_uniform()) { 509 KMP_INFORM(Uniform, env_var); 510 } else { 511 KMP_INFORM(NonUniform, env_var); 512 } 513 514 // Equivalent types 515 KMP_FOREACH_HW_TYPE(type) { 516 kmp_hw_t eq_type = equivalent[type]; 517 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 518 KMP_INFORM(AffEqualTopologyTypes, env_var, 519 __kmp_hw_get_catalog_string(type), 520 __kmp_hw_get_catalog_string(eq_type)); 521 } 522 } 523 524 // Quick topology 525 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 526 // Create a print types array that always guarantees printing 527 // the core and thread level 528 print_types_depth = 0; 529 for (int level = 0; level < depth; ++level) 530 print_types[print_types_depth++] = types[level]; 531 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 532 // Force in the core level for quick topology 533 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 534 // Force core before thread e.g., 1 socket X 2 threads/socket 535 // becomes 1 socket X 1 core/socket X 2 threads/socket 536 print_types[print_types_depth - 1] = KMP_HW_CORE; 537 print_types[print_types_depth++] = KMP_HW_THREAD; 538 } else { 539 print_types[print_types_depth++] = KMP_HW_CORE; 540 } 541 } 542 // Always put threads at very end of quick topology 543 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 544 print_types[print_types_depth++] = KMP_HW_THREAD; 545 546 __kmp_str_buf_clear(&buf); 547 kmp_hw_t numerator_type; 548 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 549 int core_level = get_level(KMP_HW_CORE); 550 int ncores = get_count(core_level); 551 552 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 553 int c; 554 bool plural; 555 numerator_type = print_types[plevel]; 556 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 557 if (equivalent[numerator_type] != numerator_type) 558 c = 1; 559 else 560 c = get_ratio(level++); 561 plural = (c > 1); 562 if (plevel == 0) { 563 __kmp_str_buf_print(&buf, "%d %s", c, 564 __kmp_hw_get_catalog_string(numerator_type, plural)); 565 } else { 566 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 567 __kmp_hw_get_catalog_string(numerator_type, plural), 568 __kmp_hw_get_catalog_string(denominator_type)); 569 } 570 denominator_type = numerator_type; 571 } 572 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 573 574 if (num_hw_threads <= 0) { 575 __kmp_str_buf_free(&buf); 576 return; 577 } 578 579 // Full OS proc to hardware thread map 580 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 581 for (int i = 0; i < num_hw_threads; i++) { 582 __kmp_str_buf_clear(&buf); 583 for (int level = 0; level < depth; ++level) { 584 kmp_hw_t type = types[level]; 585 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 586 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 587 } 588 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 589 } 590 591 __kmp_str_buf_free(&buf); 592 } 593 594 void kmp_topology_t::canonicalize() { 595 _remove_radix1_layers(); 596 _gather_enumeration_information(); 597 _discover_uniformity(); 598 _set_sub_ids(); 599 _set_globals(); 600 _set_last_level_cache(); 601 602 #if KMP_MIC_SUPPORTED 603 // Manually Add L2 = Tile equivalence 604 if (__kmp_mic_type == mic3) { 605 if (get_level(KMP_HW_L2) != -1) 606 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2); 607 else if (get_level(KMP_HW_TILE) != -1) 608 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE); 609 } 610 #endif 611 612 // Perform post canonicalization checking 613 KMP_ASSERT(depth > 0); 614 for (int level = 0; level < depth; ++level) { 615 // All counts, ratios, and types must be valid 616 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 617 KMP_ASSERT_VALID_HW_TYPE(types[level]); 618 // Detected types must point to themselves 619 KMP_ASSERT(equivalent[types[level]] == types[level]); 620 } 621 622 #if KMP_AFFINITY_SUPPORTED 623 // Set the number of affinity granularity levels 624 if (__kmp_affinity_gran_levels < 0) { 625 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); 626 // Check if user's granularity request is valid 627 if (gran_type == KMP_HW_UNKNOWN) { 628 // First try core, then thread, then package 629 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 630 for (auto g : gran_types) { 631 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { 632 gran_type = g; 633 break; 634 } 635 } 636 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 637 // Warn user what granularity setting will be used instead 638 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", 639 __kmp_hw_get_catalog_string(__kmp_affinity_gran), 640 __kmp_hw_get_catalog_string(gran_type)); 641 __kmp_affinity_gran = gran_type; 642 } 643 __kmp_affinity_gran_levels = 0; 644 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 645 __kmp_affinity_gran_levels++; 646 } 647 #endif // KMP_AFFINITY_SUPPORTED 648 } 649 650 // Canonicalize an explicit packages X cores/pkg X threads/core topology 651 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 652 int nthreads_per_core, int ncores) { 653 int ndepth = 3; 654 depth = ndepth; 655 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 656 for (int level = 0; level < depth; ++level) { 657 count[level] = 0; 658 ratio[level] = 0; 659 } 660 count[0] = npackages; 661 count[1] = ncores; 662 count[2] = __kmp_xproc; 663 ratio[0] = npackages; 664 ratio[1] = ncores_per_pkg; 665 ratio[2] = nthreads_per_core; 666 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 667 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 668 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 669 types[0] = KMP_HW_SOCKET; 670 types[1] = KMP_HW_CORE; 671 types[2] = KMP_HW_THREAD; 672 //__kmp_avail_proc = __kmp_xproc; 673 _discover_uniformity(); 674 } 675 676 // Apply the KMP_HW_SUBSET envirable to the topology 677 // Returns true if KMP_HW_SUBSET filtered any processors 678 // otherwise, returns false 679 bool kmp_topology_t::filter_hw_subset() { 680 // If KMP_HW_SUBSET wasn't requested, then do nothing. 681 if (!__kmp_hw_subset) 682 return false; 683 684 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 685 int hw_subset_depth = __kmp_hw_subset->get_depth(); 686 kmp_hw_t specified[KMP_HW_LAST]; 687 KMP_ASSERT(hw_subset_depth > 0); 688 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 689 for (int i = 0; i < hw_subset_depth; ++i) { 690 int max_count; 691 int num = __kmp_hw_subset->at(i).num; 692 int offset = __kmp_hw_subset->at(i).offset; 693 kmp_hw_t type = __kmp_hw_subset->at(i).type; 694 kmp_hw_t equivalent_type = equivalent[type]; 695 int level = get_level(type); 696 697 // Check to see if current layer is in detected machine topology 698 if (equivalent_type != KMP_HW_UNKNOWN) { 699 __kmp_hw_subset->at(i).type = equivalent_type; 700 } else { 701 KMP_WARNING(AffHWSubsetNotExistGeneric, 702 __kmp_hw_get_catalog_string(type)); 703 return false; 704 } 705 706 // Check to see if current layer has already been specified 707 // either directly or through an equivalent type 708 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 709 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), 710 __kmp_hw_get_catalog_string(specified[equivalent_type])); 711 return false; 712 } 713 specified[equivalent_type] = type; 714 715 // Check to see if layers are in order 716 if (i + 1 < hw_subset_depth) { 717 kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type); 718 if (next_type == KMP_HW_UNKNOWN) { 719 KMP_WARNING( 720 AffHWSubsetNotExistGeneric, 721 __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type)); 722 return false; 723 } 724 int next_topology_level = get_level(next_type); 725 if (level > next_topology_level) { 726 KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type), 727 __kmp_hw_get_catalog_string(next_type)); 728 return false; 729 } 730 } 731 732 // Check to see if each layer's num & offset parameters are valid 733 max_count = get_ratio(level); 734 if (max_count < 0 || num + offset > max_count) { 735 bool plural = (num > 1); 736 KMP_WARNING(AffHWSubsetManyGeneric, 737 __kmp_hw_get_catalog_string(type, plural)); 738 return false; 739 } 740 } 741 742 // Apply the filtered hardware subset 743 int new_index = 0; 744 for (int i = 0; i < num_hw_threads; ++i) { 745 kmp_hw_thread_t &hw_thread = hw_threads[i]; 746 // Check to see if this hardware thread should be filtered 747 bool should_be_filtered = false; 748 for (int level = 0, hw_subset_index = 0; 749 level < depth && hw_subset_index < hw_subset_depth; ++level) { 750 kmp_hw_t topology_type = types[level]; 751 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 752 kmp_hw_t hw_subset_type = hw_subset_item.type; 753 if (topology_type != hw_subset_type) 754 continue; 755 int num = hw_subset_item.num; 756 int offset = hw_subset_item.offset; 757 hw_subset_index++; 758 if (hw_thread.sub_ids[level] < offset || 759 hw_thread.sub_ids[level] >= offset + num) { 760 should_be_filtered = true; 761 break; 762 } 763 } 764 if (!should_be_filtered) { 765 if (i != new_index) 766 hw_threads[new_index] = hw_thread; 767 new_index++; 768 } else { 769 #if KMP_AFFINITY_SUPPORTED 770 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); 771 #endif 772 __kmp_avail_proc--; 773 } 774 } 775 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 776 num_hw_threads = new_index; 777 778 // Post hardware subset canonicalization 779 _gather_enumeration_information(); 780 _discover_uniformity(); 781 _set_globals(); 782 _set_last_level_cache(); 783 return true; 784 } 785 786 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { 787 if (hw_level >= depth) 788 return true; 789 bool retval = true; 790 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 791 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 792 for (int i = 0; i < (depth - hw_level); ++i) { 793 if (t1.ids[i] != t2.ids[i]) 794 return false; 795 } 796 return retval; 797 } 798 799 //////////////////////////////////////////////////////////////////////////////// 800 801 #if KMP_AFFINITY_SUPPORTED 802 class kmp_affinity_raii_t { 803 kmp_affin_mask_t *mask; 804 bool restored; 805 806 public: 807 kmp_affinity_raii_t() : restored(false) { 808 KMP_CPU_ALLOC(mask); 809 KMP_ASSERT(mask != NULL); 810 __kmp_get_system_affinity(mask, TRUE); 811 } 812 void restore() { 813 __kmp_set_system_affinity(mask, TRUE); 814 KMP_CPU_FREE(mask); 815 restored = true; 816 } 817 ~kmp_affinity_raii_t() { 818 if (!restored) { 819 __kmp_set_system_affinity(mask, TRUE); 820 KMP_CPU_FREE(mask); 821 } 822 } 823 }; 824 825 bool KMPAffinity::picked_api = false; 826 827 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 828 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 829 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 830 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 831 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 832 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 833 834 void KMPAffinity::pick_api() { 835 KMPAffinity *affinity_dispatch; 836 if (picked_api) 837 return; 838 #if KMP_USE_HWLOC 839 // Only use Hwloc if affinity isn't explicitly disabled and 840 // user requests Hwloc topology method 841 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 842 __kmp_affinity_type != affinity_disabled) { 843 affinity_dispatch = new KMPHwlocAffinity(); 844 } else 845 #endif 846 { 847 affinity_dispatch = new KMPNativeAffinity(); 848 } 849 __kmp_affinity_dispatch = affinity_dispatch; 850 picked_api = true; 851 } 852 853 void KMPAffinity::destroy_api() { 854 if (__kmp_affinity_dispatch != NULL) { 855 delete __kmp_affinity_dispatch; 856 __kmp_affinity_dispatch = NULL; 857 picked_api = false; 858 } 859 } 860 861 #define KMP_ADVANCE_SCAN(scan) \ 862 while (*scan != '\0') { \ 863 scan++; \ 864 } 865 866 // Print the affinity mask to the character array in a pretty format. 867 // The format is a comma separated list of non-negative integers or integer 868 // ranges: e.g., 1,2,3-5,7,9-15 869 // The format can also be the string "{<empty>}" if no bits are set in mask 870 char *__kmp_affinity_print_mask(char *buf, int buf_len, 871 kmp_affin_mask_t *mask) { 872 int start = 0, finish = 0, previous = 0; 873 bool first_range; 874 KMP_ASSERT(buf); 875 KMP_ASSERT(buf_len >= 40); 876 KMP_ASSERT(mask); 877 char *scan = buf; 878 char *end = buf + buf_len - 1; 879 880 // Check for empty set. 881 if (mask->begin() == mask->end()) { 882 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 883 KMP_ADVANCE_SCAN(scan); 884 KMP_ASSERT(scan <= end); 885 return buf; 886 } 887 888 first_range = true; 889 start = mask->begin(); 890 while (1) { 891 // Find next range 892 // [start, previous] is inclusive range of contiguous bits in mask 893 for (finish = mask->next(start), previous = start; 894 finish == previous + 1 && finish != mask->end(); 895 finish = mask->next(finish)) { 896 previous = finish; 897 } 898 899 // The first range does not need a comma printed before it, but the rest 900 // of the ranges do need a comma beforehand 901 if (!first_range) { 902 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 903 KMP_ADVANCE_SCAN(scan); 904 } else { 905 first_range = false; 906 } 907 // Range with three or more contiguous bits in the affinity mask 908 if (previous - start > 1) { 909 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 910 } else { 911 // Range with one or two contiguous bits in the affinity mask 912 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 913 KMP_ADVANCE_SCAN(scan); 914 if (previous - start > 0) { 915 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 916 } 917 } 918 KMP_ADVANCE_SCAN(scan); 919 // Start over with new start point 920 start = finish; 921 if (start == mask->end()) 922 break; 923 // Check for overflow 924 if (end - scan < 2) 925 break; 926 } 927 928 // Check for overflow 929 KMP_ASSERT(scan <= end); 930 return buf; 931 } 932 #undef KMP_ADVANCE_SCAN 933 934 // Print the affinity mask to the string buffer object in a pretty format 935 // The format is a comma separated list of non-negative integers or integer 936 // ranges: e.g., 1,2,3-5,7,9-15 937 // The format can also be the string "{<empty>}" if no bits are set in mask 938 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 939 kmp_affin_mask_t *mask) { 940 int start = 0, finish = 0, previous = 0; 941 bool first_range; 942 KMP_ASSERT(buf); 943 KMP_ASSERT(mask); 944 945 __kmp_str_buf_clear(buf); 946 947 // Check for empty set. 948 if (mask->begin() == mask->end()) { 949 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 950 return buf; 951 } 952 953 first_range = true; 954 start = mask->begin(); 955 while (1) { 956 // Find next range 957 // [start, previous] is inclusive range of contiguous bits in mask 958 for (finish = mask->next(start), previous = start; 959 finish == previous + 1 && finish != mask->end(); 960 finish = mask->next(finish)) { 961 previous = finish; 962 } 963 964 // The first range does not need a comma printed before it, but the rest 965 // of the ranges do need a comma beforehand 966 if (!first_range) { 967 __kmp_str_buf_print(buf, "%s", ","); 968 } else { 969 first_range = false; 970 } 971 // Range with three or more contiguous bits in the affinity mask 972 if (previous - start > 1) { 973 __kmp_str_buf_print(buf, "%u-%u", start, previous); 974 } else { 975 // Range with one or two contiguous bits in the affinity mask 976 __kmp_str_buf_print(buf, "%u", start); 977 if (previous - start > 0) { 978 __kmp_str_buf_print(buf, ",%u", previous); 979 } 980 } 981 // Start over with new start point 982 start = finish; 983 if (start == mask->end()) 984 break; 985 } 986 return buf; 987 } 988 989 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 990 KMP_CPU_ZERO(mask); 991 992 #if KMP_GROUP_AFFINITY 993 994 if (__kmp_num_proc_groups > 1) { 995 int group; 996 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 997 for (group = 0; group < __kmp_num_proc_groups; group++) { 998 int i; 999 int num = __kmp_GetActiveProcessorCount(group); 1000 for (i = 0; i < num; i++) { 1001 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 1002 } 1003 } 1004 } else 1005 1006 #endif /* KMP_GROUP_AFFINITY */ 1007 1008 { 1009 int proc; 1010 for (proc = 0; proc < __kmp_xproc; proc++) { 1011 KMP_CPU_SET(proc, mask); 1012 } 1013 } 1014 } 1015 1016 // All of the __kmp_affinity_create_*_map() routines should allocate the 1017 // internal topology object and set the layer ids for it. Each routine 1018 // returns a boolean on whether it was successful at doing so. 1019 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1020 1021 #if KMP_USE_HWLOC 1022 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1023 #if HWLOC_API_VERSION >= 0x00020000 1024 return hwloc_obj_type_is_cache(obj->type); 1025 #else 1026 return obj->type == HWLOC_OBJ_CACHE; 1027 #endif 1028 } 1029 1030 // Returns KMP_HW_* type derived from HWLOC_* type 1031 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1032 1033 if (__kmp_hwloc_is_cache_type(obj)) { 1034 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1035 return KMP_HW_UNKNOWN; 1036 switch (obj->attr->cache.depth) { 1037 case 1: 1038 return KMP_HW_L1; 1039 case 2: 1040 #if KMP_MIC_SUPPORTED 1041 if (__kmp_mic_type == mic3) { 1042 return KMP_HW_TILE; 1043 } 1044 #endif 1045 return KMP_HW_L2; 1046 case 3: 1047 return KMP_HW_L3; 1048 } 1049 return KMP_HW_UNKNOWN; 1050 } 1051 1052 switch (obj->type) { 1053 case HWLOC_OBJ_PACKAGE: 1054 return KMP_HW_SOCKET; 1055 case HWLOC_OBJ_NUMANODE: 1056 return KMP_HW_NUMA; 1057 case HWLOC_OBJ_CORE: 1058 return KMP_HW_CORE; 1059 case HWLOC_OBJ_PU: 1060 return KMP_HW_THREAD; 1061 case HWLOC_OBJ_GROUP: 1062 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1063 return KMP_HW_DIE; 1064 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1065 return KMP_HW_TILE; 1066 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1067 return KMP_HW_MODULE; 1068 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1069 return KMP_HW_PROC_GROUP; 1070 return KMP_HW_UNKNOWN; 1071 #if HWLOC_API_VERSION >= 0x00020100 1072 case HWLOC_OBJ_DIE: 1073 return KMP_HW_DIE; 1074 #endif 1075 } 1076 return KMP_HW_UNKNOWN; 1077 } 1078 1079 // Returns the number of objects of type 'type' below 'obj' within the topology 1080 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1081 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1082 // object. 1083 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1084 hwloc_obj_type_t type) { 1085 int retval = 0; 1086 hwloc_obj_t first; 1087 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1088 obj->logical_index, type, 0); 1089 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1090 obj->type, first) == obj; 1091 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1092 first)) { 1093 ++retval; 1094 } 1095 return retval; 1096 } 1097 1098 // This gets the sub_id for a lower object under a higher object in the 1099 // topology tree 1100 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1101 hwloc_obj_t lower) { 1102 hwloc_obj_t obj; 1103 hwloc_obj_type_t ltype = lower->type; 1104 int lindex = lower->logical_index - 1; 1105 int sub_id = 0; 1106 // Get the previous lower object 1107 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1108 while (obj && lindex >= 0 && 1109 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1110 if (obj->userdata) { 1111 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1112 break; 1113 } 1114 sub_id++; 1115 lindex--; 1116 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1117 } 1118 // store sub_id + 1 so that 0 is differed from NULL 1119 lower->userdata = RCAST(void *, sub_id + 1); 1120 return sub_id; 1121 } 1122 1123 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1124 kmp_hw_t type; 1125 int hw_thread_index, sub_id; 1126 int depth; 1127 hwloc_obj_t pu, obj, root, prev; 1128 kmp_hw_t types[KMP_HW_LAST]; 1129 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1130 1131 hwloc_topology_t tp = __kmp_hwloc_topology; 1132 *msg_id = kmp_i18n_null; 1133 if (__kmp_affinity_verbose) { 1134 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1135 } 1136 1137 if (!KMP_AFFINITY_CAPABLE()) { 1138 // Hack to try and infer the machine topology using only the data 1139 // available from hwloc on the current thread, and __kmp_xproc. 1140 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1141 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1142 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1143 if (o != NULL) 1144 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1145 else 1146 nCoresPerPkg = 1; // no PACKAGE found 1147 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1148 if (o != NULL) 1149 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1150 else 1151 __kmp_nThreadsPerCore = 1; // no CORE found 1152 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1153 if (nCoresPerPkg == 0) 1154 nCoresPerPkg = 1; // to prevent possible division by 0 1155 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1156 return true; 1157 } 1158 1159 root = hwloc_get_root_obj(tp); 1160 1161 // Figure out the depth and types in the topology 1162 depth = 0; 1163 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1164 KMP_ASSERT(pu); 1165 obj = pu; 1166 types[depth] = KMP_HW_THREAD; 1167 hwloc_types[depth] = obj->type; 1168 depth++; 1169 while (obj != root && obj != NULL) { 1170 obj = obj->parent; 1171 #if HWLOC_API_VERSION >= 0x00020000 1172 if (obj->memory_arity) { 1173 hwloc_obj_t memory; 1174 for (memory = obj->memory_first_child; memory; 1175 memory = hwloc_get_next_child(tp, obj, memory)) { 1176 if (memory->type == HWLOC_OBJ_NUMANODE) 1177 break; 1178 } 1179 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1180 types[depth] = KMP_HW_NUMA; 1181 hwloc_types[depth] = memory->type; 1182 depth++; 1183 } 1184 } 1185 #endif 1186 type = __kmp_hwloc_type_2_topology_type(obj); 1187 if (type != KMP_HW_UNKNOWN) { 1188 types[depth] = type; 1189 hwloc_types[depth] = obj->type; 1190 depth++; 1191 } 1192 } 1193 KMP_ASSERT(depth > 0); 1194 1195 // Get the order for the types correct 1196 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1197 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1198 kmp_hw_t temp = types[i]; 1199 types[i] = types[j]; 1200 types[j] = temp; 1201 hwloc_types[i] = hwloc_types[j]; 1202 hwloc_types[j] = hwloc_temp; 1203 } 1204 1205 // Allocate the data structure to be returned. 1206 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1207 1208 hw_thread_index = 0; 1209 pu = NULL; 1210 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 1211 int index = depth - 1; 1212 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1213 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1214 if (included) { 1215 hw_thread.clear(); 1216 hw_thread.ids[index] = pu->logical_index; 1217 hw_thread.os_id = pu->os_index; 1218 index--; 1219 } 1220 obj = pu; 1221 prev = obj; 1222 while (obj != root && obj != NULL) { 1223 obj = obj->parent; 1224 #if HWLOC_API_VERSION >= 0x00020000 1225 // NUMA Nodes are handled differently since they are not within the 1226 // parent/child structure anymore. They are separate children 1227 // of obj (memory_first_child points to first memory child) 1228 if (obj->memory_arity) { 1229 hwloc_obj_t memory; 1230 for (memory = obj->memory_first_child; memory; 1231 memory = hwloc_get_next_child(tp, obj, memory)) { 1232 if (memory->type == HWLOC_OBJ_NUMANODE) 1233 break; 1234 } 1235 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1236 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1237 if (included) { 1238 hw_thread.ids[index] = memory->logical_index; 1239 hw_thread.ids[index + 1] = sub_id; 1240 index--; 1241 } 1242 prev = memory; 1243 } 1244 prev = obj; 1245 } 1246 #endif 1247 type = __kmp_hwloc_type_2_topology_type(obj); 1248 if (type != KMP_HW_UNKNOWN) { 1249 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1250 if (included) { 1251 hw_thread.ids[index] = obj->logical_index; 1252 hw_thread.ids[index + 1] = sub_id; 1253 index--; 1254 } 1255 prev = obj; 1256 } 1257 } 1258 if (included) 1259 hw_thread_index++; 1260 } 1261 __kmp_topology->sort_ids(); 1262 return true; 1263 } 1264 #endif // KMP_USE_HWLOC 1265 1266 // If we don't know how to retrieve the machine's processor topology, or 1267 // encounter an error in doing so, this routine is called to form a "flat" 1268 // mapping of os thread id's <-> processor id's. 1269 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1270 *msg_id = kmp_i18n_null; 1271 int depth = 3; 1272 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1273 1274 if (__kmp_affinity_verbose) { 1275 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1276 } 1277 1278 // Even if __kmp_affinity_type == affinity_none, this routine might still 1279 // called to set __kmp_ncores, as well as 1280 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1281 if (!KMP_AFFINITY_CAPABLE()) { 1282 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1283 __kmp_ncores = nPackages = __kmp_xproc; 1284 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1285 return true; 1286 } 1287 1288 // When affinity is off, this routine will still be called to set 1289 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1290 // Make sure all these vars are set correctly, and return now if affinity is 1291 // not enabled. 1292 __kmp_ncores = nPackages = __kmp_avail_proc; 1293 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1294 1295 // Construct the data structure to be returned. 1296 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1297 int avail_ct = 0; 1298 int i; 1299 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1300 // Skip this proc if it is not included in the machine model. 1301 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1302 continue; 1303 } 1304 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1305 hw_thread.clear(); 1306 hw_thread.os_id = i; 1307 hw_thread.ids[0] = i; 1308 hw_thread.ids[1] = 0; 1309 hw_thread.ids[2] = 0; 1310 avail_ct++; 1311 } 1312 if (__kmp_affinity_verbose) { 1313 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1314 } 1315 return true; 1316 } 1317 1318 #if KMP_GROUP_AFFINITY 1319 // If multiple Windows* OS processor groups exist, we can create a 2-level 1320 // topology map with the groups at level 0 and the individual procs at level 1. 1321 // This facilitates letting the threads float among all procs in a group, 1322 // if granularity=group (the default when there are multiple groups). 1323 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 1324 *msg_id = kmp_i18n_null; 1325 int depth = 3; 1326 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 1327 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 1328 1329 if (__kmp_affinity_verbose) { 1330 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 1331 } 1332 1333 // If we aren't affinity capable, then use flat topology 1334 if (!KMP_AFFINITY_CAPABLE()) { 1335 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1336 nPackages = __kmp_num_proc_groups; 1337 __kmp_nThreadsPerCore = 1; 1338 __kmp_ncores = __kmp_xproc; 1339 nCoresPerPkg = nPackages / __kmp_ncores; 1340 return true; 1341 } 1342 1343 // Construct the data structure to be returned. 1344 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1345 int avail_ct = 0; 1346 int i; 1347 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1348 // Skip this proc if it is not included in the machine model. 1349 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1350 continue; 1351 } 1352 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 1353 hw_thread.clear(); 1354 hw_thread.os_id = i; 1355 hw_thread.ids[0] = i / BITS_PER_GROUP; 1356 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 1357 } 1358 return true; 1359 } 1360 #endif /* KMP_GROUP_AFFINITY */ 1361 1362 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1363 1364 template <kmp_uint32 LSB, kmp_uint32 MSB> 1365 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1366 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1367 const kmp_uint32 SHIFT_RIGHT = LSB; 1368 kmp_uint32 retval = v; 1369 retval <<= SHIFT_LEFT; 1370 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1371 return retval; 1372 } 1373 1374 static int __kmp_cpuid_mask_width(int count) { 1375 int r = 0; 1376 1377 while ((1 << r) < count) 1378 ++r; 1379 return r; 1380 } 1381 1382 class apicThreadInfo { 1383 public: 1384 unsigned osId; // param to __kmp_affinity_bind_thread 1385 unsigned apicId; // from cpuid after binding 1386 unsigned maxCoresPerPkg; // "" 1387 unsigned maxThreadsPerPkg; // "" 1388 unsigned pkgId; // inferred from above values 1389 unsigned coreId; // "" 1390 unsigned threadId; // "" 1391 }; 1392 1393 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1394 const void *b) { 1395 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1396 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1397 if (aa->pkgId < bb->pkgId) 1398 return -1; 1399 if (aa->pkgId > bb->pkgId) 1400 return 1; 1401 if (aa->coreId < bb->coreId) 1402 return -1; 1403 if (aa->coreId > bb->coreId) 1404 return 1; 1405 if (aa->threadId < bb->threadId) 1406 return -1; 1407 if (aa->threadId > bb->threadId) 1408 return 1; 1409 return 0; 1410 } 1411 1412 class kmp_cache_info_t { 1413 public: 1414 struct info_t { 1415 unsigned level, mask; 1416 }; 1417 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 1418 size_t get_depth() const { return depth; } 1419 info_t &operator[](size_t index) { return table[index]; } 1420 const info_t &operator[](size_t index) const { return table[index]; } 1421 1422 static kmp_hw_t get_topology_type(unsigned level) { 1423 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 1424 switch (level) { 1425 case 1: 1426 return KMP_HW_L1; 1427 case 2: 1428 return KMP_HW_L2; 1429 case 3: 1430 return KMP_HW_L3; 1431 } 1432 return KMP_HW_UNKNOWN; 1433 } 1434 1435 private: 1436 static const int MAX_CACHE_LEVEL = 3; 1437 1438 size_t depth; 1439 info_t table[MAX_CACHE_LEVEL]; 1440 1441 void get_leaf4_levels() { 1442 unsigned level = 0; 1443 while (depth < MAX_CACHE_LEVEL) { 1444 unsigned cache_type, max_threads_sharing; 1445 unsigned cache_level, cache_mask_width; 1446 kmp_cpuid buf2; 1447 __kmp_x86_cpuid(4, level, &buf2); 1448 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 1449 if (!cache_type) 1450 break; 1451 // Skip instruction caches 1452 if (cache_type == 2) { 1453 level++; 1454 continue; 1455 } 1456 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 1457 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 1458 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 1459 table[depth].level = cache_level; 1460 table[depth].mask = ((-1) << cache_mask_width); 1461 depth++; 1462 level++; 1463 } 1464 } 1465 }; 1466 1467 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1468 // an algorithm which cycles through the available os threads, setting 1469 // the current thread's affinity mask to that thread, and then retrieves 1470 // the Apic Id for each thread context using the cpuid instruction. 1471 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 1472 kmp_cpuid buf; 1473 *msg_id = kmp_i18n_null; 1474 1475 if (__kmp_affinity_verbose) { 1476 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 1477 } 1478 1479 // Check if cpuid leaf 4 is supported. 1480 __kmp_x86_cpuid(0, 0, &buf); 1481 if (buf.eax < 4) { 1482 *msg_id = kmp_i18n_str_NoLeaf4Support; 1483 return false; 1484 } 1485 1486 // The algorithm used starts by setting the affinity to each available thread 1487 // and retrieving info from the cpuid instruction, so if we are not capable of 1488 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1489 // need to do something else - use the defaults that we calculated from 1490 // issuing cpuid without binding to each proc. 1491 if (!KMP_AFFINITY_CAPABLE()) { 1492 // Hack to try and infer the machine topology using only the data 1493 // available from cpuid on the current thread, and __kmp_xproc. 1494 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1495 1496 // Get an upper bound on the number of threads per package using cpuid(1). 1497 // On some OS/chps combinations where HT is supported by the chip but is 1498 // disabled, this value will be 2 on a single core chip. Usually, it will be 1499 // 2 if HT is enabled and 1 if HT is disabled. 1500 __kmp_x86_cpuid(1, 0, &buf); 1501 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1502 if (maxThreadsPerPkg == 0) { 1503 maxThreadsPerPkg = 1; 1504 } 1505 1506 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1507 // value. 1508 // 1509 // The author of cpu_count.cpp treated this only an upper bound on the 1510 // number of cores, but I haven't seen any cases where it was greater than 1511 // the actual number of cores, so we will treat it as exact in this block of 1512 // code. 1513 // 1514 // First, we need to check if cpuid(4) is supported on this chip. To see if 1515 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1516 // greater. 1517 __kmp_x86_cpuid(0, 0, &buf); 1518 if (buf.eax >= 4) { 1519 __kmp_x86_cpuid(4, 0, &buf); 1520 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1521 } else { 1522 nCoresPerPkg = 1; 1523 } 1524 1525 // There is no way to reliably tell if HT is enabled without issuing the 1526 // cpuid instruction from every thread, can correlating the cpuid info, so 1527 // if the machine is not affinity capable, we assume that HT is off. We have 1528 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1529 // does not support HT. 1530 // 1531 // - Older OSes are usually found on machines with older chips, which do not 1532 // support HT. 1533 // - The performance penalty for mistakenly identifying a machine as HT when 1534 // it isn't (which results in blocktime being incorrectly set to 0) is 1535 // greater than the penalty when for mistakenly identifying a machine as 1536 // being 1 thread/core when it is really HT enabled (which results in 1537 // blocktime being incorrectly set to a positive value). 1538 __kmp_ncores = __kmp_xproc; 1539 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1540 __kmp_nThreadsPerCore = 1; 1541 return true; 1542 } 1543 1544 // From here on, we can assume that it is safe to call 1545 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1546 // __kmp_affinity_type = affinity_none. 1547 1548 // Save the affinity mask for the current thread. 1549 kmp_affinity_raii_t previous_affinity; 1550 1551 // Run through each of the available contexts, binding the current thread 1552 // to it, and obtaining the pertinent information using the cpuid instr. 1553 // 1554 // The relevant information is: 1555 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1556 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1557 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1558 // of this field determines the width of the core# + thread# fields in the 1559 // Apic Id. It is also an upper bound on the number of threads per 1560 // package, but it has been verified that situations happen were it is not 1561 // exact. In particular, on certain OS/chip combinations where Intel(R) 1562 // Hyper-Threading Technology is supported by the chip but has been 1563 // disabled, the value of this field will be 2 (for a single core chip). 1564 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1565 // Technology, the value of this field will be 1 when Intel(R) 1566 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1567 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1568 // of this field (+1) determines the width of the core# field in the Apic 1569 // Id. The comments in "cpucount.cpp" say that this value is an upper 1570 // bound, but the IA-32 architecture manual says that it is exactly the 1571 // number of cores per package, and I haven't seen any case where it 1572 // wasn't. 1573 // 1574 // From this information, deduce the package Id, core Id, and thread Id, 1575 // and set the corresponding fields in the apicThreadInfo struct. 1576 unsigned i; 1577 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1578 __kmp_avail_proc * sizeof(apicThreadInfo)); 1579 unsigned nApics = 0; 1580 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1581 // Skip this proc if it is not included in the machine model. 1582 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1583 continue; 1584 } 1585 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1586 1587 __kmp_affinity_dispatch->bind_thread(i); 1588 threadInfo[nApics].osId = i; 1589 1590 // The apic id and max threads per pkg come from cpuid(1). 1591 __kmp_x86_cpuid(1, 0, &buf); 1592 if (((buf.edx >> 9) & 1) == 0) { 1593 __kmp_free(threadInfo); 1594 *msg_id = kmp_i18n_str_ApicNotPresent; 1595 return false; 1596 } 1597 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1598 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1599 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1600 threadInfo[nApics].maxThreadsPerPkg = 1; 1601 } 1602 1603 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1604 // value. 1605 // 1606 // First, we need to check if cpuid(4) is supported on this chip. To see if 1607 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1608 // or greater. 1609 __kmp_x86_cpuid(0, 0, &buf); 1610 if (buf.eax >= 4) { 1611 __kmp_x86_cpuid(4, 0, &buf); 1612 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1613 } else { 1614 threadInfo[nApics].maxCoresPerPkg = 1; 1615 } 1616 1617 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1618 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1619 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1620 1621 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1622 int widthT = widthCT - widthC; 1623 if (widthT < 0) { 1624 // I've never seen this one happen, but I suppose it could, if the cpuid 1625 // instruction on a chip was really screwed up. Make sure to restore the 1626 // affinity mask before the tail call. 1627 __kmp_free(threadInfo); 1628 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1629 return false; 1630 } 1631 1632 int maskC = (1 << widthC) - 1; 1633 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1634 1635 int maskT = (1 << widthT) - 1; 1636 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1637 1638 nApics++; 1639 } 1640 1641 // We've collected all the info we need. 1642 // Restore the old affinity mask for this thread. 1643 previous_affinity.restore(); 1644 1645 // Sort the threadInfo table by physical Id. 1646 qsort(threadInfo, nApics, sizeof(*threadInfo), 1647 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1648 1649 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1650 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1651 // the chips on a system. Although coreId's are usually assigned 1652 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1653 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1654 // 1655 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1656 // total # packages) are at this point - we want to determine that now. We 1657 // only have an upper bound on the first two figures. 1658 // 1659 // We also perform a consistency check at this point: the values returned by 1660 // the cpuid instruction for any thread bound to a given package had better 1661 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1662 nPackages = 1; 1663 nCoresPerPkg = 1; 1664 __kmp_nThreadsPerCore = 1; 1665 unsigned nCores = 1; 1666 1667 unsigned pkgCt = 1; // to determine radii 1668 unsigned lastPkgId = threadInfo[0].pkgId; 1669 unsigned coreCt = 1; 1670 unsigned lastCoreId = threadInfo[0].coreId; 1671 unsigned threadCt = 1; 1672 unsigned lastThreadId = threadInfo[0].threadId; 1673 1674 // intra-pkg consist checks 1675 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1676 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1677 1678 for (i = 1; i < nApics; i++) { 1679 if (threadInfo[i].pkgId != lastPkgId) { 1680 nCores++; 1681 pkgCt++; 1682 lastPkgId = threadInfo[i].pkgId; 1683 if ((int)coreCt > nCoresPerPkg) 1684 nCoresPerPkg = coreCt; 1685 coreCt = 1; 1686 lastCoreId = threadInfo[i].coreId; 1687 if ((int)threadCt > __kmp_nThreadsPerCore) 1688 __kmp_nThreadsPerCore = threadCt; 1689 threadCt = 1; 1690 lastThreadId = threadInfo[i].threadId; 1691 1692 // This is a different package, so go on to the next iteration without 1693 // doing any consistency checks. Reset the consistency check vars, though. 1694 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1695 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1696 continue; 1697 } 1698 1699 if (threadInfo[i].coreId != lastCoreId) { 1700 nCores++; 1701 coreCt++; 1702 lastCoreId = threadInfo[i].coreId; 1703 if ((int)threadCt > __kmp_nThreadsPerCore) 1704 __kmp_nThreadsPerCore = threadCt; 1705 threadCt = 1; 1706 lastThreadId = threadInfo[i].threadId; 1707 } else if (threadInfo[i].threadId != lastThreadId) { 1708 threadCt++; 1709 lastThreadId = threadInfo[i].threadId; 1710 } else { 1711 __kmp_free(threadInfo); 1712 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1713 return false; 1714 } 1715 1716 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1717 // fields agree between all the threads bounds to a given package. 1718 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1719 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1720 __kmp_free(threadInfo); 1721 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1722 return false; 1723 } 1724 } 1725 // When affinity is off, this routine will still be called to set 1726 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1727 // Make sure all these vars are set correctly 1728 nPackages = pkgCt; 1729 if ((int)coreCt > nCoresPerPkg) 1730 nCoresPerPkg = coreCt; 1731 if ((int)threadCt > __kmp_nThreadsPerCore) 1732 __kmp_nThreadsPerCore = threadCt; 1733 __kmp_ncores = nCores; 1734 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1735 1736 // Now that we've determined the number of packages, the number of cores per 1737 // package, and the number of threads per core, we can construct the data 1738 // structure that is to be returned. 1739 int idx = 0; 1740 int pkgLevel = 0; 1741 int coreLevel = 1; 1742 int threadLevel = 2; 1743 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1744 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1745 kmp_hw_t types[3]; 1746 if (pkgLevel >= 0) 1747 types[idx++] = KMP_HW_SOCKET; 1748 if (coreLevel >= 0) 1749 types[idx++] = KMP_HW_CORE; 1750 if (threadLevel >= 0) 1751 types[idx++] = KMP_HW_THREAD; 1752 1753 KMP_ASSERT(depth > 0); 1754 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 1755 1756 for (i = 0; i < nApics; ++i) { 1757 idx = 0; 1758 unsigned os = threadInfo[i].osId; 1759 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 1760 hw_thread.clear(); 1761 1762 if (pkgLevel >= 0) { 1763 hw_thread.ids[idx++] = threadInfo[i].pkgId; 1764 } 1765 if (coreLevel >= 0) { 1766 hw_thread.ids[idx++] = threadInfo[i].coreId; 1767 } 1768 if (threadLevel >= 0) { 1769 hw_thread.ids[idx++] = threadInfo[i].threadId; 1770 } 1771 hw_thread.os_id = os; 1772 } 1773 1774 __kmp_free(threadInfo); 1775 __kmp_topology->sort_ids(); 1776 if (!__kmp_topology->check_ids()) { 1777 kmp_topology_t::deallocate(__kmp_topology); 1778 __kmp_topology = nullptr; 1779 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1780 return false; 1781 } 1782 return true; 1783 } 1784 1785 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1786 // architectures support a newer interface for specifying the x2APIC Ids, 1787 // based on CPUID.B or CPUID.1F 1788 /* 1789 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 1790 Bits Bits Bits Bits 1791 31-16 15-8 7-4 4-0 1792 ---+-----------+--------------+-------------+-----------------+ 1793 EAX| reserved | reserved | reserved | Bits to Shift | 1794 ---+-----------|--------------+-------------+-----------------| 1795 EBX| reserved | Num logical processors at level (16 bits) | 1796 ---+-----------|--------------+-------------------------------| 1797 ECX| reserved | Level Type | Level Number (8 bits) | 1798 ---+-----------+--------------+-------------------------------| 1799 EDX| X2APIC ID (32 bits) | 1800 ---+----------------------------------------------------------+ 1801 */ 1802 1803 enum { 1804 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 1805 INTEL_LEVEL_TYPE_SMT = 1, 1806 INTEL_LEVEL_TYPE_CORE = 2, 1807 INTEL_LEVEL_TYPE_TILE = 3, 1808 INTEL_LEVEL_TYPE_MODULE = 4, 1809 INTEL_LEVEL_TYPE_DIE = 5, 1810 INTEL_LEVEL_TYPE_LAST = 6, 1811 }; 1812 1813 struct cpuid_level_info_t { 1814 unsigned level_type, mask, mask_width, nitems, cache_mask; 1815 }; 1816 1817 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 1818 switch (intel_type) { 1819 case INTEL_LEVEL_TYPE_INVALID: 1820 return KMP_HW_SOCKET; 1821 case INTEL_LEVEL_TYPE_SMT: 1822 return KMP_HW_THREAD; 1823 case INTEL_LEVEL_TYPE_CORE: 1824 return KMP_HW_CORE; 1825 case INTEL_LEVEL_TYPE_TILE: 1826 return KMP_HW_TILE; 1827 case INTEL_LEVEL_TYPE_MODULE: 1828 return KMP_HW_MODULE; 1829 case INTEL_LEVEL_TYPE_DIE: 1830 return KMP_HW_DIE; 1831 } 1832 return KMP_HW_UNKNOWN; 1833 } 1834 1835 // This function takes the topology leaf, a levels array to store the levels 1836 // detected and a bitmap of the known levels. 1837 // Returns the number of levels in the topology 1838 static unsigned 1839 __kmp_x2apicid_get_levels(int leaf, 1840 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 1841 kmp_uint64 known_levels) { 1842 unsigned level, levels_index; 1843 unsigned level_type, mask_width, nitems; 1844 kmp_cpuid buf; 1845 1846 // New algorithm has known topology layers act as highest unknown topology 1847 // layers when unknown topology layers exist. 1848 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 1849 // are unknown topology layers, Then SMT will take the characteristics of 1850 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 1851 // This eliminates unknown portions of the topology while still keeping the 1852 // correct structure. 1853 level = levels_index = 0; 1854 do { 1855 __kmp_x86_cpuid(leaf, level, &buf); 1856 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 1857 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 1858 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 1859 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 1860 return 0; 1861 1862 if (known_levels & (1ull << level_type)) { 1863 // Add a new level to the topology 1864 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 1865 levels[levels_index].level_type = level_type; 1866 levels[levels_index].mask_width = mask_width; 1867 levels[levels_index].nitems = nitems; 1868 levels_index++; 1869 } else { 1870 // If it is an unknown level, then logically move the previous layer up 1871 if (levels_index > 0) { 1872 levels[levels_index - 1].mask_width = mask_width; 1873 levels[levels_index - 1].nitems = nitems; 1874 } 1875 } 1876 level++; 1877 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 1878 1879 // Set the masks to & with apicid 1880 for (unsigned i = 0; i < levels_index; ++i) { 1881 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 1882 levels[i].mask = ~((-1) << levels[i].mask_width); 1883 levels[i].cache_mask = (-1) << levels[i].mask_width; 1884 for (unsigned j = 0; j < i; ++j) 1885 levels[i].mask ^= levels[j].mask; 1886 } else { 1887 KMP_DEBUG_ASSERT(levels_index > 0); 1888 levels[i].mask = (-1) << levels[i - 1].mask_width; 1889 levels[i].cache_mask = 0; 1890 } 1891 } 1892 return levels_index; 1893 } 1894 1895 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 1896 1897 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 1898 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 1899 unsigned levels_index; 1900 kmp_cpuid buf; 1901 kmp_uint64 known_levels; 1902 int topology_leaf, highest_leaf, apic_id; 1903 int num_leaves; 1904 static int leaves[] = {0, 0}; 1905 1906 kmp_i18n_id_t leaf_message_id; 1907 1908 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 1909 1910 *msg_id = kmp_i18n_null; 1911 if (__kmp_affinity_verbose) { 1912 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 1913 } 1914 1915 // Figure out the known topology levels 1916 known_levels = 0ull; 1917 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 1918 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 1919 known_levels |= (1ull << i); 1920 } 1921 } 1922 1923 // Get the highest cpuid leaf supported 1924 __kmp_x86_cpuid(0, 0, &buf); 1925 highest_leaf = buf.eax; 1926 1927 // If a specific topology method was requested, only allow that specific leaf 1928 // otherwise, try both leaves 31 and 11 in that order 1929 num_leaves = 0; 1930 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 1931 num_leaves = 1; 1932 leaves[0] = 11; 1933 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1934 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 1935 num_leaves = 1; 1936 leaves[0] = 31; 1937 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 1938 } else { 1939 num_leaves = 2; 1940 leaves[0] = 31; 1941 leaves[1] = 11; 1942 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1943 } 1944 1945 // Check to see if cpuid leaf 31 or 11 is supported. 1946 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1947 topology_leaf = -1; 1948 for (int i = 0; i < num_leaves; ++i) { 1949 int leaf = leaves[i]; 1950 if (highest_leaf < leaf) 1951 continue; 1952 __kmp_x86_cpuid(leaf, 0, &buf); 1953 if (buf.ebx == 0) 1954 continue; 1955 topology_leaf = leaf; 1956 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 1957 if (levels_index == 0) 1958 continue; 1959 break; 1960 } 1961 if (topology_leaf == -1 || levels_index == 0) { 1962 *msg_id = leaf_message_id; 1963 return false; 1964 } 1965 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 1966 1967 // The algorithm used starts by setting the affinity to each available thread 1968 // and retrieving info from the cpuid instruction, so if we are not capable of 1969 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 1970 // we need to do something else - use the defaults that we calculated from 1971 // issuing cpuid without binding to each proc. 1972 if (!KMP_AFFINITY_CAPABLE()) { 1973 // Hack to try and infer the machine topology using only the data 1974 // available from cpuid on the current thread, and __kmp_xproc. 1975 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1976 for (unsigned i = 0; i < levels_index; ++i) { 1977 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 1978 __kmp_nThreadsPerCore = levels[i].nitems; 1979 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 1980 nCoresPerPkg = levels[i].nitems; 1981 } 1982 } 1983 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1984 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1985 return true; 1986 } 1987 1988 // Allocate the data structure to be returned. 1989 int depth = levels_index; 1990 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 1991 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 1992 __kmp_topology = 1993 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 1994 1995 // Insert equivalent cache types if they exist 1996 kmp_cache_info_t cache_info; 1997 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 1998 const kmp_cache_info_t::info_t &info = cache_info[i]; 1999 unsigned cache_mask = info.mask; 2000 unsigned cache_level = info.level; 2001 for (unsigned j = 0; j < levels_index; ++j) { 2002 unsigned hw_cache_mask = levels[j].cache_mask; 2003 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 2004 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 2005 kmp_hw_t type = 2006 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 2007 __kmp_topology->set_equivalent_type(cache_type, type); 2008 } 2009 } 2010 } 2011 2012 // From here on, we can assume that it is safe to call 2013 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2014 // __kmp_affinity_type = affinity_none. 2015 2016 // Save the affinity mask for the current thread. 2017 kmp_affinity_raii_t previous_affinity; 2018 2019 // Run through each of the available contexts, binding the current thread 2020 // to it, and obtaining the pertinent information using the cpuid instr. 2021 unsigned int proc; 2022 int hw_thread_index = 0; 2023 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2024 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2025 unsigned my_levels_index; 2026 2027 // Skip this proc if it is not included in the machine model. 2028 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2029 continue; 2030 } 2031 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2032 2033 __kmp_affinity_dispatch->bind_thread(proc); 2034 2035 // New algorithm 2036 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2037 apic_id = buf.edx; 2038 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2039 my_levels_index = 2040 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2041 if (my_levels_index == 0 || my_levels_index != levels_index) { 2042 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2043 return false; 2044 } 2045 hw_thread.clear(); 2046 hw_thread.os_id = proc; 2047 // Put in topology information 2048 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2049 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2050 if (j > 0) { 2051 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2052 } 2053 } 2054 hw_thread_index++; 2055 } 2056 KMP_ASSERT(hw_thread_index > 0); 2057 __kmp_topology->sort_ids(); 2058 if (!__kmp_topology->check_ids()) { 2059 kmp_topology_t::deallocate(__kmp_topology); 2060 __kmp_topology = nullptr; 2061 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2062 return false; 2063 } 2064 return true; 2065 } 2066 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2067 2068 #define osIdIndex 0 2069 #define threadIdIndex 1 2070 #define coreIdIndex 2 2071 #define pkgIdIndex 3 2072 #define nodeIdIndex 4 2073 2074 typedef unsigned *ProcCpuInfo; 2075 static unsigned maxIndex = pkgIdIndex; 2076 2077 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2078 const void *b) { 2079 unsigned i; 2080 const unsigned *aa = *(unsigned *const *)a; 2081 const unsigned *bb = *(unsigned *const *)b; 2082 for (i = maxIndex;; i--) { 2083 if (aa[i] < bb[i]) 2084 return -1; 2085 if (aa[i] > bb[i]) 2086 return 1; 2087 if (i == osIdIndex) 2088 break; 2089 } 2090 return 0; 2091 } 2092 2093 #if KMP_USE_HIER_SCHED 2094 // Set the array sizes for the hierarchy layers 2095 static void __kmp_dispatch_set_hierarchy_values() { 2096 // Set the maximum number of L1's to number of cores 2097 // Set the maximum number of L2's to to either number of cores / 2 for 2098 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2099 // Or the number of cores for Intel(R) Xeon(R) processors 2100 // Set the maximum number of NUMA nodes and L3's to number of packages 2101 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2102 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2103 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2104 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2105 KMP_MIC_SUPPORTED 2106 if (__kmp_mic_type >= mic3) 2107 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2108 else 2109 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2110 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2111 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2112 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2113 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2114 // Set the number of threads per unit 2115 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2116 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2117 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2118 __kmp_nThreadsPerCore; 2119 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2120 KMP_MIC_SUPPORTED 2121 if (__kmp_mic_type >= mic3) 2122 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2123 2 * __kmp_nThreadsPerCore; 2124 else 2125 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2126 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2127 __kmp_nThreadsPerCore; 2128 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2129 nCoresPerPkg * __kmp_nThreadsPerCore; 2130 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2131 nCoresPerPkg * __kmp_nThreadsPerCore; 2132 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2133 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2134 } 2135 2136 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2137 // i.e., this thread's L1 or this thread's L2, etc. 2138 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2139 int index = type + 1; 2140 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2141 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2142 if (type == kmp_hier_layer_e::LAYER_THREAD) 2143 return tid; 2144 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2145 return 0; 2146 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2147 if (tid >= num_hw_threads) 2148 tid = tid % num_hw_threads; 2149 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2150 } 2151 2152 // Return the number of t1's per t2 2153 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2154 int i1 = t1 + 1; 2155 int i2 = t2 + 1; 2156 KMP_DEBUG_ASSERT(i1 <= i2); 2157 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2158 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2159 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2160 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2161 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2162 } 2163 #endif // KMP_USE_HIER_SCHED 2164 2165 static inline const char *__kmp_cpuinfo_get_filename() { 2166 const char *filename; 2167 if (__kmp_cpuinfo_file != nullptr) 2168 filename = __kmp_cpuinfo_file; 2169 else 2170 filename = "/proc/cpuinfo"; 2171 return filename; 2172 } 2173 2174 static inline const char *__kmp_cpuinfo_get_envvar() { 2175 const char *envvar = nullptr; 2176 if (__kmp_cpuinfo_file != nullptr) 2177 envvar = "KMP_CPUINFO_FILE"; 2178 return envvar; 2179 } 2180 2181 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2182 // affinity map. 2183 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2184 kmp_i18n_id_t *const msg_id) { 2185 const char *filename = __kmp_cpuinfo_get_filename(); 2186 const char *envvar = __kmp_cpuinfo_get_envvar(); 2187 *msg_id = kmp_i18n_null; 2188 2189 if (__kmp_affinity_verbose) { 2190 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2191 } 2192 2193 kmp_safe_raii_file_t f(filename, "r", envvar); 2194 2195 // Scan of the file, and count the number of "processor" (osId) fields, 2196 // and find the highest value of <n> for a node_<n> field. 2197 char buf[256]; 2198 unsigned num_records = 0; 2199 while (!feof(f)) { 2200 buf[sizeof(buf) - 1] = 1; 2201 if (!fgets(buf, sizeof(buf), f)) { 2202 // Read errors presumably because of EOF 2203 break; 2204 } 2205 2206 char s1[] = "processor"; 2207 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2208 num_records++; 2209 continue; 2210 } 2211 2212 // FIXME - this will match "node_<n> <garbage>" 2213 unsigned level; 2214 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2215 // validate the input fisrt: 2216 if (level > (unsigned)__kmp_xproc) { // level is too big 2217 level = __kmp_xproc; 2218 } 2219 if (nodeIdIndex + level >= maxIndex) { 2220 maxIndex = nodeIdIndex + level; 2221 } 2222 continue; 2223 } 2224 } 2225 2226 // Check for empty file / no valid processor records, or too many. The number 2227 // of records can't exceed the number of valid bits in the affinity mask. 2228 if (num_records == 0) { 2229 *msg_id = kmp_i18n_str_NoProcRecords; 2230 return false; 2231 } 2232 if (num_records > (unsigned)__kmp_xproc) { 2233 *msg_id = kmp_i18n_str_TooManyProcRecords; 2234 return false; 2235 } 2236 2237 // Set the file pointer back to the beginning, so that we can scan the file 2238 // again, this time performing a full parse of the data. Allocate a vector of 2239 // ProcCpuInfo object, where we will place the data. Adding an extra element 2240 // at the end allows us to remove a lot of extra checks for termination 2241 // conditions. 2242 if (fseek(f, 0, SEEK_SET) != 0) { 2243 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2244 return false; 2245 } 2246 2247 // Allocate the array of records to store the proc info in. The dummy 2248 // element at the end makes the logic in filling them out easier to code. 2249 unsigned **threadInfo = 2250 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2251 unsigned i; 2252 for (i = 0; i <= num_records; i++) { 2253 threadInfo[i] = 2254 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2255 } 2256 2257 #define CLEANUP_THREAD_INFO \ 2258 for (i = 0; i <= num_records; i++) { \ 2259 __kmp_free(threadInfo[i]); \ 2260 } \ 2261 __kmp_free(threadInfo); 2262 2263 // A value of UINT_MAX means that we didn't find the field 2264 unsigned __index; 2265 2266 #define INIT_PROC_INFO(p) \ 2267 for (__index = 0; __index <= maxIndex; __index++) { \ 2268 (p)[__index] = UINT_MAX; \ 2269 } 2270 2271 for (i = 0; i <= num_records; i++) { 2272 INIT_PROC_INFO(threadInfo[i]); 2273 } 2274 2275 unsigned num_avail = 0; 2276 *line = 0; 2277 while (!feof(f)) { 2278 // Create an inner scoping level, so that all the goto targets at the end of 2279 // the loop appear in an outer scoping level. This avoids warnings about 2280 // jumping past an initialization to a target in the same block. 2281 { 2282 buf[sizeof(buf) - 1] = 1; 2283 bool long_line = false; 2284 if (!fgets(buf, sizeof(buf), f)) { 2285 // Read errors presumably because of EOF 2286 // If there is valid data in threadInfo[num_avail], then fake 2287 // a blank line in ensure that the last address gets parsed. 2288 bool valid = false; 2289 for (i = 0; i <= maxIndex; i++) { 2290 if (threadInfo[num_avail][i] != UINT_MAX) { 2291 valid = true; 2292 } 2293 } 2294 if (!valid) { 2295 break; 2296 } 2297 buf[0] = 0; 2298 } else if (!buf[sizeof(buf) - 1]) { 2299 // The line is longer than the buffer. Set a flag and don't 2300 // emit an error if we were going to ignore the line, anyway. 2301 long_line = true; 2302 2303 #define CHECK_LINE \ 2304 if (long_line) { \ 2305 CLEANUP_THREAD_INFO; \ 2306 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2307 return false; \ 2308 } 2309 } 2310 (*line)++; 2311 2312 char s1[] = "processor"; 2313 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2314 CHECK_LINE; 2315 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2316 unsigned val; 2317 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2318 goto no_val; 2319 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2320 #if KMP_ARCH_AARCH64 2321 // Handle the old AArch64 /proc/cpuinfo layout differently, 2322 // it contains all of the 'processor' entries listed in a 2323 // single 'Processor' section, therefore the normal looking 2324 // for duplicates in that section will always fail. 2325 num_avail++; 2326 #else 2327 goto dup_field; 2328 #endif 2329 threadInfo[num_avail][osIdIndex] = val; 2330 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2331 char path[256]; 2332 KMP_SNPRINTF( 2333 path, sizeof(path), 2334 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2335 threadInfo[num_avail][osIdIndex]); 2336 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2337 2338 KMP_SNPRINTF(path, sizeof(path), 2339 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2340 threadInfo[num_avail][osIdIndex]); 2341 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2342 continue; 2343 #else 2344 } 2345 char s2[] = "physical id"; 2346 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2347 CHECK_LINE; 2348 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2349 unsigned val; 2350 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2351 goto no_val; 2352 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2353 goto dup_field; 2354 threadInfo[num_avail][pkgIdIndex] = val; 2355 continue; 2356 } 2357 char s3[] = "core id"; 2358 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2359 CHECK_LINE; 2360 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2361 unsigned val; 2362 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2363 goto no_val; 2364 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2365 goto dup_field; 2366 threadInfo[num_avail][coreIdIndex] = val; 2367 continue; 2368 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2369 } 2370 char s4[] = "thread id"; 2371 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2372 CHECK_LINE; 2373 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2374 unsigned val; 2375 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2376 goto no_val; 2377 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2378 goto dup_field; 2379 threadInfo[num_avail][threadIdIndex] = val; 2380 continue; 2381 } 2382 unsigned level; 2383 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2384 CHECK_LINE; 2385 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2386 unsigned val; 2387 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2388 goto no_val; 2389 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2390 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2391 goto dup_field; 2392 threadInfo[num_avail][nodeIdIndex + level] = val; 2393 continue; 2394 } 2395 2396 // We didn't recognize the leading token on the line. There are lots of 2397 // leading tokens that we don't recognize - if the line isn't empty, go on 2398 // to the next line. 2399 if ((*buf != 0) && (*buf != '\n')) { 2400 // If the line is longer than the buffer, read characters 2401 // until we find a newline. 2402 if (long_line) { 2403 int ch; 2404 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2405 ; 2406 } 2407 continue; 2408 } 2409 2410 // A newline has signalled the end of the processor record. 2411 // Check that there aren't too many procs specified. 2412 if ((int)num_avail == __kmp_xproc) { 2413 CLEANUP_THREAD_INFO; 2414 *msg_id = kmp_i18n_str_TooManyEntries; 2415 return false; 2416 } 2417 2418 // Check for missing fields. The osId field must be there, and we 2419 // currently require that the physical id field is specified, also. 2420 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2421 CLEANUP_THREAD_INFO; 2422 *msg_id = kmp_i18n_str_MissingProcField; 2423 return false; 2424 } 2425 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2426 CLEANUP_THREAD_INFO; 2427 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2428 return false; 2429 } 2430 2431 // Skip this proc if it is not included in the machine model. 2432 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2433 __kmp_affin_fullMask)) { 2434 INIT_PROC_INFO(threadInfo[num_avail]); 2435 continue; 2436 } 2437 2438 // We have a successful parse of this proc's info. 2439 // Increment the counter, and prepare for the next proc. 2440 num_avail++; 2441 KMP_ASSERT(num_avail <= num_records); 2442 INIT_PROC_INFO(threadInfo[num_avail]); 2443 } 2444 continue; 2445 2446 no_val: 2447 CLEANUP_THREAD_INFO; 2448 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2449 return false; 2450 2451 dup_field: 2452 CLEANUP_THREAD_INFO; 2453 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2454 return false; 2455 } 2456 *line = 0; 2457 2458 #if KMP_MIC && REDUCE_TEAM_SIZE 2459 unsigned teamSize = 0; 2460 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2461 2462 // check for num_records == __kmp_xproc ??? 2463 2464 // If it is configured to omit the package level when there is only a single 2465 // package, the logic at the end of this routine won't work if there is only a 2466 // single thread 2467 KMP_ASSERT(num_avail > 0); 2468 KMP_ASSERT(num_avail <= num_records); 2469 2470 // Sort the threadInfo table by physical Id. 2471 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2472 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2473 2474 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2475 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2476 // the chips on a system. Although coreId's are usually assigned 2477 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2478 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2479 // 2480 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2481 // total # packages) are at this point - we want to determine that now. We 2482 // only have an upper bound on the first two figures. 2483 unsigned *counts = 2484 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2485 unsigned *maxCt = 2486 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2487 unsigned *totals = 2488 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2489 unsigned *lastId = 2490 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2491 2492 bool assign_thread_ids = false; 2493 unsigned threadIdCt; 2494 unsigned index; 2495 2496 restart_radix_check: 2497 threadIdCt = 0; 2498 2499 // Initialize the counter arrays with data from threadInfo[0]. 2500 if (assign_thread_ids) { 2501 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2502 threadInfo[0][threadIdIndex] = threadIdCt++; 2503 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2504 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2505 } 2506 } 2507 for (index = 0; index <= maxIndex; index++) { 2508 counts[index] = 1; 2509 maxCt[index] = 1; 2510 totals[index] = 1; 2511 lastId[index] = threadInfo[0][index]; 2512 ; 2513 } 2514 2515 // Run through the rest of the OS procs. 2516 for (i = 1; i < num_avail; i++) { 2517 // Find the most significant index whose id differs from the id for the 2518 // previous OS proc. 2519 for (index = maxIndex; index >= threadIdIndex; index--) { 2520 if (assign_thread_ids && (index == threadIdIndex)) { 2521 // Auto-assign the thread id field if it wasn't specified. 2522 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2523 threadInfo[i][threadIdIndex] = threadIdCt++; 2524 } 2525 // Apparently the thread id field was specified for some entries and not 2526 // others. Start the thread id counter off at the next higher thread id. 2527 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2528 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2529 } 2530 } 2531 if (threadInfo[i][index] != lastId[index]) { 2532 // Run through all indices which are less significant, and reset the 2533 // counts to 1. At all levels up to and including index, we need to 2534 // increment the totals and record the last id. 2535 unsigned index2; 2536 for (index2 = threadIdIndex; index2 < index; index2++) { 2537 totals[index2]++; 2538 if (counts[index2] > maxCt[index2]) { 2539 maxCt[index2] = counts[index2]; 2540 } 2541 counts[index2] = 1; 2542 lastId[index2] = threadInfo[i][index2]; 2543 } 2544 counts[index]++; 2545 totals[index]++; 2546 lastId[index] = threadInfo[i][index]; 2547 2548 if (assign_thread_ids && (index > threadIdIndex)) { 2549 2550 #if KMP_MIC && REDUCE_TEAM_SIZE 2551 // The default team size is the total #threads in the machine 2552 // minus 1 thread for every core that has 3 or more threads. 2553 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2554 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2555 2556 // Restart the thread counter, as we are on a new core. 2557 threadIdCt = 0; 2558 2559 // Auto-assign the thread id field if it wasn't specified. 2560 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2561 threadInfo[i][threadIdIndex] = threadIdCt++; 2562 } 2563 2564 // Apparently the thread id field was specified for some entries and 2565 // not others. Start the thread id counter off at the next higher 2566 // thread id. 2567 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2568 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2569 } 2570 } 2571 break; 2572 } 2573 } 2574 if (index < threadIdIndex) { 2575 // If thread ids were specified, it is an error if they are not unique. 2576 // Also, check that we waven't already restarted the loop (to be safe - 2577 // shouldn't need to). 2578 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2579 __kmp_free(lastId); 2580 __kmp_free(totals); 2581 __kmp_free(maxCt); 2582 __kmp_free(counts); 2583 CLEANUP_THREAD_INFO; 2584 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2585 return false; 2586 } 2587 2588 // If the thread ids were not specified and we see entries entries that 2589 // are duplicates, start the loop over and assign the thread ids manually. 2590 assign_thread_ids = true; 2591 goto restart_radix_check; 2592 } 2593 } 2594 2595 #if KMP_MIC && REDUCE_TEAM_SIZE 2596 // The default team size is the total #threads in the machine 2597 // minus 1 thread for every core that has 3 or more threads. 2598 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2599 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2600 2601 for (index = threadIdIndex; index <= maxIndex; index++) { 2602 if (counts[index] > maxCt[index]) { 2603 maxCt[index] = counts[index]; 2604 } 2605 } 2606 2607 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2608 nCoresPerPkg = maxCt[coreIdIndex]; 2609 nPackages = totals[pkgIdIndex]; 2610 2611 // When affinity is off, this routine will still be called to set 2612 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2613 // Make sure all these vars are set correctly, and return now if affinity is 2614 // not enabled. 2615 __kmp_ncores = totals[coreIdIndex]; 2616 if (!KMP_AFFINITY_CAPABLE()) { 2617 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2618 return true; 2619 } 2620 2621 #if KMP_MIC && REDUCE_TEAM_SIZE 2622 // Set the default team size. 2623 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2624 __kmp_dflt_team_nth = teamSize; 2625 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2626 "__kmp_dflt_team_nth = %d\n", 2627 __kmp_dflt_team_nth)); 2628 } 2629 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2630 2631 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2632 2633 // Count the number of levels which have more nodes at that level than at the 2634 // parent's level (with there being an implicit root node of the top level). 2635 // This is equivalent to saying that there is at least one node at this level 2636 // which has a sibling. These levels are in the map, and the package level is 2637 // always in the map. 2638 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2639 for (index = threadIdIndex; index < maxIndex; index++) { 2640 KMP_ASSERT(totals[index] >= totals[index + 1]); 2641 inMap[index] = (totals[index] > totals[index + 1]); 2642 } 2643 inMap[maxIndex] = (totals[maxIndex] > 1); 2644 inMap[pkgIdIndex] = true; 2645 inMap[coreIdIndex] = true; 2646 inMap[threadIdIndex] = true; 2647 2648 int depth = 0; 2649 int idx = 0; 2650 kmp_hw_t types[KMP_HW_LAST]; 2651 int pkgLevel = -1; 2652 int coreLevel = -1; 2653 int threadLevel = -1; 2654 for (index = threadIdIndex; index <= maxIndex; index++) { 2655 if (inMap[index]) { 2656 depth++; 2657 } 2658 } 2659 if (inMap[pkgIdIndex]) { 2660 pkgLevel = idx; 2661 types[idx++] = KMP_HW_SOCKET; 2662 } 2663 if (inMap[coreIdIndex]) { 2664 coreLevel = idx; 2665 types[idx++] = KMP_HW_CORE; 2666 } 2667 if (inMap[threadIdIndex]) { 2668 threadLevel = idx; 2669 types[idx++] = KMP_HW_THREAD; 2670 } 2671 KMP_ASSERT(depth > 0); 2672 2673 // Construct the data structure that is to be returned. 2674 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 2675 2676 for (i = 0; i < num_avail; ++i) { 2677 unsigned os = threadInfo[i][osIdIndex]; 2678 int src_index; 2679 int dst_index = 0; 2680 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2681 hw_thread.clear(); 2682 hw_thread.os_id = os; 2683 2684 idx = 0; 2685 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2686 if (!inMap[src_index]) { 2687 continue; 2688 } 2689 if (src_index == pkgIdIndex) { 2690 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 2691 } else if (src_index == coreIdIndex) { 2692 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 2693 } else if (src_index == threadIdIndex) { 2694 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 2695 } 2696 dst_index++; 2697 } 2698 } 2699 2700 __kmp_free(inMap); 2701 __kmp_free(lastId); 2702 __kmp_free(totals); 2703 __kmp_free(maxCt); 2704 __kmp_free(counts); 2705 CLEANUP_THREAD_INFO; 2706 __kmp_topology->sort_ids(); 2707 if (!__kmp_topology->check_ids()) { 2708 kmp_topology_t::deallocate(__kmp_topology); 2709 __kmp_topology = nullptr; 2710 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2711 return false; 2712 } 2713 return true; 2714 } 2715 2716 // Create and return a table of affinity masks, indexed by OS thread ID. 2717 // This routine handles OR'ing together all the affinity masks of threads 2718 // that are sufficiently close, if granularity > fine. 2719 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2720 unsigned *numUnique) { 2721 // First form a table of affinity masks in order of OS thread id. 2722 int maxOsId; 2723 int i; 2724 int numAddrs = __kmp_topology->get_num_hw_threads(); 2725 int depth = __kmp_topology->get_depth(); 2726 KMP_ASSERT(numAddrs); 2727 KMP_ASSERT(depth); 2728 2729 maxOsId = 0; 2730 for (i = numAddrs - 1;; --i) { 2731 int osId = __kmp_topology->at(i).os_id; 2732 if (osId > maxOsId) { 2733 maxOsId = osId; 2734 } 2735 if (i == 0) 2736 break; 2737 } 2738 kmp_affin_mask_t *osId2Mask; 2739 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2740 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2741 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2742 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2743 } 2744 if (__kmp_affinity_gran_levels >= (int)depth) { 2745 if (__kmp_affinity_verbose || 2746 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2747 KMP_WARNING(AffThreadsMayMigrate); 2748 } 2749 } 2750 2751 // Run through the table, forming the masks for all threads on each core. 2752 // Threads on the same core will have identical kmp_hw_thread_t objects, not 2753 // considering the last level, which must be the thread id. All threads on a 2754 // core will appear consecutively. 2755 int unique = 0; 2756 int j = 0; // index of 1st thread on core 2757 int leader = 0; 2758 kmp_affin_mask_t *sum; 2759 KMP_CPU_ALLOC_ON_STACK(sum); 2760 KMP_CPU_ZERO(sum); 2761 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); 2762 for (i = 1; i < numAddrs; i++) { 2763 // If this thread is sufficiently close to the leader (within the 2764 // granularity setting), then set the bit for this os thread in the 2765 // affinity mask for this group, and go on to the next thread. 2766 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { 2767 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2768 continue; 2769 } 2770 2771 // For every thread in this group, copy the mask to the thread's entry in 2772 // the osId2Mask table. Mark the first address as a leader. 2773 for (; j < i; j++) { 2774 int osId = __kmp_topology->at(j).os_id; 2775 KMP_DEBUG_ASSERT(osId <= maxOsId); 2776 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2777 KMP_CPU_COPY(mask, sum); 2778 __kmp_topology->at(j).leader = (j == leader); 2779 } 2780 unique++; 2781 2782 // Start a new mask. 2783 leader = i; 2784 KMP_CPU_ZERO(sum); 2785 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2786 } 2787 2788 // For every thread in last group, copy the mask to the thread's 2789 // entry in the osId2Mask table. 2790 for (; j < i; j++) { 2791 int osId = __kmp_topology->at(j).os_id; 2792 KMP_DEBUG_ASSERT(osId <= maxOsId); 2793 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2794 KMP_CPU_COPY(mask, sum); 2795 __kmp_topology->at(j).leader = (j == leader); 2796 } 2797 unique++; 2798 KMP_CPU_FREE_FROM_STACK(sum); 2799 2800 *maxIndex = maxOsId; 2801 *numUnique = unique; 2802 return osId2Mask; 2803 } 2804 2805 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2806 // as file-static than to try and pass them through the calling sequence of 2807 // the recursive-descent OMP_PLACES parser. 2808 static kmp_affin_mask_t *newMasks; 2809 static int numNewMasks; 2810 static int nextNewMask; 2811 2812 #define ADD_MASK(_mask) \ 2813 { \ 2814 if (nextNewMask >= numNewMasks) { \ 2815 int i; \ 2816 numNewMasks *= 2; \ 2817 kmp_affin_mask_t *temp; \ 2818 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2819 for (i = 0; i < numNewMasks / 2; i++) { \ 2820 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2821 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2822 KMP_CPU_COPY(dest, src); \ 2823 } \ 2824 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2825 newMasks = temp; \ 2826 } \ 2827 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2828 nextNewMask++; \ 2829 } 2830 2831 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2832 { \ 2833 if (((_osId) > _maxOsId) || \ 2834 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2835 if (__kmp_affinity_verbose || \ 2836 (__kmp_affinity_warnings && \ 2837 (__kmp_affinity_type != affinity_none))) { \ 2838 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2839 } \ 2840 } else { \ 2841 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2842 } \ 2843 } 2844 2845 // Re-parse the proclist (for the explicit affinity type), and form the list 2846 // of affinity newMasks indexed by gtid. 2847 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2848 unsigned int *out_numMasks, 2849 const char *proclist, 2850 kmp_affin_mask_t *osId2Mask, 2851 int maxOsId) { 2852 int i; 2853 const char *scan = proclist; 2854 const char *next = proclist; 2855 2856 // We use malloc() for the temporary mask vector, so that we can use 2857 // realloc() to extend it. 2858 numNewMasks = 2; 2859 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2860 nextNewMask = 0; 2861 kmp_affin_mask_t *sumMask; 2862 KMP_CPU_ALLOC(sumMask); 2863 int setSize = 0; 2864 2865 for (;;) { 2866 int start, end, stride; 2867 2868 SKIP_WS(scan); 2869 next = scan; 2870 if (*next == '\0') { 2871 break; 2872 } 2873 2874 if (*next == '{') { 2875 int num; 2876 setSize = 0; 2877 next++; // skip '{' 2878 SKIP_WS(next); 2879 scan = next; 2880 2881 // Read the first integer in the set. 2882 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2883 SKIP_DIGITS(next); 2884 num = __kmp_str_to_int(scan, *next); 2885 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2886 2887 // Copy the mask for that osId to the sum (union) mask. 2888 if ((num > maxOsId) || 2889 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2890 if (__kmp_affinity_verbose || 2891 (__kmp_affinity_warnings && 2892 (__kmp_affinity_type != affinity_none))) { 2893 KMP_WARNING(AffIgnoreInvalidProcID, num); 2894 } 2895 KMP_CPU_ZERO(sumMask); 2896 } else { 2897 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2898 setSize = 1; 2899 } 2900 2901 for (;;) { 2902 // Check for end of set. 2903 SKIP_WS(next); 2904 if (*next == '}') { 2905 next++; // skip '}' 2906 break; 2907 } 2908 2909 // Skip optional comma. 2910 if (*next == ',') { 2911 next++; 2912 } 2913 SKIP_WS(next); 2914 2915 // Read the next integer in the set. 2916 scan = next; 2917 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2918 2919 SKIP_DIGITS(next); 2920 num = __kmp_str_to_int(scan, *next); 2921 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2922 2923 // Add the mask for that osId to the sum mask. 2924 if ((num > maxOsId) || 2925 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2926 if (__kmp_affinity_verbose || 2927 (__kmp_affinity_warnings && 2928 (__kmp_affinity_type != affinity_none))) { 2929 KMP_WARNING(AffIgnoreInvalidProcID, num); 2930 } 2931 } else { 2932 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2933 setSize++; 2934 } 2935 } 2936 if (setSize > 0) { 2937 ADD_MASK(sumMask); 2938 } 2939 2940 SKIP_WS(next); 2941 if (*next == ',') { 2942 next++; 2943 } 2944 scan = next; 2945 continue; 2946 } 2947 2948 // Read the first integer. 2949 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2950 SKIP_DIGITS(next); 2951 start = __kmp_str_to_int(scan, *next); 2952 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2953 SKIP_WS(next); 2954 2955 // If this isn't a range, then add a mask to the list and go on. 2956 if (*next != '-') { 2957 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2958 2959 // Skip optional comma. 2960 if (*next == ',') { 2961 next++; 2962 } 2963 scan = next; 2964 continue; 2965 } 2966 2967 // This is a range. Skip over the '-' and read in the 2nd int. 2968 next++; // skip '-' 2969 SKIP_WS(next); 2970 scan = next; 2971 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2972 SKIP_DIGITS(next); 2973 end = __kmp_str_to_int(scan, *next); 2974 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2975 2976 // Check for a stride parameter 2977 stride = 1; 2978 SKIP_WS(next); 2979 if (*next == ':') { 2980 // A stride is specified. Skip over the ':" and read the 3rd int. 2981 int sign = +1; 2982 next++; // skip ':' 2983 SKIP_WS(next); 2984 scan = next; 2985 if (*next == '-') { 2986 sign = -1; 2987 next++; 2988 SKIP_WS(next); 2989 scan = next; 2990 } 2991 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2992 SKIP_DIGITS(next); 2993 stride = __kmp_str_to_int(scan, *next); 2994 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2995 stride *= sign; 2996 } 2997 2998 // Do some range checks. 2999 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3000 if (stride > 0) { 3001 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3002 } else { 3003 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3004 } 3005 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3006 3007 // Add the mask for each OS proc # to the list. 3008 if (stride > 0) { 3009 do { 3010 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3011 start += stride; 3012 } while (start <= end); 3013 } else { 3014 do { 3015 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3016 start += stride; 3017 } while (start >= end); 3018 } 3019 3020 // Skip optional comma. 3021 SKIP_WS(next); 3022 if (*next == ',') { 3023 next++; 3024 } 3025 scan = next; 3026 } 3027 3028 *out_numMasks = nextNewMask; 3029 if (nextNewMask == 0) { 3030 *out_masks = NULL; 3031 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3032 return; 3033 } 3034 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3035 for (i = 0; i < nextNewMask; i++) { 3036 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3037 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3038 KMP_CPU_COPY(dest, src); 3039 } 3040 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3041 KMP_CPU_FREE(sumMask); 3042 } 3043 3044 /*----------------------------------------------------------------------------- 3045 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3046 places. Again, Here is the grammar: 3047 3048 place_list := place 3049 place_list := place , place_list 3050 place := num 3051 place := place : num 3052 place := place : num : signed 3053 place := { subplacelist } 3054 place := ! place // (lowest priority) 3055 subplace_list := subplace 3056 subplace_list := subplace , subplace_list 3057 subplace := num 3058 subplace := num : num 3059 subplace := num : num : signed 3060 signed := num 3061 signed := + signed 3062 signed := - signed 3063 -----------------------------------------------------------------------------*/ 3064 static void __kmp_process_subplace_list(const char **scan, 3065 kmp_affin_mask_t *osId2Mask, 3066 int maxOsId, kmp_affin_mask_t *tempMask, 3067 int *setSize) { 3068 const char *next; 3069 3070 for (;;) { 3071 int start, count, stride, i; 3072 3073 // Read in the starting proc id 3074 SKIP_WS(*scan); 3075 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3076 next = *scan; 3077 SKIP_DIGITS(next); 3078 start = __kmp_str_to_int(*scan, *next); 3079 KMP_ASSERT(start >= 0); 3080 *scan = next; 3081 3082 // valid follow sets are ',' ':' and '}' 3083 SKIP_WS(*scan); 3084 if (**scan == '}' || **scan == ',') { 3085 if ((start > maxOsId) || 3086 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3087 if (__kmp_affinity_verbose || 3088 (__kmp_affinity_warnings && 3089 (__kmp_affinity_type != affinity_none))) { 3090 KMP_WARNING(AffIgnoreInvalidProcID, start); 3091 } 3092 } else { 3093 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3094 (*setSize)++; 3095 } 3096 if (**scan == '}') { 3097 break; 3098 } 3099 (*scan)++; // skip ',' 3100 continue; 3101 } 3102 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3103 (*scan)++; // skip ':' 3104 3105 // Read count parameter 3106 SKIP_WS(*scan); 3107 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3108 next = *scan; 3109 SKIP_DIGITS(next); 3110 count = __kmp_str_to_int(*scan, *next); 3111 KMP_ASSERT(count >= 0); 3112 *scan = next; 3113 3114 // valid follow sets are ',' ':' and '}' 3115 SKIP_WS(*scan); 3116 if (**scan == '}' || **scan == ',') { 3117 for (i = 0; i < count; i++) { 3118 if ((start > maxOsId) || 3119 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3120 if (__kmp_affinity_verbose || 3121 (__kmp_affinity_warnings && 3122 (__kmp_affinity_type != affinity_none))) { 3123 KMP_WARNING(AffIgnoreInvalidProcID, start); 3124 } 3125 break; // don't proliferate warnings for large count 3126 } else { 3127 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3128 start++; 3129 (*setSize)++; 3130 } 3131 } 3132 if (**scan == '}') { 3133 break; 3134 } 3135 (*scan)++; // skip ',' 3136 continue; 3137 } 3138 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3139 (*scan)++; // skip ':' 3140 3141 // Read stride parameter 3142 int sign = +1; 3143 for (;;) { 3144 SKIP_WS(*scan); 3145 if (**scan == '+') { 3146 (*scan)++; // skip '+' 3147 continue; 3148 } 3149 if (**scan == '-') { 3150 sign *= -1; 3151 (*scan)++; // skip '-' 3152 continue; 3153 } 3154 break; 3155 } 3156 SKIP_WS(*scan); 3157 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3158 next = *scan; 3159 SKIP_DIGITS(next); 3160 stride = __kmp_str_to_int(*scan, *next); 3161 KMP_ASSERT(stride >= 0); 3162 *scan = next; 3163 stride *= sign; 3164 3165 // valid follow sets are ',' and '}' 3166 SKIP_WS(*scan); 3167 if (**scan == '}' || **scan == ',') { 3168 for (i = 0; i < count; i++) { 3169 if ((start > maxOsId) || 3170 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3171 if (__kmp_affinity_verbose || 3172 (__kmp_affinity_warnings && 3173 (__kmp_affinity_type != affinity_none))) { 3174 KMP_WARNING(AffIgnoreInvalidProcID, start); 3175 } 3176 break; // don't proliferate warnings for large count 3177 } else { 3178 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3179 start += stride; 3180 (*setSize)++; 3181 } 3182 } 3183 if (**scan == '}') { 3184 break; 3185 } 3186 (*scan)++; // skip ',' 3187 continue; 3188 } 3189 3190 KMP_ASSERT2(0, "bad explicit places list"); 3191 } 3192 } 3193 3194 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3195 int maxOsId, kmp_affin_mask_t *tempMask, 3196 int *setSize) { 3197 const char *next; 3198 3199 // valid follow sets are '{' '!' and num 3200 SKIP_WS(*scan); 3201 if (**scan == '{') { 3202 (*scan)++; // skip '{' 3203 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3204 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3205 (*scan)++; // skip '}' 3206 } else if (**scan == '!') { 3207 (*scan)++; // skip '!' 3208 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3209 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3210 } else if ((**scan >= '0') && (**scan <= '9')) { 3211 next = *scan; 3212 SKIP_DIGITS(next); 3213 int num = __kmp_str_to_int(*scan, *next); 3214 KMP_ASSERT(num >= 0); 3215 if ((num > maxOsId) || 3216 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3217 if (__kmp_affinity_verbose || 3218 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3219 KMP_WARNING(AffIgnoreInvalidProcID, num); 3220 } 3221 } else { 3222 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3223 (*setSize)++; 3224 } 3225 *scan = next; // skip num 3226 } else { 3227 KMP_ASSERT2(0, "bad explicit places list"); 3228 } 3229 } 3230 3231 // static void 3232 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3233 unsigned int *out_numMasks, 3234 const char *placelist, 3235 kmp_affin_mask_t *osId2Mask, 3236 int maxOsId) { 3237 int i, j, count, stride, sign; 3238 const char *scan = placelist; 3239 const char *next = placelist; 3240 3241 numNewMasks = 2; 3242 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3243 nextNewMask = 0; 3244 3245 // tempMask is modified based on the previous or initial 3246 // place to form the current place 3247 // previousMask contains the previous place 3248 kmp_affin_mask_t *tempMask; 3249 kmp_affin_mask_t *previousMask; 3250 KMP_CPU_ALLOC(tempMask); 3251 KMP_CPU_ZERO(tempMask); 3252 KMP_CPU_ALLOC(previousMask); 3253 KMP_CPU_ZERO(previousMask); 3254 int setSize = 0; 3255 3256 for (;;) { 3257 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3258 3259 // valid follow sets are ',' ':' and EOL 3260 SKIP_WS(scan); 3261 if (*scan == '\0' || *scan == ',') { 3262 if (setSize > 0) { 3263 ADD_MASK(tempMask); 3264 } 3265 KMP_CPU_ZERO(tempMask); 3266 setSize = 0; 3267 if (*scan == '\0') { 3268 break; 3269 } 3270 scan++; // skip ',' 3271 continue; 3272 } 3273 3274 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3275 scan++; // skip ':' 3276 3277 // Read count parameter 3278 SKIP_WS(scan); 3279 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3280 next = scan; 3281 SKIP_DIGITS(next); 3282 count = __kmp_str_to_int(scan, *next); 3283 KMP_ASSERT(count >= 0); 3284 scan = next; 3285 3286 // valid follow sets are ',' ':' and EOL 3287 SKIP_WS(scan); 3288 if (*scan == '\0' || *scan == ',') { 3289 stride = +1; 3290 } else { 3291 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3292 scan++; // skip ':' 3293 3294 // Read stride parameter 3295 sign = +1; 3296 for (;;) { 3297 SKIP_WS(scan); 3298 if (*scan == '+') { 3299 scan++; // skip '+' 3300 continue; 3301 } 3302 if (*scan == '-') { 3303 sign *= -1; 3304 scan++; // skip '-' 3305 continue; 3306 } 3307 break; 3308 } 3309 SKIP_WS(scan); 3310 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3311 next = scan; 3312 SKIP_DIGITS(next); 3313 stride = __kmp_str_to_int(scan, *next); 3314 KMP_DEBUG_ASSERT(stride >= 0); 3315 scan = next; 3316 stride *= sign; 3317 } 3318 3319 // Add places determined by initial_place : count : stride 3320 for (i = 0; i < count; i++) { 3321 if (setSize == 0) { 3322 break; 3323 } 3324 // Add the current place, then build the next place (tempMask) from that 3325 KMP_CPU_COPY(previousMask, tempMask); 3326 ADD_MASK(previousMask); 3327 KMP_CPU_ZERO(tempMask); 3328 setSize = 0; 3329 KMP_CPU_SET_ITERATE(j, previousMask) { 3330 if (!KMP_CPU_ISSET(j, previousMask)) { 3331 continue; 3332 } 3333 if ((j + stride > maxOsId) || (j + stride < 0) || 3334 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3335 (!KMP_CPU_ISSET(j + stride, 3336 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3337 if ((__kmp_affinity_verbose || 3338 (__kmp_affinity_warnings && 3339 (__kmp_affinity_type != affinity_none))) && 3340 i < count - 1) { 3341 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3342 } 3343 continue; 3344 } 3345 KMP_CPU_SET(j + stride, tempMask); 3346 setSize++; 3347 } 3348 } 3349 KMP_CPU_ZERO(tempMask); 3350 setSize = 0; 3351 3352 // valid follow sets are ',' and EOL 3353 SKIP_WS(scan); 3354 if (*scan == '\0') { 3355 break; 3356 } 3357 if (*scan == ',') { 3358 scan++; // skip ',' 3359 continue; 3360 } 3361 3362 KMP_ASSERT2(0, "bad explicit places list"); 3363 } 3364 3365 *out_numMasks = nextNewMask; 3366 if (nextNewMask == 0) { 3367 *out_masks = NULL; 3368 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3369 return; 3370 } 3371 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3372 KMP_CPU_FREE(tempMask); 3373 KMP_CPU_FREE(previousMask); 3374 for (i = 0; i < nextNewMask; i++) { 3375 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3376 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3377 KMP_CPU_COPY(dest, src); 3378 } 3379 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3380 } 3381 3382 #undef ADD_MASK 3383 #undef ADD_MASK_OSID 3384 3385 // This function figures out the deepest level at which there is at least one 3386 // cluster/core with more than one processing unit bound to it. 3387 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 3388 int core_level = 0; 3389 3390 for (int i = 0; i < nprocs; i++) { 3391 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3392 for (int j = bottom_level; j > 0; j--) { 3393 if (hw_thread.ids[j] > 0) { 3394 if (core_level < (j - 1)) { 3395 core_level = j - 1; 3396 } 3397 } 3398 } 3399 } 3400 return core_level; 3401 } 3402 3403 // This function counts number of clusters/cores at given level. 3404 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 3405 int core_level) { 3406 return __kmp_topology->get_count(core_level); 3407 } 3408 // This function finds to which cluster/core given processing unit is bound. 3409 static int __kmp_affinity_find_core(int proc, int bottom_level, 3410 int core_level) { 3411 int core = 0; 3412 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 3413 for (int i = 0; i <= proc; ++i) { 3414 if (i + 1 <= proc) { 3415 for (int j = 0; j <= core_level; ++j) { 3416 if (__kmp_topology->at(i + 1).sub_ids[j] != 3417 __kmp_topology->at(i).sub_ids[j]) { 3418 core++; 3419 break; 3420 } 3421 } 3422 } 3423 } 3424 return core; 3425 } 3426 3427 // This function finds maximal number of processing units bound to a 3428 // cluster/core at given level. 3429 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 3430 int core_level) { 3431 if (core_level >= bottom_level) 3432 return 1; 3433 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 3434 return __kmp_topology->calculate_ratio(thread_level, core_level); 3435 } 3436 3437 static int *procarr = NULL; 3438 static int __kmp_aff_depth = 0; 3439 3440 // Create a one element mask array (set of places) which only contains the 3441 // initial process's affinity mask 3442 static void __kmp_create_affinity_none_places() { 3443 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3444 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3445 __kmp_affinity_num_masks = 1; 3446 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3447 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 3448 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 3449 } 3450 3451 static void __kmp_aux_affinity_initialize(void) { 3452 if (__kmp_affinity_masks != NULL) { 3453 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3454 return; 3455 } 3456 3457 // Create the "full" mask - this defines all of the processors that we 3458 // consider to be in the machine model. If respect is set, then it is the 3459 // initialization thread's affinity mask. Otherwise, it is all processors that 3460 // we know about on the machine. 3461 if (__kmp_affin_fullMask == NULL) { 3462 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3463 } 3464 if (KMP_AFFINITY_CAPABLE()) { 3465 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3466 if (__kmp_affinity_respect_mask) { 3467 // Count the number of available processors. 3468 unsigned i; 3469 __kmp_avail_proc = 0; 3470 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3471 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3472 continue; 3473 } 3474 __kmp_avail_proc++; 3475 } 3476 if (__kmp_avail_proc > __kmp_xproc) { 3477 if (__kmp_affinity_verbose || 3478 (__kmp_affinity_warnings && 3479 (__kmp_affinity_type != affinity_none))) { 3480 KMP_WARNING(ErrorInitializeAffinity); 3481 } 3482 __kmp_affinity_type = affinity_none; 3483 KMP_AFFINITY_DISABLE(); 3484 return; 3485 } 3486 3487 if (__kmp_affinity_verbose) { 3488 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3489 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3490 __kmp_affin_fullMask); 3491 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 3492 } 3493 } else { 3494 if (__kmp_affinity_verbose) { 3495 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3496 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3497 __kmp_affin_fullMask); 3498 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 3499 } 3500 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3501 __kmp_avail_proc = __kmp_xproc; 3502 #if KMP_OS_WINDOWS 3503 // Set the process affinity mask since threads' affinity 3504 // masks must be subset of process mask in Windows* OS 3505 __kmp_affin_fullMask->set_process_affinity(true); 3506 #endif 3507 } 3508 } 3509 3510 kmp_i18n_id_t msg_id = kmp_i18n_null; 3511 3512 // For backward compatibility, setting KMP_CPUINFO_FILE => 3513 // KMP_TOPOLOGY_METHOD=cpuinfo 3514 if ((__kmp_cpuinfo_file != NULL) && 3515 (__kmp_affinity_top_method == affinity_top_method_all)) { 3516 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3517 } 3518 3519 bool success = false; 3520 if (__kmp_affinity_top_method == affinity_top_method_all) { 3521 // In the default code path, errors are not fatal - we just try using 3522 // another method. We only emit a warning message if affinity is on, or the 3523 // verbose flag is set, an the nowarnings flag was not set. 3524 #if KMP_USE_HWLOC 3525 if (!success && 3526 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3527 if (!__kmp_hwloc_error) { 3528 success = __kmp_affinity_create_hwloc_map(&msg_id); 3529 if (!success && __kmp_affinity_verbose) { 3530 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3531 } 3532 } else if (__kmp_affinity_verbose) { 3533 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3534 } 3535 } 3536 #endif 3537 3538 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3539 if (!success) { 3540 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3541 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3542 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3543 } 3544 } 3545 if (!success) { 3546 success = __kmp_affinity_create_apicid_map(&msg_id); 3547 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3548 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3549 } 3550 } 3551 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3552 3553 #if KMP_OS_LINUX 3554 if (!success) { 3555 int line = 0; 3556 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3557 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3558 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3559 } 3560 } 3561 #endif /* KMP_OS_LINUX */ 3562 3563 #if KMP_GROUP_AFFINITY 3564 if (!success && (__kmp_num_proc_groups > 1)) { 3565 success = __kmp_affinity_create_proc_group_map(&msg_id); 3566 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3567 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3568 } 3569 } 3570 #endif /* KMP_GROUP_AFFINITY */ 3571 3572 if (!success) { 3573 success = __kmp_affinity_create_flat_map(&msg_id); 3574 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3575 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3576 } 3577 KMP_ASSERT(success); 3578 } 3579 } 3580 3581 // If the user has specified that a paricular topology discovery method is to be 3582 // used, then we abort if that method fails. The exception is group affinity, 3583 // which might have been implicitly set. 3584 #if KMP_USE_HWLOC 3585 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3586 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3587 success = __kmp_affinity_create_hwloc_map(&msg_id); 3588 if (!success) { 3589 KMP_ASSERT(msg_id != kmp_i18n_null); 3590 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3591 } 3592 } 3593 #endif // KMP_USE_HWLOC 3594 3595 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3596 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 3597 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 3598 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3599 if (!success) { 3600 KMP_ASSERT(msg_id != kmp_i18n_null); 3601 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3602 } 3603 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3604 success = __kmp_affinity_create_apicid_map(&msg_id); 3605 if (!success) { 3606 KMP_ASSERT(msg_id != kmp_i18n_null); 3607 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3608 } 3609 } 3610 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3611 3612 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3613 int line = 0; 3614 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3615 if (!success) { 3616 KMP_ASSERT(msg_id != kmp_i18n_null); 3617 const char *filename = __kmp_cpuinfo_get_filename(); 3618 if (line > 0) { 3619 KMP_FATAL(FileLineMsgExiting, filename, line, 3620 __kmp_i18n_catgets(msg_id)); 3621 } else { 3622 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3623 } 3624 } 3625 } 3626 3627 #if KMP_GROUP_AFFINITY 3628 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3629 success = __kmp_affinity_create_proc_group_map(&msg_id); 3630 KMP_ASSERT(success); 3631 if (!success) { 3632 KMP_ASSERT(msg_id != kmp_i18n_null); 3633 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3634 } 3635 } 3636 #endif /* KMP_GROUP_AFFINITY */ 3637 3638 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3639 success = __kmp_affinity_create_flat_map(&msg_id); 3640 // should not fail 3641 KMP_ASSERT(success); 3642 } 3643 3644 // Early exit if topology could not be created 3645 if (!__kmp_topology) { 3646 if (KMP_AFFINITY_CAPABLE() && 3647 (__kmp_affinity_verbose || 3648 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 3649 KMP_WARNING(ErrorInitializeAffinity); 3650 } 3651 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 3652 __kmp_ncores > 0) { 3653 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 3654 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 3655 __kmp_nThreadsPerCore, __kmp_ncores); 3656 if (__kmp_affinity_verbose) { 3657 __kmp_topology->print("KMP_AFFINITY"); 3658 } 3659 } 3660 __kmp_affinity_type = affinity_none; 3661 __kmp_create_affinity_none_places(); 3662 #if KMP_USE_HIER_SCHED 3663 __kmp_dispatch_set_hierarchy_values(); 3664 #endif 3665 KMP_AFFINITY_DISABLE(); 3666 return; 3667 } 3668 3669 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and 3670 // initialize other data structures which depend on the topology 3671 __kmp_topology->canonicalize(); 3672 if (__kmp_affinity_verbose) 3673 __kmp_topology->print("KMP_AFFINITY"); 3674 bool filtered = __kmp_topology->filter_hw_subset(); 3675 if (filtered && __kmp_affinity_verbose) 3676 __kmp_topology->print("KMP_HW_SUBSET"); 3677 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 3678 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 3679 // If KMP_AFFINITY=none, then only create the single "none" place 3680 // which is the process's initial affinity mask or the number of 3681 // hardware threads depending on respect,norespect 3682 if (__kmp_affinity_type == affinity_none) { 3683 __kmp_create_affinity_none_places(); 3684 #if KMP_USE_HIER_SCHED 3685 __kmp_dispatch_set_hierarchy_values(); 3686 #endif 3687 return; 3688 } 3689 int depth = __kmp_topology->get_depth(); 3690 3691 // Create the table of masks, indexed by thread Id. 3692 unsigned maxIndex; 3693 unsigned numUnique; 3694 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); 3695 if (__kmp_affinity_gran_levels == 0) { 3696 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3697 } 3698 3699 switch (__kmp_affinity_type) { 3700 3701 case affinity_explicit: 3702 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3703 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 3704 __kmp_affinity_process_proclist( 3705 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3706 __kmp_affinity_proclist, osId2Mask, maxIndex); 3707 } else { 3708 __kmp_affinity_process_placelist( 3709 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3710 __kmp_affinity_proclist, osId2Mask, maxIndex); 3711 } 3712 if (__kmp_affinity_num_masks == 0) { 3713 if (__kmp_affinity_verbose || 3714 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3715 KMP_WARNING(AffNoValidProcID); 3716 } 3717 __kmp_affinity_type = affinity_none; 3718 __kmp_create_affinity_none_places(); 3719 return; 3720 } 3721 break; 3722 3723 // The other affinity types rely on sorting the hardware threads according to 3724 // some permutation of the machine topology tree. Set __kmp_affinity_compact 3725 // and __kmp_affinity_offset appropriately, then jump to a common code 3726 // fragment to do the sort and create the array of affinity masks. 3727 case affinity_logical: 3728 __kmp_affinity_compact = 0; 3729 if (__kmp_affinity_offset) { 3730 __kmp_affinity_offset = 3731 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3732 } 3733 goto sortTopology; 3734 3735 case affinity_physical: 3736 if (__kmp_nThreadsPerCore > 1) { 3737 __kmp_affinity_compact = 1; 3738 if (__kmp_affinity_compact >= depth) { 3739 __kmp_affinity_compact = 0; 3740 } 3741 } else { 3742 __kmp_affinity_compact = 0; 3743 } 3744 if (__kmp_affinity_offset) { 3745 __kmp_affinity_offset = 3746 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3747 } 3748 goto sortTopology; 3749 3750 case affinity_scatter: 3751 if (__kmp_affinity_compact >= depth) { 3752 __kmp_affinity_compact = 0; 3753 } else { 3754 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3755 } 3756 goto sortTopology; 3757 3758 case affinity_compact: 3759 if (__kmp_affinity_compact >= depth) { 3760 __kmp_affinity_compact = depth - 1; 3761 } 3762 goto sortTopology; 3763 3764 case affinity_balanced: 3765 if (depth <= 1) { 3766 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3767 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3768 } 3769 __kmp_affinity_type = affinity_none; 3770 __kmp_create_affinity_none_places(); 3771 return; 3772 } else if (!__kmp_topology->is_uniform()) { 3773 // Save the depth for further usage 3774 __kmp_aff_depth = depth; 3775 3776 int core_level = 3777 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 3778 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 3779 core_level); 3780 int maxprocpercore = __kmp_affinity_max_proc_per_core( 3781 __kmp_avail_proc, depth - 1, core_level); 3782 3783 int nproc = ncores * maxprocpercore; 3784 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 3785 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3786 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3787 } 3788 __kmp_affinity_type = affinity_none; 3789 return; 3790 } 3791 3792 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 3793 for (int i = 0; i < nproc; i++) { 3794 procarr[i] = -1; 3795 } 3796 3797 int lastcore = -1; 3798 int inlastcore = 0; 3799 for (int i = 0; i < __kmp_avail_proc; i++) { 3800 int proc = __kmp_topology->at(i).os_id; 3801 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 3802 3803 if (core == lastcore) { 3804 inlastcore++; 3805 } else { 3806 inlastcore = 0; 3807 } 3808 lastcore = core; 3809 3810 procarr[core * maxprocpercore + inlastcore] = proc; 3811 } 3812 } 3813 if (__kmp_affinity_compact >= depth) { 3814 __kmp_affinity_compact = depth - 1; 3815 } 3816 3817 sortTopology: 3818 // Allocate the gtid->affinity mask table. 3819 if (__kmp_affinity_dups) { 3820 __kmp_affinity_num_masks = __kmp_avail_proc; 3821 } else { 3822 __kmp_affinity_num_masks = numUnique; 3823 } 3824 3825 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 3826 (__kmp_affinity_num_places > 0) && 3827 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 3828 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3829 } 3830 3831 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3832 3833 // Sort the topology table according to the current setting of 3834 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3835 __kmp_topology->sort_compact(); 3836 { 3837 int i; 3838 unsigned j; 3839 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 3840 for (i = 0, j = 0; i < num_hw_threads; i++) { 3841 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { 3842 continue; 3843 } 3844 int osId = __kmp_topology->at(i).os_id; 3845 3846 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3847 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3848 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3849 KMP_CPU_COPY(dest, src); 3850 if (++j >= __kmp_affinity_num_masks) { 3851 break; 3852 } 3853 } 3854 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3855 } 3856 // Sort the topology back using ids 3857 __kmp_topology->sort_ids(); 3858 break; 3859 3860 default: 3861 KMP_ASSERT2(0, "Unexpected affinity setting"); 3862 } 3863 3864 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 3865 } 3866 3867 void __kmp_affinity_initialize(void) { 3868 // Much of the code above was written assuming that if a machine was not 3869 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3870 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3871 // There are too many checks for __kmp_affinity_type == affinity_none 3872 // in this code. Instead of trying to change them all, check if 3873 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3874 // affinity_none, call the real initialization routine, then restore 3875 // __kmp_affinity_type to affinity_disabled. 3876 int disabled = (__kmp_affinity_type == affinity_disabled); 3877 if (!KMP_AFFINITY_CAPABLE()) { 3878 KMP_ASSERT(disabled); 3879 } 3880 if (disabled) { 3881 __kmp_affinity_type = affinity_none; 3882 } 3883 __kmp_aux_affinity_initialize(); 3884 if (disabled) { 3885 __kmp_affinity_type = affinity_disabled; 3886 } 3887 } 3888 3889 void __kmp_affinity_uninitialize(void) { 3890 if (__kmp_affinity_masks != NULL) { 3891 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3892 __kmp_affinity_masks = NULL; 3893 } 3894 if (__kmp_affin_fullMask != NULL) { 3895 KMP_CPU_FREE(__kmp_affin_fullMask); 3896 __kmp_affin_fullMask = NULL; 3897 } 3898 __kmp_affinity_num_masks = 0; 3899 __kmp_affinity_type = affinity_default; 3900 __kmp_affinity_num_places = 0; 3901 if (__kmp_affinity_proclist != NULL) { 3902 __kmp_free(__kmp_affinity_proclist); 3903 __kmp_affinity_proclist = NULL; 3904 } 3905 if (procarr != NULL) { 3906 __kmp_free(procarr); 3907 procarr = NULL; 3908 } 3909 #if KMP_USE_HWLOC 3910 if (__kmp_hwloc_topology != NULL) { 3911 hwloc_topology_destroy(__kmp_hwloc_topology); 3912 __kmp_hwloc_topology = NULL; 3913 } 3914 #endif 3915 if (__kmp_hw_subset) { 3916 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 3917 __kmp_hw_subset = nullptr; 3918 } 3919 if (__kmp_topology) { 3920 kmp_topology_t::deallocate(__kmp_topology); 3921 __kmp_topology = nullptr; 3922 } 3923 KMPAffinity::destroy_api(); 3924 } 3925 3926 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 3927 if (!KMP_AFFINITY_CAPABLE()) { 3928 return; 3929 } 3930 3931 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3932 if (th->th.th_affin_mask == NULL) { 3933 KMP_CPU_ALLOC(th->th.th_affin_mask); 3934 } else { 3935 KMP_CPU_ZERO(th->th.th_affin_mask); 3936 } 3937 3938 // Copy the thread mask to the kmp_info_t structure. If 3939 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 3940 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 3941 // then the full mask is the same as the mask of the initialization thread. 3942 kmp_affin_mask_t *mask; 3943 int i; 3944 3945 if (KMP_AFFINITY_NON_PROC_BIND) { 3946 if ((__kmp_affinity_type == affinity_none) || 3947 (__kmp_affinity_type == affinity_balanced) || 3948 KMP_HIDDEN_HELPER_THREAD(gtid)) { 3949 #if KMP_GROUP_AFFINITY 3950 if (__kmp_num_proc_groups > 1) { 3951 return; 3952 } 3953 #endif 3954 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3955 i = 0; 3956 mask = __kmp_affin_fullMask; 3957 } else { 3958 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 3959 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 3960 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3961 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3962 } 3963 } else { 3964 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) || 3965 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 3966 #if KMP_GROUP_AFFINITY 3967 if (__kmp_num_proc_groups > 1) { 3968 return; 3969 } 3970 #endif 3971 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3972 i = KMP_PLACE_ALL; 3973 mask = __kmp_affin_fullMask; 3974 } else { 3975 // int i = some hash function or just a counter that doesn't 3976 // always start at 0. Use adjusted gtid for now. 3977 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 3978 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 3979 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3980 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3981 } 3982 } 3983 3984 th->th.th_current_place = i; 3985 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) { 3986 th->th.th_new_place = i; 3987 th->th.th_first_place = 0; 3988 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3989 } else if (KMP_AFFINITY_NON_PROC_BIND) { 3990 // When using a Non-OMP_PROC_BIND affinity method, 3991 // set all threads' place-partition-var to the entire place list 3992 th->th.th_first_place = 0; 3993 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3994 } 3995 3996 if (i == KMP_PLACE_ALL) { 3997 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 3998 gtid)); 3999 } else { 4000 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4001 gtid, i)); 4002 } 4003 4004 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4005 4006 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid) 4007 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4008 && (__kmp_affinity_type == affinity_none || 4009 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4010 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4011 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4012 th->th.th_affin_mask); 4013 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4014 __kmp_gettid(), gtid, buf); 4015 } 4016 4017 #if KMP_DEBUG 4018 // Hidden helper thread affinity only printed for debug builds 4019 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) { 4020 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4021 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4022 th->th.th_affin_mask); 4023 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)", 4024 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4025 } 4026 #endif 4027 4028 #if KMP_OS_WINDOWS 4029 // On Windows* OS, the process affinity mask might have changed. If the user 4030 // didn't request affinity and this call fails, just continue silently. 4031 // See CQ171393. 4032 if (__kmp_affinity_type == affinity_none) { 4033 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4034 } else 4035 #endif 4036 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4037 } 4038 4039 void __kmp_affinity_set_place(int gtid) { 4040 if (!KMP_AFFINITY_CAPABLE()) { 4041 return; 4042 } 4043 4044 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4045 4046 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4047 "place = %d)\n", 4048 gtid, th->th.th_new_place, th->th.th_current_place)); 4049 4050 // Check that the new place is within this thread's partition. 4051 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4052 KMP_ASSERT(th->th.th_new_place >= 0); 4053 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4054 if (th->th.th_first_place <= th->th.th_last_place) { 4055 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4056 (th->th.th_new_place <= th->th.th_last_place)); 4057 } else { 4058 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4059 (th->th.th_new_place >= th->th.th_last_place)); 4060 } 4061 4062 // Copy the thread mask to the kmp_info_t structure, 4063 // and set this thread's affinity. 4064 kmp_affin_mask_t *mask = 4065 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4066 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4067 th->th.th_current_place = th->th.th_new_place; 4068 4069 if (__kmp_affinity_verbose) { 4070 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4071 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4072 th->th.th_affin_mask); 4073 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4074 __kmp_gettid(), gtid, buf); 4075 } 4076 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4077 } 4078 4079 int __kmp_aux_set_affinity(void **mask) { 4080 int gtid; 4081 kmp_info_t *th; 4082 int retval; 4083 4084 if (!KMP_AFFINITY_CAPABLE()) { 4085 return -1; 4086 } 4087 4088 gtid = __kmp_entry_gtid(); 4089 KA_TRACE( 4090 1000, (""); { 4091 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4092 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4093 (kmp_affin_mask_t *)(*mask)); 4094 __kmp_debug_printf( 4095 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4096 gtid, buf); 4097 }); 4098 4099 if (__kmp_env_consistency_check) { 4100 if ((mask == NULL) || (*mask == NULL)) { 4101 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4102 } else { 4103 unsigned proc; 4104 int num_procs = 0; 4105 4106 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4107 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4108 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4109 } 4110 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4111 continue; 4112 } 4113 num_procs++; 4114 } 4115 if (num_procs == 0) { 4116 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4117 } 4118 4119 #if KMP_GROUP_AFFINITY 4120 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4121 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4122 } 4123 #endif /* KMP_GROUP_AFFINITY */ 4124 } 4125 } 4126 4127 th = __kmp_threads[gtid]; 4128 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4129 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4130 if (retval == 0) { 4131 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4132 } 4133 4134 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4135 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4136 th->th.th_first_place = 0; 4137 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4138 4139 // Turn off 4.0 affinity for the current tread at this parallel level. 4140 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4141 4142 return retval; 4143 } 4144 4145 int __kmp_aux_get_affinity(void **mask) { 4146 int gtid; 4147 int retval; 4148 kmp_info_t *th; 4149 4150 if (!KMP_AFFINITY_CAPABLE()) { 4151 return -1; 4152 } 4153 4154 gtid = __kmp_entry_gtid(); 4155 th = __kmp_threads[gtid]; 4156 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4157 4158 KA_TRACE( 4159 1000, (""); { 4160 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4161 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4162 th->th.th_affin_mask); 4163 __kmp_printf( 4164 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 4165 buf); 4166 }); 4167 4168 if (__kmp_env_consistency_check) { 4169 if ((mask == NULL) || (*mask == NULL)) { 4170 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4171 } 4172 } 4173 4174 #if !KMP_OS_WINDOWS 4175 4176 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4177 KA_TRACE( 4178 1000, (""); { 4179 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4180 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4181 (kmp_affin_mask_t *)(*mask)); 4182 __kmp_printf( 4183 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 4184 buf); 4185 }); 4186 return retval; 4187 4188 #else 4189 (void)retval; 4190 4191 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4192 return 0; 4193 4194 #endif /* KMP_OS_WINDOWS */ 4195 } 4196 4197 int __kmp_aux_get_affinity_max_proc() { 4198 if (!KMP_AFFINITY_CAPABLE()) { 4199 return 0; 4200 } 4201 #if KMP_GROUP_AFFINITY 4202 if (__kmp_num_proc_groups > 1) { 4203 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4204 } 4205 #endif 4206 return __kmp_xproc; 4207 } 4208 4209 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4210 if (!KMP_AFFINITY_CAPABLE()) { 4211 return -1; 4212 } 4213 4214 KA_TRACE( 4215 1000, (""); { 4216 int gtid = __kmp_entry_gtid(); 4217 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4218 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4219 (kmp_affin_mask_t *)(*mask)); 4220 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4221 "affinity mask for thread %d = %s\n", 4222 proc, gtid, buf); 4223 }); 4224 4225 if (__kmp_env_consistency_check) { 4226 if ((mask == NULL) || (*mask == NULL)) { 4227 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4228 } 4229 } 4230 4231 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4232 return -1; 4233 } 4234 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4235 return -2; 4236 } 4237 4238 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4239 return 0; 4240 } 4241 4242 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4243 if (!KMP_AFFINITY_CAPABLE()) { 4244 return -1; 4245 } 4246 4247 KA_TRACE( 4248 1000, (""); { 4249 int gtid = __kmp_entry_gtid(); 4250 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4251 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4252 (kmp_affin_mask_t *)(*mask)); 4253 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4254 "affinity mask for thread %d = %s\n", 4255 proc, gtid, buf); 4256 }); 4257 4258 if (__kmp_env_consistency_check) { 4259 if ((mask == NULL) || (*mask == NULL)) { 4260 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4261 } 4262 } 4263 4264 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4265 return -1; 4266 } 4267 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4268 return -2; 4269 } 4270 4271 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4272 return 0; 4273 } 4274 4275 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4276 if (!KMP_AFFINITY_CAPABLE()) { 4277 return -1; 4278 } 4279 4280 KA_TRACE( 4281 1000, (""); { 4282 int gtid = __kmp_entry_gtid(); 4283 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4284 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4285 (kmp_affin_mask_t *)(*mask)); 4286 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4287 "affinity mask for thread %d = %s\n", 4288 proc, gtid, buf); 4289 }); 4290 4291 if (__kmp_env_consistency_check) { 4292 if ((mask == NULL) || (*mask == NULL)) { 4293 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4294 } 4295 } 4296 4297 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4298 return -1; 4299 } 4300 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4301 return 0; 4302 } 4303 4304 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4305 } 4306 4307 // Dynamic affinity settings - Affinity balanced 4308 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4309 KMP_DEBUG_ASSERT(th); 4310 bool fine_gran = true; 4311 int tid = th->th.th_info.ds.ds_tid; 4312 4313 // Do not perform balanced affinity for the hidden helper threads 4314 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 4315 return; 4316 4317 switch (__kmp_affinity_gran) { 4318 case KMP_HW_THREAD: 4319 break; 4320 case KMP_HW_CORE: 4321 if (__kmp_nThreadsPerCore > 1) { 4322 fine_gran = false; 4323 } 4324 break; 4325 case KMP_HW_SOCKET: 4326 if (nCoresPerPkg > 1) { 4327 fine_gran = false; 4328 } 4329 break; 4330 default: 4331 fine_gran = false; 4332 } 4333 4334 if (__kmp_topology->is_uniform()) { 4335 int coreID; 4336 int threadID; 4337 // Number of hyper threads per core in HT machine 4338 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4339 // Number of cores 4340 int ncores = __kmp_ncores; 4341 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4342 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4343 ncores = nPackages; 4344 } 4345 // How many threads will be bound to each core 4346 int chunk = nthreads / ncores; 4347 // How many cores will have an additional thread bound to it - "big cores" 4348 int big_cores = nthreads % ncores; 4349 // Number of threads on the big cores 4350 int big_nth = (chunk + 1) * big_cores; 4351 if (tid < big_nth) { 4352 coreID = tid / (chunk + 1); 4353 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4354 } else { // tid >= big_nth 4355 coreID = (tid - big_cores) / chunk; 4356 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4357 } 4358 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4359 "Illegal set affinity operation when not capable"); 4360 4361 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4362 KMP_CPU_ZERO(mask); 4363 4364 if (fine_gran) { 4365 int osID = 4366 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 4367 KMP_CPU_SET(osID, mask); 4368 } else { 4369 for (int i = 0; i < __kmp_nth_per_core; i++) { 4370 int osID; 4371 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 4372 KMP_CPU_SET(osID, mask); 4373 } 4374 } 4375 if (__kmp_affinity_verbose) { 4376 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4377 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4378 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4379 __kmp_gettid(), tid, buf); 4380 } 4381 __kmp_set_system_affinity(mask, TRUE); 4382 } else { // Non-uniform topology 4383 4384 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4385 KMP_CPU_ZERO(mask); 4386 4387 int core_level = 4388 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 4389 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 4390 __kmp_aff_depth - 1, core_level); 4391 int nth_per_core = __kmp_affinity_max_proc_per_core( 4392 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4393 4394 // For performance gain consider the special case nthreads == 4395 // __kmp_avail_proc 4396 if (nthreads == __kmp_avail_proc) { 4397 if (fine_gran) { 4398 int osID = __kmp_topology->at(tid).os_id; 4399 KMP_CPU_SET(osID, mask); 4400 } else { 4401 int core = 4402 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 4403 for (int i = 0; i < __kmp_avail_proc; i++) { 4404 int osID = __kmp_topology->at(i).os_id; 4405 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 4406 core) { 4407 KMP_CPU_SET(osID, mask); 4408 } 4409 } 4410 } 4411 } else if (nthreads <= ncores) { 4412 4413 int core = 0; 4414 for (int i = 0; i < ncores; i++) { 4415 // Check if this core from procarr[] is in the mask 4416 int in_mask = 0; 4417 for (int j = 0; j < nth_per_core; j++) { 4418 if (procarr[i * nth_per_core + j] != -1) { 4419 in_mask = 1; 4420 break; 4421 } 4422 } 4423 if (in_mask) { 4424 if (tid == core) { 4425 for (int j = 0; j < nth_per_core; j++) { 4426 int osID = procarr[i * nth_per_core + j]; 4427 if (osID != -1) { 4428 KMP_CPU_SET(osID, mask); 4429 // For fine granularity it is enough to set the first available 4430 // osID for this core 4431 if (fine_gran) { 4432 break; 4433 } 4434 } 4435 } 4436 break; 4437 } else { 4438 core++; 4439 } 4440 } 4441 } 4442 } else { // nthreads > ncores 4443 // Array to save the number of processors at each core 4444 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4445 // Array to save the number of cores with "x" available processors; 4446 int *ncores_with_x_procs = 4447 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4448 // Array to save the number of cores with # procs from x to nth_per_core 4449 int *ncores_with_x_to_max_procs = 4450 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4451 4452 for (int i = 0; i <= nth_per_core; i++) { 4453 ncores_with_x_procs[i] = 0; 4454 ncores_with_x_to_max_procs[i] = 0; 4455 } 4456 4457 for (int i = 0; i < ncores; i++) { 4458 int cnt = 0; 4459 for (int j = 0; j < nth_per_core; j++) { 4460 if (procarr[i * nth_per_core + j] != -1) { 4461 cnt++; 4462 } 4463 } 4464 nproc_at_core[i] = cnt; 4465 ncores_with_x_procs[cnt]++; 4466 } 4467 4468 for (int i = 0; i <= nth_per_core; i++) { 4469 for (int j = i; j <= nth_per_core; j++) { 4470 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4471 } 4472 } 4473 4474 // Max number of processors 4475 int nproc = nth_per_core * ncores; 4476 // An array to keep number of threads per each context 4477 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4478 for (int i = 0; i < nproc; i++) { 4479 newarr[i] = 0; 4480 } 4481 4482 int nth = nthreads; 4483 int flag = 0; 4484 while (nth > 0) { 4485 for (int j = 1; j <= nth_per_core; j++) { 4486 int cnt = ncores_with_x_to_max_procs[j]; 4487 for (int i = 0; i < ncores; i++) { 4488 // Skip the core with 0 processors 4489 if (nproc_at_core[i] == 0) { 4490 continue; 4491 } 4492 for (int k = 0; k < nth_per_core; k++) { 4493 if (procarr[i * nth_per_core + k] != -1) { 4494 if (newarr[i * nth_per_core + k] == 0) { 4495 newarr[i * nth_per_core + k] = 1; 4496 cnt--; 4497 nth--; 4498 break; 4499 } else { 4500 if (flag != 0) { 4501 newarr[i * nth_per_core + k]++; 4502 cnt--; 4503 nth--; 4504 break; 4505 } 4506 } 4507 } 4508 } 4509 if (cnt == 0 || nth == 0) { 4510 break; 4511 } 4512 } 4513 if (nth == 0) { 4514 break; 4515 } 4516 } 4517 flag = 1; 4518 } 4519 int sum = 0; 4520 for (int i = 0; i < nproc; i++) { 4521 sum += newarr[i]; 4522 if (sum > tid) { 4523 if (fine_gran) { 4524 int osID = procarr[i]; 4525 KMP_CPU_SET(osID, mask); 4526 } else { 4527 int coreID = i / nth_per_core; 4528 for (int ii = 0; ii < nth_per_core; ii++) { 4529 int osID = procarr[coreID * nth_per_core + ii]; 4530 if (osID != -1) { 4531 KMP_CPU_SET(osID, mask); 4532 } 4533 } 4534 } 4535 break; 4536 } 4537 } 4538 __kmp_free(newarr); 4539 } 4540 4541 if (__kmp_affinity_verbose) { 4542 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4543 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4544 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4545 __kmp_gettid(), tid, buf); 4546 } 4547 __kmp_set_system_affinity(mask, TRUE); 4548 } 4549 } 4550 4551 #if KMP_OS_LINUX || KMP_OS_FREEBSD 4552 // We don't need this entry for Windows because 4553 // there is GetProcessAffinityMask() api 4554 // 4555 // The intended usage is indicated by these steps: 4556 // 1) The user gets the current affinity mask 4557 // 2) Then sets the affinity by calling this function 4558 // 3) Error check the return value 4559 // 4) Use non-OpenMP parallelization 4560 // 5) Reset the affinity to what was stored in step 1) 4561 #ifdef __cplusplus 4562 extern "C" 4563 #endif 4564 int 4565 kmp_set_thread_affinity_mask_initial() 4566 // the function returns 0 on success, 4567 // -1 if we cannot bind thread 4568 // >0 (errno) if an error happened during binding 4569 { 4570 int gtid = __kmp_get_gtid(); 4571 if (gtid < 0) { 4572 // Do not touch non-omp threads 4573 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4574 "non-omp thread, returning\n")); 4575 return -1; 4576 } 4577 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4578 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4579 "affinity not initialized, returning\n")); 4580 return -1; 4581 } 4582 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4583 "set full mask for thread %d\n", 4584 gtid)); 4585 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4586 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4587 } 4588 #endif 4589 4590 #endif // KMP_AFFINITY_SUPPORTED 4591