1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Basic NUMA support in terms of locality groups 30 * 31 * Solaris needs to know which CPUs, memory, etc. are near each other to 32 * provide good performance on NUMA machines by optimizing for locality. 33 * In order to do this, a new abstraction called a "locality group (lgroup)" 34 * has been introduced to keep track of which CPU-like and memory-like hardware 35 * resources are close to each other. Currently, latency is the only measure 36 * used to determine how to group hardware resources into lgroups, but this 37 * does not limit the groupings to be based solely on latency. Other factors 38 * may be used to determine the groupings in the future. 39 * 40 * Lgroups are organized into a hieararchy or topology that represents the 41 * latency topology of the machine. There is always at least a root lgroup in 42 * the system. It represents all the hardware resources in the machine at a 43 * latency big enough that any hardware resource can at least access any other 44 * hardware resource within that latency. A Uniform Memory Access (UMA) 45 * machine is represented with one lgroup (the root). In contrast, a NUMA 46 * machine is represented at least by the root lgroup and some number of leaf 47 * lgroups where the leaf lgroups contain the hardware resources within the 48 * least latency of each other and the root lgroup still contains all the 49 * resources in the machine. Some number of intermediate lgroups may exist 50 * which represent more levels of locality than just the local latency of the 51 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 52 * (eg. root and intermediate lgroups) contain the next nearest resources to 53 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 54 * to the root lgroup shows the hardware resources from closest to farthest 55 * from the leaf lgroup such that each successive ancestor lgroup contains 56 * the next nearest resources at the next level of locality from the previous. 57 * 58 * The kernel uses the lgroup abstraction to know how to allocate resources 59 * near a given process/thread. At fork() and lwp/thread_create() time, a 60 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 61 * with the lowest load average. Binding to a processor or processor set will 62 * change the home lgroup for a thread. The scheduler has been modified to try 63 * to dispatch a thread on a CPU in its home lgroup. Physical memory 64 * allocation is lgroup aware too, so memory will be allocated from the current 65 * thread's home lgroup if possible. If the desired resources are not 66 * available, the kernel traverses the lgroup hierarchy going to the parent 67 * lgroup to find resources at the next level of locality until it reaches the 68 * root lgroup. 69 */ 70 71 #include <sys/lgrp.h> 72 #include <sys/lgrp_user.h> 73 #include <sys/types.h> 74 #include <sys/mman.h> 75 #include <sys/param.h> 76 #include <sys/var.h> 77 #include <sys/thread.h> 78 #include <sys/cpuvar.h> 79 #include <sys/cpupart.h> 80 #include <sys/kmem.h> 81 #include <vm/seg.h> 82 #include <vm/seg_kmem.h> 83 #include <vm/seg_spt.h> 84 #include <vm/seg_vn.h> 85 #include <vm/as.h> 86 #include <sys/atomic.h> 87 #include <sys/systm.h> 88 #include <sys/errno.h> 89 #include <sys/cmn_err.h> 90 #include <sys/kstat.h> 91 #include <sys/sysmacros.h> 92 #include <sys/chip.h> 93 #include <sys/promif.h> 94 #include <sys/sdt.h> 95 96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 98 /* indexed by lgrp_id */ 99 int nlgrps; /* number of lgroups in machine */ 100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 102 103 /* 104 * Kstat data for lgroups. 105 * 106 * Actual kstat data is collected in lgrp_stats array. 107 * The lgrp_kstat_data array of named kstats is used to extract data from 108 * lgrp_stats and present it to kstat framework. It is protected from partallel 109 * modifications by lgrp_kstat_mutex. This may cause some contention when 110 * several kstat commands run in parallel but this is not the 111 * performance-critical path. 112 */ 113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 114 115 /* 116 * Declare kstat names statically for enums as defined in the header file. 117 */ 118 LGRP_KSTAT_NAMES; 119 120 static void lgrp_kstat_init(void); 121 static int lgrp_kstat_extract(kstat_t *, int); 122 static void lgrp_kstat_reset(lgrp_id_t); 123 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 125 static kmutex_t lgrp_kstat_mutex; 126 127 128 /* 129 * max number of lgroups supported by the platform 130 */ 131 int nlgrpsmax = 0; 132 133 /* 134 * The root lgroup. Represents the set of resources at the system wide 135 * level of locality. 136 */ 137 lgrp_t *lgrp_root = NULL; 138 139 /* 140 * During system bootstrap cp_default does not contain the list of lgrp load 141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 142 * on-line when cp_default is initialized by cpupart_initialize_default(). 143 * Configuring CPU0 may create a two-level topology with root and one leaf node 144 * containing CPU0. This topology is initially constructed in a special 145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 147 * for all lpl operations until cp_default is fully constructed. 148 * 149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 151 * the first element of lpl_bootstrap_list. 152 * 153 * CPUs that are added to the system, but have not yet been assigned to an 154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 155 * on some architectures (x86) it's possible for the slave CPU startup thread 156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 157 */ 158 #define LPL_BOOTSTRAP_SIZE 2 159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 160 lpl_t *lpl_bootstrap; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 /* 172 * Size, in bytes, beyond which random memory allocation policy is applied 173 * to non-shared memory. Default is the maximum size, so random memory 174 * allocation won't be used for non-shared memory by default. 175 */ 176 size_t lgrp_privm_random_thresh = (size_t)(-1); 177 178 /* the maximum effect that a single thread can have on it's lgroup's load */ 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 180 ((lgrp_loadavg_max_effect) / (ncpu)) 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 182 183 184 /* 185 * Size, in bytes, beyond which random memory allocation policy is applied to 186 * shared memory. Default is 8MB (2 ISM pages). 187 */ 188 size_t lgrp_shm_random_thresh = 8*1024*1024; 189 190 /* 191 * Whether to do processor set aware memory allocation by default 192 */ 193 int lgrp_mem_pset_aware = 0; 194 195 /* 196 * Set the default memory allocation policy for root lgroup 197 */ 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 199 200 /* 201 * Set the default memory allocation policy. For most platforms, 202 * next touch is sufficient, but some platforms may wish to override 203 * this. 204 */ 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 206 207 208 /* 209 * lgroup CPU event handlers 210 */ 211 static void lgrp_cpu_init(struct cpu *); 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 214 215 /* 216 * lgroup memory event handlers 217 */ 218 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 219 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 220 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 221 222 /* 223 * lgroup CPU partition event handlers 224 */ 225 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 226 static void lgrp_part_del_cpu(struct cpu *); 227 228 static void lgrp_root_init(void); 229 230 /* 231 * lpl topology 232 */ 233 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 234 static void lpl_clear(lpl_t *); 235 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 236 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 237 static void lpl_rset_add(lpl_t *, lpl_t *); 238 static void lpl_rset_del(lpl_t *, lpl_t *); 239 static int lpl_rset_contains(lpl_t *, lpl_t *); 240 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 241 static void lpl_child_update(lpl_t *, struct cpupart *); 242 static int lpl_pick(lpl_t *, lpl_t *); 243 static void lpl_verify_wrapper(struct cpupart *); 244 245 /* 246 * defines for lpl topology verifier return codes 247 */ 248 249 #define LPL_TOPO_CORRECT 0 250 #define LPL_TOPO_PART_HAS_NO_LPL -1 251 #define LPL_TOPO_CPUS_NOT_EMPTY -2 252 #define LPL_TOPO_LGRP_MISMATCH -3 253 #define LPL_TOPO_MISSING_PARENT -4 254 #define LPL_TOPO_PARENT_MISMATCH -5 255 #define LPL_TOPO_BAD_CPUCNT -6 256 #define LPL_TOPO_RSET_MISMATCH -7 257 #define LPL_TOPO_LPL_ORPHANED -8 258 #define LPL_TOPO_LPL_BAD_NCPU -9 259 #define LPL_TOPO_RSET_MSSNG_LF -10 260 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 261 #define LPL_TOPO_BOGUS_HINT -12 262 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 263 #define LPL_TOPO_LGRP_NOT_LEAF -14 264 #define LPL_TOPO_BAD_RSETCNT -15 265 266 /* 267 * Return whether lgroup optimizations should be enabled on this system 268 */ 269 int 270 lgrp_optimizations(void) 271 { 272 /* 273 * System must have more than 2 lgroups to enable lgroup optimizations 274 * 275 * XXX This assumes that a 2 lgroup system has an empty root lgroup 276 * with one child lgroup containing all the resources. A 2 lgroup 277 * system with a root lgroup directly containing CPUs or memory might 278 * need lgroup optimizations with its child lgroup, but there 279 * isn't such a machine for now.... 280 */ 281 if (nlgrps > 2) 282 return (1); 283 284 return (0); 285 } 286 287 /* 288 * Build full lgroup topology 289 */ 290 static void 291 lgrp_root_init(void) 292 { 293 lgrp_handle_t hand; 294 int i; 295 lgrp_id_t id; 296 297 /* 298 * Create the "root" lgroup 299 */ 300 ASSERT(nlgrps == 0); 301 id = nlgrps++; 302 303 lgrp_root = &lroot; 304 305 lgrp_root->lgrp_cpu = NULL; 306 lgrp_root->lgrp_mnodes = 0; 307 lgrp_root->lgrp_nmnodes = 0; 308 hand = lgrp_plat_root_hand(); 309 lgrp_root->lgrp_plathand = hand; 310 311 lgrp_root->lgrp_id = id; 312 lgrp_root->lgrp_cpucnt = 0; 313 lgrp_root->lgrp_childcnt = 0; 314 klgrpset_clear(lgrp_root->lgrp_children); 315 klgrpset_clear(lgrp_root->lgrp_leaves); 316 lgrp_root->lgrp_parent = NULL; 317 lgrp_root->lgrp_chips = NULL; 318 lgrp_root->lgrp_chipcnt = 0; 319 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 320 321 for (i = 0; i < LGRP_RSRC_COUNT; i++) 322 klgrpset_clear(lgrp_root->lgrp_set[i]); 323 324 lgrp_root->lgrp_kstat = NULL; 325 326 lgrp_table[id] = lgrp_root; 327 328 /* 329 * Setup initial lpl list for CPU0 and initial t0 home. 330 * The only lpl space we have so far is lpl_bootstrap. It is used for 331 * all topology operations until cp_default is initialized at which 332 * point t0.t_lpl will be updated. 333 */ 334 lpl_bootstrap = lpl_bootstrap_list; 335 t0.t_lpl = lpl_bootstrap; 336 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 337 lpl_bootstrap_list[1].lpl_lgrpid = 1; 338 cp_default.cp_lgrploads = lpl_bootstrap; 339 } 340 341 /* 342 * Initialize the lgroup framework and allow the platform to do the same 343 */ 344 void 345 lgrp_init(void) 346 { 347 /* 348 * Initialize the platform 349 */ 350 lgrp_plat_init(); 351 352 /* 353 * Set max number of lgroups supported on this platform which must be 354 * less than the max number of lgroups supported by the common lgroup 355 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 356 */ 357 nlgrpsmax = lgrp_plat_max_lgrps(); 358 ASSERT(nlgrpsmax <= NLGRPS_MAX); 359 } 360 361 /* 362 * Create the root and cpu0's lgroup, and set t0's home. 363 */ 364 void 365 lgrp_setup(void) 366 { 367 /* 368 * Setup the root lgroup 369 */ 370 lgrp_root_init(); 371 372 /* 373 * Add cpu0 to an lgroup 374 */ 375 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 376 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 377 } 378 379 /* 380 * Lgroup initialization is split in two parts. The first part 381 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 382 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 383 * when all CPUs are brought online and all distance information is available. 384 * 385 * When lgrp_main_init() is complete it sets lgrp_initialized. The 386 * lgrp_main_mp_init() sets lgrp_topo_initialized. 387 */ 388 389 /* 390 * true when lgrp initialization has been completed. 391 */ 392 int lgrp_initialized = 0; 393 394 /* 395 * True when lgrp topology is constructed. 396 */ 397 int lgrp_topo_initialized = 0; 398 399 /* 400 * Init routine called after startup(), /etc/system has been processed, 401 * and cpu0 has been added to an lgroup. 402 */ 403 void 404 lgrp_main_init(void) 405 { 406 cpu_t *cp = CPU; 407 lgrp_id_t lgrpid; 408 int i; 409 /* 410 * Enforce a valid lgrp_mem_default_policy 411 */ 412 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 413 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 414 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 415 416 /* 417 * See if mpo should be disabled. 418 * This may happen in the case of null proc LPA on Starcat. 419 * The platform won't be able to detect null proc LPA until after 420 * cpu0 and memory have already been added to lgroups. 421 * When and if it is detected, the Starcat platform will return 422 * a different platform handle for cpu0 which is what we check for 423 * here. If mpo should be disabled move cpu0 to it's rightful place 424 * (the root), and destroy the remaining lgroups. This effectively 425 * provides an UMA lgroup topology. 426 */ 427 lgrpid = cp->cpu_lpl->lpl_lgrpid; 428 if (lgrp_table[lgrpid]->lgrp_plathand != 429 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 430 lgrp_part_del_cpu(cp); 431 lgrp_cpu_fini(cp, lgrpid); 432 433 lgrp_cpu_init(cp); 434 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 435 436 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 437 438 /* 439 * Destroy all lgroups except for root 440 */ 441 for (i = 0; i <= lgrp_alloc_max; i++) { 442 if (LGRP_EXISTS(lgrp_table[i]) && 443 lgrp_table[i] != lgrp_root) 444 lgrp_destroy(lgrp_table[i]); 445 } 446 447 /* 448 * Fix up root to point at itself for leaves and resources 449 * and not have any children 450 */ 451 lgrp_root->lgrp_childcnt = 0; 452 klgrpset_clear(lgrp_root->lgrp_children); 453 klgrpset_clear(lgrp_root->lgrp_leaves); 454 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 455 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 456 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 457 } 458 459 /* 460 * Initialize kstats framework. 461 */ 462 lgrp_kstat_init(); 463 /* 464 * cpu0 is finally where it should be, so create it's lgroup's kstats 465 */ 466 mutex_enter(&cpu_lock); 467 lgrp_kstat_create(cp); 468 mutex_exit(&cpu_lock); 469 470 lgrp_plat_main_init(); 471 lgrp_initialized = 1; 472 } 473 474 /* 475 * Finish lgrp initialization after all CPUS are brought on-line. 476 * This routine is called after start_other_cpus(). 477 */ 478 void 479 lgrp_main_mp_init(void) 480 { 481 klgrpset_t changed; 482 483 /* 484 * Update lgroup topology (if necessary) 485 */ 486 klgrpset_clear(changed); 487 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 488 lgrp_topo_initialized = 1; 489 } 490 491 /* 492 * Change latency of lgroup with specified lgroup platform handle (if one is 493 * given) or change all lgroups with old latency to new latency 494 */ 495 void 496 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 497 u_longlong_t newtime) 498 { 499 lgrp_t *lgrp; 500 int i; 501 502 for (i = 0; i <= lgrp_alloc_max; i++) { 503 lgrp = lgrp_table[i]; 504 505 if (!LGRP_EXISTS(lgrp)) 506 continue; 507 508 if ((hand == LGRP_NULL_HANDLE && 509 lgrp->lgrp_latency == oldtime) || 510 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 511 lgrp->lgrp_latency = (int)newtime; 512 } 513 } 514 515 /* 516 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 517 */ 518 void 519 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 520 { 521 klgrpset_t changed; 522 cpu_t *cp; 523 lgrp_id_t id; 524 int rc; 525 526 switch (event) { 527 /* 528 * The following (re)configuration events are common code 529 * initiated. lgrp_plat_config() is called here to inform the 530 * platform of the reconfiguration event. 531 */ 532 case LGRP_CONFIG_CPU_ADD: 533 cp = (cpu_t *)resource; 534 535 /* 536 * Initialize the new CPU's lgrp related next/prev 537 * links, and give it a bootstrap lpl so that it can 538 * survive should it need to enter the dispatcher. 539 */ 540 cp->cpu_next_lpl = cp; 541 cp->cpu_prev_lpl = cp; 542 cp->cpu_next_lgrp = cp; 543 cp->cpu_prev_lgrp = cp; 544 cp->cpu_lpl = lpl_bootstrap; 545 546 lgrp_plat_config(event, resource); 547 atomic_add_32(&lgrp_gen, 1); 548 549 break; 550 case LGRP_CONFIG_CPU_DEL: 551 lgrp_plat_config(event, resource); 552 atomic_add_32(&lgrp_gen, 1); 553 554 break; 555 case LGRP_CONFIG_CPU_ONLINE: 556 cp = (cpu_t *)resource; 557 lgrp_cpu_init(cp); 558 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 559 rc = lpl_topo_verify(cp->cpu_part); 560 if (rc != LPL_TOPO_CORRECT) { 561 panic("lpl_topo_verify failed: %d", rc); 562 } 563 lgrp_plat_config(event, resource); 564 atomic_add_32(&lgrp_gen, 1); 565 566 break; 567 case LGRP_CONFIG_CPU_OFFLINE: 568 cp = (cpu_t *)resource; 569 id = cp->cpu_lpl->lpl_lgrpid; 570 lgrp_part_del_cpu(cp); 571 lgrp_cpu_fini(cp, id); 572 rc = lpl_topo_verify(cp->cpu_part); 573 if (rc != LPL_TOPO_CORRECT) { 574 panic("lpl_topo_verify failed: %d", rc); 575 } 576 lgrp_plat_config(event, resource); 577 atomic_add_32(&lgrp_gen, 1); 578 579 break; 580 case LGRP_CONFIG_CPUPART_ADD: 581 cp = (cpu_t *)resource; 582 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 583 rc = lpl_topo_verify(cp->cpu_part); 584 if (rc != LPL_TOPO_CORRECT) { 585 panic("lpl_topo_verify failed: %d", rc); 586 } 587 lgrp_plat_config(event, resource); 588 589 break; 590 case LGRP_CONFIG_CPUPART_DEL: 591 cp = (cpu_t *)resource; 592 lgrp_part_del_cpu((cpu_t *)resource); 593 rc = lpl_topo_verify(cp->cpu_part); 594 if (rc != LPL_TOPO_CORRECT) { 595 panic("lpl_topo_verify failed: %d", rc); 596 } 597 lgrp_plat_config(event, resource); 598 599 break; 600 /* 601 * The following events are initiated by the memnode 602 * subsystem. 603 */ 604 case LGRP_CONFIG_MEM_ADD: 605 lgrp_mem_init((int)resource, where, B_FALSE); 606 atomic_add_32(&lgrp_gen, 1); 607 608 break; 609 case LGRP_CONFIG_MEM_DEL: 610 lgrp_mem_fini((int)resource, where, B_FALSE); 611 atomic_add_32(&lgrp_gen, 1); 612 613 break; 614 case LGRP_CONFIG_MEM_RENAME: { 615 lgrp_config_mem_rename_t *ren_arg = 616 (lgrp_config_mem_rename_t *)where; 617 618 lgrp_mem_rename((int)resource, 619 ren_arg->lmem_rename_from, 620 ren_arg->lmem_rename_to); 621 atomic_add_32(&lgrp_gen, 1); 622 623 break; 624 } 625 case LGRP_CONFIG_GEN_UPDATE: 626 atomic_add_32(&lgrp_gen, 1); 627 628 break; 629 case LGRP_CONFIG_FLATTEN: 630 if (where == 0) 631 lgrp_topo_levels = (int)resource; 632 else 633 (void) lgrp_topo_flatten(resource, 634 lgrp_table, lgrp_alloc_max, &changed); 635 636 break; 637 /* 638 * Update any lgroups with old latency to new latency 639 */ 640 case LGRP_CONFIG_LAT_CHANGE_ALL: 641 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 642 (u_longlong_t)where); 643 644 break; 645 /* 646 * Update lgroup with specified lgroup platform handle to have 647 * new latency 648 */ 649 case LGRP_CONFIG_LAT_CHANGE: 650 lgrp_latency_change((lgrp_handle_t)resource, 0, 651 (u_longlong_t)where); 652 653 break; 654 case LGRP_CONFIG_NOP: 655 656 break; 657 default: 658 break; 659 } 660 661 } 662 663 /* 664 * Called to add lgrp info into cpu structure from cpu_add_unit; 665 * do not assume cpu is in cpu[] yet! 666 * 667 * CPUs are brought online with all other CPUs paused so we can't 668 * allocate memory or we could deadlock the system, so we rely on 669 * the platform to statically allocate as much space as we need 670 * for the lgrp structs and stats. 671 */ 672 static void 673 lgrp_cpu_init(struct cpu *cp) 674 { 675 klgrpset_t changed; 676 int count; 677 lgrp_handle_t hand; 678 int first_cpu; 679 lgrp_t *my_lgrp; 680 lgrp_id_t lgrpid; 681 struct cpu *cptr; 682 struct chip *chp; 683 684 /* 685 * This is the first time through if the resource set 686 * for the root lgroup is empty. After cpu0 has been 687 * initially added to an lgroup, the root's CPU resource 688 * set can never be empty, since the system's last CPU 689 * cannot be offlined. 690 */ 691 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 692 /* 693 * First time through. 694 */ 695 first_cpu = 1; 696 } else { 697 /* 698 * If cpu0 needs to move lgroups, we may come 699 * through here again, at which time cpu_lock won't 700 * be held, and lgrp_initialized will be false. 701 */ 702 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 703 ASSERT(cp->cpu_part != NULL); 704 first_cpu = 0; 705 } 706 707 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 708 my_lgrp = lgrp_hand_to_lgrp(hand); 709 710 if (my_lgrp == NULL) { 711 /* 712 * Create new lgrp and add it to lgroup topology 713 */ 714 my_lgrp = lgrp_create(); 715 my_lgrp->lgrp_plathand = hand; 716 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 717 lgrpid = my_lgrp->lgrp_id; 718 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 719 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 720 721 count = 0; 722 klgrpset_clear(changed); 723 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 724 &changed); 725 /* 726 * May have added new intermediate lgroups, so need to add 727 * resources other than CPUs which are added below 728 */ 729 (void) lgrp_mnode_update(changed, NULL); 730 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 731 > 0) { 732 /* 733 * Leaf lgroup was created, but latency wasn't available 734 * then. So, set latency for it and fill in rest of lgroup 735 * topology now that we know how far it is from other leaf 736 * lgroups. 737 */ 738 lgrpid = my_lgrp->lgrp_id; 739 klgrpset_clear(changed); 740 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 741 lgrpid)) 742 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 743 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 744 &changed); 745 746 /* 747 * May have added new intermediate lgroups, so need to add 748 * resources other than CPUs which are added below 749 */ 750 (void) lgrp_mnode_update(changed, NULL); 751 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 752 my_lgrp->lgrp_id)) { 753 int i; 754 755 /* 756 * Update existing lgroup and lgroups containing it with CPU 757 * resource 758 */ 759 lgrpid = my_lgrp->lgrp_id; 760 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 761 for (i = 0; i <= lgrp_alloc_max; i++) { 762 lgrp_t *lgrp; 763 764 lgrp = lgrp_table[i]; 765 if (!LGRP_EXISTS(lgrp) || 766 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 767 continue; 768 769 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 770 } 771 } 772 773 lgrpid = my_lgrp->lgrp_id; 774 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 775 776 /* 777 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 778 * end up in lpl for lgroup 0 whether it is supposed to be in there or 779 * not since none of lgroup IDs in the lpl's have been set yet. 780 */ 781 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 782 cp->cpu_lpl->lpl_lgrpid = lgrpid; 783 784 /* 785 * link the CPU into the lgrp's CPU list 786 */ 787 if (my_lgrp->lgrp_cpucnt == 0) { 788 my_lgrp->lgrp_cpu = cp; 789 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 790 } else { 791 cptr = my_lgrp->lgrp_cpu; 792 cp->cpu_next_lgrp = cptr; 793 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 794 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 795 cptr->cpu_prev_lgrp = cp; 796 } 797 my_lgrp->lgrp_cpucnt++; 798 799 /* 800 * Add this cpu's chip to the per lgroup list 801 * if necessary 802 */ 803 if (cp->cpu_chip->chip_lgrp == NULL) { 804 struct chip *lcpr; 805 806 chp = cp->cpu_chip; 807 808 if (my_lgrp->lgrp_chipcnt == 0) { 809 my_lgrp->lgrp_chips = chp; 810 chp->chip_next_lgrp = 811 chp->chip_prev_lgrp = chp; 812 } else { 813 lcpr = my_lgrp->lgrp_chips; 814 chp->chip_next_lgrp = lcpr; 815 chp->chip_prev_lgrp = 816 lcpr->chip_prev_lgrp; 817 lcpr->chip_prev_lgrp->chip_next_lgrp = 818 chp; 819 lcpr->chip_prev_lgrp = chp; 820 } 821 chp->chip_lgrp = my_lgrp; 822 chp->chip_balance = chp->chip_next_lgrp; 823 my_lgrp->lgrp_chipcnt++; 824 } 825 } 826 827 lgrp_t * 828 lgrp_create(void) 829 { 830 lgrp_t *my_lgrp; 831 lgrp_id_t lgrpid; 832 int i; 833 834 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 835 836 /* 837 * Find an open slot in the lgroup table and recycle unused lgroup 838 * left there if any 839 */ 840 my_lgrp = NULL; 841 if (lgrp_alloc_hint == -1) 842 /* 843 * Allocate from end when hint not set yet because no lgroups 844 * have been deleted yet 845 */ 846 lgrpid = nlgrps++; 847 else { 848 /* 849 * Start looking for next open slot from hint and leave hint 850 * at slot allocated 851 */ 852 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 853 my_lgrp = lgrp_table[i]; 854 if (!LGRP_EXISTS(my_lgrp)) { 855 lgrpid = i; 856 nlgrps++; 857 break; 858 } 859 } 860 lgrp_alloc_hint = lgrpid; 861 } 862 863 /* 864 * Keep track of max lgroup ID allocated so far to cut down on searches 865 */ 866 if (lgrpid > lgrp_alloc_max) 867 lgrp_alloc_max = lgrpid; 868 869 /* 870 * Need to allocate new lgroup if next open slot didn't have one 871 * for recycling 872 */ 873 if (my_lgrp == NULL) 874 my_lgrp = lgrp_plat_alloc(lgrpid); 875 876 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 877 panic("Too many lgrps for platform (%d)", nlgrps); 878 879 my_lgrp->lgrp_id = lgrpid; 880 my_lgrp->lgrp_latency = 0; 881 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 882 my_lgrp->lgrp_parent = NULL; 883 my_lgrp->lgrp_childcnt = 0; 884 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 885 my_lgrp->lgrp_nmnodes = 0; 886 klgrpset_clear(my_lgrp->lgrp_children); 887 klgrpset_clear(my_lgrp->lgrp_leaves); 888 for (i = 0; i < LGRP_RSRC_COUNT; i++) 889 klgrpset_clear(my_lgrp->lgrp_set[i]); 890 891 my_lgrp->lgrp_cpu = NULL; 892 my_lgrp->lgrp_cpucnt = 0; 893 my_lgrp->lgrp_chips = NULL; 894 my_lgrp->lgrp_chipcnt = 0; 895 896 if (my_lgrp->lgrp_kstat != NULL) 897 lgrp_kstat_reset(lgrpid); 898 899 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 900 901 return (my_lgrp); 902 } 903 904 void 905 lgrp_destroy(lgrp_t *lgrp) 906 { 907 int i; 908 909 /* 910 * Unless this lgroup is being destroyed on behalf of 911 * the boot CPU, cpu_lock must be held 912 */ 913 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 914 915 if (nlgrps == 1) 916 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 917 918 if (!LGRP_EXISTS(lgrp)) 919 return; 920 921 /* 922 * Set hint to lgroup being deleted and try to keep lower numbered 923 * hints to facilitate finding empty slots 924 */ 925 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 926 lgrp_alloc_hint = lgrp->lgrp_id; 927 928 /* 929 * Mark this lgroup to be recycled by setting its lgroup ID to 930 * LGRP_NONE and clear relevant fields 931 */ 932 lgrp->lgrp_id = LGRP_NONE; 933 lgrp->lgrp_latency = 0; 934 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 935 lgrp->lgrp_parent = NULL; 936 lgrp->lgrp_childcnt = 0; 937 938 klgrpset_clear(lgrp->lgrp_children); 939 klgrpset_clear(lgrp->lgrp_leaves); 940 for (i = 0; i < LGRP_RSRC_COUNT; i++) 941 klgrpset_clear(lgrp->lgrp_set[i]); 942 943 lgrp->lgrp_mnodes = (mnodeset_t)0; 944 lgrp->lgrp_nmnodes = 0; 945 946 lgrp->lgrp_cpu = NULL; 947 lgrp->lgrp_cpucnt = 0; 948 lgrp->lgrp_chipcnt = 0; 949 lgrp->lgrp_chips = NULL; 950 951 nlgrps--; 952 } 953 954 /* 955 * Initialize kstat data. Called from lgrp intialization code. 956 */ 957 static void 958 lgrp_kstat_init(void) 959 { 960 lgrp_stat_t stat; 961 962 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 963 964 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 965 kstat_named_init(&lgrp_kstat_data[stat], 966 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 967 } 968 969 /* 970 * initialize an lgrp's kstats if needed 971 * called with cpu_lock held but not with cpus paused. 972 * we don't tear these down now because we don't know about 973 * memory leaving the lgrp yet... 974 */ 975 976 void 977 lgrp_kstat_create(cpu_t *cp) 978 { 979 kstat_t *lgrp_kstat; 980 lgrp_id_t lgrpid; 981 lgrp_t *my_lgrp; 982 983 ASSERT(MUTEX_HELD(&cpu_lock)); 984 985 lgrpid = cp->cpu_lpl->lpl_lgrpid; 986 my_lgrp = lgrp_table[lgrpid]; 987 988 if (my_lgrp->lgrp_kstat != NULL) 989 return; /* already initialized */ 990 991 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 992 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 993 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 994 995 if (lgrp_kstat != NULL) { 996 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 997 lgrp_kstat->ks_private = my_lgrp; 998 lgrp_kstat->ks_data = &lgrp_kstat_data; 999 lgrp_kstat->ks_update = lgrp_kstat_extract; 1000 my_lgrp->lgrp_kstat = lgrp_kstat; 1001 kstat_install(lgrp_kstat); 1002 } 1003 } 1004 1005 /* 1006 * this will do something when we manage to remove now unused lgrps 1007 */ 1008 1009 /* ARGSUSED */ 1010 void 1011 lgrp_kstat_destroy(cpu_t *cp) 1012 { 1013 ASSERT(MUTEX_HELD(&cpu_lock)); 1014 } 1015 1016 /* 1017 * Called when a CPU is off-lined. 1018 */ 1019 static void 1020 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 1021 { 1022 lgrp_t *my_lgrp; 1023 struct cpu *prev; 1024 struct cpu *next; 1025 chip_t *chp; 1026 1027 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 1028 1029 prev = cp->cpu_prev_lgrp; 1030 next = cp->cpu_next_lgrp; 1031 1032 prev->cpu_next_lgrp = next; 1033 next->cpu_prev_lgrp = prev; 1034 1035 /* 1036 * just because I'm paranoid doesn't mean... 1037 */ 1038 1039 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1040 1041 my_lgrp = lgrp_table[lgrpid]; 1042 my_lgrp->lgrp_cpucnt--; 1043 1044 /* 1045 * If the last CPU on it's chip is being offlined 1046 * then remove this chip from the per lgroup list. 1047 * 1048 * This is also done for the boot CPU when it needs 1049 * to move between lgroups as a consequence of 1050 * null proc lpa. 1051 */ 1052 chp = cp->cpu_chip; 1053 if (chp->chip_ncpu == 0 || !lgrp_initialized) { 1054 1055 chip_t *chpp; 1056 1057 if (--my_lgrp->lgrp_chipcnt == 0) 1058 my_lgrp->lgrp_chips = NULL; 1059 else if (my_lgrp->lgrp_chips == chp) 1060 my_lgrp->lgrp_chips = chp->chip_next_lgrp; 1061 1062 /* 1063 * Walk this lgroup's chip list looking for chips that 1064 * may try to balance against the one that's leaving 1065 */ 1066 for (chpp = chp->chip_next_lgrp; chpp != chp; 1067 chpp = chpp->chip_next_lgrp) { 1068 if (chpp->chip_balance == chp) 1069 chpp->chip_balance = chp->chip_next_lgrp; 1070 } 1071 1072 chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 1073 chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 1074 1075 chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 1076 chp->chip_lgrp = NULL; 1077 chp->chip_balance = NULL; 1078 } 1079 1080 /* 1081 * Removing last CPU in lgroup, so update lgroup topology 1082 */ 1083 if (my_lgrp->lgrp_cpucnt == 0) { 1084 klgrpset_t changed; 1085 int count; 1086 int i; 1087 1088 my_lgrp->lgrp_cpu = NULL; 1089 1090 /* 1091 * Remove this lgroup from its lgroup CPU resources and remove 1092 * lgroup from lgroup topology if it doesn't have any more 1093 * resources in it now 1094 */ 1095 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1096 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1097 count = 0; 1098 klgrpset_clear(changed); 1099 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1100 lgrp_alloc_max + 1, &changed); 1101 return; 1102 } 1103 1104 /* 1105 * This lgroup isn't empty, so just remove it from CPU 1106 * resources of any lgroups that contain it as such 1107 */ 1108 for (i = 0; i <= lgrp_alloc_max; i++) { 1109 lgrp_t *lgrp; 1110 1111 lgrp = lgrp_table[i]; 1112 if (!LGRP_EXISTS(lgrp) || 1113 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1114 lgrpid)) 1115 continue; 1116 1117 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1118 } 1119 return; 1120 } 1121 1122 if (my_lgrp->lgrp_cpu == cp) 1123 my_lgrp->lgrp_cpu = next; 1124 1125 } 1126 1127 /* 1128 * Update memory nodes in target lgroups and return ones that get changed 1129 */ 1130 int 1131 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1132 { 1133 int count; 1134 int i; 1135 int j; 1136 lgrp_t *lgrp; 1137 lgrp_t *lgrp_rsrc; 1138 1139 count = 0; 1140 if (changed) 1141 klgrpset_clear(*changed); 1142 1143 if (klgrpset_isempty(target)) 1144 return (0); 1145 1146 /* 1147 * Find each lgroup in target lgroups 1148 */ 1149 for (i = 0; i <= lgrp_alloc_max; i++) { 1150 /* 1151 * Skip any lgroups that don't exist or aren't in target group 1152 */ 1153 lgrp = lgrp_table[i]; 1154 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1155 continue; 1156 } 1157 1158 /* 1159 * Initialize memnodes for intermediate lgroups to 0 1160 * and update them from scratch since they may have completely 1161 * changed 1162 */ 1163 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1164 lgrp->lgrp_mnodes = (mnodeset_t)0; 1165 lgrp->lgrp_nmnodes = 0; 1166 } 1167 1168 /* 1169 * Update memory nodes of of target lgroup with memory nodes 1170 * from each lgroup in its lgroup memory resource set 1171 */ 1172 for (j = 0; j <= lgrp_alloc_max; j++) { 1173 int k; 1174 1175 /* 1176 * Skip any lgroups that don't exist or aren't in 1177 * memory resources of target lgroup 1178 */ 1179 lgrp_rsrc = lgrp_table[j]; 1180 if (!LGRP_EXISTS(lgrp_rsrc) || 1181 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1182 j)) 1183 continue; 1184 1185 /* 1186 * Update target lgroup's memnodes to include memnodes 1187 * of this lgroup 1188 */ 1189 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1190 mnodeset_t mnode_mask; 1191 1192 mnode_mask = (mnodeset_t)1 << k; 1193 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1194 !(lgrp->lgrp_mnodes & mnode_mask)) { 1195 lgrp->lgrp_mnodes |= mnode_mask; 1196 lgrp->lgrp_nmnodes++; 1197 } 1198 } 1199 count++; 1200 if (changed) 1201 klgrpset_add(*changed, lgrp->lgrp_id); 1202 } 1203 } 1204 1205 return (count); 1206 } 1207 1208 /* 1209 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1210 * is moved from one board to another. The "from" and "to" arguments specify the 1211 * source and the destination of the move. 1212 * 1213 * See plat_lgrp_config() for a detailed description of the copy-rename 1214 * semantics. 1215 * 1216 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1217 * the lgroup topology which is changing as memory moves from one lgroup to 1218 * another. It removes the mnode from the source lgroup and re-inserts it in the 1219 * target lgroup. 1220 * 1221 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1222 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1223 * copy-rename operation. 1224 * 1225 * There is one case which requires special handling. If the system contains 1226 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1227 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1228 * lgrp_mem_init), but there is a window when the system has no memory in the 1229 * lgroup hierarchy. If another thread tries to allocate memory during this 1230 * window, the allocation will fail, although the system has physical memory. 1231 * This may cause a system panic or a deadlock (some sleeping memory allocations 1232 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1233 * the mnode back). 1234 * 1235 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1236 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1237 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1238 * but it updates the rest of the lgroup topology as if the mnode was actually 1239 * removed. The lgrp_mem_init() function recognizes that the mnode being 1240 * inserted represents such a special case and updates the topology 1241 * appropriately. 1242 */ 1243 void 1244 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1245 { 1246 /* 1247 * Remove the memory from the source node and add it to the destination 1248 * node. 1249 */ 1250 lgrp_mem_fini(mnode, from, B_TRUE); 1251 lgrp_mem_init(mnode, to, B_TRUE); 1252 } 1253 1254 /* 1255 * Called to indicate that the lgrp with platform handle "hand" now 1256 * contains the memory identified by "mnode". 1257 * 1258 * LOCKING for this routine is a bit tricky. Usually it is called without 1259 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1260 * callers. During DR of the board containing the caged memory it may be called 1261 * with cpu_lock already held and CPUs paused. 1262 * 1263 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1264 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1265 * dealing with the special case of DR copy-rename described in 1266 * lgrp_mem_rename(). 1267 */ 1268 void 1269 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1270 { 1271 klgrpset_t changed; 1272 int count; 1273 int i; 1274 lgrp_t *my_lgrp; 1275 lgrp_id_t lgrpid; 1276 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1277 boolean_t drop_lock = B_FALSE; 1278 boolean_t need_synch = B_FALSE; 1279 1280 /* 1281 * Grab CPU lock (if we haven't already) 1282 */ 1283 if (!MUTEX_HELD(&cpu_lock)) { 1284 mutex_enter(&cpu_lock); 1285 drop_lock = B_TRUE; 1286 } 1287 1288 /* 1289 * This routine may be called from a context where we already 1290 * hold cpu_lock, and have already paused cpus. 1291 */ 1292 if (!cpus_paused()) 1293 need_synch = B_TRUE; 1294 1295 /* 1296 * Check if this mnode is already configured and return immediately if 1297 * it is. 1298 * 1299 * NOTE: in special case of copy-rename of the only remaining mnode, 1300 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1301 * recognize this case and continue as usual, but skip the update to 1302 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1303 * in topology, temporarily introduced by lgrp_mem_fini(). 1304 */ 1305 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1306 lgrp_root->lgrp_mnodes & mnodes_mask) { 1307 if (drop_lock) 1308 mutex_exit(&cpu_lock); 1309 return; 1310 } 1311 1312 /* 1313 * Update lgroup topology with new memory resources, keeping track of 1314 * which lgroups change 1315 */ 1316 count = 0; 1317 klgrpset_clear(changed); 1318 my_lgrp = lgrp_hand_to_lgrp(hand); 1319 if (my_lgrp == NULL) { 1320 /* new lgrp */ 1321 my_lgrp = lgrp_create(); 1322 lgrpid = my_lgrp->lgrp_id; 1323 my_lgrp->lgrp_plathand = hand; 1324 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1325 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1326 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1327 1328 if (need_synch) 1329 pause_cpus(NULL); 1330 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1331 &changed); 1332 if (need_synch) 1333 start_cpus(); 1334 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1335 > 0) { 1336 /* 1337 * Leaf lgroup was created, but latency wasn't available 1338 * then. So, set latency for it and fill in rest of lgroup 1339 * topology now that we know how far it is from other leaf 1340 * lgroups. 1341 */ 1342 klgrpset_clear(changed); 1343 lgrpid = my_lgrp->lgrp_id; 1344 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1345 lgrpid)) 1346 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1347 if (need_synch) 1348 pause_cpus(NULL); 1349 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1350 &changed); 1351 if (need_synch) 1352 start_cpus(); 1353 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1354 my_lgrp->lgrp_id)) { 1355 /* 1356 * Add new lgroup memory resource to existing lgroup 1357 */ 1358 lgrpid = my_lgrp->lgrp_id; 1359 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1360 klgrpset_add(changed, lgrpid); 1361 count++; 1362 for (i = 0; i <= lgrp_alloc_max; i++) { 1363 lgrp_t *lgrp; 1364 1365 lgrp = lgrp_table[i]; 1366 if (!LGRP_EXISTS(lgrp) || 1367 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1368 continue; 1369 1370 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1371 klgrpset_add(changed, lgrp->lgrp_id); 1372 count++; 1373 } 1374 } 1375 1376 /* 1377 * Add memory node to lgroup and remove lgroup from ones that need 1378 * to be updated 1379 */ 1380 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1381 my_lgrp->lgrp_mnodes |= mnodes_mask; 1382 my_lgrp->lgrp_nmnodes++; 1383 } 1384 klgrpset_del(changed, lgrpid); 1385 1386 /* 1387 * Update memory node information for all lgroups that changed and 1388 * contain new memory node as a resource 1389 */ 1390 if (count) 1391 (void) lgrp_mnode_update(changed, NULL); 1392 1393 if (drop_lock) 1394 mutex_exit(&cpu_lock); 1395 } 1396 1397 /* 1398 * Called to indicate that the lgroup associated with the platform 1399 * handle "hand" no longer contains given memory node 1400 * 1401 * LOCKING for this routine is a bit tricky. Usually it is called without 1402 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1403 * callers. During DR of the board containing the caged memory it may be called 1404 * with cpu_lock already held and CPUs paused. 1405 * 1406 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1407 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1408 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1409 * the same mnode back into the topology. See lgrp_mem_rename() and 1410 * lgrp_mem_init() for additional details. 1411 */ 1412 void 1413 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1414 { 1415 klgrpset_t changed; 1416 int count; 1417 int i; 1418 lgrp_t *my_lgrp; 1419 lgrp_id_t lgrpid; 1420 mnodeset_t mnodes_mask; 1421 boolean_t drop_lock = B_FALSE; 1422 boolean_t need_synch = B_FALSE; 1423 1424 /* 1425 * Grab CPU lock (if we haven't already) 1426 */ 1427 if (!MUTEX_HELD(&cpu_lock)) { 1428 mutex_enter(&cpu_lock); 1429 drop_lock = B_TRUE; 1430 } 1431 1432 /* 1433 * This routine may be called from a context where we already 1434 * hold cpu_lock and have already paused cpus. 1435 */ 1436 if (!cpus_paused()) 1437 need_synch = B_TRUE; 1438 1439 my_lgrp = lgrp_hand_to_lgrp(hand); 1440 1441 /* 1442 * The lgrp *must* be pre-existing 1443 */ 1444 ASSERT(my_lgrp != NULL); 1445 1446 /* 1447 * Delete memory node from lgroups which contain it 1448 */ 1449 mnodes_mask = ((mnodeset_t)1 << mnode); 1450 for (i = 0; i <= lgrp_alloc_max; i++) { 1451 lgrp_t *lgrp = lgrp_table[i]; 1452 /* 1453 * Skip any non-existent lgroups and any lgroups that don't 1454 * contain leaf lgroup of memory as a memory resource 1455 */ 1456 if (!LGRP_EXISTS(lgrp) || 1457 !(lgrp->lgrp_mnodes & mnodes_mask)) 1458 continue; 1459 1460 /* 1461 * Avoid removing the last mnode from the root in the DR 1462 * copy-rename case. See lgrp_mem_rename() for details. 1463 */ 1464 if (is_copy_rename && 1465 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1466 continue; 1467 1468 /* 1469 * Remove memory node from lgroup. 1470 */ 1471 lgrp->lgrp_mnodes &= ~mnodes_mask; 1472 lgrp->lgrp_nmnodes--; 1473 ASSERT(lgrp->lgrp_nmnodes >= 0); 1474 } 1475 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1476 1477 /* 1478 * Don't need to update lgroup topology if this lgroup still has memory. 1479 * 1480 * In the special case of DR copy-rename with the only mnode being 1481 * removed, the lgrp_mnodes for the root is always non-zero, but we 1482 * still need to update the lgroup topology. 1483 */ 1484 if ((my_lgrp->lgrp_nmnodes > 0) && 1485 !(is_copy_rename && 1486 (my_lgrp == lgrp_root) && 1487 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1488 if (drop_lock) 1489 mutex_exit(&cpu_lock); 1490 return; 1491 } 1492 1493 /* 1494 * This lgroup does not contain any memory now 1495 */ 1496 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1497 1498 /* 1499 * Remove this lgroup from lgroup topology if it does not contain any 1500 * resources now 1501 */ 1502 lgrpid = my_lgrp->lgrp_id; 1503 count = 0; 1504 klgrpset_clear(changed); 1505 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1506 /* 1507 * Delete lgroup when no more resources 1508 */ 1509 if (need_synch) 1510 pause_cpus(NULL); 1511 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1512 lgrp_alloc_max + 1, &changed); 1513 ASSERT(count > 0); 1514 if (need_synch) 1515 start_cpus(); 1516 } else { 1517 /* 1518 * Remove lgroup from memory resources of any lgroups that 1519 * contain it as such 1520 */ 1521 for (i = 0; i <= lgrp_alloc_max; i++) { 1522 lgrp_t *lgrp; 1523 1524 lgrp = lgrp_table[i]; 1525 if (!LGRP_EXISTS(lgrp) || 1526 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1527 lgrpid)) 1528 continue; 1529 1530 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1531 } 1532 } 1533 if (drop_lock) 1534 mutex_exit(&cpu_lock); 1535 } 1536 1537 /* 1538 * Return lgroup with given platform handle 1539 */ 1540 lgrp_t * 1541 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1542 { 1543 int i; 1544 lgrp_t *lgrp; 1545 1546 if (hand == LGRP_NULL_HANDLE) 1547 return (NULL); 1548 1549 for (i = 0; i <= lgrp_alloc_max; i++) { 1550 lgrp = lgrp_table[i]; 1551 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1552 return (lgrp); 1553 } 1554 return (NULL); 1555 } 1556 1557 /* 1558 * Return the home lgroup of the current thread. 1559 * We must do this with kernel preemption disabled, since we don't want our 1560 * thread to be re-homed while we're poking around with its lpl, and the lpl 1561 * should never be NULL. 1562 * 1563 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1564 * is enabled because of DR. Callers can use disable kernel preemption 1565 * around this call to guarantee that the lgroup will be valid beyond this 1566 * routine, since kernel preemption can be recursive. 1567 */ 1568 lgrp_t * 1569 lgrp_home_lgrp(void) 1570 { 1571 lgrp_t *lgrp; 1572 lpl_t *lpl; 1573 1574 kpreempt_disable(); 1575 1576 lpl = curthread->t_lpl; 1577 ASSERT(lpl != NULL); 1578 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1579 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1580 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1581 1582 kpreempt_enable(); 1583 1584 return (lgrp); 1585 } 1586 1587 /* 1588 * Return ID of home lgroup for given thread 1589 * (See comments for lgrp_home_lgrp() for special care and handling 1590 * instructions) 1591 */ 1592 lgrp_id_t 1593 lgrp_home_id(kthread_t *t) 1594 { 1595 lgrp_id_t lgrp; 1596 lpl_t *lpl; 1597 1598 ASSERT(t != NULL); 1599 /* 1600 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1601 * cannot since the HAT layer can call into this routine to 1602 * determine the locality for its data structures in the context 1603 * of a page fault. 1604 */ 1605 1606 kpreempt_disable(); 1607 1608 lpl = t->t_lpl; 1609 ASSERT(lpl != NULL); 1610 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1611 lgrp = lpl->lpl_lgrpid; 1612 1613 kpreempt_enable(); 1614 1615 return (lgrp); 1616 } 1617 1618 /* 1619 * Return lgroup containing the physical memory for the given page frame number 1620 */ 1621 lgrp_t * 1622 lgrp_pfn_to_lgrp(pfn_t pfn) 1623 { 1624 lgrp_handle_t hand; 1625 int i; 1626 lgrp_t *lgrp; 1627 1628 hand = lgrp_plat_pfn_to_hand(pfn); 1629 if (hand != LGRP_NULL_HANDLE) 1630 for (i = 0; i <= lgrp_alloc_max; i++) { 1631 lgrp = lgrp_table[i]; 1632 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1633 return (lgrp); 1634 } 1635 return (NULL); 1636 } 1637 1638 /* 1639 * Return lgroup containing the physical memory for the given page frame number 1640 */ 1641 lgrp_t * 1642 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1643 { 1644 lgrp_handle_t hand; 1645 int i; 1646 lgrp_t *lgrp; 1647 pfn_t pfn; 1648 1649 pfn = btop(physaddr); 1650 hand = lgrp_plat_pfn_to_hand(pfn); 1651 if (hand != LGRP_NULL_HANDLE) 1652 for (i = 0; i <= lgrp_alloc_max; i++) { 1653 lgrp = lgrp_table[i]; 1654 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1655 return (lgrp); 1656 } 1657 return (NULL); 1658 } 1659 1660 /* 1661 * Return the leaf lgroup containing the given CPU 1662 * 1663 * The caller needs to take precautions necessary to prevent 1664 * "cpu" from going away across a call to this function. 1665 * hint: kpreempt_disable()/kpreempt_enable() 1666 */ 1667 static lgrp_t * 1668 lgrp_cpu_to_lgrp(cpu_t *cpu) 1669 { 1670 return (cpu->cpu_lpl->lpl_lgrp); 1671 } 1672 1673 /* 1674 * Return the sum of the partition loads in an lgrp divided by 1675 * the number of CPUs in the lgrp. This is our best approximation 1676 * of an 'lgroup load average' for a useful per-lgroup kstat. 1677 */ 1678 static uint64_t 1679 lgrp_sum_loadavgs(lgrp_t *lgrp) 1680 { 1681 cpu_t *cpu; 1682 int ncpu; 1683 uint64_t loads = 0; 1684 1685 mutex_enter(&cpu_lock); 1686 1687 cpu = lgrp->lgrp_cpu; 1688 ncpu = lgrp->lgrp_cpucnt; 1689 1690 if (cpu == NULL || ncpu == 0) { 1691 mutex_exit(&cpu_lock); 1692 return (0ull); 1693 } 1694 1695 do { 1696 loads += cpu->cpu_lpl->lpl_loadavg; 1697 cpu = cpu->cpu_next_lgrp; 1698 } while (cpu != lgrp->lgrp_cpu); 1699 1700 mutex_exit(&cpu_lock); 1701 1702 return (loads / ncpu); 1703 } 1704 1705 void 1706 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1707 { 1708 struct lgrp_stats *pstats; 1709 1710 /* 1711 * Verify that the caller isn't trying to add to 1712 * a statistic for an lgroup that has gone away 1713 */ 1714 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1715 return; 1716 1717 pstats = &lgrp_stats[lgrpid]; 1718 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1719 } 1720 1721 int64_t 1722 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1723 { 1724 uint64_t val; 1725 struct lgrp_stats *pstats; 1726 1727 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1728 return ((int64_t)0); 1729 1730 pstats = &lgrp_stats[lgrpid]; 1731 LGRP_STAT_READ(pstats, stat, val); 1732 return (val); 1733 } 1734 1735 /* 1736 * Reset all kstats for lgrp specified by its lgrpid. 1737 */ 1738 static void 1739 lgrp_kstat_reset(lgrp_id_t lgrpid) 1740 { 1741 lgrp_stat_t stat; 1742 1743 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1744 return; 1745 1746 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1747 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1748 } 1749 } 1750 1751 /* 1752 * Collect all per-lgrp statistics for the lgrp associated with this 1753 * kstat, and store them in the ks_data array. 1754 * 1755 * The superuser can reset all the running counter statistics for an 1756 * lgrp by writing to any of the lgrp's stats. 1757 */ 1758 static int 1759 lgrp_kstat_extract(kstat_t *ksp, int rw) 1760 { 1761 lgrp_stat_t stat; 1762 struct kstat_named *ksd; 1763 lgrp_t *lgrp; 1764 lgrp_id_t lgrpid; 1765 1766 lgrp = (lgrp_t *)ksp->ks_private; 1767 1768 ksd = (struct kstat_named *)ksp->ks_data; 1769 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1770 1771 lgrpid = lgrp->lgrp_id; 1772 1773 if (lgrpid == LGRP_NONE) { 1774 /* 1775 * Return all zeroes as stats for freed lgrp. 1776 */ 1777 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1778 ksd[stat].value.i64 = 0; 1779 } 1780 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1781 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1782 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1783 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1784 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1785 } else if (rw != KSTAT_WRITE) { 1786 /* 1787 * Handle counter stats 1788 */ 1789 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1790 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1791 } 1792 1793 /* 1794 * Handle kernel data snapshot stats 1795 */ 1796 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1797 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1798 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1799 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1800 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1801 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1802 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1803 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1804 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1805 lgrp_loadavg_max_effect; 1806 } else { 1807 lgrp_kstat_reset(lgrpid); 1808 } 1809 1810 return (0); 1811 } 1812 1813 int 1814 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1815 { 1816 cpu_t *cp; 1817 1818 mutex_enter(&cpu_lock); 1819 1820 if ((cp = cpu_get(id)) == NULL) { 1821 mutex_exit(&cpu_lock); 1822 return (EINVAL); 1823 } 1824 1825 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1826 mutex_exit(&cpu_lock); 1827 return (EINVAL); 1828 } 1829 1830 ASSERT(cp->cpu_lpl != NULL); 1831 1832 *lp = cp->cpu_lpl->lpl_lgrpid; 1833 1834 mutex_exit(&cpu_lock); 1835 1836 return (0); 1837 } 1838 1839 int 1840 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1841 { 1842 cpu_t *cp; 1843 1844 mutex_enter(&cpu_lock); 1845 1846 if ((cp = cpu_get(id)) == NULL) { 1847 mutex_exit(&cpu_lock); 1848 return (EINVAL); 1849 } 1850 1851 ASSERT(cp->cpu_lpl != NULL); 1852 1853 *lp = cp->cpu_lpl->lpl_loadavg; 1854 1855 mutex_exit(&cpu_lock); 1856 1857 return (0); 1858 } 1859 1860 /* 1861 * Add a resource named by lpl_leaf to rset of lpl_target 1862 * 1863 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1864 * resource. It is adjusted here, as this is presently the only place that we 1865 * can be certain a resource addition has succeeded. 1866 * 1867 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1868 * list in order until it reaches a NULL. (This list is required to be NULL 1869 * terminated, too). This is done so that we can mark start pos + 1, so that 1870 * each lpl is traversed sequentially, but in a different order. We hope this 1871 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1872 */ 1873 1874 void 1875 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1876 { 1877 int i; 1878 int entry_slot = 0; 1879 1880 /* return if leaf is already present */ 1881 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1882 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1883 return; 1884 } 1885 1886 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1887 lpl_leaf->lpl_lgrpid) { 1888 break; 1889 } 1890 } 1891 1892 /* insert leaf, update counts */ 1893 entry_slot = i; 1894 i = lpl_target->lpl_nrset++; 1895 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1896 panic("More leaf lgrps in system than are supported!\n"); 1897 } 1898 1899 /* 1900 * Start at the end of the rset array and work backwards towards the 1901 * slot into which the new lpl will be inserted. This effectively 1902 * preserves the current ordering by scooting everybody over one entry, 1903 * and placing the new entry into the space created. 1904 */ 1905 1906 while (i-- > entry_slot) { 1907 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1908 } 1909 1910 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1911 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1912 } 1913 1914 /* 1915 * Update each of lpl_parent's children with a proper hint and 1916 * a reference to their parent. 1917 * The lgrp topology is used as the reference since it is fully 1918 * consistent and correct at this point. 1919 * 1920 * Each child's hint will reference an element in lpl_parent's 1921 * rset that designates where the child should start searching 1922 * for CPU resources. The hint selected is the highest order leaf present 1923 * in the child's lineage. 1924 * 1925 * This should be called after any potential change in lpl_parent's 1926 * rset. 1927 */ 1928 static void 1929 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1930 { 1931 klgrpset_t children, leaves; 1932 lpl_t *lpl; 1933 int hint; 1934 int i, j; 1935 1936 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1937 if (klgrpset_isempty(children)) 1938 return; /* nothing to do */ 1939 1940 for (i = 0; i <= lgrp_alloc_max; i++) { 1941 if (klgrpset_ismember(children, i)) { 1942 1943 /* 1944 * Given the set of leaves in this child's lineage, 1945 * find the highest order leaf present in the parent's 1946 * rset. Select this as the hint for the child. 1947 */ 1948 leaves = lgrp_table[i]->lgrp_leaves; 1949 hint = 0; 1950 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1951 lpl = lpl_parent->lpl_rset[j]; 1952 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1953 hint = j; 1954 } 1955 cp->cp_lgrploads[i].lpl_hint = hint; 1956 1957 /* 1958 * (Re)set the parent. It may be incorrect if 1959 * lpl_parent is new in the topology. 1960 */ 1961 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1962 } 1963 } 1964 } 1965 1966 /* 1967 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1968 * 1969 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1970 * resource. The values are adjusted here, as this is the only place that we can 1971 * be certain a resource was successfully deleted. 1972 */ 1973 void 1974 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1975 { 1976 int i; 1977 1978 /* find leaf in intermediate node */ 1979 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1980 if (lpl_target->lpl_rset[i] == lpl_leaf) 1981 break; 1982 } 1983 1984 /* return if leaf not found */ 1985 if (lpl_target->lpl_rset[i] != lpl_leaf) 1986 return; 1987 1988 /* prune leaf, compress array */ 1989 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1990 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1991 lpl_target->lpl_ncpu--; 1992 do { 1993 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1994 } while (i++ < lpl_target->lpl_nrset); 1995 } 1996 1997 /* 1998 * Check to see if the resource set of the target lpl contains the 1999 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 2000 */ 2001 2002 int 2003 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 2004 { 2005 int i; 2006 2007 for (i = 0; i < lpl_target->lpl_nrset; i++) { 2008 if (lpl_target->lpl_rset[i] == lpl_leaf) 2009 return (1); 2010 } 2011 2012 return (0); 2013 } 2014 2015 /* 2016 * Called when we change cpu lpl membership. This increments or decrements the 2017 * per-cpu counter in every lpl in which our leaf appears. 2018 */ 2019 void 2020 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 2021 { 2022 cpupart_t *cpupart; 2023 lgrp_t *lgrp_leaf; 2024 lgrp_t *lgrp_cur; 2025 lpl_t *lpl_leaf; 2026 lpl_t *lpl_cur; 2027 int i; 2028 2029 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 2030 2031 cpupart = cp->cpu_part; 2032 lpl_leaf = cp->cpu_lpl; 2033 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 2034 2035 for (i = 0; i <= lgrp_alloc_max; i++) { 2036 lgrp_cur = lgrp_table[i]; 2037 2038 /* 2039 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 2040 * for the cpu in question, or if the current lgrp and leaf 2041 * don't share the same resources. 2042 */ 2043 2044 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 2045 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 2046 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 2047 continue; 2048 2049 2050 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2051 2052 if (lpl_cur->lpl_nrset > 0) { 2053 if (act == LPL_INCREMENT) { 2054 lpl_cur->lpl_ncpu++; 2055 } else if (act == LPL_DECREMENT) { 2056 lpl_cur->lpl_ncpu--; 2057 } 2058 } 2059 } 2060 } 2061 2062 /* 2063 * Initialize lpl with given resources and specified lgrp 2064 */ 2065 2066 void 2067 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2068 { 2069 lpl->lpl_lgrpid = lgrp->lgrp_id; 2070 lpl->lpl_loadavg = 0; 2071 if (lpl == lpl_leaf) 2072 lpl->lpl_ncpu = 1; 2073 else 2074 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2075 lpl->lpl_nrset = 1; 2076 lpl->lpl_rset[0] = lpl_leaf; 2077 lpl->lpl_lgrp = lgrp; 2078 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2079 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2080 } 2081 2082 /* 2083 * Clear an unused lpl 2084 */ 2085 2086 void 2087 lpl_clear(lpl_t *lpl) 2088 { 2089 lgrp_id_t lid; 2090 2091 /* save lid for debugging purposes */ 2092 lid = lpl->lpl_lgrpid; 2093 bzero(lpl, sizeof (lpl_t)); 2094 lpl->lpl_lgrpid = lid; 2095 } 2096 2097 /* 2098 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2099 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2100 * make full use of all of the lgroup topology, but this checks to make sure 2101 * that for the parts that it does use, it has correctly understood the 2102 * relationships that exist. This function returns 2103 * 0 if the topology is correct, and a non-zero error code, for non-debug 2104 * kernels if incorrect. Asserts are spread throughout the code to aid in 2105 * debugging on a DEBUG kernel. 2106 */ 2107 int 2108 lpl_topo_verify(cpupart_t *cpupart) 2109 { 2110 lgrp_t *lgrp; 2111 lpl_t *lpl; 2112 klgrpset_t rset; 2113 klgrpset_t cset; 2114 cpu_t *cpu; 2115 cpu_t *cp_start; 2116 int i; 2117 int j; 2118 int sum; 2119 2120 /* topology can't be incorrect if it doesn't exist */ 2121 if (!lgrp_topo_initialized || !lgrp_initialized) 2122 return (LPL_TOPO_CORRECT); 2123 2124 ASSERT(cpupart != NULL); 2125 2126 for (i = 0; i <= lgrp_alloc_max; i++) { 2127 lgrp = lgrp_table[i]; 2128 lpl = NULL; 2129 /* make sure lpls are allocated */ 2130 ASSERT(cpupart->cp_lgrploads); 2131 if (!cpupart->cp_lgrploads) 2132 return (LPL_TOPO_PART_HAS_NO_LPL); 2133 2134 lpl = &cpupart->cp_lgrploads[i]; 2135 /* make sure our index is good */ 2136 ASSERT(i < cpupart->cp_nlgrploads); 2137 2138 /* if lgroup doesn't exist, make sure lpl is empty */ 2139 if (!LGRP_EXISTS(lgrp)) { 2140 ASSERT(lpl->lpl_ncpu == 0); 2141 if (lpl->lpl_ncpu > 0) { 2142 return (LPL_TOPO_CPUS_NOT_EMPTY); 2143 } else { 2144 continue; 2145 } 2146 } 2147 2148 /* verify that lgroup and lpl are identically numbered */ 2149 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2150 2151 /* if lgroup isn't in our partition, make sure lpl is empty */ 2152 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2153 cpupart->cp_lgrpset)) { 2154 ASSERT(lpl->lpl_ncpu == 0); 2155 if (lpl->lpl_ncpu > 0) { 2156 return (LPL_TOPO_CPUS_NOT_EMPTY); 2157 } 2158 /* 2159 * lpl is empty, and lgroup isn't in partition. verify 2160 * that lpl doesn't show up in anyone else's rsets (in 2161 * this partition, anyway) 2162 */ 2163 2164 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2165 lpl_t *i_lpl; /* lpl we're iterating over */ 2166 2167 i_lpl = &cpupart->cp_lgrploads[j]; 2168 2169 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2170 if (lpl_rset_contains(i_lpl, lpl)) { 2171 return (LPL_TOPO_LPL_ORPHANED); 2172 } 2173 } 2174 /* lgroup is empty, and everything is ok. continue */ 2175 continue; 2176 } 2177 2178 2179 /* lgroup is in this partition, now check it against lpl */ 2180 2181 /* do both have matching lgrps? */ 2182 ASSERT(lgrp == lpl->lpl_lgrp); 2183 if (lgrp != lpl->lpl_lgrp) { 2184 return (LPL_TOPO_LGRP_MISMATCH); 2185 } 2186 2187 /* do the parent lgroups exist and do they match? */ 2188 if (lgrp->lgrp_parent) { 2189 ASSERT(lpl->lpl_parent); 2190 ASSERT(lgrp->lgrp_parent->lgrp_id == 2191 lpl->lpl_parent->lpl_lgrpid); 2192 2193 if (!lpl->lpl_parent) { 2194 return (LPL_TOPO_MISSING_PARENT); 2195 } else if (lgrp->lgrp_parent->lgrp_id != 2196 lpl->lpl_parent->lpl_lgrpid) { 2197 return (LPL_TOPO_PARENT_MISMATCH); 2198 } 2199 } 2200 2201 /* only leaf lgroups keep a cpucnt, only check leaves */ 2202 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2203 2204 /* verify that lgrp is also a leaf */ 2205 ASSERT((lgrp->lgrp_childcnt == 0) && 2206 (klgrpset_ismember(lgrp->lgrp_leaves, 2207 lpl->lpl_lgrpid))); 2208 2209 if ((lgrp->lgrp_childcnt > 0) || 2210 (!klgrpset_ismember(lgrp->lgrp_leaves, 2211 lpl->lpl_lgrpid))) { 2212 return (LPL_TOPO_LGRP_NOT_LEAF); 2213 } 2214 2215 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2216 (lpl->lpl_ncpu > 0)); 2217 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2218 (lpl->lpl_ncpu <= 0)) { 2219 return (LPL_TOPO_BAD_CPUCNT); 2220 } 2221 2222 /* 2223 * Check that lpl_ncpu also matches the number of 2224 * cpus in the lpl's linked list. This only exists in 2225 * leaves, but they should always match. 2226 */ 2227 j = 0; 2228 cpu = cp_start = lpl->lpl_cpus; 2229 while (cpu != NULL) { 2230 j++; 2231 2232 /* check to make sure cpu's lpl is leaf lpl */ 2233 ASSERT(cpu->cpu_lpl == lpl); 2234 if (cpu->cpu_lpl != lpl) { 2235 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2236 } 2237 2238 /* check next cpu */ 2239 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2240 continue; 2241 } else { 2242 cpu = NULL; 2243 } 2244 } 2245 2246 ASSERT(j == lpl->lpl_ncpu); 2247 if (j != lpl->lpl_ncpu) { 2248 return (LPL_TOPO_LPL_BAD_NCPU); 2249 } 2250 2251 /* 2252 * Also, check that leaf lpl is contained in all 2253 * intermediate lpls that name the leaf as a descendant 2254 */ 2255 2256 for (j = 0; j <= lgrp_alloc_max; j++) { 2257 klgrpset_t intersect; 2258 lgrp_t *lgrp_cand; 2259 lpl_t *lpl_cand; 2260 2261 lgrp_cand = lgrp_table[j]; 2262 intersect = klgrpset_intersects( 2263 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2264 cpupart->cp_lgrpset); 2265 2266 if (!LGRP_EXISTS(lgrp_cand) || 2267 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2268 cpupart->cp_lgrpset) || 2269 (intersect == 0)) 2270 continue; 2271 2272 lpl_cand = 2273 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2274 2275 if (klgrpset_ismember(intersect, 2276 lgrp->lgrp_id)) { 2277 ASSERT(lpl_rset_contains(lpl_cand, 2278 lpl)); 2279 2280 if (!lpl_rset_contains(lpl_cand, lpl)) { 2281 return (LPL_TOPO_RSET_MSSNG_LF); 2282 } 2283 } 2284 } 2285 2286 } else { /* non-leaf specific checks */ 2287 2288 /* 2289 * Non-leaf lpls should have lpl_cpus == NULL 2290 * verify that this is so 2291 */ 2292 ASSERT(lpl->lpl_cpus == NULL); 2293 if (lpl->lpl_cpus != NULL) { 2294 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2295 } 2296 2297 /* 2298 * verify that the sum of the cpus in the leaf resources 2299 * is equal to the total ncpu in the intermediate 2300 */ 2301 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2302 sum += lpl->lpl_rset[j]->lpl_ncpu; 2303 } 2304 2305 ASSERT(sum == lpl->lpl_ncpu); 2306 if (sum != lpl->lpl_ncpu) { 2307 return (LPL_TOPO_LPL_BAD_NCPU); 2308 } 2309 } 2310 2311 /* 2312 * check on lpl_hint. Don't check root, since it has no parent. 2313 */ 2314 if (lpl->lpl_parent != NULL) { 2315 int hint; 2316 lpl_t *hint_lpl; 2317 2318 /* make sure hint is within limits of nrset */ 2319 hint = lpl->lpl_hint; 2320 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2321 if (lpl->lpl_parent->lpl_nrset < hint) { 2322 return (LPL_TOPO_BOGUS_HINT); 2323 } 2324 2325 /* make sure hint points to valid lpl */ 2326 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2327 ASSERT(hint_lpl->lpl_ncpu > 0); 2328 if (hint_lpl->lpl_ncpu <= 0) { 2329 return (LPL_TOPO_BOGUS_HINT); 2330 } 2331 } 2332 2333 /* 2334 * Check the rset of the lpl in question. Make sure that each 2335 * rset contains a subset of the resources in 2336 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2337 * sure that each rset doesn't include resources that are 2338 * outside of that set. (Which would be resources somehow not 2339 * accounted for). 2340 */ 2341 2342 klgrpset_clear(rset); 2343 for (j = 0; j < lpl->lpl_nrset; j++) { 2344 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2345 } 2346 klgrpset_copy(cset, rset); 2347 /* make sure lpl rset matches lgrp rset */ 2348 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2349 /* make sure rset is contained with in partition, too */ 2350 klgrpset_diff(cset, cpupart->cp_lgrpset); 2351 2352 ASSERT(klgrpset_isempty(rset) && 2353 klgrpset_isempty(cset)); 2354 if (!klgrpset_isempty(rset) || 2355 !klgrpset_isempty(cset)) { 2356 return (LPL_TOPO_RSET_MISMATCH); 2357 } 2358 2359 /* 2360 * check to make sure lpl_nrset matches the number of rsets 2361 * contained in the lpl 2362 */ 2363 2364 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2365 j++); 2366 2367 ASSERT(j == lpl->lpl_nrset); 2368 if (j != lpl->lpl_nrset) { 2369 return (LPL_TOPO_BAD_RSETCNT); 2370 } 2371 2372 } 2373 return (LPL_TOPO_CORRECT); 2374 } 2375 2376 /* 2377 * Flatten lpl topology to given number of levels. This is presently only 2378 * implemented for a flatten to 2 levels, which will prune out the intermediates 2379 * and home the leaf lpls to the root lpl. 2380 */ 2381 int 2382 lpl_topo_flatten(int levels) 2383 { 2384 int i; 2385 uint_t sum; 2386 lgrp_t *lgrp_cur; 2387 lpl_t *lpl_cur; 2388 lpl_t *lpl_root; 2389 cpupart_t *cp; 2390 2391 if (levels != 2) 2392 return (0); 2393 2394 /* called w/ cpus paused - grab no locks! */ 2395 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2396 !lgrp_initialized); 2397 2398 cp = cp_list_head; 2399 do { 2400 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2401 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2402 2403 for (i = 0; i <= lgrp_alloc_max; i++) { 2404 lgrp_cur = lgrp_table[i]; 2405 lpl_cur = &cp->cp_lgrploads[i]; 2406 2407 if ((lgrp_cur == lgrp_root) || 2408 (!LGRP_EXISTS(lgrp_cur) && 2409 (lpl_cur->lpl_ncpu == 0))) 2410 continue; 2411 2412 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2413 /* 2414 * this should be a deleted intermediate, so 2415 * clear it 2416 */ 2417 lpl_clear(lpl_cur); 2418 } else if ((lpl_cur->lpl_nrset == 1) && 2419 (lpl_cur->lpl_rset[0] == lpl_cur) && 2420 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2421 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2422 /* 2423 * this is a leaf whose parent was deleted, or 2424 * whose parent had their lgrp deleted. (And 2425 * whose parent will soon be deleted). Point 2426 * this guy back to the root lpl. 2427 */ 2428 lpl_cur->lpl_parent = lpl_root; 2429 lpl_rset_add(lpl_root, lpl_cur); 2430 } 2431 2432 } 2433 2434 /* 2435 * Now that we're done, make sure the count on the root lpl is 2436 * correct, and update the hints of the children for the sake of 2437 * thoroughness 2438 */ 2439 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2440 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2441 } 2442 lpl_root->lpl_ncpu = sum; 2443 lpl_child_update(lpl_root, cp); 2444 2445 cp = cp->cp_next; 2446 } while (cp != cp_list_head); 2447 2448 return (levels); 2449 } 2450 2451 /* 2452 * Insert a lpl into the resource hierarchy and create any additional lpls that 2453 * are necessary to represent the varying states of locality for the cpu 2454 * resoruces newly added to the partition. 2455 * 2456 * This routine is clever enough that it can correctly add resources from the 2457 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2458 * those for which the lpl is a leaf as opposed to simply a named equally local 2459 * resource). The one special case that needs additional processing is when a 2460 * new intermediate lpl is introduced. Since the main loop only traverses 2461 * looking to add the leaf resource where it does not yet exist, additional work 2462 * is necessary to add other leaf resources that may need to exist in the newly 2463 * created intermediate. This is performed by the second inner loop, and is 2464 * only done when the check for more than one overlapping resource succeeds. 2465 */ 2466 2467 void 2468 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2469 { 2470 int i; 2471 int j; 2472 int hint; 2473 int rset_num_intersect; 2474 lgrp_t *lgrp_cur; 2475 lpl_t *lpl_cur; 2476 lpl_t *lpl_parent; 2477 lgrp_id_t parent_id; 2478 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2479 2480 for (i = 0; i <= lgrp_alloc_max; i++) { 2481 lgrp_cur = lgrp_table[i]; 2482 2483 /* 2484 * Don't insert if the lgrp isn't there, if the leaf isn't 2485 * contained within the current lgrp, or if the current lgrp has 2486 * no leaves in this partition 2487 */ 2488 2489 if (!LGRP_EXISTS(lgrp_cur) || 2490 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2491 lpl_leaf->lpl_lgrpid) || 2492 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2493 cpupart->cp_lgrpset)) 2494 continue; 2495 2496 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2497 if (lgrp_cur->lgrp_parent != NULL) { 2498 /* if lgrp has a parent, assign it properly */ 2499 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2500 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2501 } else { 2502 /* if not, make sure parent ptr gets set to null */ 2503 lpl_parent = NULL; 2504 } 2505 2506 if (lpl_cur == lpl_leaf) { 2507 /* 2508 * Almost all leaf state was initialized elsewhere. The 2509 * only thing left to do is to set the parent. 2510 */ 2511 lpl_cur->lpl_parent = lpl_parent; 2512 continue; 2513 } 2514 2515 /* 2516 * Initialize intermediate lpl 2517 * Save this lpl's hint though. Since we're changing this 2518 * lpl's resources, we need to update the hint in this lpl's 2519 * children, but the hint in this lpl is unaffected and 2520 * should be preserved. 2521 */ 2522 hint = lpl_cur->lpl_hint; 2523 2524 lpl_clear(lpl_cur); 2525 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2526 2527 lpl_cur->lpl_hint = hint; 2528 lpl_cur->lpl_parent = lpl_parent; 2529 2530 /* does new lpl need to be populated with other resources? */ 2531 rset_intersect = 2532 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2533 cpupart->cp_lgrpset); 2534 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2535 2536 if (rset_num_intersect > 1) { 2537 /* 2538 * If so, figure out what lpls have resources that 2539 * intersect this one, and add them. 2540 */ 2541 for (j = 0; j <= lgrp_alloc_max; j++) { 2542 lgrp_t *lgrp_cand; /* candidate lgrp */ 2543 lpl_t *lpl_cand; /* candidate lpl */ 2544 2545 lgrp_cand = lgrp_table[j]; 2546 if (!LGRP_EXISTS(lgrp_cand) || 2547 !klgrpset_ismember(rset_intersect, 2548 lgrp_cand->lgrp_id)) 2549 continue; 2550 lpl_cand = 2551 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2552 lpl_rset_add(lpl_cur, lpl_cand); 2553 } 2554 } 2555 /* 2556 * This lpl's rset has changed. Update the hint in it's 2557 * children. 2558 */ 2559 lpl_child_update(lpl_cur, cpupart); 2560 } 2561 } 2562 2563 /* 2564 * remove a lpl from the hierarchy of resources, clearing its state when 2565 * finished. If the lpls at the intermediate levels of the hierarchy have no 2566 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2567 * delete them as well. 2568 */ 2569 2570 void 2571 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2572 { 2573 int i; 2574 lgrp_t *lgrp_cur; 2575 lpl_t *lpl_cur; 2576 klgrpset_t leaf_intersect; /* intersection of leaves */ 2577 2578 for (i = 0; i <= lgrp_alloc_max; i++) { 2579 lgrp_cur = lgrp_table[i]; 2580 2581 /* 2582 * Don't attempt to remove from lgrps that aren't there, that 2583 * don't contain our leaf, or from the leaf itself. (We do that 2584 * later) 2585 */ 2586 2587 if (!LGRP_EXISTS(lgrp_cur)) 2588 continue; 2589 2590 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2591 2592 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2593 lpl_leaf->lpl_lgrpid) || 2594 (lpl_cur == lpl_leaf)) { 2595 continue; 2596 } 2597 2598 /* 2599 * This is a slightly sleazy simplification in that we have 2600 * already marked the cp_lgrpset as no longer containing the 2601 * leaf we've deleted. Any lpls that pass the above checks 2602 * based upon lgrp membership but not necessarily cpu-part 2603 * membership also get cleared by the checks below. Currently 2604 * this is harmless, as the lpls should be empty anyway. 2605 * 2606 * In particular, we want to preserve lpls that have additional 2607 * leaf resources, even though we don't yet have a processor 2608 * architecture that represents resources this way. 2609 */ 2610 2611 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2612 cpupart->cp_lgrpset); 2613 2614 lpl_rset_del(lpl_cur, lpl_leaf); 2615 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2616 lpl_clear(lpl_cur); 2617 } else { 2618 /* 2619 * Update this lpl's children 2620 */ 2621 lpl_child_update(lpl_cur, cpupart); 2622 } 2623 } 2624 lpl_clear(lpl_leaf); 2625 } 2626 2627 /* 2628 * add a cpu to a partition in terms of lgrp load avg bookeeping 2629 * 2630 * The lpl (cpu partition load average information) is now arranged in a 2631 * hierarchical fashion whereby resources that are closest, ie. most local, to 2632 * the cpu in question are considered to be leaves in a tree of resources. 2633 * There are two general cases for cpu additon: 2634 * 2635 * 1. A lpl structure that contains resources already in the hierarchy tree. 2636 * In this case, all of the associated lpl relationships have been defined, and 2637 * all that is necessary is that we link the new cpu into the per-lpl list of 2638 * cpus, and increment the ncpu count of all places where this cpu resource will 2639 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2640 * pushing is accomplished by this routine. 2641 * 2642 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2643 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2644 * construct the hierarchy of state necessary to name it's more distant 2645 * resources, if they should exist. The leaf structure is initialized by this 2646 * routine, as is the cpu-partition state for the lgrp membership. This routine 2647 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2648 * and builds all of the "ancestoral" state necessary to identify resources at 2649 * differing levels of locality. 2650 */ 2651 void 2652 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2653 { 2654 cpupart_t *cpupart; 2655 lgrp_t *lgrp_leaf; 2656 lpl_t *lpl_leaf; 2657 2658 /* called sometimes w/ cpus paused - grab no locks */ 2659 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2660 2661 cpupart = cp->cpu_part; 2662 lgrp_leaf = lgrp_table[lgrpid]; 2663 2664 /* don't add non-existent lgrp */ 2665 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2666 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2667 cp->cpu_lpl = lpl_leaf; 2668 2669 /* only leaf lpls contain cpus */ 2670 2671 if (lpl_leaf->lpl_ncpu++ == 0) { 2672 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2673 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2674 lpl_leaf_insert(lpl_leaf, cpupart); 2675 } else { 2676 /* 2677 * the lpl should already exist in the parent, so just update 2678 * the count of available CPUs 2679 */ 2680 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2681 } 2682 2683 /* link cpu into list of cpus in lpl */ 2684 2685 if (lpl_leaf->lpl_cpus) { 2686 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2687 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2688 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2689 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2690 } else { 2691 /* 2692 * We increment ncpu immediately after we create a new leaf 2693 * lpl, so assert that ncpu == 1 for the case where we don't 2694 * have any cpu pointers yet. 2695 */ 2696 ASSERT(lpl_leaf->lpl_ncpu == 1); 2697 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2698 } 2699 2700 } 2701 2702 2703 /* 2704 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2705 * 2706 * The lpl (cpu partition load average information) is now arranged in a 2707 * hierarchical fashion whereby resources that are closest, ie. most local, to 2708 * the cpu in question are considered to be leaves in a tree of resources. 2709 * There are two removal cases in question: 2710 * 2711 * 1. Removal of the resource in the leaf leaves other resources remaining in 2712 * that leaf. (Another cpu still exists at this level of locality). In this 2713 * case, the count of available cpus is decremented in all assocated lpls by 2714 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2715 * from the per-cpu lpl list. 2716 * 2717 * 2. Removal of the resource results in the lpl containing no resources. (It's 2718 * empty) In this case, all of what has occurred for the first step must take 2719 * place; however, additionally we must remove the lpl structure itself, prune 2720 * out any stranded lpls that do not directly name a leaf resource, and mark the 2721 * cpu partition in question as no longer containing resources from the lgrp of 2722 * the lpl that has been delted. Cpu-partition changes are handled by this 2723 * method, but the lpl_leaf_remove function deals with the details of pruning 2724 * out the empty lpl and any of its orphaned direct ancestors. 2725 */ 2726 void 2727 lgrp_part_del_cpu(cpu_t *cp) 2728 { 2729 lpl_t *lpl; 2730 lpl_t *leaf_lpl; 2731 lgrp_t *lgrp_leaf; 2732 2733 /* called sometimes w/ cpus paused - grab no locks */ 2734 2735 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2736 2737 lpl = leaf_lpl = cp->cpu_lpl; 2738 lgrp_leaf = leaf_lpl->lpl_lgrp; 2739 2740 /* don't delete a leaf that isn't there */ 2741 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2742 2743 /* no double-deletes */ 2744 ASSERT(lpl->lpl_ncpu); 2745 if (--lpl->lpl_ncpu == 0) { 2746 /* 2747 * This was the last cpu in this lgroup for this partition, 2748 * clear its bit in the partition's lgroup bitmask 2749 */ 2750 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2751 2752 /* eliminate remaning lpl link pointers in cpu, lpl */ 2753 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2754 2755 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2756 } else { 2757 2758 /* unlink cpu from lists of cpus in lpl */ 2759 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2760 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2761 if (lpl->lpl_cpus == cp) { 2762 lpl->lpl_cpus = cp->cpu_next_lpl; 2763 } 2764 2765 /* 2766 * Update the cpu count in the lpls associated with parent 2767 * lgroups. 2768 */ 2769 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2770 2771 } 2772 /* clear cpu's lpl ptr when we're all done */ 2773 cp->cpu_lpl = NULL; 2774 } 2775 2776 /* 2777 * Recompute load average for the specified partition/lgrp fragment. 2778 * 2779 * We rely on the fact that this routine is called from the clock thread 2780 * at a point before the clock thread can block (i.e. before its first 2781 * lock request). Since the clock thread can not be preempted (since it 2782 * runs at highest priority), we know that cpu partitions can not change 2783 * (since doing so would require either the repartition requester or the 2784 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2785 * without grabbing cpu_lock. 2786 */ 2787 void 2788 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2789 { 2790 uint_t ncpu; 2791 int64_t old, new, f; 2792 2793 /* 2794 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2795 */ 2796 static short expval[] = { 2797 0, 3196, 1618, 1083, 2798 814, 652, 543, 466, 2799 408, 363, 326, 297, 2800 272, 251, 233, 218, 2801 204, 192, 181, 172, 2802 163, 155, 148, 142, 2803 136, 130, 125, 121, 2804 116, 112, 109, 105 2805 }; 2806 2807 /* ASSERT (called from clock level) */ 2808 2809 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2810 ((ncpu = lpl->lpl_ncpu) == 0)) { 2811 return; 2812 } 2813 2814 for (;;) { 2815 2816 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2817 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2818 else 2819 f = expval[ncpu]; 2820 2821 /* 2822 * Modify the load average atomically to avoid losing 2823 * anticipatory load updates (see lgrp_move_thread()). 2824 */ 2825 if (ageflag) { 2826 /* 2827 * We're supposed to both update and age the load. 2828 * This happens 10 times/sec. per cpu. We do a 2829 * little hoop-jumping to avoid integer overflow. 2830 */ 2831 int64_t q, r; 2832 2833 do { 2834 old = new = lpl->lpl_loadavg; 2835 q = (old >> 16) << 7; 2836 r = (old & 0xffff) << 7; 2837 new += ((long long)(nrcpus - q) * f - 2838 ((r * f) >> 16)) >> 7; 2839 2840 /* 2841 * Check for overflow 2842 */ 2843 if (new > LGRP_LOADAVG_MAX) 2844 new = LGRP_LOADAVG_MAX; 2845 else if (new < 0) 2846 new = 0; 2847 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2848 new) != old); 2849 } else { 2850 /* 2851 * We're supposed to update the load, but not age it. 2852 * This option is used to update the load (which either 2853 * has already been aged in this 1/10 sec. interval or 2854 * soon will be) to account for a remotely executing 2855 * thread. 2856 */ 2857 do { 2858 old = new = lpl->lpl_loadavg; 2859 new += f; 2860 /* 2861 * Check for overflow 2862 * Underflow not possible here 2863 */ 2864 if (new < old) 2865 new = LGRP_LOADAVG_MAX; 2866 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2867 new) != old); 2868 } 2869 2870 /* 2871 * Do the same for this lpl's parent 2872 */ 2873 if ((lpl = lpl->lpl_parent) == NULL) 2874 break; 2875 ncpu = lpl->lpl_ncpu; 2876 } 2877 } 2878 2879 /* 2880 * Initialize lpl topology in the target based on topology currently present in 2881 * lpl_bootstrap. 2882 * 2883 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2884 * initialize cp_default list of lpls. Up to this point all topology operations 2885 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2886 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2887 * `target' points to the list of lpls in cp_default and `size' is the size of 2888 * this list. 2889 * 2890 * This function walks the lpl topology in lpl_bootstrap and does for things: 2891 * 2892 * 1) Copies all fields from lpl_bootstrap to the target. 2893 * 2894 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2895 * 2896 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2897 * instead of lpl_bootstrap. 2898 * 2899 * 4) Updates pointers in the resource list of the target to point to the lpls 2900 * in the target list instead of lpl_bootstrap. 2901 * 2902 * After lpl_topo_bootstrap() completes, target contains the same information 2903 * that would be present there if it were used during boot instead of 2904 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2905 * and it is bzeroed. 2906 */ 2907 void 2908 lpl_topo_bootstrap(lpl_t *target, int size) 2909 { 2910 lpl_t *lpl = lpl_bootstrap; 2911 lpl_t *target_lpl = target; 2912 int howmany; 2913 int id; 2914 int i; 2915 2916 /* 2917 * The only target that should be passed here is cp_default lpl list. 2918 */ 2919 ASSERT(target == cp_default.cp_lgrploads); 2920 ASSERT(size == cp_default.cp_nlgrploads); 2921 ASSERT(!lgrp_topo_initialized); 2922 ASSERT(ncpus == 1); 2923 2924 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2925 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2926 /* 2927 * Copy all fields from lpl. 2928 */ 2929 2930 *target_lpl = *lpl; 2931 2932 /* 2933 * Substitute CPU0 lpl pointer with one relative to target. 2934 */ 2935 if (lpl->lpl_cpus == CPU) { 2936 ASSERT(CPU->cpu_lpl == lpl); 2937 CPU->cpu_lpl = target_lpl; 2938 } 2939 2940 /* 2941 * Substitute parent information with parent relative to target. 2942 */ 2943 if (lpl->lpl_parent != NULL) 2944 target_lpl->lpl_parent = (lpl_t *) 2945 (((uintptr_t)lpl->lpl_parent - 2946 (uintptr_t)lpl_bootstrap) + 2947 (uintptr_t)target); 2948 2949 /* 2950 * Walk over resource set substituting pointers relative to 2951 * lpl_bootstrap to pointers relative to target. 2952 */ 2953 ASSERT(lpl->lpl_nrset <= 1); 2954 2955 for (id = 0; id < lpl->lpl_nrset; id++) { 2956 if (lpl->lpl_rset[id] != NULL) { 2957 target_lpl->lpl_rset[id] = 2958 (lpl_t *) 2959 (((uintptr_t)lpl->lpl_rset[id] - 2960 (uintptr_t)lpl_bootstrap) + 2961 (uintptr_t)target); 2962 } 2963 } 2964 } 2965 2966 /* 2967 * Topology information in lpl_bootstrap is no longer needed. 2968 */ 2969 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2970 } 2971 2972 /* 2973 * If the lowest load among the lgroups a process' threads are currently 2974 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2975 * expanding the process to a new lgroup. 2976 */ 2977 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2978 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2979 2980 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2981 ((lgrp_expand_proc_thresh) / (ncpu)) 2982 2983 /* 2984 * A process will be expanded to a new lgroup only if the difference between 2985 * the lowest load on the lgroups the process' thread's are currently spread 2986 * across and the lowest load on the other lgroups in the process' partition 2987 * is greater than lgrp_expand_proc_diff. 2988 */ 2989 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2990 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2991 2992 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2993 ((lgrp_expand_proc_diff) / (ncpu)) 2994 2995 /* 2996 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2997 * be present due to impreciseness of the load average decay algorithm. 2998 * 2999 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 3000 * tolerance is scaled by the number of cpus in the lgroup just like 3001 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 3002 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 3003 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 3004 */ 3005 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 3006 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 3007 ((lgrp_loadavg_tolerance) / ncpu) 3008 3009 /* 3010 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 3011 * average is above this threshold 3012 */ 3013 uint32_t lgrp_load_thresh = UINT32_MAX; 3014 3015 /* 3016 * lgrp_choose() will try to skip any lgroups with less memory 3017 * than this free when choosing a home lgroup 3018 */ 3019 pgcnt_t lgrp_mem_free_thresh = 0; 3020 3021 /* 3022 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 3023 * one based on one of the following policies: 3024 * - Random selection 3025 * - Pseudo round robin placement 3026 * - Longest time since a thread was last placed 3027 */ 3028 #define LGRP_CHOOSE_RANDOM 1 3029 #define LGRP_CHOOSE_RR 2 3030 #define LGRP_CHOOSE_TIME 3 3031 3032 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 3033 3034 /* 3035 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 3036 * be bound to a CPU or processor set. 3037 * 3038 * Arguments: 3039 * t The thread 3040 * cpupart The partition the thread belongs to. 3041 * 3042 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 3043 * disabled, or thread_lock held (at splhigh) to protect against the CPU 3044 * partitions changing out from under us and assumes that given thread is 3045 * protected. Also, called sometimes w/ cpus paused or kernel preemption 3046 * disabled, so don't grab any locks because we should never block under 3047 * those conditions. 3048 */ 3049 lpl_t * 3050 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 3051 { 3052 lgrp_load_t bestload, bestrload; 3053 int lgrpid_offset, lgrp_count; 3054 lgrp_id_t lgrpid, lgrpid_start; 3055 lpl_t *lpl, *bestlpl, *bestrlpl; 3056 klgrpset_t lgrpset; 3057 proc_t *p; 3058 3059 ASSERT(t != NULL); 3060 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3061 THREAD_LOCK_HELD(t)); 3062 ASSERT(cpupart != NULL); 3063 3064 p = t->t_procp; 3065 3066 /* A process should always be in an active partition */ 3067 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3068 3069 bestlpl = bestrlpl = NULL; 3070 bestload = bestrload = LGRP_LOADAVG_MAX; 3071 lgrpset = cpupart->cp_lgrpset; 3072 3073 switch (lgrp_choose_policy) { 3074 case LGRP_CHOOSE_RR: 3075 lgrpid = cpupart->cp_lgrp_hint; 3076 do { 3077 if (++lgrpid > lgrp_alloc_max) 3078 lgrpid = 0; 3079 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3080 3081 break; 3082 default: 3083 case LGRP_CHOOSE_TIME: 3084 case LGRP_CHOOSE_RANDOM: 3085 klgrpset_nlgrps(lgrpset, lgrp_count); 3086 lgrpid_offset = 3087 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3088 for (lgrpid = 0; ; lgrpid++) { 3089 if (klgrpset_ismember(lgrpset, lgrpid)) { 3090 if (--lgrpid_offset == 0) 3091 break; 3092 } 3093 } 3094 break; 3095 } 3096 3097 lgrpid_start = lgrpid; 3098 3099 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3100 lgrp_id_t, cpupart->cp_lgrp_hint); 3101 3102 /* 3103 * Use lgroup affinities (if any) to choose best lgroup 3104 * 3105 * NOTE: Assumes that thread is protected from going away and its 3106 * lgroup affinities won't change (ie. p_lock, or 3107 * thread_lock() being held and/or CPUs paused) 3108 */ 3109 if (t->t_lgrp_affinity) { 3110 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 3111 if (lpl != NULL) 3112 return (lpl); 3113 } 3114 3115 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3116 3117 do { 3118 pgcnt_t npgs; 3119 3120 /* 3121 * Skip any lgroups outside of thread's pset 3122 */ 3123 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3124 if (++lgrpid > lgrp_alloc_max) 3125 lgrpid = 0; /* wrap the search */ 3126 continue; 3127 } 3128 3129 /* 3130 * Skip any non-leaf lgroups 3131 */ 3132 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3133 continue; 3134 3135 /* 3136 * Skip any lgroups without enough free memory 3137 * (when threshold set to nonzero positive value) 3138 */ 3139 if (lgrp_mem_free_thresh > 0) { 3140 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3141 if (npgs < lgrp_mem_free_thresh) { 3142 if (++lgrpid > lgrp_alloc_max) 3143 lgrpid = 0; /* wrap the search */ 3144 continue; 3145 } 3146 } 3147 3148 lpl = &cpupart->cp_lgrploads[lgrpid]; 3149 if (klgrpset_isempty(p->p_lgrpset) || 3150 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3151 /* 3152 * Either this is a new process or the process already 3153 * has threads on this lgrp, so this is a preferred 3154 * lgroup for the thread. 3155 */ 3156 if (bestlpl == NULL || 3157 lpl_pick(lpl, bestlpl)) { 3158 bestload = lpl->lpl_loadavg; 3159 bestlpl = lpl; 3160 } 3161 } else { 3162 /* 3163 * The process doesn't have any threads on this lgrp, 3164 * but we're willing to consider this lgrp if the load 3165 * difference is big enough to justify splitting up 3166 * the process' threads. 3167 */ 3168 if (bestrlpl == NULL || 3169 lpl_pick(lpl, bestrlpl)) { 3170 bestrload = lpl->lpl_loadavg; 3171 bestrlpl = lpl; 3172 } 3173 } 3174 if (++lgrpid > lgrp_alloc_max) 3175 lgrpid = 0; /* wrap the search */ 3176 } while (lgrpid != lgrpid_start); 3177 3178 /* 3179 * Return root lgroup if threshold isn't set to maximum value and 3180 * lowest lgroup load average more than a certain threshold 3181 */ 3182 if (lgrp_load_thresh != UINT32_MAX && 3183 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3184 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3185 3186 /* 3187 * If all the lgroups over which the thread's process is spread are 3188 * heavily loaded, or otherwise undesirable, we'll consider placing 3189 * the thread on one of the other leaf lgroups in the thread's 3190 * partition. 3191 */ 3192 if ((bestlpl == NULL) || 3193 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3194 (bestrload < bestload) && /* paranoid about wraparound */ 3195 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3196 bestload))) { 3197 bestlpl = bestrlpl; 3198 } 3199 3200 if (bestlpl == NULL) { 3201 /* 3202 * No lgroup looked particularly good, but we still 3203 * have to pick something. Go with the randomly selected 3204 * legal lgroup we started with above. 3205 */ 3206 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3207 } 3208 3209 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3210 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3211 3212 ASSERT(bestlpl->lpl_ncpu > 0); 3213 return (bestlpl); 3214 } 3215 3216 /* 3217 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3218 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3219 */ 3220 static int 3221 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3222 { 3223 lgrp_load_t l1, l2; 3224 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3225 3226 l1 = lpl1->lpl_loadavg; 3227 l2 = lpl2->lpl_loadavg; 3228 3229 if ((l1 + tolerance < l2) && (l1 < l2)) { 3230 /* lpl1 is significantly less loaded than lpl2 */ 3231 return (1); 3232 } 3233 3234 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3235 l1 + tolerance >= l2 && l1 < l2 && 3236 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3237 /* 3238 * lpl1's load is within the tolerance of lpl2. We're 3239 * willing to consider it be to better however if 3240 * it has been longer since we last homed a thread there 3241 */ 3242 return (1); 3243 } 3244 3245 return (0); 3246 } 3247 3248 /* 3249 * An LWP is expected to be assigned to an lgroup for at least this long 3250 * for its anticipatory load to be justified. NOTE that this value should 3251 * not be set extremely huge (say, larger than 100 years), to avoid problems 3252 * with overflow in the calculation that uses it. 3253 */ 3254 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3255 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3256 3257 /* 3258 * Routine to change a thread's lgroup affiliation. This routine updates 3259 * the thread's kthread_t struct and its process' proc_t struct to note the 3260 * thread's new lgroup affiliation, and its lgroup affinities. 3261 * 3262 * Note that this is the only routine that modifies a thread's t_lpl field, 3263 * and that adds in or removes anticipatory load. 3264 * 3265 * If the thread is exiting, newlpl is NULL. 3266 * 3267 * Locking: 3268 * The following lock must be held on entry: 3269 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3270 * doesn't get removed from t's partition 3271 * 3272 * This routine is not allowed to grab any locks, since it may be called 3273 * with cpus paused (such as from cpu_offline). 3274 */ 3275 void 3276 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3277 { 3278 proc_t *p; 3279 lpl_t *lpl, *oldlpl; 3280 lgrp_id_t oldid; 3281 kthread_t *tp; 3282 uint_t ncpu; 3283 lgrp_load_t old, new; 3284 3285 ASSERT(t); 3286 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3287 THREAD_LOCK_HELD(t)); 3288 3289 /* 3290 * If not changing lpls, just return 3291 */ 3292 if ((oldlpl = t->t_lpl) == newlpl) 3293 return; 3294 3295 /* 3296 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3297 * associated with process 0 rather than with its original process). 3298 */ 3299 if (t->t_proc_flag & TP_LWPEXIT) { 3300 if (newlpl != NULL) { 3301 t->t_lpl = newlpl; 3302 } 3303 return; 3304 } 3305 3306 p = ttoproc(t); 3307 3308 /* 3309 * If the thread had a previous lgroup, update its process' p_lgrpset 3310 * to account for it being moved from its old lgroup. 3311 */ 3312 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3313 (p->p_tlist != NULL)) { 3314 oldid = oldlpl->lpl_lgrpid; 3315 3316 if (newlpl != NULL) 3317 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3318 3319 if ((do_lgrpset_delete) && 3320 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3321 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3322 /* 3323 * Check if a thread other than the thread 3324 * that's moving is assigned to the same 3325 * lgroup as the thread that's moving. Note 3326 * that we have to compare lgroup IDs, rather 3327 * than simply comparing t_lpl's, since the 3328 * threads may belong to different partitions 3329 * but be assigned to the same lgroup. 3330 */ 3331 ASSERT(tp->t_lpl != NULL); 3332 3333 if ((tp != t) && 3334 (tp->t_lpl->lpl_lgrpid == oldid)) { 3335 /* 3336 * Another thread is assigned to the 3337 * same lgroup as the thread that's 3338 * moving, p_lgrpset doesn't change. 3339 */ 3340 break; 3341 } else if (tp == p->p_tlist) { 3342 /* 3343 * No other thread is assigned to the 3344 * same lgroup as the exiting thread, 3345 * clear the lgroup's bit in p_lgrpset. 3346 */ 3347 klgrpset_del(p->p_lgrpset, oldid); 3348 break; 3349 } 3350 } 3351 } 3352 3353 /* 3354 * If this thread was assigned to its old lgroup for such a 3355 * short amount of time that the anticipatory load that was 3356 * added on its behalf has aged very little, remove that 3357 * anticipatory load. 3358 */ 3359 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3360 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3361 lpl = oldlpl; 3362 for (;;) { 3363 do { 3364 old = new = lpl->lpl_loadavg; 3365 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3366 if (new > old) { 3367 /* 3368 * this can happen if the load 3369 * average was aged since we 3370 * added in the anticipatory 3371 * load 3372 */ 3373 new = 0; 3374 } 3375 } while (cas32( 3376 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3377 new) != old); 3378 3379 lpl = lpl->lpl_parent; 3380 if (lpl == NULL) 3381 break; 3382 3383 ncpu = lpl->lpl_ncpu; 3384 ASSERT(ncpu > 0); 3385 } 3386 } 3387 } 3388 /* 3389 * If the thread has a new lgroup (i.e. it's not exiting), update its 3390 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3391 * to its new lgroup to account for its move to its new lgroup. 3392 */ 3393 if (newlpl != NULL) { 3394 /* 3395 * This thread is moving to a new lgroup 3396 */ 3397 t->t_lpl = newlpl; 3398 3399 /* 3400 * Reflect move in load average of new lgroup 3401 * unless it is root lgroup 3402 */ 3403 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3404 return; 3405 3406 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3407 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3408 } 3409 3410 /* 3411 * It'll take some time for the load on the new lgroup 3412 * to reflect this thread's placement on it. We'd 3413 * like not, however, to have all threads between now 3414 * and then also piling on to this lgroup. To avoid 3415 * this pileup, we anticipate the load this thread 3416 * will generate on its new lgroup. The goal is to 3417 * make the lgroup's load appear as though the thread 3418 * had been there all along. We're very conservative 3419 * in calculating this anticipatory load, we assume 3420 * the worst case case (100% CPU-bound thread). This 3421 * may be modified in the future to be more accurate. 3422 */ 3423 lpl = newlpl; 3424 for (;;) { 3425 ncpu = lpl->lpl_ncpu; 3426 ASSERT(ncpu > 0); 3427 do { 3428 old = new = lpl->lpl_loadavg; 3429 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3430 /* 3431 * Check for overflow 3432 * Underflow not possible here 3433 */ 3434 if (new < old) 3435 new = UINT32_MAX; 3436 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3437 new) != old); 3438 3439 lpl = lpl->lpl_parent; 3440 if (lpl == NULL) 3441 break; 3442 } 3443 t->t_anttime = gethrtime(); 3444 } 3445 } 3446 3447 /* 3448 * Return lgroup memory allocation policy given advice from madvise(3C) 3449 */ 3450 lgrp_mem_policy_t 3451 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3452 { 3453 switch (advice) { 3454 case MADV_ACCESS_LWP: 3455 return (LGRP_MEM_POLICY_NEXT); 3456 case MADV_ACCESS_MANY: 3457 return (LGRP_MEM_POLICY_RANDOM); 3458 default: 3459 return (lgrp_mem_policy_default(size, type)); 3460 } 3461 } 3462 3463 /* 3464 * Figure out default policy 3465 */ 3466 lgrp_mem_policy_t 3467 lgrp_mem_policy_default(size_t size, int type) 3468 { 3469 cpupart_t *cp; 3470 lgrp_mem_policy_t policy; 3471 size_t pset_mem_size; 3472 3473 /* 3474 * Randomly allocate memory across lgroups for shared memory 3475 * beyond a certain threshold 3476 */ 3477 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3478 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3479 /* 3480 * Get total memory size of current thread's pset 3481 */ 3482 kpreempt_disable(); 3483 cp = curthread->t_cpupart; 3484 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3485 kpreempt_enable(); 3486 3487 /* 3488 * Choose policy to randomly allocate memory across 3489 * lgroups in pset if it will fit and is not default 3490 * partition. Otherwise, allocate memory randomly 3491 * across machine. 3492 */ 3493 if (lgrp_mem_pset_aware && size < pset_mem_size) 3494 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3495 else 3496 policy = LGRP_MEM_POLICY_RANDOM; 3497 } else 3498 /* 3499 * Apply default policy for private memory and 3500 * shared memory under the respective random 3501 * threshold. 3502 */ 3503 policy = lgrp_mem_default_policy; 3504 3505 return (policy); 3506 } 3507 3508 /* 3509 * Get memory allocation policy for this segment 3510 */ 3511 lgrp_mem_policy_info_t * 3512 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3513 { 3514 lgrp_mem_policy_info_t *policy_info; 3515 extern struct seg_ops segspt_ops; 3516 extern struct seg_ops segspt_shmops; 3517 3518 /* 3519 * This is for binary compatibility to protect against third party 3520 * segment drivers which haven't recompiled to allow for 3521 * SEGOP_GETPOLICY() 3522 */ 3523 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3524 seg->s_ops != &segspt_shmops) 3525 return (NULL); 3526 3527 policy_info = NULL; 3528 if (seg->s_ops->getpolicy != NULL) 3529 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3530 3531 return (policy_info); 3532 } 3533 3534 /* 3535 * Set policy for allocating private memory given desired policy, policy info, 3536 * size in bytes of memory that policy is being applied. 3537 * Return 0 if policy wasn't set already and 1 if policy was set already 3538 */ 3539 int 3540 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3541 lgrp_mem_policy_info_t *policy_info, size_t size) 3542 { 3543 3544 ASSERT(policy_info != NULL); 3545 3546 if (policy == LGRP_MEM_POLICY_DEFAULT) 3547 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3548 3549 /* 3550 * Policy set already? 3551 */ 3552 if (policy == policy_info->mem_policy) 3553 return (1); 3554 3555 /* 3556 * Set policy 3557 */ 3558 policy_info->mem_policy = policy; 3559 policy_info->mem_reserved = 0; 3560 3561 return (0); 3562 } 3563 3564 3565 /* 3566 * Get shared memory allocation policy with given tree and offset 3567 */ 3568 lgrp_mem_policy_info_t * 3569 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3570 u_offset_t vn_off) 3571 { 3572 u_offset_t off; 3573 lgrp_mem_policy_info_t *policy_info; 3574 lgrp_shm_policy_seg_t *policy_seg; 3575 lgrp_shm_locality_t *shm_locality; 3576 avl_tree_t *tree; 3577 avl_index_t where; 3578 3579 /* 3580 * Get policy segment tree from anon_map or vnode and use specified 3581 * anon index or vnode offset as offset 3582 * 3583 * Assume that no lock needs to be held on anon_map or vnode, since 3584 * they should be protected by their reference count which must be 3585 * nonzero for an existing segment 3586 */ 3587 if (amp) { 3588 ASSERT(amp->refcnt != 0); 3589 shm_locality = amp->locality; 3590 if (shm_locality == NULL) 3591 return (NULL); 3592 tree = shm_locality->loc_tree; 3593 off = ptob(anon_index); 3594 } else if (vp) { 3595 shm_locality = vp->v_locality; 3596 if (shm_locality == NULL) 3597 return (NULL); 3598 ASSERT(shm_locality->loc_count != 0); 3599 tree = shm_locality->loc_tree; 3600 off = vn_off; 3601 } 3602 3603 if (tree == NULL) 3604 return (NULL); 3605 3606 /* 3607 * Lookup policy segment for offset into shared object and return 3608 * policy info 3609 */ 3610 rw_enter(&shm_locality->loc_lock, RW_READER); 3611 policy_info = NULL; 3612 policy_seg = avl_find(tree, &off, &where); 3613 if (policy_seg) 3614 policy_info = &policy_seg->shm_policy; 3615 rw_exit(&shm_locality->loc_lock); 3616 3617 return (policy_info); 3618 } 3619 3620 /* 3621 * Default memory allocation policy for kernel segmap pages 3622 */ 3623 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3624 3625 /* 3626 * Return lgroup to use for allocating memory 3627 * given the segment and address 3628 * 3629 * There isn't any mutual exclusion that exists between calls 3630 * to this routine and DR, so this routine and whomever calls it 3631 * should be mindful of the possibility that the lgrp returned 3632 * may be deleted. If this happens, dereferences of the lgrp 3633 * pointer will still be safe, but the resources in the lgrp will 3634 * be gone, and LGRP_EXISTS() will no longer be true. 3635 */ 3636 lgrp_t * 3637 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3638 { 3639 int i; 3640 lgrp_t *lgrp; 3641 klgrpset_t lgrpset; 3642 int lgrps_spanned; 3643 unsigned long off; 3644 lgrp_mem_policy_t policy; 3645 lgrp_mem_policy_info_t *policy_info; 3646 ushort_t random; 3647 int stat = 0; 3648 extern struct seg *segkmap; 3649 3650 /* 3651 * Just return null if the lgrp framework hasn't finished 3652 * initializing or if this is a UMA machine. 3653 */ 3654 if (nlgrps == 1 || !lgrp_initialized) 3655 return (lgrp_root); 3656 3657 /* 3658 * Get memory allocation policy for this segment 3659 */ 3660 policy = lgrp_mem_default_policy; 3661 if (seg != NULL) { 3662 if (seg->s_as == &kas) { 3663 if (seg == segkmap) 3664 policy = lgrp_segmap_default_policy; 3665 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3666 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3667 policy = LGRP_MEM_POLICY_RANDOM; 3668 } else { 3669 policy_info = lgrp_mem_policy_get(seg, vaddr); 3670 if (policy_info != NULL) 3671 policy = policy_info->mem_policy; 3672 } 3673 } 3674 lgrpset = 0; 3675 3676 /* 3677 * Initialize lgroup to home by default 3678 */ 3679 lgrp = lgrp_home_lgrp(); 3680 3681 /* 3682 * When homing threads on root lgrp, override default memory 3683 * allocation policies with root lgroup memory allocation policy 3684 */ 3685 if (lgrp == lgrp_root) 3686 policy = lgrp_mem_policy_root; 3687 3688 /* 3689 * Implement policy 3690 */ 3691 switch (policy) { 3692 case LGRP_MEM_POLICY_NEXT_CPU: 3693 3694 /* 3695 * Return lgroup of current CPU which faulted on memory 3696 * If the CPU isn't currently in an lgrp, then opt to 3697 * allocate from the root. 3698 * 3699 * Kernel preemption needs to be disabled here to prevent 3700 * the current CPU from going away before lgrp is found. 3701 */ 3702 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3703 lgrp = lgrp_root; 3704 } else { 3705 kpreempt_disable(); 3706 lgrp = lgrp_cpu_to_lgrp(CPU); 3707 kpreempt_enable(); 3708 } 3709 break; 3710 3711 case LGRP_MEM_POLICY_NEXT: 3712 case LGRP_MEM_POLICY_DEFAULT: 3713 default: 3714 3715 /* 3716 * Just return current thread's home lgroup 3717 * for default policy (next touch) 3718 * If the thread is homed to the root, 3719 * then the default policy is random across lgroups. 3720 * Fallthrough to the random case. 3721 */ 3722 if (lgrp != lgrp_root) { 3723 if (policy == LGRP_MEM_POLICY_NEXT) 3724 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3725 else 3726 lgrp_stat_add(lgrp->lgrp_id, 3727 LGRP_NUM_DEFAULT, 1); 3728 break; 3729 } 3730 /* LINTED fallthrough on case statement */ 3731 case LGRP_MEM_POLICY_RANDOM: 3732 3733 /* 3734 * Return a random leaf lgroup with memory 3735 */ 3736 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3737 /* 3738 * Count how many lgroups are spanned 3739 */ 3740 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3741 3742 /* 3743 * There may be no memnodes in the root lgroup during DR copy 3744 * rename on a system with only two boards (memnodes) 3745 * configured. In this case just return the root lgrp. 3746 */ 3747 if (lgrps_spanned == 0) { 3748 lgrp = lgrp_root; 3749 break; 3750 } 3751 3752 /* 3753 * Pick a random offset within lgroups spanned 3754 * and return lgroup at that offset 3755 */ 3756 random = (ushort_t)gethrtime() >> 4; 3757 off = random % lgrps_spanned; 3758 ASSERT(off <= lgrp_alloc_max); 3759 3760 for (i = 0; i <= lgrp_alloc_max; i++) { 3761 if (!klgrpset_ismember(lgrpset, i)) 3762 continue; 3763 if (off) 3764 off--; 3765 else { 3766 lgrp = lgrp_table[i]; 3767 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3768 1); 3769 break; 3770 } 3771 } 3772 break; 3773 3774 case LGRP_MEM_POLICY_RANDOM_PROC: 3775 3776 /* 3777 * Grab copy of bitmask of lgroups spanned by 3778 * this process 3779 */ 3780 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3781 stat = LGRP_NUM_RANDOM_PROC; 3782 3783 /* LINTED fallthrough on case statement */ 3784 case LGRP_MEM_POLICY_RANDOM_PSET: 3785 3786 if (!stat) 3787 stat = LGRP_NUM_RANDOM_PSET; 3788 3789 if (klgrpset_isempty(lgrpset)) { 3790 /* 3791 * Grab copy of bitmask of lgroups spanned by 3792 * this processor set 3793 */ 3794 kpreempt_disable(); 3795 klgrpset_copy(lgrpset, 3796 curthread->t_cpupart->cp_lgrpset); 3797 kpreempt_enable(); 3798 } 3799 3800 /* 3801 * Count how many lgroups are spanned 3802 */ 3803 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3804 ASSERT(lgrps_spanned <= nlgrps); 3805 3806 /* 3807 * Probably lgrps_spanned should be always non-zero, but to be 3808 * on the safe side we return lgrp_root if it is empty. 3809 */ 3810 if (lgrps_spanned == 0) { 3811 lgrp = lgrp_root; 3812 break; 3813 } 3814 3815 /* 3816 * Pick a random offset within lgroups spanned 3817 * and return lgroup at that offset 3818 */ 3819 random = (ushort_t)gethrtime() >> 4; 3820 off = random % lgrps_spanned; 3821 ASSERT(off <= lgrp_alloc_max); 3822 3823 for (i = 0; i <= lgrp_alloc_max; i++) { 3824 if (!klgrpset_ismember(lgrpset, i)) 3825 continue; 3826 if (off) 3827 off--; 3828 else { 3829 lgrp = lgrp_table[i]; 3830 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3831 1); 3832 break; 3833 } 3834 } 3835 break; 3836 3837 case LGRP_MEM_POLICY_ROUNDROBIN: 3838 3839 /* 3840 * Use offset within segment to determine 3841 * offset from home lgroup to choose for 3842 * next lgroup to allocate memory from 3843 */ 3844 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3845 (lgrp_alloc_max + 1); 3846 3847 kpreempt_disable(); 3848 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3849 i = lgrp->lgrp_id; 3850 kpreempt_enable(); 3851 3852 while (off > 0) { 3853 i = (i + 1) % (lgrp_alloc_max + 1); 3854 lgrp = lgrp_table[i]; 3855 if (klgrpset_ismember(lgrpset, i)) 3856 off--; 3857 } 3858 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3859 3860 break; 3861 } 3862 3863 ASSERT(lgrp != NULL); 3864 return (lgrp); 3865 } 3866 3867 /* 3868 * Return the number of pages in an lgroup 3869 * 3870 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3871 * could cause tests that rely on the numat driver to fail.... 3872 */ 3873 pgcnt_t 3874 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3875 { 3876 lgrp_t *lgrp; 3877 3878 lgrp = lgrp_table[lgrpid]; 3879 if (!LGRP_EXISTS(lgrp) || 3880 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3881 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3882 return (0); 3883 3884 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3885 } 3886 3887 /* 3888 * Initialize lgroup shared memory allocation policy support 3889 */ 3890 void 3891 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3892 { 3893 lgrp_shm_locality_t *shm_locality; 3894 3895 /* 3896 * Initialize locality field in anon_map 3897 * Don't need any locks because this is called when anon_map is 3898 * allocated, but not used anywhere yet. 3899 */ 3900 if (amp) { 3901 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3902 if (amp->locality == NULL) { 3903 /* 3904 * Allocate and initialize shared memory locality info 3905 * and set anon_map locality pointer to it 3906 * Drop lock across kmem_alloc(KM_SLEEP) 3907 */ 3908 ANON_LOCK_EXIT(&->a_rwlock); 3909 shm_locality = kmem_alloc(sizeof (*shm_locality), 3910 KM_SLEEP); 3911 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3912 NULL); 3913 shm_locality->loc_count = 1; /* not used for amp */ 3914 shm_locality->loc_tree = NULL; 3915 3916 /* 3917 * Reacquire lock and check to see whether anyone beat 3918 * us to initializing the locality info 3919 */ 3920 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3921 if (amp->locality != NULL) { 3922 rw_destroy(&shm_locality->loc_lock); 3923 kmem_free(shm_locality, 3924 sizeof (*shm_locality)); 3925 } else 3926 amp->locality = shm_locality; 3927 } 3928 ANON_LOCK_EXIT(&->a_rwlock); 3929 return; 3930 } 3931 3932 /* 3933 * Allocate shared vnode policy info if vnode is not locality aware yet 3934 */ 3935 mutex_enter(&vp->v_lock); 3936 if ((vp->v_flag & V_LOCALITY) == 0) { 3937 /* 3938 * Allocate and initialize shared memory locality info 3939 */ 3940 mutex_exit(&vp->v_lock); 3941 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3942 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3943 shm_locality->loc_count = 1; 3944 shm_locality->loc_tree = NULL; 3945 3946 /* 3947 * Point vnode locality field at shared vnode policy info 3948 * and set locality aware flag in vnode 3949 */ 3950 mutex_enter(&vp->v_lock); 3951 if ((vp->v_flag & V_LOCALITY) == 0) { 3952 vp->v_locality = shm_locality; 3953 vp->v_flag |= V_LOCALITY; 3954 } else { 3955 /* 3956 * Lost race so free locality info and increment count. 3957 */ 3958 rw_destroy(&shm_locality->loc_lock); 3959 kmem_free(shm_locality, sizeof (*shm_locality)); 3960 shm_locality = vp->v_locality; 3961 shm_locality->loc_count++; 3962 } 3963 mutex_exit(&vp->v_lock); 3964 3965 return; 3966 } 3967 3968 /* 3969 * Increment reference count of number of segments mapping this vnode 3970 * shared 3971 */ 3972 shm_locality = vp->v_locality; 3973 shm_locality->loc_count++; 3974 mutex_exit(&vp->v_lock); 3975 } 3976 3977 /* 3978 * Destroy the given shared memory policy segment tree 3979 */ 3980 void 3981 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3982 { 3983 lgrp_shm_policy_seg_t *cur; 3984 lgrp_shm_policy_seg_t *next; 3985 3986 if (tree == NULL) 3987 return; 3988 3989 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3990 while (cur != NULL) { 3991 next = AVL_NEXT(tree, cur); 3992 avl_remove(tree, cur); 3993 kmem_free(cur, sizeof (*cur)); 3994 cur = next; 3995 } 3996 kmem_free(tree, sizeof (avl_tree_t)); 3997 } 3998 3999 /* 4000 * Uninitialize lgroup shared memory allocation policy support 4001 */ 4002 void 4003 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 4004 { 4005 lgrp_shm_locality_t *shm_locality; 4006 4007 /* 4008 * For anon_map, deallocate shared memory policy tree and 4009 * zero locality field 4010 * Don't need any locks because anon_map is being freed 4011 */ 4012 if (amp) { 4013 if (amp->locality == NULL) 4014 return; 4015 shm_locality = amp->locality; 4016 shm_locality->loc_count = 0; /* not really used for amp */ 4017 rw_destroy(&shm_locality->loc_lock); 4018 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4019 kmem_free(shm_locality, sizeof (*shm_locality)); 4020 amp->locality = 0; 4021 return; 4022 } 4023 4024 /* 4025 * For vnode, decrement reference count of segments mapping this vnode 4026 * shared and delete locality info if reference count drops to 0 4027 */ 4028 mutex_enter(&vp->v_lock); 4029 shm_locality = vp->v_locality; 4030 shm_locality->loc_count--; 4031 4032 if (shm_locality->loc_count == 0) { 4033 rw_destroy(&shm_locality->loc_lock); 4034 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4035 kmem_free(shm_locality, sizeof (*shm_locality)); 4036 vp->v_locality = 0; 4037 vp->v_flag &= ~V_LOCALITY; 4038 } 4039 mutex_exit(&vp->v_lock); 4040 } 4041 4042 /* 4043 * Compare two shared memory policy segments 4044 * Used by AVL tree code for searching 4045 */ 4046 int 4047 lgrp_shm_policy_compar(const void *x, const void *y) 4048 { 4049 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4050 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4051 4052 if (a->shm_off < b->shm_off) 4053 return (-1); 4054 if (a->shm_off >= b->shm_off + b->shm_size) 4055 return (1); 4056 return (0); 4057 } 4058 4059 /* 4060 * Concatenate seg1 with seg2 and remove seg2 4061 */ 4062 static int 4063 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4064 lgrp_shm_policy_seg_t *seg2) 4065 { 4066 if (!seg1 || !seg2 || 4067 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4068 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4069 return (-1); 4070 4071 seg1->shm_size += seg2->shm_size; 4072 avl_remove(tree, seg2); 4073 kmem_free(seg2, sizeof (*seg2)); 4074 return (0); 4075 } 4076 4077 /* 4078 * Split segment at given offset and return rightmost (uppermost) segment 4079 * Assumes that there are no overlapping segments 4080 */ 4081 static lgrp_shm_policy_seg_t * 4082 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4083 u_offset_t off) 4084 { 4085 lgrp_shm_policy_seg_t *newseg; 4086 avl_index_t where; 4087 4088 ASSERT(seg != NULL); 4089 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4090 4091 if (!seg || off < seg->shm_off || off > seg->shm_off + 4092 seg->shm_size) 4093 return (NULL); 4094 4095 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4096 return (seg); 4097 4098 /* 4099 * Adjust size of left segment and allocate new (right) segment 4100 */ 4101 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4102 newseg->shm_policy = seg->shm_policy; 4103 newseg->shm_off = off; 4104 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4105 seg->shm_size = off - seg->shm_off; 4106 4107 /* 4108 * Find where to insert new segment in AVL tree and insert it 4109 */ 4110 (void) avl_find(tree, &off, &where); 4111 avl_insert(tree, newseg, where); 4112 4113 return (newseg); 4114 } 4115 4116 /* 4117 * Set shared memory allocation policy on specified shared object at given 4118 * offset and length 4119 * 4120 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4121 * -1 if can't set policy. 4122 */ 4123 int 4124 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4125 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4126 { 4127 u_offset_t eoff; 4128 lgrp_shm_policy_seg_t *next; 4129 lgrp_shm_policy_seg_t *newseg; 4130 u_offset_t off; 4131 u_offset_t oldeoff; 4132 lgrp_shm_policy_seg_t *prev; 4133 int retval; 4134 lgrp_shm_policy_seg_t *seg; 4135 lgrp_shm_locality_t *shm_locality; 4136 avl_tree_t *tree; 4137 avl_index_t where; 4138 4139 ASSERT(amp || vp); 4140 ASSERT((len & PAGEOFFSET) == 0); 4141 4142 if (len == 0) 4143 return (-1); 4144 4145 retval = 0; 4146 4147 /* 4148 * Get locality info and starting offset into shared object 4149 * Try anon map first and then vnode 4150 * Assume that no locks need to be held on anon_map or vnode, since 4151 * it should be protected by its reference count which must be nonzero 4152 * for an existing segment. 4153 */ 4154 if (amp) { 4155 /* 4156 * Get policy info from anon_map 4157 * 4158 */ 4159 ASSERT(amp->refcnt != 0); 4160 if (amp->locality == NULL) 4161 lgrp_shm_policy_init(amp, NULL); 4162 shm_locality = amp->locality; 4163 off = ptob(anon_index); 4164 } else if (vp) { 4165 /* 4166 * Get policy info from vnode 4167 */ 4168 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4169 lgrp_shm_policy_init(NULL, vp); 4170 shm_locality = vp->v_locality; 4171 ASSERT(shm_locality->loc_count != 0); 4172 off = vn_off; 4173 } else 4174 return (-1); 4175 4176 ASSERT((off & PAGEOFFSET) == 0); 4177 4178 /* 4179 * Figure out default policy 4180 */ 4181 if (policy == LGRP_MEM_POLICY_DEFAULT) 4182 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4183 4184 /* 4185 * Create AVL tree if there isn't one yet 4186 * and set locality field to point at it 4187 */ 4188 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4189 tree = shm_locality->loc_tree; 4190 if (!tree) { 4191 rw_exit(&shm_locality->loc_lock); 4192 4193 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4194 4195 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4196 if (shm_locality->loc_tree == NULL) { 4197 avl_create(tree, lgrp_shm_policy_compar, 4198 sizeof (lgrp_shm_policy_seg_t), 4199 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4200 shm_locality->loc_tree = tree; 4201 } else { 4202 /* 4203 * Another thread managed to set up the tree 4204 * before we could. Free the tree we allocated 4205 * and use the one that's already there. 4206 */ 4207 kmem_free(tree, sizeof (*tree)); 4208 tree = shm_locality->loc_tree; 4209 } 4210 } 4211 4212 /* 4213 * Set policy 4214 * 4215 * Need to maintain hold on writer's lock to keep tree from 4216 * changing out from under us 4217 */ 4218 while (len != 0) { 4219 /* 4220 * Find policy segment for specified offset into shared object 4221 */ 4222 seg = avl_find(tree, &off, &where); 4223 4224 /* 4225 * Didn't find any existing segment that contains specified 4226 * offset, so allocate new segment, insert it, and concatenate 4227 * with adjacent segments if possible 4228 */ 4229 if (seg == NULL) { 4230 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4231 KM_SLEEP); 4232 newseg->shm_policy.mem_policy = policy; 4233 newseg->shm_policy.mem_reserved = 0; 4234 newseg->shm_off = off; 4235 avl_insert(tree, newseg, where); 4236 4237 /* 4238 * Check to see whether new segment overlaps with next 4239 * one, set length of new segment accordingly, and 4240 * calculate remaining length and next offset 4241 */ 4242 seg = AVL_NEXT(tree, newseg); 4243 if (seg == NULL || off + len <= seg->shm_off) { 4244 newseg->shm_size = len; 4245 len = 0; 4246 } else { 4247 newseg->shm_size = seg->shm_off - off; 4248 off = seg->shm_off; 4249 len -= newseg->shm_size; 4250 } 4251 4252 /* 4253 * Try to concatenate new segment with next and 4254 * previous ones, since they might have the same policy 4255 * now. Grab previous and next segments first because 4256 * they will change on concatenation. 4257 */ 4258 prev = AVL_PREV(tree, newseg); 4259 next = AVL_NEXT(tree, newseg); 4260 (void) lgrp_shm_policy_concat(tree, newseg, next); 4261 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4262 4263 continue; 4264 } 4265 4266 eoff = off + len; 4267 oldeoff = seg->shm_off + seg->shm_size; 4268 4269 /* 4270 * Policy set already? 4271 */ 4272 if (policy == seg->shm_policy.mem_policy) { 4273 /* 4274 * Nothing left to do if offset and length 4275 * fall within this segment 4276 */ 4277 if (eoff <= oldeoff) { 4278 retval = 1; 4279 break; 4280 } else { 4281 len = eoff - oldeoff; 4282 off = oldeoff; 4283 continue; 4284 } 4285 } 4286 4287 /* 4288 * Specified offset and length match existing segment exactly 4289 */ 4290 if (off == seg->shm_off && len == seg->shm_size) { 4291 /* 4292 * Set policy and update current length 4293 */ 4294 seg->shm_policy.mem_policy = policy; 4295 seg->shm_policy.mem_reserved = 0; 4296 len = 0; 4297 4298 /* 4299 * Try concatenating new segment with previous and next 4300 * segments, since they might have the same policy now. 4301 * Grab previous and next segments first because they 4302 * will change on concatenation. 4303 */ 4304 prev = AVL_PREV(tree, seg); 4305 next = AVL_NEXT(tree, seg); 4306 (void) lgrp_shm_policy_concat(tree, seg, next); 4307 (void) lgrp_shm_policy_concat(tree, prev, seg); 4308 } else { 4309 /* 4310 * Specified offset and length only apply to part of 4311 * existing segment 4312 */ 4313 4314 /* 4315 * New segment starts in middle of old one, so split 4316 * new one off near beginning of old one 4317 */ 4318 newseg = NULL; 4319 if (off > seg->shm_off) { 4320 newseg = lgrp_shm_policy_split(tree, seg, off); 4321 4322 /* 4323 * New segment ends where old one did, so try 4324 * to concatenate with next segment 4325 */ 4326 if (eoff == oldeoff) { 4327 newseg->shm_policy.mem_policy = policy; 4328 newseg->shm_policy.mem_reserved = 0; 4329 (void) lgrp_shm_policy_concat(tree, 4330 newseg, AVL_NEXT(tree, newseg)); 4331 break; 4332 } 4333 } 4334 4335 /* 4336 * New segment ends before old one, so split off end of 4337 * old one 4338 */ 4339 if (eoff < oldeoff) { 4340 if (newseg) { 4341 (void) lgrp_shm_policy_split(tree, 4342 newseg, eoff); 4343 newseg->shm_policy.mem_policy = policy; 4344 newseg->shm_policy.mem_reserved = 0; 4345 } else { 4346 (void) lgrp_shm_policy_split(tree, seg, 4347 eoff); 4348 seg->shm_policy.mem_policy = policy; 4349 seg->shm_policy.mem_reserved = 0; 4350 } 4351 4352 if (off == seg->shm_off) 4353 (void) lgrp_shm_policy_concat(tree, 4354 AVL_PREV(tree, seg), seg); 4355 break; 4356 } 4357 4358 /* 4359 * Calculate remaining length and next offset 4360 */ 4361 len = eoff - oldeoff; 4362 off = oldeoff; 4363 } 4364 } 4365 4366 rw_exit(&shm_locality->loc_lock); 4367 return (retval); 4368 } 4369 4370 /* 4371 * Return the best memnode from which to allocate memory given 4372 * an lgroup. 4373 * 4374 * "c" is for cookie, which is good enough for me. 4375 * It references a cookie struct that should be zero'ed to initialize. 4376 * The cookie should live on the caller's stack. 4377 * 4378 * The routine returns -1 when: 4379 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4380 * - traverse is 1, and all the memnodes in the system have been 4381 * returned. 4382 */ 4383 int 4384 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4385 { 4386 lgrp_t *lp = c->lmc_lgrp; 4387 mnodeset_t nodes = c->lmc_nodes; 4388 int cnt = c->lmc_cnt; 4389 int offset, mnode; 4390 4391 extern int max_mem_nodes; 4392 4393 /* 4394 * If the set is empty, and the caller is willing, traverse 4395 * up the hierarchy until we find a non-empty set. 4396 */ 4397 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4398 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4399 ((lp = lp->lgrp_parent) == NULL)) 4400 return (-1); 4401 4402 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4403 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4404 } 4405 4406 /* 4407 * Select a memnode by picking one at a "random" offset. 4408 * Because of DR, memnodes can come and go at any time. 4409 * This code must be able to cope with the possibility 4410 * that the nodes count "cnt" is inconsistent with respect 4411 * to the number of elements actually in "nodes", and 4412 * therefore that the offset chosen could be greater than 4413 * the number of elements in the set (some memnodes may 4414 * have dissapeared just before cnt was read). 4415 * If this happens, the search simply wraps back to the 4416 * beginning of the set. 4417 */ 4418 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4419 offset = c->lmc_rand % cnt; 4420 do { 4421 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4422 if (nodes & ((mnodeset_t)1 << mnode)) 4423 if (!offset--) 4424 break; 4425 } while (mnode >= max_mem_nodes); 4426 4427 /* Found a node. Store state before returning. */ 4428 c->lmc_lgrp = lp; 4429 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4430 c->lmc_cnt = cnt - 1; 4431 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4432 c->lmc_ntried++; 4433 4434 return (mnode); 4435 } 4436