1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Basic NUMA support in terms of locality groups 30 * 31 * Solaris needs to know which CPUs, memory, etc. are near each other to 32 * provide good performance on NUMA machines by optimizing for locality. 33 * In order to do this, a new abstraction called a "locality group (lgroup)" 34 * has been introduced to keep track of which CPU-like and memory-like hardware 35 * resources are close to each other. Currently, latency is the only measure 36 * used to determine how to group hardware resources into lgroups, but this 37 * does not limit the groupings to be based solely on latency. Other factors 38 * may be used to determine the groupings in the future. 39 * 40 * Lgroups are organized into a hieararchy or topology that represents the 41 * latency topology of the machine. There is always at least a root lgroup in 42 * the system. It represents all the hardware resources in the machine at a 43 * latency big enough that any hardware resource can at least access any other 44 * hardware resource within that latency. A Uniform Memory Access (UMA) 45 * machine is represented with one lgroup (the root). In contrast, a NUMA 46 * machine is represented at least by the root lgroup and some number of leaf 47 * lgroups where the leaf lgroups contain the hardware resources within the 48 * least latency of each other and the root lgroup still contains all the 49 * resources in the machine. Some number of intermediate lgroups may exist 50 * which represent more levels of locality than just the local latency of the 51 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 52 * (eg. root and intermediate lgroups) contain the next nearest resources to 53 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 54 * to the root lgroup shows the hardware resources from closest to farthest 55 * from the leaf lgroup such that each successive ancestor lgroup contains 56 * the next nearest resources at the next level of locality from the previous. 57 * 58 * The kernel uses the lgroup abstraction to know how to allocate resources 59 * near a given process/thread. At fork() and lwp/thread_create() time, a 60 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 61 * with the lowest load average. Binding to a processor or processor set will 62 * change the home lgroup for a thread. The scheduler has been modified to try 63 * to dispatch a thread on a CPU in its home lgroup. Physical memory 64 * allocation is lgroup aware too, so memory will be allocated from the current 65 * thread's home lgroup if possible. If the desired resources are not 66 * available, the kernel traverses the lgroup hierarchy going to the parent 67 * lgroup to find resources at the next level of locality until it reaches the 68 * root lgroup. 69 */ 70 71 #include <sys/lgrp.h> 72 #include <sys/lgrp_user.h> 73 #include <sys/types.h> 74 #include <sys/mman.h> 75 #include <sys/param.h> 76 #include <sys/var.h> 77 #include <sys/thread.h> 78 #include <sys/cpuvar.h> 79 #include <sys/cpupart.h> 80 #include <sys/kmem.h> 81 #include <vm/seg.h> 82 #include <vm/seg_kmem.h> 83 #include <vm/seg_spt.h> 84 #include <vm/seg_vn.h> 85 #include <vm/as.h> 86 #include <sys/atomic.h> 87 #include <sys/systm.h> 88 #include <sys/errno.h> 89 #include <sys/cmn_err.h> 90 #include <sys/kstat.h> 91 #include <sys/sysmacros.h> 92 #include <sys/chip.h> 93 #include <sys/promif.h> 94 #include <sys/sdt.h> 95 96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 98 /* indexed by lgrp_id */ 99 int nlgrps; /* number of lgroups in machine */ 100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 102 103 /* 104 * Kstat data for lgroups. 105 * 106 * Actual kstat data is collected in lgrp_stats array. 107 * The lgrp_kstat_data array of named kstats is used to extract data from 108 * lgrp_stats and present it to kstat framework. It is protected from partallel 109 * modifications by lgrp_kstat_mutex. This may cause some contention when 110 * several kstat commands run in parallel but this is not the 111 * performance-critical path. 112 */ 113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 114 115 /* 116 * Declare kstat names statically for enums as defined in the header file. 117 */ 118 LGRP_KSTAT_NAMES; 119 120 static void lgrp_kstat_init(void); 121 static int lgrp_kstat_extract(kstat_t *, int); 122 static void lgrp_kstat_reset(lgrp_id_t); 123 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 125 static kmutex_t lgrp_kstat_mutex; 126 127 128 /* 129 * max number of lgroups supported by the platform 130 */ 131 int nlgrpsmax = 0; 132 133 /* 134 * The root lgroup. Represents the set of resources at the system wide 135 * level of locality. 136 */ 137 lgrp_t *lgrp_root = NULL; 138 139 /* 140 * During system bootstrap cp_default does not contain the list of lgrp load 141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 142 * on-line when cp_default is initialized by cpupart_initialize_default(). 143 * Configuring CPU0 may create a two-level topology with root and one leaf node 144 * containing CPU0. This topology is initially constructed in a special 145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 147 * for all lpl operations until cp_default is fully constructed. 148 * 149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 151 * the first element of lpl_bootstrap_list. 152 * 153 * CPUs that are added to the system, but have not yet been assigned to an 154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 155 * on some architectures (x86) it's possible for the slave CPU startup thread 156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 157 */ 158 #define LPL_BOOTSTRAP_SIZE 2 159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 160 lpl_t *lpl_bootstrap; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 /* 172 * Size, in bytes, beyond which random memory allocation policy is applied 173 * to non-shared memory. Default is the maximum size, so random memory 174 * allocation won't be used for non-shared memory by default. 175 */ 176 size_t lgrp_privm_random_thresh = (size_t)(-1); 177 178 /* the maximum effect that a single thread can have on it's lgroup's load */ 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 180 ((lgrp_loadavg_max_effect) / (ncpu)) 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 182 183 184 /* 185 * Size, in bytes, beyond which random memory allocation policy is applied to 186 * shared memory. Default is 8MB (2 ISM pages). 187 */ 188 size_t lgrp_shm_random_thresh = 8*1024*1024; 189 190 /* 191 * Whether to do processor set aware memory allocation by default 192 */ 193 int lgrp_mem_pset_aware = 0; 194 195 /* 196 * Set the default memory allocation policy for root lgroup 197 */ 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 199 200 /* 201 * Set the default memory allocation policy. For most platforms, 202 * next touch is sufficient, but some platforms may wish to override 203 * this. 204 */ 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 206 207 208 /* 209 * lgroup CPU event handlers 210 */ 211 static void lgrp_cpu_init(struct cpu *); 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 214 215 static void lgrp_latency_change(u_longlong_t, u_longlong_t); 216 217 /* 218 * lgroup memory event handlers 219 */ 220 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 221 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 222 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 223 224 /* 225 * lgroup CPU partition event handlers 226 */ 227 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 228 static void lgrp_part_del_cpu(struct cpu *); 229 230 static void lgrp_root_init(void); 231 232 /* 233 * lpl topology 234 */ 235 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 236 static void lpl_clear(lpl_t *); 237 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 238 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 239 static void lpl_rset_add(lpl_t *, lpl_t *); 240 static void lpl_rset_del(lpl_t *, lpl_t *); 241 static int lpl_rset_contains(lpl_t *, lpl_t *); 242 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 243 static void lpl_child_update(lpl_t *, struct cpupart *); 244 static int lpl_pick(lpl_t *, lpl_t *); 245 static void lpl_verify_wrapper(struct cpupart *); 246 247 /* 248 * defines for lpl topology verifier return codes 249 */ 250 251 #define LPL_TOPO_CORRECT 0 252 #define LPL_TOPO_PART_HAS_NO_LPL -1 253 #define LPL_TOPO_CPUS_NOT_EMPTY -2 254 #define LPL_TOPO_LGRP_MISMATCH -3 255 #define LPL_TOPO_MISSING_PARENT -4 256 #define LPL_TOPO_PARENT_MISMATCH -5 257 #define LPL_TOPO_BAD_CPUCNT -6 258 #define LPL_TOPO_RSET_MISMATCH -7 259 #define LPL_TOPO_LPL_ORPHANED -8 260 #define LPL_TOPO_LPL_BAD_NCPU -9 261 #define LPL_TOPO_RSET_MSSNG_LF -10 262 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 263 #define LPL_TOPO_BOGUS_HINT -12 264 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 265 #define LPL_TOPO_LGRP_NOT_LEAF -14 266 #define LPL_TOPO_BAD_RSETCNT -15 267 268 /* 269 * Return whether lgroup optimizations should be enabled on this system 270 */ 271 int 272 lgrp_optimizations(void) 273 { 274 /* 275 * System must have more than 2 lgroups to enable lgroup optimizations 276 * 277 * XXX This assumes that a 2 lgroup system has an empty root lgroup 278 * with one child lgroup containing all the resources. A 2 lgroup 279 * system with a root lgroup directly containing CPUs or memory might 280 * need lgroup optimizations with its child lgroup, but there 281 * isn't such a machine for now.... 282 */ 283 if (nlgrps > 2) 284 return (1); 285 286 return (0); 287 } 288 289 /* 290 * Build full lgroup topology 291 */ 292 static void 293 lgrp_root_init(void) 294 { 295 lgrp_handle_t hand; 296 int i; 297 lgrp_id_t id; 298 299 /* 300 * Create the "root" lgroup 301 */ 302 ASSERT(nlgrps == 0); 303 id = nlgrps++; 304 305 lgrp_root = &lroot; 306 307 lgrp_root->lgrp_cpu = NULL; 308 lgrp_root->lgrp_mnodes = 0; 309 lgrp_root->lgrp_nmnodes = 0; 310 hand = lgrp_plat_root_hand(); 311 lgrp_root->lgrp_plathand = hand; 312 313 lgrp_root->lgrp_id = id; 314 lgrp_root->lgrp_cpucnt = 0; 315 lgrp_root->lgrp_childcnt = 0; 316 klgrpset_clear(lgrp_root->lgrp_children); 317 klgrpset_clear(lgrp_root->lgrp_leaves); 318 lgrp_root->lgrp_parent = NULL; 319 lgrp_root->lgrp_chips = NULL; 320 lgrp_root->lgrp_chipcnt = 0; 321 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 322 323 for (i = 0; i < LGRP_RSRC_COUNT; i++) 324 klgrpset_clear(lgrp_root->lgrp_set[i]); 325 326 lgrp_root->lgrp_kstat = NULL; 327 328 lgrp_table[id] = lgrp_root; 329 330 /* 331 * Setup initial lpl list for CPU0 and initial t0 home. 332 * The only lpl space we have so far is lpl_bootstrap. It is used for 333 * all topology operations until cp_default is initialized at which 334 * point t0.t_lpl will be updated. 335 */ 336 lpl_bootstrap = lpl_bootstrap_list; 337 t0.t_lpl = lpl_bootstrap; 338 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 339 lpl_bootstrap_list[1].lpl_lgrpid = 1; 340 cp_default.cp_lgrploads = lpl_bootstrap; 341 } 342 343 /* 344 * Initialize the lgroup framework and allow the platform to do the same 345 */ 346 void 347 lgrp_init(void) 348 { 349 /* 350 * Initialize the platform 351 */ 352 lgrp_plat_init(); 353 354 /* 355 * Set max number of lgroups supported on this platform which must be 356 * less than the max number of lgroups supported by the common lgroup 357 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 358 */ 359 nlgrpsmax = lgrp_plat_max_lgrps(); 360 ASSERT(nlgrpsmax <= NLGRPS_MAX); 361 } 362 363 /* 364 * Create the root and cpu0's lgroup, and set t0's home. 365 */ 366 void 367 lgrp_setup(void) 368 { 369 /* 370 * Setup the root lgroup 371 */ 372 lgrp_root_init(); 373 374 /* 375 * Add cpu0 to an lgroup 376 */ 377 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 378 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 379 } 380 381 /* 382 * Lgroup initialization is split in two parts. The first part 383 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 384 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 385 * when all CPUs are brought online and all distance information is available. 386 * 387 * When lgrp_main_init() is complete it sets lgrp_initialized. The 388 * lgrp_main_mp_init() sets lgrp_topo_initialized. 389 */ 390 391 /* 392 * true when lgrp initialization has been completed. 393 */ 394 int lgrp_initialized = 0; 395 396 /* 397 * True when lgrp topology is constructed. 398 */ 399 int lgrp_topo_initialized = 0; 400 401 /* 402 * Init routine called after startup(), /etc/system has been processed, 403 * and cpu0 has been added to an lgroup. 404 */ 405 void 406 lgrp_main_init(void) 407 { 408 cpu_t *cp = CPU; 409 lgrp_id_t lgrpid; 410 int i; 411 /* 412 * Enforce a valid lgrp_mem_default_policy 413 */ 414 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 415 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 416 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 417 418 /* 419 * See if mpo should be disabled. 420 * This may happen in the case of null proc LPA on Starcat. 421 * The platform won't be able to detect null proc LPA until after 422 * cpu0 and memory have already been added to lgroups. 423 * When and if it is detected, the Starcat platform will return 424 * a different platform handle for cpu0 which is what we check for 425 * here. If mpo should be disabled move cpu0 to it's rightful place 426 * (the root), and destroy the remaining lgroups. This effectively 427 * provides an UMA lgroup topology. 428 */ 429 lgrpid = cp->cpu_lpl->lpl_lgrpid; 430 if (lgrp_table[lgrpid]->lgrp_plathand != 431 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 432 lgrp_part_del_cpu(cp); 433 lgrp_cpu_fini(cp, lgrpid); 434 435 lgrp_cpu_init(cp); 436 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 437 438 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 439 440 /* 441 * Destroy all lgroups except for root 442 */ 443 for (i = 0; i <= lgrp_alloc_max; i++) { 444 if (LGRP_EXISTS(lgrp_table[i]) && 445 lgrp_table[i] != lgrp_root) 446 lgrp_destroy(lgrp_table[i]); 447 } 448 449 /* 450 * Fix up root to point at itself for leaves and resources 451 * and not have any children 452 */ 453 lgrp_root->lgrp_childcnt = 0; 454 klgrpset_clear(lgrp_root->lgrp_children); 455 klgrpset_clear(lgrp_root->lgrp_leaves); 456 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 457 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 458 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 459 } 460 461 /* 462 * Initialize kstats framework. 463 */ 464 lgrp_kstat_init(); 465 /* 466 * cpu0 is finally where it should be, so create it's lgroup's kstats 467 */ 468 mutex_enter(&cpu_lock); 469 lgrp_kstat_create(cp); 470 mutex_exit(&cpu_lock); 471 472 lgrp_plat_main_init(); 473 lgrp_initialized = 1; 474 } 475 476 /* 477 * Finish lgrp initialization after all CPUS are brought on-line. 478 * This routine is called after start_other_cpus(). 479 */ 480 void 481 lgrp_main_mp_init(void) 482 { 483 klgrpset_t changed; 484 485 /* 486 * Update lgroup topology (if necessary) 487 */ 488 klgrpset_clear(changed); 489 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 490 lgrp_topo_initialized = 1; 491 } 492 493 /* 494 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 495 */ 496 void 497 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 498 { 499 klgrpset_t changed; 500 cpu_t *cp; 501 lgrp_id_t id; 502 int rc; 503 504 switch (event) { 505 /* 506 * The following (re)configuration events are common code 507 * initiated. lgrp_plat_config() is called here to inform the 508 * platform of the reconfiguration event. 509 */ 510 case LGRP_CONFIG_CPU_ADD: 511 cp = (cpu_t *)resource; 512 513 /* 514 * Initialize the new CPU's lgrp related next/prev 515 * links, and give it a bootstrap lpl so that it can 516 * survive should it need to enter the dispatcher. 517 */ 518 cp->cpu_next_lpl = cp; 519 cp->cpu_prev_lpl = cp; 520 cp->cpu_next_lgrp = cp; 521 cp->cpu_prev_lgrp = cp; 522 cp->cpu_lpl = lpl_bootstrap; 523 524 lgrp_plat_config(event, resource); 525 atomic_add_32(&lgrp_gen, 1); 526 527 break; 528 case LGRP_CONFIG_CPU_DEL: 529 lgrp_plat_config(event, resource); 530 atomic_add_32(&lgrp_gen, 1); 531 532 break; 533 case LGRP_CONFIG_CPU_ONLINE: 534 cp = (cpu_t *)resource; 535 lgrp_cpu_init(cp); 536 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 537 rc = lpl_topo_verify(cp->cpu_part); 538 if (rc != LPL_TOPO_CORRECT) { 539 panic("lpl_topo_verify failed: %d", rc); 540 } 541 lgrp_plat_config(event, resource); 542 atomic_add_32(&lgrp_gen, 1); 543 544 break; 545 case LGRP_CONFIG_CPU_OFFLINE: 546 cp = (cpu_t *)resource; 547 id = cp->cpu_lpl->lpl_lgrpid; 548 lgrp_part_del_cpu(cp); 549 lgrp_cpu_fini(cp, id); 550 rc = lpl_topo_verify(cp->cpu_part); 551 if (rc != LPL_TOPO_CORRECT) { 552 panic("lpl_topo_verify failed: %d", rc); 553 } 554 lgrp_plat_config(event, resource); 555 atomic_add_32(&lgrp_gen, 1); 556 557 break; 558 case LGRP_CONFIG_CPUPART_ADD: 559 cp = (cpu_t *)resource; 560 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 561 rc = lpl_topo_verify(cp->cpu_part); 562 if (rc != LPL_TOPO_CORRECT) { 563 panic("lpl_topo_verify failed: %d", rc); 564 } 565 lgrp_plat_config(event, resource); 566 567 break; 568 case LGRP_CONFIG_CPUPART_DEL: 569 cp = (cpu_t *)resource; 570 lgrp_part_del_cpu((cpu_t *)resource); 571 rc = lpl_topo_verify(cp->cpu_part); 572 if (rc != LPL_TOPO_CORRECT) { 573 panic("lpl_topo_verify failed: %d", rc); 574 } 575 lgrp_plat_config(event, resource); 576 577 break; 578 /* 579 * The following events are initiated by the memnode 580 * subsystem. 581 */ 582 case LGRP_CONFIG_MEM_ADD: 583 lgrp_mem_init((int)resource, where, B_FALSE); 584 atomic_add_32(&lgrp_gen, 1); 585 586 break; 587 case LGRP_CONFIG_MEM_DEL: 588 lgrp_mem_fini((int)resource, where, B_FALSE); 589 atomic_add_32(&lgrp_gen, 1); 590 591 break; 592 case LGRP_CONFIG_MEM_RENAME: { 593 lgrp_config_mem_rename_t *ren_arg = 594 (lgrp_config_mem_rename_t *)where; 595 596 lgrp_mem_rename((int)resource, 597 ren_arg->lmem_rename_from, 598 ren_arg->lmem_rename_to); 599 atomic_add_32(&lgrp_gen, 1); 600 601 break; 602 } 603 case LGRP_CONFIG_GEN_UPDATE: 604 atomic_add_32(&lgrp_gen, 1); 605 606 break; 607 case LGRP_CONFIG_FLATTEN: 608 if (where == 0) 609 lgrp_topo_levels = (int)resource; 610 else 611 (void) lgrp_topo_flatten(resource, 612 lgrp_table, lgrp_alloc_max, &changed); 613 614 break; 615 /* 616 * Initiated by platform latency probing code 617 */ 618 case LGRP_CONFIG_LATENCY_CHANGE: 619 lgrp_latency_change((u_longlong_t)resource, 620 (u_longlong_t)where); 621 622 break; 623 case LGRP_CONFIG_NOP: 624 625 break; 626 default: 627 break; 628 } 629 630 } 631 632 /* 633 * Called to add lgrp info into cpu structure from cpu_add_unit; 634 * do not assume cpu is in cpu[] yet! 635 * 636 * CPUs are brought online with all other CPUs paused so we can't 637 * allocate memory or we could deadlock the system, so we rely on 638 * the platform to statically allocate as much space as we need 639 * for the lgrp structs and stats. 640 */ 641 static void 642 lgrp_cpu_init(struct cpu *cp) 643 { 644 klgrpset_t changed; 645 int count; 646 lgrp_handle_t hand; 647 int first_cpu; 648 lgrp_t *my_lgrp; 649 lgrp_id_t lgrpid; 650 struct cpu *cptr; 651 struct chip *chp; 652 653 /* 654 * This is the first time through if the resource set 655 * for the root lgroup is empty. After cpu0 has been 656 * initially added to an lgroup, the root's CPU resource 657 * set can never be empty, since the system's last CPU 658 * cannot be offlined. 659 */ 660 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 661 /* 662 * First time through. 663 */ 664 first_cpu = 1; 665 } else { 666 /* 667 * If cpu0 needs to move lgroups, we may come 668 * through here again, at which time cpu_lock won't 669 * be held, and lgrp_initialized will be false. 670 */ 671 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 672 ASSERT(cp->cpu_part != NULL); 673 first_cpu = 0; 674 } 675 676 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 677 my_lgrp = lgrp_hand_to_lgrp(hand); 678 679 if (my_lgrp == NULL) { 680 /* 681 * Create new lgrp and add it to lgroup topology 682 */ 683 my_lgrp = lgrp_create(); 684 my_lgrp->lgrp_plathand = hand; 685 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 686 lgrpid = my_lgrp->lgrp_id; 687 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 688 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 689 690 count = 0; 691 klgrpset_clear(changed); 692 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 693 &changed); 694 /* 695 * May have added new intermediate lgroups, so need to add 696 * resources other than CPUs which are added below 697 */ 698 (void) lgrp_mnode_update(changed, NULL); 699 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 700 > 0) { 701 /* 702 * Leaf lgroup was created, but latency wasn't available 703 * then. So, set latency for it and fill in rest of lgroup 704 * topology now that we know how far it is from other leaf 705 * lgroups. 706 */ 707 lgrpid = my_lgrp->lgrp_id; 708 klgrpset_clear(changed); 709 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 710 lgrpid)) 711 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 712 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 713 &changed); 714 715 /* 716 * May have added new intermediate lgroups, so need to add 717 * resources other than CPUs which are added below 718 */ 719 (void) lgrp_mnode_update(changed, NULL); 720 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 721 my_lgrp->lgrp_id)) { 722 int i; 723 724 /* 725 * Update existing lgroup and lgroups containing it with CPU 726 * resource 727 */ 728 lgrpid = my_lgrp->lgrp_id; 729 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 730 for (i = 0; i <= lgrp_alloc_max; i++) { 731 lgrp_t *lgrp; 732 733 lgrp = lgrp_table[i]; 734 if (!LGRP_EXISTS(lgrp) || 735 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 736 continue; 737 738 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 739 } 740 } 741 742 lgrpid = my_lgrp->lgrp_id; 743 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 744 745 /* 746 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 747 * end up in lpl for lgroup 0 whether it is supposed to be in there or 748 * not since none of lgroup IDs in the lpl's have been set yet. 749 */ 750 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 751 cp->cpu_lpl->lpl_lgrpid = lgrpid; 752 753 /* 754 * link the CPU into the lgrp's CPU list 755 */ 756 if (my_lgrp->lgrp_cpucnt == 0) { 757 my_lgrp->lgrp_cpu = cp; 758 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 759 } else { 760 cptr = my_lgrp->lgrp_cpu; 761 cp->cpu_next_lgrp = cptr; 762 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 763 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 764 cptr->cpu_prev_lgrp = cp; 765 } 766 my_lgrp->lgrp_cpucnt++; 767 768 /* 769 * Add this cpu's chip to the per lgroup list 770 * if necessary 771 */ 772 if (cp->cpu_chip->chip_lgrp == NULL) { 773 struct chip *lcpr; 774 775 chp = cp->cpu_chip; 776 777 if (my_lgrp->lgrp_chipcnt == 0) { 778 my_lgrp->lgrp_chips = chp; 779 chp->chip_next_lgrp = 780 chp->chip_prev_lgrp = chp; 781 } else { 782 lcpr = my_lgrp->lgrp_chips; 783 chp->chip_next_lgrp = lcpr; 784 chp->chip_prev_lgrp = 785 lcpr->chip_prev_lgrp; 786 lcpr->chip_prev_lgrp->chip_next_lgrp = 787 chp; 788 lcpr->chip_prev_lgrp = chp; 789 } 790 chp->chip_lgrp = my_lgrp; 791 chp->chip_balance = chp->chip_next_lgrp; 792 my_lgrp->lgrp_chipcnt++; 793 } 794 } 795 796 lgrp_t * 797 lgrp_create(void) 798 { 799 lgrp_t *my_lgrp; 800 lgrp_id_t lgrpid; 801 int i; 802 803 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 804 805 /* 806 * Find an open slot in the lgroup table and recycle unused lgroup 807 * left there if any 808 */ 809 my_lgrp = NULL; 810 if (lgrp_alloc_hint == -1) 811 /* 812 * Allocate from end when hint not set yet because no lgroups 813 * have been deleted yet 814 */ 815 lgrpid = nlgrps++; 816 else { 817 /* 818 * Start looking for next open slot from hint and leave hint 819 * at slot allocated 820 */ 821 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 822 my_lgrp = lgrp_table[i]; 823 if (!LGRP_EXISTS(my_lgrp)) { 824 lgrpid = i; 825 nlgrps++; 826 break; 827 } 828 } 829 lgrp_alloc_hint = lgrpid; 830 } 831 832 /* 833 * Keep track of max lgroup ID allocated so far to cut down on searches 834 */ 835 if (lgrpid > lgrp_alloc_max) 836 lgrp_alloc_max = lgrpid; 837 838 /* 839 * Need to allocate new lgroup if next open slot didn't have one 840 * for recycling 841 */ 842 if (my_lgrp == NULL) 843 my_lgrp = lgrp_plat_alloc(lgrpid); 844 845 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 846 panic("Too many lgrps for platform (%d)", nlgrps); 847 848 my_lgrp->lgrp_id = lgrpid; 849 my_lgrp->lgrp_latency = 0; 850 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 851 my_lgrp->lgrp_parent = NULL; 852 my_lgrp->lgrp_childcnt = 0; 853 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 854 my_lgrp->lgrp_nmnodes = 0; 855 klgrpset_clear(my_lgrp->lgrp_children); 856 klgrpset_clear(my_lgrp->lgrp_leaves); 857 for (i = 0; i < LGRP_RSRC_COUNT; i++) 858 klgrpset_clear(my_lgrp->lgrp_set[i]); 859 860 my_lgrp->lgrp_cpu = NULL; 861 my_lgrp->lgrp_cpucnt = 0; 862 my_lgrp->lgrp_chips = NULL; 863 my_lgrp->lgrp_chipcnt = 0; 864 865 if (my_lgrp->lgrp_kstat != NULL) 866 lgrp_kstat_reset(lgrpid); 867 868 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 869 870 return (my_lgrp); 871 } 872 873 void 874 lgrp_destroy(lgrp_t *lgrp) 875 { 876 int i; 877 878 /* 879 * Unless this lgroup is being destroyed on behalf of 880 * the boot CPU, cpu_lock must be held 881 */ 882 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 883 884 if (nlgrps == 1) 885 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 886 887 if (!LGRP_EXISTS(lgrp)) 888 return; 889 890 /* 891 * Set hint to lgroup being deleted and try to keep lower numbered 892 * hints to facilitate finding empty slots 893 */ 894 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 895 lgrp_alloc_hint = lgrp->lgrp_id; 896 897 /* 898 * Mark this lgroup to be recycled by setting its lgroup ID to 899 * LGRP_NONE and clear relevant fields 900 */ 901 lgrp->lgrp_id = LGRP_NONE; 902 lgrp->lgrp_latency = 0; 903 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 904 lgrp->lgrp_parent = NULL; 905 lgrp->lgrp_childcnt = 0; 906 907 klgrpset_clear(lgrp->lgrp_children); 908 klgrpset_clear(lgrp->lgrp_leaves); 909 for (i = 0; i < LGRP_RSRC_COUNT; i++) 910 klgrpset_clear(lgrp->lgrp_set[i]); 911 912 lgrp->lgrp_mnodes = (mnodeset_t)0; 913 lgrp->lgrp_nmnodes = 0; 914 915 lgrp->lgrp_cpu = NULL; 916 lgrp->lgrp_cpucnt = 0; 917 lgrp->lgrp_chipcnt = 0; 918 lgrp->lgrp_chips = NULL; 919 920 nlgrps--; 921 } 922 923 /* 924 * Initialize kstat data. Called from lgrp intialization code. 925 */ 926 static void 927 lgrp_kstat_init(void) 928 { 929 lgrp_stat_t stat; 930 931 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 932 933 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 934 kstat_named_init(&lgrp_kstat_data[stat], 935 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 936 } 937 938 /* 939 * initialize an lgrp's kstats if needed 940 * called with cpu_lock held but not with cpus paused. 941 * we don't tear these down now because we don't know about 942 * memory leaving the lgrp yet... 943 */ 944 945 void 946 lgrp_kstat_create(cpu_t *cp) 947 { 948 kstat_t *lgrp_kstat; 949 lgrp_id_t lgrpid; 950 lgrp_t *my_lgrp; 951 952 ASSERT(MUTEX_HELD(&cpu_lock)); 953 954 lgrpid = cp->cpu_lpl->lpl_lgrpid; 955 my_lgrp = lgrp_table[lgrpid]; 956 957 if (my_lgrp->lgrp_kstat != NULL) 958 return; /* already initialized */ 959 960 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 961 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 962 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 963 964 if (lgrp_kstat != NULL) { 965 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 966 lgrp_kstat->ks_private = my_lgrp; 967 lgrp_kstat->ks_data = &lgrp_kstat_data; 968 lgrp_kstat->ks_update = lgrp_kstat_extract; 969 my_lgrp->lgrp_kstat = lgrp_kstat; 970 kstat_install(lgrp_kstat); 971 } 972 } 973 974 /* 975 * this will do something when we manage to remove now unused lgrps 976 */ 977 978 /* ARGSUSED */ 979 void 980 lgrp_kstat_destroy(cpu_t *cp) 981 { 982 ASSERT(MUTEX_HELD(&cpu_lock)); 983 } 984 985 /* 986 * Called when a CPU is off-lined. 987 */ 988 static void 989 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 990 { 991 lgrp_t *my_lgrp; 992 struct cpu *prev; 993 struct cpu *next; 994 chip_t *chp; 995 996 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 997 998 prev = cp->cpu_prev_lgrp; 999 next = cp->cpu_next_lgrp; 1000 1001 prev->cpu_next_lgrp = next; 1002 next->cpu_prev_lgrp = prev; 1003 1004 /* 1005 * just because I'm paranoid doesn't mean... 1006 */ 1007 1008 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1009 1010 my_lgrp = lgrp_table[lgrpid]; 1011 my_lgrp->lgrp_cpucnt--; 1012 1013 /* 1014 * If the last CPU on it's chip is being offlined 1015 * then remove this chip from the per lgroup list. 1016 * 1017 * This is also done for the boot CPU when it needs 1018 * to move between lgroups as a consequence of 1019 * null proc lpa. 1020 */ 1021 chp = cp->cpu_chip; 1022 if (chp->chip_ncpu == 0 || !lgrp_initialized) { 1023 1024 chip_t *chpp; 1025 1026 if (--my_lgrp->lgrp_chipcnt == 0) 1027 my_lgrp->lgrp_chips = NULL; 1028 else if (my_lgrp->lgrp_chips == chp) 1029 my_lgrp->lgrp_chips = chp->chip_next_lgrp; 1030 1031 /* 1032 * Walk this lgroup's chip list looking for chips that 1033 * may try to balance against the one that's leaving 1034 */ 1035 for (chpp = chp->chip_next_lgrp; chpp != chp; 1036 chpp = chpp->chip_next_lgrp) { 1037 if (chpp->chip_balance == chp) 1038 chpp->chip_balance = chp->chip_next_lgrp; 1039 } 1040 1041 chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 1042 chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 1043 1044 chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 1045 chp->chip_lgrp = NULL; 1046 chp->chip_balance = NULL; 1047 } 1048 1049 /* 1050 * Removing last CPU in lgroup, so update lgroup topology 1051 */ 1052 if (my_lgrp->lgrp_cpucnt == 0) { 1053 klgrpset_t changed; 1054 int count; 1055 int i; 1056 1057 my_lgrp->lgrp_cpu = NULL; 1058 1059 /* 1060 * Remove this lgroup from its lgroup CPU resources and remove 1061 * lgroup from lgroup topology if it doesn't have any more 1062 * resources in it now 1063 */ 1064 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1065 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1066 count = 0; 1067 klgrpset_clear(changed); 1068 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1069 lgrp_alloc_max + 1, &changed); 1070 return; 1071 } 1072 1073 /* 1074 * This lgroup isn't empty, so just remove it from CPU 1075 * resources of any lgroups that contain it as such 1076 */ 1077 for (i = 0; i <= lgrp_alloc_max; i++) { 1078 lgrp_t *lgrp; 1079 1080 lgrp = lgrp_table[i]; 1081 if (!LGRP_EXISTS(lgrp) || 1082 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1083 lgrpid)) 1084 continue; 1085 1086 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1087 } 1088 return; 1089 } 1090 1091 if (my_lgrp->lgrp_cpu == cp) 1092 my_lgrp->lgrp_cpu = next; 1093 1094 } 1095 1096 /* 1097 * Update memory nodes in target lgroups and return ones that get changed 1098 */ 1099 int 1100 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1101 { 1102 int count; 1103 int i; 1104 int j; 1105 lgrp_t *lgrp; 1106 lgrp_t *lgrp_rsrc; 1107 1108 count = 0; 1109 if (changed) 1110 klgrpset_clear(*changed); 1111 1112 if (klgrpset_isempty(target)) 1113 return (0); 1114 1115 /* 1116 * Find each lgroup in target lgroups 1117 */ 1118 for (i = 0; i <= lgrp_alloc_max; i++) { 1119 /* 1120 * Skip any lgroups that don't exist or aren't in target group 1121 */ 1122 lgrp = lgrp_table[i]; 1123 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1124 continue; 1125 } 1126 1127 /* 1128 * Initialize memnodes for intermediate lgroups to 0 1129 * and update them from scratch since they may have completely 1130 * changed 1131 */ 1132 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1133 lgrp->lgrp_mnodes = (mnodeset_t)0; 1134 lgrp->lgrp_nmnodes = 0; 1135 } 1136 1137 /* 1138 * Update memory nodes of of target lgroup with memory nodes 1139 * from each lgroup in its lgroup memory resource set 1140 */ 1141 for (j = 0; j <= lgrp_alloc_max; j++) { 1142 int k; 1143 1144 /* 1145 * Skip any lgroups that don't exist or aren't in 1146 * memory resources of target lgroup 1147 */ 1148 lgrp_rsrc = lgrp_table[j]; 1149 if (!LGRP_EXISTS(lgrp_rsrc) || 1150 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1151 j)) 1152 continue; 1153 1154 /* 1155 * Update target lgroup's memnodes to include memnodes 1156 * of this lgroup 1157 */ 1158 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1159 mnodeset_t mnode_mask; 1160 1161 mnode_mask = (mnodeset_t)1 << k; 1162 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1163 !(lgrp->lgrp_mnodes & mnode_mask)) { 1164 lgrp->lgrp_mnodes |= mnode_mask; 1165 lgrp->lgrp_nmnodes++; 1166 } 1167 } 1168 count++; 1169 if (changed) 1170 klgrpset_add(*changed, lgrp->lgrp_id); 1171 } 1172 } 1173 1174 return (count); 1175 } 1176 1177 /* 1178 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1179 * is moved from one board to another. The "from" and "to" arguments specify the 1180 * source and the destination of the move. 1181 * 1182 * See plat_lgrp_config() for a detailed description of the copy-rename 1183 * semantics. 1184 * 1185 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1186 * the lgroup topology which is changing as memory moves from one lgroup to 1187 * another. It removes the mnode from the source lgroup and re-inserts it in the 1188 * target lgroup. 1189 * 1190 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1191 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1192 * copy-rename operation. 1193 * 1194 * There is one case which requires special handling. If the system contains 1195 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1196 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1197 * lgrp_mem_init), but there is a window when the system has no memory in the 1198 * lgroup hierarchy. If another thread tries to allocate memory during this 1199 * window, the allocation will fail, although the system has physical memory. 1200 * This may cause a system panic or a deadlock (some sleeping memory allocations 1201 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1202 * the mnode back). 1203 * 1204 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1205 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1206 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1207 * but it updates the rest of the lgroup topology as if the mnode was actually 1208 * removed. The lgrp_mem_init() function recognizes that the mnode being 1209 * inserted represents such a special case and updates the topology 1210 * appropriately. 1211 */ 1212 void 1213 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1214 { 1215 /* 1216 * Remove the memory from the source node and add it to the destination 1217 * node. 1218 */ 1219 lgrp_mem_fini(mnode, from, B_TRUE); 1220 lgrp_mem_init(mnode, to, B_TRUE); 1221 } 1222 1223 /* 1224 * Called to indicate that the lgrp with platform handle "hand" now 1225 * contains the memory identified by "mnode". 1226 * 1227 * LOCKING for this routine is a bit tricky. Usually it is called without 1228 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1229 * callers. During DR of the board containing the caged memory it may be called 1230 * with cpu_lock already held and CPUs paused. 1231 * 1232 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1233 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1234 * dealing with the special case of DR copy-rename described in 1235 * lgrp_mem_rename(). 1236 */ 1237 void 1238 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1239 { 1240 klgrpset_t changed; 1241 int count; 1242 int i; 1243 lgrp_t *my_lgrp; 1244 lgrp_id_t lgrpid; 1245 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1246 boolean_t drop_lock = B_FALSE; 1247 boolean_t need_synch = B_FALSE; 1248 1249 /* 1250 * Grab CPU lock (if we haven't already) 1251 */ 1252 if (!MUTEX_HELD(&cpu_lock)) { 1253 mutex_enter(&cpu_lock); 1254 drop_lock = B_TRUE; 1255 } 1256 1257 /* 1258 * This routine may be called from a context where we already 1259 * hold cpu_lock, and have already paused cpus. 1260 */ 1261 if (!cpus_paused()) 1262 need_synch = B_TRUE; 1263 1264 /* 1265 * Check if this mnode is already configured and return immediately if 1266 * it is. 1267 * 1268 * NOTE: in special case of copy-rename of the only remaining mnode, 1269 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1270 * recognize this case and continue as usual, but skip the update to 1271 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1272 * in topology, temporarily introduced by lgrp_mem_fini(). 1273 */ 1274 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1275 lgrp_root->lgrp_mnodes & mnodes_mask) { 1276 if (drop_lock) 1277 mutex_exit(&cpu_lock); 1278 return; 1279 } 1280 1281 /* 1282 * Update lgroup topology with new memory resources, keeping track of 1283 * which lgroups change 1284 */ 1285 count = 0; 1286 klgrpset_clear(changed); 1287 my_lgrp = lgrp_hand_to_lgrp(hand); 1288 if (my_lgrp == NULL) { 1289 /* new lgrp */ 1290 my_lgrp = lgrp_create(); 1291 lgrpid = my_lgrp->lgrp_id; 1292 my_lgrp->lgrp_plathand = hand; 1293 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1294 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1295 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1296 1297 if (need_synch) 1298 pause_cpus(NULL); 1299 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1300 &changed); 1301 if (need_synch) 1302 start_cpus(); 1303 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1304 > 0) { 1305 /* 1306 * Leaf lgroup was created, but latency wasn't available 1307 * then. So, set latency for it and fill in rest of lgroup 1308 * topology now that we know how far it is from other leaf 1309 * lgroups. 1310 */ 1311 klgrpset_clear(changed); 1312 lgrpid = my_lgrp->lgrp_id; 1313 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1314 lgrpid)) 1315 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1316 if (need_synch) 1317 pause_cpus(NULL); 1318 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1319 &changed); 1320 if (need_synch) 1321 start_cpus(); 1322 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1323 my_lgrp->lgrp_id)) { 1324 /* 1325 * Add new lgroup memory resource to existing lgroup 1326 */ 1327 lgrpid = my_lgrp->lgrp_id; 1328 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1329 klgrpset_add(changed, lgrpid); 1330 count++; 1331 for (i = 0; i <= lgrp_alloc_max; i++) { 1332 lgrp_t *lgrp; 1333 1334 lgrp = lgrp_table[i]; 1335 if (!LGRP_EXISTS(lgrp) || 1336 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1337 continue; 1338 1339 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1340 klgrpset_add(changed, lgrp->lgrp_id); 1341 count++; 1342 } 1343 } 1344 1345 /* 1346 * Add memory node to lgroup and remove lgroup from ones that need 1347 * to be updated 1348 */ 1349 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1350 my_lgrp->lgrp_mnodes |= mnodes_mask; 1351 my_lgrp->lgrp_nmnodes++; 1352 } 1353 klgrpset_del(changed, lgrpid); 1354 1355 /* 1356 * Update memory node information for all lgroups that changed and 1357 * contain new memory node as a resource 1358 */ 1359 if (count) 1360 (void) lgrp_mnode_update(changed, NULL); 1361 1362 if (drop_lock) 1363 mutex_exit(&cpu_lock); 1364 } 1365 1366 /* 1367 * Called to indicate that the lgroup associated with the platform 1368 * handle "hand" no longer contains given memory node 1369 * 1370 * LOCKING for this routine is a bit tricky. Usually it is called without 1371 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1372 * callers. During DR of the board containing the caged memory it may be called 1373 * with cpu_lock already held and CPUs paused. 1374 * 1375 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1376 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1377 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1378 * the same mnode back into the topology. See lgrp_mem_rename() and 1379 * lgrp_mem_init() for additional details. 1380 */ 1381 void 1382 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1383 { 1384 klgrpset_t changed; 1385 int count; 1386 int i; 1387 lgrp_t *my_lgrp; 1388 lgrp_id_t lgrpid; 1389 mnodeset_t mnodes_mask; 1390 boolean_t drop_lock = B_FALSE; 1391 boolean_t need_synch = B_FALSE; 1392 1393 /* 1394 * Grab CPU lock (if we haven't already) 1395 */ 1396 if (!MUTEX_HELD(&cpu_lock)) { 1397 mutex_enter(&cpu_lock); 1398 drop_lock = B_TRUE; 1399 } 1400 1401 /* 1402 * This routine may be called from a context where we already 1403 * hold cpu_lock and have already paused cpus. 1404 */ 1405 if (!cpus_paused()) 1406 need_synch = B_TRUE; 1407 1408 my_lgrp = lgrp_hand_to_lgrp(hand); 1409 1410 /* 1411 * The lgrp *must* be pre-existing 1412 */ 1413 ASSERT(my_lgrp != NULL); 1414 1415 /* 1416 * Delete memory node from lgroups which contain it 1417 */ 1418 mnodes_mask = ((mnodeset_t)1 << mnode); 1419 for (i = 0; i <= lgrp_alloc_max; i++) { 1420 lgrp_t *lgrp = lgrp_table[i]; 1421 /* 1422 * Skip any non-existent lgroups and any lgroups that don't 1423 * contain leaf lgroup of memory as a memory resource 1424 */ 1425 if (!LGRP_EXISTS(lgrp) || 1426 !(lgrp->lgrp_mnodes & mnodes_mask)) 1427 continue; 1428 1429 /* 1430 * Avoid removing the last mnode from the root in the DR 1431 * copy-rename case. See lgrp_mem_rename() for details. 1432 */ 1433 if (is_copy_rename && 1434 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1435 continue; 1436 1437 /* 1438 * Remove memory node from lgroup. 1439 */ 1440 lgrp->lgrp_mnodes &= ~mnodes_mask; 1441 lgrp->lgrp_nmnodes--; 1442 ASSERT(lgrp->lgrp_nmnodes >= 0); 1443 } 1444 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1445 1446 /* 1447 * Don't need to update lgroup topology if this lgroup still has memory. 1448 * 1449 * In the special case of DR copy-rename with the only mnode being 1450 * removed, the lgrp_mnodes for the root is always non-zero, but we 1451 * still need to update the lgroup topology. 1452 */ 1453 if ((my_lgrp->lgrp_nmnodes > 0) && 1454 !(is_copy_rename && 1455 (my_lgrp == lgrp_root) && 1456 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1457 if (drop_lock) 1458 mutex_exit(&cpu_lock); 1459 return; 1460 } 1461 1462 /* 1463 * This lgroup does not contain any memory now 1464 */ 1465 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1466 1467 /* 1468 * Remove this lgroup from lgroup topology if it does not contain any 1469 * resources now 1470 */ 1471 lgrpid = my_lgrp->lgrp_id; 1472 count = 0; 1473 klgrpset_clear(changed); 1474 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1475 /* 1476 * Delete lgroup when no more resources 1477 */ 1478 if (need_synch) 1479 pause_cpus(NULL); 1480 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1481 lgrp_alloc_max + 1, &changed); 1482 ASSERT(count > 0); 1483 if (need_synch) 1484 start_cpus(); 1485 } else { 1486 /* 1487 * Remove lgroup from memory resources of any lgroups that 1488 * contain it as such 1489 */ 1490 for (i = 0; i <= lgrp_alloc_max; i++) { 1491 lgrp_t *lgrp; 1492 1493 lgrp = lgrp_table[i]; 1494 if (!LGRP_EXISTS(lgrp) || 1495 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1496 lgrpid)) 1497 continue; 1498 1499 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1500 } 1501 } 1502 if (drop_lock) 1503 mutex_exit(&cpu_lock); 1504 } 1505 1506 /* 1507 * Return lgroup with given platform handle 1508 */ 1509 lgrp_t * 1510 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1511 { 1512 int i; 1513 lgrp_t *lgrp; 1514 1515 if (hand == LGRP_NULL_HANDLE) 1516 return (NULL); 1517 1518 for (i = 0; i <= lgrp_alloc_max; i++) { 1519 lgrp = lgrp_table[i]; 1520 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1521 return (lgrp); 1522 } 1523 return (NULL); 1524 } 1525 1526 /* 1527 * Return the home lgroup of the current thread. 1528 * We must do this with kernel preemption disabled, since we don't want our 1529 * thread to be re-homed while we're poking around with its lpl, and the lpl 1530 * should never be NULL. 1531 * 1532 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1533 * is enabled because of DR. Callers can use disable kernel preemption 1534 * around this call to guarantee that the lgroup will be valid beyond this 1535 * routine, since kernel preemption can be recursive. 1536 */ 1537 lgrp_t * 1538 lgrp_home_lgrp(void) 1539 { 1540 lgrp_t *lgrp; 1541 lpl_t *lpl; 1542 1543 kpreempt_disable(); 1544 1545 lpl = curthread->t_lpl; 1546 ASSERT(lpl != NULL); 1547 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1548 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1549 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1550 1551 kpreempt_enable(); 1552 1553 return (lgrp); 1554 } 1555 1556 /* 1557 * Return ID of home lgroup for given thread 1558 * (See comments for lgrp_home_lgrp() for special care and handling 1559 * instructions) 1560 */ 1561 lgrp_id_t 1562 lgrp_home_id(kthread_t *t) 1563 { 1564 lgrp_id_t lgrp; 1565 lpl_t *lpl; 1566 1567 ASSERT(t != NULL); 1568 /* 1569 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1570 * cannot since the HAT layer can call into this routine to 1571 * determine the locality for its data structures in the context 1572 * of a page fault. 1573 */ 1574 1575 kpreempt_disable(); 1576 1577 lpl = t->t_lpl; 1578 ASSERT(lpl != NULL); 1579 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1580 lgrp = lpl->lpl_lgrpid; 1581 1582 kpreempt_enable(); 1583 1584 return (lgrp); 1585 } 1586 1587 /* 1588 * Return lgroup containing the physical memory for the given page frame number 1589 */ 1590 lgrp_t * 1591 lgrp_pfn_to_lgrp(pfn_t pfn) 1592 { 1593 lgrp_handle_t hand; 1594 int i; 1595 lgrp_t *lgrp; 1596 1597 hand = lgrp_plat_pfn_to_hand(pfn); 1598 if (hand != LGRP_NULL_HANDLE) 1599 for (i = 0; i <= lgrp_alloc_max; i++) { 1600 lgrp = lgrp_table[i]; 1601 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1602 return (lgrp); 1603 } 1604 return (NULL); 1605 } 1606 1607 /* 1608 * Return lgroup containing the physical memory for the given page frame number 1609 */ 1610 lgrp_t * 1611 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1612 { 1613 lgrp_handle_t hand; 1614 int i; 1615 lgrp_t *lgrp; 1616 pfn_t pfn; 1617 1618 pfn = btop(physaddr); 1619 hand = lgrp_plat_pfn_to_hand(pfn); 1620 if (hand != LGRP_NULL_HANDLE) 1621 for (i = 0; i <= lgrp_alloc_max; i++) { 1622 lgrp = lgrp_table[i]; 1623 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1624 return (lgrp); 1625 } 1626 return (NULL); 1627 } 1628 1629 /* 1630 * Return the leaf lgroup containing the given CPU 1631 * 1632 * The caller needs to take precautions necessary to prevent 1633 * "cpu" from going away across a call to this function. 1634 * hint: kpreempt_disable()/kpreempt_enable() 1635 */ 1636 static lgrp_t * 1637 lgrp_cpu_to_lgrp(cpu_t *cpu) 1638 { 1639 return (cpu->cpu_lpl->lpl_lgrp); 1640 } 1641 1642 /* 1643 * Return the sum of the partition loads in an lgrp divided by 1644 * the number of CPUs in the lgrp. This is our best approximation 1645 * of an 'lgroup load average' for a useful per-lgroup kstat. 1646 */ 1647 static uint64_t 1648 lgrp_sum_loadavgs(lgrp_t *lgrp) 1649 { 1650 cpu_t *cpu; 1651 int ncpu; 1652 uint64_t loads = 0; 1653 1654 mutex_enter(&cpu_lock); 1655 1656 cpu = lgrp->lgrp_cpu; 1657 ncpu = lgrp->lgrp_cpucnt; 1658 1659 if (cpu == NULL || ncpu == 0) { 1660 mutex_exit(&cpu_lock); 1661 return (0ull); 1662 } 1663 1664 do { 1665 loads += cpu->cpu_lpl->lpl_loadavg; 1666 cpu = cpu->cpu_next_lgrp; 1667 } while (cpu != lgrp->lgrp_cpu); 1668 1669 mutex_exit(&cpu_lock); 1670 1671 return (loads / ncpu); 1672 } 1673 1674 void 1675 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1676 { 1677 struct lgrp_stats *pstats; 1678 1679 /* 1680 * Verify that the caller isn't trying to add to 1681 * a statistic for an lgroup that has gone away 1682 */ 1683 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1684 return; 1685 1686 pstats = &lgrp_stats[lgrpid]; 1687 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1688 } 1689 1690 int64_t 1691 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1692 { 1693 uint64_t val; 1694 struct lgrp_stats *pstats; 1695 1696 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1697 return ((int64_t)0); 1698 1699 pstats = &lgrp_stats[lgrpid]; 1700 LGRP_STAT_READ(pstats, stat, val); 1701 return (val); 1702 } 1703 1704 /* 1705 * Reset all kstats for lgrp specified by its lgrpid. 1706 */ 1707 static void 1708 lgrp_kstat_reset(lgrp_id_t lgrpid) 1709 { 1710 lgrp_stat_t stat; 1711 1712 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1713 return; 1714 1715 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1716 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1717 } 1718 } 1719 1720 /* 1721 * Collect all per-lgrp statistics for the lgrp associated with this 1722 * kstat, and store them in the ks_data array. 1723 * 1724 * The superuser can reset all the running counter statistics for an 1725 * lgrp by writing to any of the lgrp's stats. 1726 */ 1727 static int 1728 lgrp_kstat_extract(kstat_t *ksp, int rw) 1729 { 1730 lgrp_stat_t stat; 1731 struct kstat_named *ksd; 1732 lgrp_t *lgrp; 1733 lgrp_id_t lgrpid; 1734 1735 lgrp = (lgrp_t *)ksp->ks_private; 1736 1737 ksd = (struct kstat_named *)ksp->ks_data; 1738 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1739 1740 lgrpid = lgrp->lgrp_id; 1741 1742 if (lgrpid == LGRP_NONE) { 1743 /* 1744 * Return all zeroes as stats for freed lgrp. 1745 */ 1746 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1747 ksd[stat].value.i64 = 0; 1748 } 1749 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1750 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1751 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1752 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1753 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1754 } else if (rw != KSTAT_WRITE) { 1755 /* 1756 * Handle counter stats 1757 */ 1758 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1759 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1760 } 1761 1762 /* 1763 * Handle kernel data snapshot stats 1764 */ 1765 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1766 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1767 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1768 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1769 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1770 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1771 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1772 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1773 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1774 lgrp_loadavg_max_effect; 1775 } else { 1776 lgrp_kstat_reset(lgrpid); 1777 } 1778 1779 return (0); 1780 } 1781 1782 int 1783 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1784 { 1785 cpu_t *cp; 1786 1787 mutex_enter(&cpu_lock); 1788 1789 if ((cp = cpu_get(id)) == NULL) { 1790 mutex_exit(&cpu_lock); 1791 return (EINVAL); 1792 } 1793 1794 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1795 mutex_exit(&cpu_lock); 1796 return (EINVAL); 1797 } 1798 1799 ASSERT(cp->cpu_lpl != NULL); 1800 1801 *lp = cp->cpu_lpl->lpl_lgrpid; 1802 1803 mutex_exit(&cpu_lock); 1804 1805 return (0); 1806 } 1807 1808 int 1809 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1810 { 1811 cpu_t *cp; 1812 1813 mutex_enter(&cpu_lock); 1814 1815 if ((cp = cpu_get(id)) == NULL) { 1816 mutex_exit(&cpu_lock); 1817 return (EINVAL); 1818 } 1819 1820 ASSERT(cp->cpu_lpl != NULL); 1821 1822 *lp = cp->cpu_lpl->lpl_loadavg; 1823 1824 mutex_exit(&cpu_lock); 1825 1826 return (0); 1827 } 1828 1829 void 1830 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime) 1831 { 1832 lgrp_t *lgrp; 1833 int i; 1834 1835 for (i = 0; i <= lgrp_alloc_max; i++) { 1836 lgrp = lgrp_table[i]; 1837 1838 if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime)) 1839 lgrp->lgrp_latency = (int)newtime; 1840 } 1841 } 1842 1843 /* 1844 * Add a resource named by lpl_leaf to rset of lpl_target 1845 * 1846 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1847 * resource. It is adjusted here, as this is presently the only place that we 1848 * can be certain a resource addition has succeeded. 1849 * 1850 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1851 * list in order until it reaches a NULL. (This list is required to be NULL 1852 * terminated, too). This is done so that we can mark start pos + 1, so that 1853 * each lpl is traversed sequentially, but in a different order. We hope this 1854 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1855 */ 1856 1857 void 1858 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1859 { 1860 int i; 1861 int entry_slot = 0; 1862 1863 /* return if leaf is already present */ 1864 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1865 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1866 return; 1867 } 1868 1869 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1870 lpl_leaf->lpl_lgrpid) { 1871 break; 1872 } 1873 } 1874 1875 /* insert leaf, update counts */ 1876 entry_slot = i; 1877 i = lpl_target->lpl_nrset++; 1878 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1879 panic("More leaf lgrps in system than are supported!\n"); 1880 } 1881 1882 /* 1883 * Start at the end of the rset array and work backwards towards the 1884 * slot into which the new lpl will be inserted. This effectively 1885 * preserves the current ordering by scooting everybody over one entry, 1886 * and placing the new entry into the space created. 1887 */ 1888 1889 while (i-- > entry_slot) { 1890 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1891 } 1892 1893 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1894 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1895 } 1896 1897 /* 1898 * Update each of lpl_parent's children with a proper hint and 1899 * a reference to their parent. 1900 * The lgrp topology is used as the reference since it is fully 1901 * consistent and correct at this point. 1902 * 1903 * Each child's hint will reference an element in lpl_parent's 1904 * rset that designates where the child should start searching 1905 * for CPU resources. The hint selected is the highest order leaf present 1906 * in the child's lineage. 1907 * 1908 * This should be called after any potential change in lpl_parent's 1909 * rset. 1910 */ 1911 static void 1912 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1913 { 1914 klgrpset_t children, leaves; 1915 lpl_t *lpl; 1916 int hint; 1917 int i, j; 1918 1919 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1920 if (klgrpset_isempty(children)) 1921 return; /* nothing to do */ 1922 1923 for (i = 0; i <= lgrp_alloc_max; i++) { 1924 if (klgrpset_ismember(children, i)) { 1925 1926 /* 1927 * Given the set of leaves in this child's lineage, 1928 * find the highest order leaf present in the parent's 1929 * rset. Select this as the hint for the child. 1930 */ 1931 leaves = lgrp_table[i]->lgrp_leaves; 1932 hint = 0; 1933 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1934 lpl = lpl_parent->lpl_rset[j]; 1935 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1936 hint = j; 1937 } 1938 cp->cp_lgrploads[i].lpl_hint = hint; 1939 1940 /* 1941 * (Re)set the parent. It may be incorrect if 1942 * lpl_parent is new in the topology. 1943 */ 1944 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1945 } 1946 } 1947 } 1948 1949 /* 1950 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1951 * 1952 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1953 * resource. The values are adjusted here, as this is the only place that we can 1954 * be certain a resource was successfully deleted. 1955 */ 1956 void 1957 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1958 { 1959 int i; 1960 1961 /* find leaf in intermediate node */ 1962 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1963 if (lpl_target->lpl_rset[i] == lpl_leaf) 1964 break; 1965 } 1966 1967 /* return if leaf not found */ 1968 if (lpl_target->lpl_rset[i] != lpl_leaf) 1969 return; 1970 1971 /* prune leaf, compress array */ 1972 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1973 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1974 lpl_target->lpl_ncpu--; 1975 do { 1976 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1977 } while (i++ < lpl_target->lpl_nrset); 1978 } 1979 1980 /* 1981 * Check to see if the resource set of the target lpl contains the 1982 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1983 */ 1984 1985 int 1986 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1987 { 1988 int i; 1989 1990 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1991 if (lpl_target->lpl_rset[i] == lpl_leaf) 1992 return (1); 1993 } 1994 1995 return (0); 1996 } 1997 1998 /* 1999 * Called when we change cpu lpl membership. This increments or decrements the 2000 * per-cpu counter in every lpl in which our leaf appears. 2001 */ 2002 void 2003 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 2004 { 2005 cpupart_t *cpupart; 2006 lgrp_t *lgrp_leaf; 2007 lgrp_t *lgrp_cur; 2008 lpl_t *lpl_leaf; 2009 lpl_t *lpl_cur; 2010 int i; 2011 2012 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 2013 2014 cpupart = cp->cpu_part; 2015 lpl_leaf = cp->cpu_lpl; 2016 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 2017 2018 for (i = 0; i <= lgrp_alloc_max; i++) { 2019 lgrp_cur = lgrp_table[i]; 2020 2021 /* 2022 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 2023 * for the cpu in question, or if the current lgrp and leaf 2024 * don't share the same resources. 2025 */ 2026 2027 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 2028 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 2029 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 2030 continue; 2031 2032 2033 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2034 2035 if (lpl_cur->lpl_nrset > 0) { 2036 if (act == LPL_INCREMENT) { 2037 lpl_cur->lpl_ncpu++; 2038 } else if (act == LPL_DECREMENT) { 2039 lpl_cur->lpl_ncpu--; 2040 } 2041 } 2042 } 2043 } 2044 2045 /* 2046 * Initialize lpl with given resources and specified lgrp 2047 */ 2048 2049 void 2050 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2051 { 2052 lpl->lpl_lgrpid = lgrp->lgrp_id; 2053 lpl->lpl_loadavg = 0; 2054 if (lpl == lpl_leaf) 2055 lpl->lpl_ncpu = 1; 2056 else 2057 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2058 lpl->lpl_nrset = 1; 2059 lpl->lpl_rset[0] = lpl_leaf; 2060 lpl->lpl_lgrp = lgrp; 2061 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2062 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2063 } 2064 2065 /* 2066 * Clear an unused lpl 2067 */ 2068 2069 void 2070 lpl_clear(lpl_t *lpl) 2071 { 2072 lgrp_id_t lid; 2073 2074 /* save lid for debugging purposes */ 2075 lid = lpl->lpl_lgrpid; 2076 bzero(lpl, sizeof (lpl_t)); 2077 lpl->lpl_lgrpid = lid; 2078 } 2079 2080 /* 2081 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2082 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2083 * make full use of all of the lgroup topology, but this checks to make sure 2084 * that for the parts that it does use, it has correctly understood the 2085 * relationships that exist. This function returns 2086 * 0 if the topology is correct, and a non-zero error code, for non-debug 2087 * kernels if incorrect. Asserts are spread throughout the code to aid in 2088 * debugging on a DEBUG kernel. 2089 */ 2090 int 2091 lpl_topo_verify(cpupart_t *cpupart) 2092 { 2093 lgrp_t *lgrp; 2094 lpl_t *lpl; 2095 klgrpset_t rset; 2096 klgrpset_t cset; 2097 cpu_t *cpu; 2098 cpu_t *cp_start; 2099 int i; 2100 int j; 2101 int sum; 2102 2103 /* topology can't be incorrect if it doesn't exist */ 2104 if (!lgrp_topo_initialized || !lgrp_initialized) 2105 return (LPL_TOPO_CORRECT); 2106 2107 ASSERT(cpupart != NULL); 2108 2109 for (i = 0; i <= lgrp_alloc_max; i++) { 2110 lgrp = lgrp_table[i]; 2111 lpl = NULL; 2112 /* make sure lpls are allocated */ 2113 ASSERT(cpupart->cp_lgrploads); 2114 if (!cpupart->cp_lgrploads) 2115 return (LPL_TOPO_PART_HAS_NO_LPL); 2116 2117 lpl = &cpupart->cp_lgrploads[i]; 2118 /* make sure our index is good */ 2119 ASSERT(i < cpupart->cp_nlgrploads); 2120 2121 /* if lgroup doesn't exist, make sure lpl is empty */ 2122 if (!LGRP_EXISTS(lgrp)) { 2123 ASSERT(lpl->lpl_ncpu == 0); 2124 if (lpl->lpl_ncpu > 0) { 2125 return (LPL_TOPO_CPUS_NOT_EMPTY); 2126 } else { 2127 continue; 2128 } 2129 } 2130 2131 /* verify that lgroup and lpl are identically numbered */ 2132 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2133 2134 /* if lgroup isn't in our partition, make sure lpl is empty */ 2135 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2136 cpupart->cp_lgrpset)) { 2137 ASSERT(lpl->lpl_ncpu == 0); 2138 if (lpl->lpl_ncpu > 0) { 2139 return (LPL_TOPO_CPUS_NOT_EMPTY); 2140 } 2141 /* 2142 * lpl is empty, and lgroup isn't in partition. verify 2143 * that lpl doesn't show up in anyone else's rsets (in 2144 * this partition, anyway) 2145 */ 2146 2147 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2148 lpl_t *i_lpl; /* lpl we're iterating over */ 2149 2150 i_lpl = &cpupart->cp_lgrploads[j]; 2151 2152 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2153 if (lpl_rset_contains(i_lpl, lpl)) { 2154 return (LPL_TOPO_LPL_ORPHANED); 2155 } 2156 } 2157 /* lgroup is empty, and everything is ok. continue */ 2158 continue; 2159 } 2160 2161 2162 /* lgroup is in this partition, now check it against lpl */ 2163 2164 /* do both have matching lgrps? */ 2165 ASSERT(lgrp == lpl->lpl_lgrp); 2166 if (lgrp != lpl->lpl_lgrp) { 2167 return (LPL_TOPO_LGRP_MISMATCH); 2168 } 2169 2170 /* do the parent lgroups exist and do they match? */ 2171 if (lgrp->lgrp_parent) { 2172 ASSERT(lpl->lpl_parent); 2173 ASSERT(lgrp->lgrp_parent->lgrp_id == 2174 lpl->lpl_parent->lpl_lgrpid); 2175 2176 if (!lpl->lpl_parent) { 2177 return (LPL_TOPO_MISSING_PARENT); 2178 } else if (lgrp->lgrp_parent->lgrp_id != 2179 lpl->lpl_parent->lpl_lgrpid) { 2180 return (LPL_TOPO_PARENT_MISMATCH); 2181 } 2182 } 2183 2184 /* only leaf lgroups keep a cpucnt, only check leaves */ 2185 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2186 2187 /* verify that lgrp is also a leaf */ 2188 ASSERT((lgrp->lgrp_childcnt == 0) && 2189 (klgrpset_ismember(lgrp->lgrp_leaves, 2190 lpl->lpl_lgrpid))); 2191 2192 if ((lgrp->lgrp_childcnt > 0) || 2193 (!klgrpset_ismember(lgrp->lgrp_leaves, 2194 lpl->lpl_lgrpid))) { 2195 return (LPL_TOPO_LGRP_NOT_LEAF); 2196 } 2197 2198 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2199 (lpl->lpl_ncpu > 0)); 2200 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2201 (lpl->lpl_ncpu <= 0)) { 2202 return (LPL_TOPO_BAD_CPUCNT); 2203 } 2204 2205 /* 2206 * Check that lpl_ncpu also matches the number of 2207 * cpus in the lpl's linked list. This only exists in 2208 * leaves, but they should always match. 2209 */ 2210 j = 0; 2211 cpu = cp_start = lpl->lpl_cpus; 2212 while (cpu != NULL) { 2213 j++; 2214 2215 /* check to make sure cpu's lpl is leaf lpl */ 2216 ASSERT(cpu->cpu_lpl == lpl); 2217 if (cpu->cpu_lpl != lpl) { 2218 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2219 } 2220 2221 /* check next cpu */ 2222 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2223 continue; 2224 } else { 2225 cpu = NULL; 2226 } 2227 } 2228 2229 ASSERT(j == lpl->lpl_ncpu); 2230 if (j != lpl->lpl_ncpu) { 2231 return (LPL_TOPO_LPL_BAD_NCPU); 2232 } 2233 2234 /* 2235 * Also, check that leaf lpl is contained in all 2236 * intermediate lpls that name the leaf as a descendant 2237 */ 2238 2239 for (j = 0; j <= lgrp_alloc_max; j++) { 2240 klgrpset_t intersect; 2241 lgrp_t *lgrp_cand; 2242 lpl_t *lpl_cand; 2243 2244 lgrp_cand = lgrp_table[j]; 2245 intersect = klgrpset_intersects( 2246 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2247 cpupart->cp_lgrpset); 2248 2249 if (!LGRP_EXISTS(lgrp_cand) || 2250 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2251 cpupart->cp_lgrpset) || 2252 (intersect == 0)) 2253 continue; 2254 2255 lpl_cand = 2256 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2257 2258 if (klgrpset_ismember(intersect, 2259 lgrp->lgrp_id)) { 2260 ASSERT(lpl_rset_contains(lpl_cand, 2261 lpl)); 2262 2263 if (!lpl_rset_contains(lpl_cand, lpl)) { 2264 return (LPL_TOPO_RSET_MSSNG_LF); 2265 } 2266 } 2267 } 2268 2269 } else { /* non-leaf specific checks */ 2270 2271 /* 2272 * Non-leaf lpls should have lpl_cpus == NULL 2273 * verify that this is so 2274 */ 2275 ASSERT(lpl->lpl_cpus == NULL); 2276 if (lpl->lpl_cpus != NULL) { 2277 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2278 } 2279 2280 /* 2281 * verify that the sum of the cpus in the leaf resources 2282 * is equal to the total ncpu in the intermediate 2283 */ 2284 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2285 sum += lpl->lpl_rset[j]->lpl_ncpu; 2286 } 2287 2288 ASSERT(sum == lpl->lpl_ncpu); 2289 if (sum != lpl->lpl_ncpu) { 2290 return (LPL_TOPO_LPL_BAD_NCPU); 2291 } 2292 } 2293 2294 /* 2295 * check on lpl_hint. Don't check root, since it has no parent. 2296 */ 2297 if (lpl->lpl_parent != NULL) { 2298 int hint; 2299 lpl_t *hint_lpl; 2300 2301 /* make sure hint is within limits of nrset */ 2302 hint = lpl->lpl_hint; 2303 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2304 if (lpl->lpl_parent->lpl_nrset < hint) { 2305 return (LPL_TOPO_BOGUS_HINT); 2306 } 2307 2308 /* make sure hint points to valid lpl */ 2309 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2310 ASSERT(hint_lpl->lpl_ncpu > 0); 2311 if (hint_lpl->lpl_ncpu <= 0) { 2312 return (LPL_TOPO_BOGUS_HINT); 2313 } 2314 } 2315 2316 /* 2317 * Check the rset of the lpl in question. Make sure that each 2318 * rset contains a subset of the resources in 2319 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2320 * sure that each rset doesn't include resources that are 2321 * outside of that set. (Which would be resources somehow not 2322 * accounted for). 2323 */ 2324 2325 klgrpset_clear(rset); 2326 for (j = 0; j < lpl->lpl_nrset; j++) { 2327 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2328 } 2329 klgrpset_copy(cset, rset); 2330 /* make sure lpl rset matches lgrp rset */ 2331 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2332 /* make sure rset is contained with in partition, too */ 2333 klgrpset_diff(cset, cpupart->cp_lgrpset); 2334 2335 ASSERT(klgrpset_isempty(rset) && 2336 klgrpset_isempty(cset)); 2337 if (!klgrpset_isempty(rset) || 2338 !klgrpset_isempty(cset)) { 2339 return (LPL_TOPO_RSET_MISMATCH); 2340 } 2341 2342 /* 2343 * check to make sure lpl_nrset matches the number of rsets 2344 * contained in the lpl 2345 */ 2346 2347 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2348 j++); 2349 2350 ASSERT(j == lpl->lpl_nrset); 2351 if (j != lpl->lpl_nrset) { 2352 return (LPL_TOPO_BAD_RSETCNT); 2353 } 2354 2355 } 2356 return (LPL_TOPO_CORRECT); 2357 } 2358 2359 /* 2360 * Flatten lpl topology to given number of levels. This is presently only 2361 * implemented for a flatten to 2 levels, which will prune out the intermediates 2362 * and home the leaf lpls to the root lpl. 2363 */ 2364 int 2365 lpl_topo_flatten(int levels) 2366 { 2367 int i; 2368 uint_t sum; 2369 lgrp_t *lgrp_cur; 2370 lpl_t *lpl_cur; 2371 lpl_t *lpl_root; 2372 cpupart_t *cp; 2373 2374 if (levels != 2) 2375 return (0); 2376 2377 /* called w/ cpus paused - grab no locks! */ 2378 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2379 !lgrp_initialized); 2380 2381 cp = cp_list_head; 2382 do { 2383 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2384 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2385 2386 for (i = 0; i <= lgrp_alloc_max; i++) { 2387 lgrp_cur = lgrp_table[i]; 2388 lpl_cur = &cp->cp_lgrploads[i]; 2389 2390 if ((lgrp_cur == lgrp_root) || 2391 (!LGRP_EXISTS(lgrp_cur) && 2392 (lpl_cur->lpl_ncpu == 0))) 2393 continue; 2394 2395 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2396 /* 2397 * this should be a deleted intermediate, so 2398 * clear it 2399 */ 2400 lpl_clear(lpl_cur); 2401 } else if ((lpl_cur->lpl_nrset == 1) && 2402 (lpl_cur->lpl_rset[0] == lpl_cur) && 2403 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2404 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2405 /* 2406 * this is a leaf whose parent was deleted, or 2407 * whose parent had their lgrp deleted. (And 2408 * whose parent will soon be deleted). Point 2409 * this guy back to the root lpl. 2410 */ 2411 lpl_cur->lpl_parent = lpl_root; 2412 lpl_rset_add(lpl_root, lpl_cur); 2413 } 2414 2415 } 2416 2417 /* 2418 * Now that we're done, make sure the count on the root lpl is 2419 * correct, and update the hints of the children for the sake of 2420 * thoroughness 2421 */ 2422 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2423 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2424 } 2425 lpl_root->lpl_ncpu = sum; 2426 lpl_child_update(lpl_root, cp); 2427 2428 cp = cp->cp_next; 2429 } while (cp != cp_list_head); 2430 2431 return (levels); 2432 } 2433 2434 /* 2435 * Insert a lpl into the resource hierarchy and create any additional lpls that 2436 * are necessary to represent the varying states of locality for the cpu 2437 * resoruces newly added to the partition. 2438 * 2439 * This routine is clever enough that it can correctly add resources from the 2440 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2441 * those for which the lpl is a leaf as opposed to simply a named equally local 2442 * resource). The one special case that needs additional processing is when a 2443 * new intermediate lpl is introduced. Since the main loop only traverses 2444 * looking to add the leaf resource where it does not yet exist, additional work 2445 * is necessary to add other leaf resources that may need to exist in the newly 2446 * created intermediate. This is performed by the second inner loop, and is 2447 * only done when the check for more than one overlapping resource succeeds. 2448 */ 2449 2450 void 2451 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2452 { 2453 int i; 2454 int j; 2455 int hint; 2456 int rset_num_intersect; 2457 lgrp_t *lgrp_cur; 2458 lpl_t *lpl_cur; 2459 lpl_t *lpl_parent; 2460 lgrp_id_t parent_id; 2461 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2462 2463 for (i = 0; i <= lgrp_alloc_max; i++) { 2464 lgrp_cur = lgrp_table[i]; 2465 2466 /* 2467 * Don't insert if the lgrp isn't there, if the leaf isn't 2468 * contained within the current lgrp, or if the current lgrp has 2469 * no leaves in this partition 2470 */ 2471 2472 if (!LGRP_EXISTS(lgrp_cur) || 2473 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2474 lpl_leaf->lpl_lgrpid) || 2475 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2476 cpupart->cp_lgrpset)) 2477 continue; 2478 2479 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2480 if (lgrp_cur->lgrp_parent != NULL) { 2481 /* if lgrp has a parent, assign it properly */ 2482 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2483 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2484 } else { 2485 /* if not, make sure parent ptr gets set to null */ 2486 lpl_parent = NULL; 2487 } 2488 2489 if (lpl_cur == lpl_leaf) { 2490 /* 2491 * Almost all leaf state was initialized elsewhere. The 2492 * only thing left to do is to set the parent. 2493 */ 2494 lpl_cur->lpl_parent = lpl_parent; 2495 continue; 2496 } 2497 2498 /* 2499 * Initialize intermediate lpl 2500 * Save this lpl's hint though. Since we're changing this 2501 * lpl's resources, we need to update the hint in this lpl's 2502 * children, but the hint in this lpl is unaffected and 2503 * should be preserved. 2504 */ 2505 hint = lpl_cur->lpl_hint; 2506 2507 lpl_clear(lpl_cur); 2508 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2509 2510 lpl_cur->lpl_hint = hint; 2511 lpl_cur->lpl_parent = lpl_parent; 2512 2513 /* does new lpl need to be populated with other resources? */ 2514 rset_intersect = 2515 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2516 cpupart->cp_lgrpset); 2517 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2518 2519 if (rset_num_intersect > 1) { 2520 /* 2521 * If so, figure out what lpls have resources that 2522 * intersect this one, and add them. 2523 */ 2524 for (j = 0; j <= lgrp_alloc_max; j++) { 2525 lgrp_t *lgrp_cand; /* candidate lgrp */ 2526 lpl_t *lpl_cand; /* candidate lpl */ 2527 2528 lgrp_cand = lgrp_table[j]; 2529 if (!LGRP_EXISTS(lgrp_cand) || 2530 !klgrpset_ismember(rset_intersect, 2531 lgrp_cand->lgrp_id)) 2532 continue; 2533 lpl_cand = 2534 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2535 lpl_rset_add(lpl_cur, lpl_cand); 2536 } 2537 } 2538 /* 2539 * This lpl's rset has changed. Update the hint in it's 2540 * children. 2541 */ 2542 lpl_child_update(lpl_cur, cpupart); 2543 } 2544 } 2545 2546 /* 2547 * remove a lpl from the hierarchy of resources, clearing its state when 2548 * finished. If the lpls at the intermediate levels of the hierarchy have no 2549 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2550 * delete them as well. 2551 */ 2552 2553 void 2554 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2555 { 2556 int i; 2557 lgrp_t *lgrp_cur; 2558 lpl_t *lpl_cur; 2559 klgrpset_t leaf_intersect; /* intersection of leaves */ 2560 2561 for (i = 0; i <= lgrp_alloc_max; i++) { 2562 lgrp_cur = lgrp_table[i]; 2563 2564 /* 2565 * Don't attempt to remove from lgrps that aren't there, that 2566 * don't contain our leaf, or from the leaf itself. (We do that 2567 * later) 2568 */ 2569 2570 if (!LGRP_EXISTS(lgrp_cur)) 2571 continue; 2572 2573 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2574 2575 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2576 lpl_leaf->lpl_lgrpid) || 2577 (lpl_cur == lpl_leaf)) { 2578 continue; 2579 } 2580 2581 /* 2582 * This is a slightly sleazy simplification in that we have 2583 * already marked the cp_lgrpset as no longer containing the 2584 * leaf we've deleted. Any lpls that pass the above checks 2585 * based upon lgrp membership but not necessarily cpu-part 2586 * membership also get cleared by the checks below. Currently 2587 * this is harmless, as the lpls should be empty anyway. 2588 * 2589 * In particular, we want to preserve lpls that have additional 2590 * leaf resources, even though we don't yet have a processor 2591 * architecture that represents resources this way. 2592 */ 2593 2594 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2595 cpupart->cp_lgrpset); 2596 2597 lpl_rset_del(lpl_cur, lpl_leaf); 2598 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2599 lpl_clear(lpl_cur); 2600 } else { 2601 /* 2602 * Update this lpl's children 2603 */ 2604 lpl_child_update(lpl_cur, cpupart); 2605 } 2606 } 2607 lpl_clear(lpl_leaf); 2608 } 2609 2610 /* 2611 * add a cpu to a partition in terms of lgrp load avg bookeeping 2612 * 2613 * The lpl (cpu partition load average information) is now arranged in a 2614 * hierarchical fashion whereby resources that are closest, ie. most local, to 2615 * the cpu in question are considered to be leaves in a tree of resources. 2616 * There are two general cases for cpu additon: 2617 * 2618 * 1. A lpl structure that contains resources already in the hierarchy tree. 2619 * In this case, all of the associated lpl relationships have been defined, and 2620 * all that is necessary is that we link the new cpu into the per-lpl list of 2621 * cpus, and increment the ncpu count of all places where this cpu resource will 2622 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2623 * pushing is accomplished by this routine. 2624 * 2625 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2626 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2627 * construct the hierarchy of state necessary to name it's more distant 2628 * resources, if they should exist. The leaf structure is initialized by this 2629 * routine, as is the cpu-partition state for the lgrp membership. This routine 2630 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2631 * and builds all of the "ancestoral" state necessary to identify resources at 2632 * differing levels of locality. 2633 */ 2634 void 2635 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2636 { 2637 cpupart_t *cpupart; 2638 lgrp_t *lgrp_leaf; 2639 lpl_t *lpl_leaf; 2640 2641 /* called sometimes w/ cpus paused - grab no locks */ 2642 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2643 2644 cpupart = cp->cpu_part; 2645 lgrp_leaf = lgrp_table[lgrpid]; 2646 2647 /* don't add non-existent lgrp */ 2648 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2649 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2650 cp->cpu_lpl = lpl_leaf; 2651 2652 /* only leaf lpls contain cpus */ 2653 2654 if (lpl_leaf->lpl_ncpu++ == 0) { 2655 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2656 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2657 lpl_leaf_insert(lpl_leaf, cpupart); 2658 } else { 2659 /* 2660 * the lpl should already exist in the parent, so just update 2661 * the count of available CPUs 2662 */ 2663 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2664 } 2665 2666 /* link cpu into list of cpus in lpl */ 2667 2668 if (lpl_leaf->lpl_cpus) { 2669 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2670 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2671 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2672 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2673 } else { 2674 /* 2675 * We increment ncpu immediately after we create a new leaf 2676 * lpl, so assert that ncpu == 1 for the case where we don't 2677 * have any cpu pointers yet. 2678 */ 2679 ASSERT(lpl_leaf->lpl_ncpu == 1); 2680 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2681 } 2682 2683 } 2684 2685 2686 /* 2687 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2688 * 2689 * The lpl (cpu partition load average information) is now arranged in a 2690 * hierarchical fashion whereby resources that are closest, ie. most local, to 2691 * the cpu in question are considered to be leaves in a tree of resources. 2692 * There are two removal cases in question: 2693 * 2694 * 1. Removal of the resource in the leaf leaves other resources remaining in 2695 * that leaf. (Another cpu still exists at this level of locality). In this 2696 * case, the count of available cpus is decremented in all assocated lpls by 2697 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2698 * from the per-cpu lpl list. 2699 * 2700 * 2. Removal of the resource results in the lpl containing no resources. (It's 2701 * empty) In this case, all of what has occurred for the first step must take 2702 * place; however, additionally we must remove the lpl structure itself, prune 2703 * out any stranded lpls that do not directly name a leaf resource, and mark the 2704 * cpu partition in question as no longer containing resources from the lgrp of 2705 * the lpl that has been delted. Cpu-partition changes are handled by this 2706 * method, but the lpl_leaf_remove function deals with the details of pruning 2707 * out the empty lpl and any of its orphaned direct ancestors. 2708 */ 2709 void 2710 lgrp_part_del_cpu(cpu_t *cp) 2711 { 2712 lpl_t *lpl; 2713 lpl_t *leaf_lpl; 2714 lgrp_t *lgrp_leaf; 2715 2716 /* called sometimes w/ cpus paused - grab no locks */ 2717 2718 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2719 2720 lpl = leaf_lpl = cp->cpu_lpl; 2721 lgrp_leaf = leaf_lpl->lpl_lgrp; 2722 2723 /* don't delete a leaf that isn't there */ 2724 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2725 2726 /* no double-deletes */ 2727 ASSERT(lpl->lpl_ncpu); 2728 if (--lpl->lpl_ncpu == 0) { 2729 /* 2730 * This was the last cpu in this lgroup for this partition, 2731 * clear its bit in the partition's lgroup bitmask 2732 */ 2733 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2734 2735 /* eliminate remaning lpl link pointers in cpu, lpl */ 2736 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2737 2738 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2739 } else { 2740 2741 /* unlink cpu from lists of cpus in lpl */ 2742 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2743 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2744 if (lpl->lpl_cpus == cp) { 2745 lpl->lpl_cpus = cp->cpu_next_lpl; 2746 } 2747 2748 /* 2749 * Update the cpu count in the lpls associated with parent 2750 * lgroups. 2751 */ 2752 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2753 2754 } 2755 /* clear cpu's lpl ptr when we're all done */ 2756 cp->cpu_lpl = NULL; 2757 } 2758 2759 /* 2760 * Recompute load average for the specified partition/lgrp fragment. 2761 * 2762 * We rely on the fact that this routine is called from the clock thread 2763 * at a point before the clock thread can block (i.e. before its first 2764 * lock request). Since the clock thread can not be preempted (since it 2765 * runs at highest priority), we know that cpu partitions can not change 2766 * (since doing so would require either the repartition requester or the 2767 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2768 * without grabbing cpu_lock. 2769 */ 2770 void 2771 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2772 { 2773 uint_t ncpu; 2774 int64_t old, new, f; 2775 2776 /* 2777 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2778 */ 2779 static short expval[] = { 2780 0, 3196, 1618, 1083, 2781 814, 652, 543, 466, 2782 408, 363, 326, 297, 2783 272, 251, 233, 218, 2784 204, 192, 181, 172, 2785 163, 155, 148, 142, 2786 136, 130, 125, 121, 2787 116, 112, 109, 105 2788 }; 2789 2790 /* ASSERT (called from clock level) */ 2791 2792 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2793 ((ncpu = lpl->lpl_ncpu) == 0)) { 2794 return; 2795 } 2796 2797 for (;;) { 2798 2799 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2800 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2801 else 2802 f = expval[ncpu]; 2803 2804 /* 2805 * Modify the load average atomically to avoid losing 2806 * anticipatory load updates (see lgrp_move_thread()). 2807 */ 2808 if (ageflag) { 2809 /* 2810 * We're supposed to both update and age the load. 2811 * This happens 10 times/sec. per cpu. We do a 2812 * little hoop-jumping to avoid integer overflow. 2813 */ 2814 int64_t q, r; 2815 2816 do { 2817 old = new = lpl->lpl_loadavg; 2818 q = (old >> 16) << 7; 2819 r = (old & 0xffff) << 7; 2820 new += ((long long)(nrcpus - q) * f - 2821 ((r * f) >> 16)) >> 7; 2822 2823 /* 2824 * Check for overflow 2825 */ 2826 if (new > LGRP_LOADAVG_MAX) 2827 new = LGRP_LOADAVG_MAX; 2828 else if (new < 0) 2829 new = 0; 2830 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2831 new) != old); 2832 } else { 2833 /* 2834 * We're supposed to update the load, but not age it. 2835 * This option is used to update the load (which either 2836 * has already been aged in this 1/10 sec. interval or 2837 * soon will be) to account for a remotely executing 2838 * thread. 2839 */ 2840 do { 2841 old = new = lpl->lpl_loadavg; 2842 new += f; 2843 /* 2844 * Check for overflow 2845 * Underflow not possible here 2846 */ 2847 if (new < old) 2848 new = LGRP_LOADAVG_MAX; 2849 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2850 new) != old); 2851 } 2852 2853 /* 2854 * Do the same for this lpl's parent 2855 */ 2856 if ((lpl = lpl->lpl_parent) == NULL) 2857 break; 2858 ncpu = lpl->lpl_ncpu; 2859 } 2860 } 2861 2862 /* 2863 * Initialize lpl topology in the target based on topology currently present in 2864 * lpl_bootstrap. 2865 * 2866 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2867 * initialize cp_default list of lpls. Up to this point all topology operations 2868 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2869 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2870 * `target' points to the list of lpls in cp_default and `size' is the size of 2871 * this list. 2872 * 2873 * This function walks the lpl topology in lpl_bootstrap and does for things: 2874 * 2875 * 1) Copies all fields from lpl_bootstrap to the target. 2876 * 2877 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2878 * 2879 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2880 * instead of lpl_bootstrap. 2881 * 2882 * 4) Updates pointers in the resource list of the target to point to the lpls 2883 * in the target list instead of lpl_bootstrap. 2884 * 2885 * After lpl_topo_bootstrap() completes, target contains the same information 2886 * that would be present there if it were used during boot instead of 2887 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2888 * and it is bzeroed. 2889 */ 2890 void 2891 lpl_topo_bootstrap(lpl_t *target, int size) 2892 { 2893 lpl_t *lpl = lpl_bootstrap; 2894 lpl_t *target_lpl = target; 2895 int howmany; 2896 int id; 2897 int i; 2898 2899 /* 2900 * The only target that should be passed here is cp_default lpl list. 2901 */ 2902 ASSERT(target == cp_default.cp_lgrploads); 2903 ASSERT(size == cp_default.cp_nlgrploads); 2904 ASSERT(!lgrp_topo_initialized); 2905 ASSERT(ncpus == 1); 2906 2907 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2908 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2909 /* 2910 * Copy all fields from lpl. 2911 */ 2912 2913 *target_lpl = *lpl; 2914 2915 /* 2916 * Substitute CPU0 lpl pointer with one relative to target. 2917 */ 2918 if (lpl->lpl_cpus == CPU) { 2919 ASSERT(CPU->cpu_lpl == lpl); 2920 CPU->cpu_lpl = target_lpl; 2921 } 2922 2923 /* 2924 * Substitute parent information with parent relative to target. 2925 */ 2926 if (lpl->lpl_parent != NULL) 2927 target_lpl->lpl_parent = (lpl_t *) 2928 (((uintptr_t)lpl->lpl_parent - 2929 (uintptr_t)lpl_bootstrap) + 2930 (uintptr_t)target); 2931 2932 /* 2933 * Walk over resource set substituting pointers relative to 2934 * lpl_bootstrap to pointers relative to target. 2935 */ 2936 ASSERT(lpl->lpl_nrset <= 1); 2937 2938 for (id = 0; id < lpl->lpl_nrset; id++) { 2939 if (lpl->lpl_rset[id] != NULL) { 2940 target_lpl->lpl_rset[id] = 2941 (lpl_t *) 2942 (((uintptr_t)lpl->lpl_rset[id] - 2943 (uintptr_t)lpl_bootstrap) + 2944 (uintptr_t)target); 2945 } 2946 } 2947 } 2948 2949 /* 2950 * Topology information in lpl_bootstrap is no longer needed. 2951 */ 2952 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2953 } 2954 2955 /* 2956 * If the lowest load among the lgroups a process' threads are currently 2957 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2958 * expanding the process to a new lgroup. 2959 */ 2960 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2961 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2962 2963 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2964 ((lgrp_expand_proc_thresh) / (ncpu)) 2965 2966 /* 2967 * A process will be expanded to a new lgroup only if the difference between 2968 * the lowest load on the lgroups the process' thread's are currently spread 2969 * across and the lowest load on the other lgroups in the process' partition 2970 * is greater than lgrp_expand_proc_diff. 2971 */ 2972 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2973 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2974 2975 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2976 ((lgrp_expand_proc_diff) / (ncpu)) 2977 2978 /* 2979 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2980 * be present due to impreciseness of the load average decay algorithm. 2981 * 2982 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2983 * tolerance is scaled by the number of cpus in the lgroup just like 2984 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2985 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2986 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2987 */ 2988 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2989 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2990 ((lgrp_loadavg_tolerance) / ncpu) 2991 2992 /* 2993 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2994 * average is above this threshold 2995 */ 2996 uint32_t lgrp_load_thresh = UINT32_MAX; 2997 2998 /* 2999 * lgrp_choose() will try to skip any lgroups with less memory 3000 * than this free when choosing a home lgroup 3001 */ 3002 pgcnt_t lgrp_mem_free_thresh = 0; 3003 3004 /* 3005 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 3006 * one based on one of the following policies: 3007 * - Random selection 3008 * - Pseudo round robin placement 3009 * - Longest time since a thread was last placed 3010 */ 3011 #define LGRP_CHOOSE_RANDOM 1 3012 #define LGRP_CHOOSE_RR 2 3013 #define LGRP_CHOOSE_TIME 3 3014 3015 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 3016 3017 /* 3018 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 3019 * be bound to a CPU or processor set. 3020 * 3021 * Arguments: 3022 * t The thread 3023 * cpupart The partition the thread belongs to. 3024 * 3025 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 3026 * disabled, or thread_lock held (at splhigh) to protect against the CPU 3027 * partitions changing out from under us and assumes that given thread is 3028 * protected. Also, called sometimes w/ cpus paused or kernel preemption 3029 * disabled, so don't grab any locks because we should never block under 3030 * those conditions. 3031 */ 3032 lpl_t * 3033 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 3034 { 3035 lgrp_load_t bestload, bestrload; 3036 int lgrpid_offset, lgrp_count; 3037 lgrp_id_t lgrpid, lgrpid_start; 3038 lpl_t *lpl, *bestlpl, *bestrlpl; 3039 klgrpset_t lgrpset; 3040 proc_t *p; 3041 3042 ASSERT(t != NULL); 3043 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3044 THREAD_LOCK_HELD(t)); 3045 ASSERT(cpupart != NULL); 3046 3047 p = t->t_procp; 3048 3049 /* A process should always be in an active partition */ 3050 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3051 3052 bestlpl = bestrlpl = NULL; 3053 bestload = bestrload = LGRP_LOADAVG_MAX; 3054 lgrpset = cpupart->cp_lgrpset; 3055 3056 switch (lgrp_choose_policy) { 3057 case LGRP_CHOOSE_RR: 3058 lgrpid = cpupart->cp_lgrp_hint; 3059 do { 3060 if (++lgrpid > lgrp_alloc_max) 3061 lgrpid = 0; 3062 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3063 3064 break; 3065 default: 3066 case LGRP_CHOOSE_TIME: 3067 case LGRP_CHOOSE_RANDOM: 3068 klgrpset_nlgrps(lgrpset, lgrp_count); 3069 lgrpid_offset = 3070 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3071 for (lgrpid = 0; ; lgrpid++) { 3072 if (klgrpset_ismember(lgrpset, lgrpid)) { 3073 if (--lgrpid_offset == 0) 3074 break; 3075 } 3076 } 3077 break; 3078 } 3079 3080 lgrpid_start = lgrpid; 3081 3082 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3083 lgrp_id_t, cpupart->cp_lgrp_hint); 3084 3085 /* 3086 * Use lgroup affinities (if any) to choose best lgroup 3087 * 3088 * NOTE: Assumes that thread is protected from going away and its 3089 * lgroup affinities won't change (ie. p_lock, or 3090 * thread_lock() being held and/or CPUs paused) 3091 */ 3092 if (t->t_lgrp_affinity) { 3093 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start); 3094 if (lpl != NULL) 3095 return (lpl); 3096 } 3097 3098 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3099 3100 do { 3101 pgcnt_t npgs; 3102 3103 /* 3104 * Skip any lgroups outside of thread's pset 3105 */ 3106 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3107 if (++lgrpid > lgrp_alloc_max) 3108 lgrpid = 0; /* wrap the search */ 3109 continue; 3110 } 3111 3112 /* 3113 * Skip any non-leaf lgroups 3114 */ 3115 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3116 continue; 3117 3118 /* 3119 * Skip any lgroups without enough free memory 3120 * (when threshold set to nonzero positive value) 3121 */ 3122 if (lgrp_mem_free_thresh > 0) { 3123 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3124 if (npgs < lgrp_mem_free_thresh) { 3125 if (++lgrpid > lgrp_alloc_max) 3126 lgrpid = 0; /* wrap the search */ 3127 continue; 3128 } 3129 } 3130 3131 lpl = &cpupart->cp_lgrploads[lgrpid]; 3132 if (klgrpset_isempty(p->p_lgrpset) || 3133 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3134 /* 3135 * Either this is a new process or the process already 3136 * has threads on this lgrp, so this is a preferred 3137 * lgroup for the thread. 3138 */ 3139 if (bestlpl == NULL || 3140 lpl_pick(lpl, bestlpl)) { 3141 bestload = lpl->lpl_loadavg; 3142 bestlpl = lpl; 3143 } 3144 } else { 3145 /* 3146 * The process doesn't have any threads on this lgrp, 3147 * but we're willing to consider this lgrp if the load 3148 * difference is big enough to justify splitting up 3149 * the process' threads. 3150 */ 3151 if (bestrlpl == NULL || 3152 lpl_pick(lpl, bestrlpl)) { 3153 bestrload = lpl->lpl_loadavg; 3154 bestrlpl = lpl; 3155 } 3156 } 3157 if (++lgrpid > lgrp_alloc_max) 3158 lgrpid = 0; /* wrap the search */ 3159 } while (lgrpid != lgrpid_start); 3160 3161 /* 3162 * Return root lgroup if threshold isn't set to maximum value and 3163 * lowest lgroup load average more than a certain threshold 3164 */ 3165 if (lgrp_load_thresh != UINT32_MAX && 3166 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3167 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3168 3169 /* 3170 * If all the lgroups over which the thread's process is spread are 3171 * heavily loaded, or otherwise undesirable, we'll consider placing 3172 * the thread on one of the other leaf lgroups in the thread's 3173 * partition. 3174 */ 3175 if ((bestlpl == NULL) || 3176 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3177 (bestrload < bestload) && /* paranoid about wraparound */ 3178 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3179 bestload))) { 3180 bestlpl = bestrlpl; 3181 } 3182 3183 if (bestlpl == NULL) { 3184 /* 3185 * No lgroup looked particularly good, but we still 3186 * have to pick something. Go with the randomly selected 3187 * legal lgroup we started with above. 3188 */ 3189 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3190 } 3191 3192 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3193 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3194 3195 ASSERT(bestlpl->lpl_ncpu > 0); 3196 return (bestlpl); 3197 } 3198 3199 /* 3200 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3201 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3202 */ 3203 static int 3204 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3205 { 3206 lgrp_load_t l1, l2; 3207 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3208 3209 l1 = lpl1->lpl_loadavg; 3210 l2 = lpl2->lpl_loadavg; 3211 3212 if ((l1 + tolerance < l2) && (l1 < l2)) { 3213 /* lpl1 is significantly less loaded than lpl2 */ 3214 return (1); 3215 } 3216 3217 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3218 l1 + tolerance >= l2 && l1 < l2 && 3219 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3220 /* 3221 * lpl1's load is within the tolerance of lpl2. We're 3222 * willing to consider it be to better however if 3223 * it has been longer since we last homed a thread there 3224 */ 3225 return (1); 3226 } 3227 3228 return (0); 3229 } 3230 3231 /* 3232 * An LWP is expected to be assigned to an lgroup for at least this long 3233 * for its anticipatory load to be justified. NOTE that this value should 3234 * not be set extremely huge (say, larger than 100 years), to avoid problems 3235 * with overflow in the calculation that uses it. 3236 */ 3237 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3238 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3239 3240 /* 3241 * Routine to change a thread's lgroup affiliation. This routine updates 3242 * the thread's kthread_t struct and its process' proc_t struct to note the 3243 * thread's new lgroup affiliation, and its lgroup affinities. 3244 * 3245 * Note that this is the only routine that modifies a thread's t_lpl field, 3246 * and that adds in or removes anticipatory load. 3247 * 3248 * If the thread is exiting, newlpl is NULL. 3249 * 3250 * Locking: 3251 * The following lock must be held on entry: 3252 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3253 * doesn't get removed from t's partition 3254 * 3255 * This routine is not allowed to grab any locks, since it may be called 3256 * with cpus paused (such as from cpu_offline). 3257 */ 3258 void 3259 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3260 { 3261 proc_t *p; 3262 lpl_t *lpl, *oldlpl; 3263 lgrp_id_t oldid; 3264 kthread_t *tp; 3265 uint_t ncpu; 3266 lgrp_load_t old, new; 3267 3268 ASSERT(t); 3269 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3270 THREAD_LOCK_HELD(t)); 3271 3272 /* 3273 * If not changing lpls, just return 3274 */ 3275 if ((oldlpl = t->t_lpl) == newlpl) 3276 return; 3277 3278 /* 3279 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3280 * associated with process 0 rather than with its original process). 3281 */ 3282 if (t->t_proc_flag & TP_LWPEXIT) { 3283 if (newlpl != NULL) { 3284 t->t_lpl = newlpl; 3285 } 3286 return; 3287 } 3288 3289 p = ttoproc(t); 3290 3291 /* 3292 * If the thread had a previous lgroup, update its process' p_lgrpset 3293 * to account for it being moved from its old lgroup. 3294 */ 3295 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3296 (p->p_tlist != NULL)) { 3297 oldid = oldlpl->lpl_lgrpid; 3298 3299 if (newlpl != NULL) 3300 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3301 3302 if ((do_lgrpset_delete) && 3303 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3304 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3305 /* 3306 * Check if a thread other than the thread 3307 * that's moving is assigned to the same 3308 * lgroup as the thread that's moving. Note 3309 * that we have to compare lgroup IDs, rather 3310 * than simply comparing t_lpl's, since the 3311 * threads may belong to different partitions 3312 * but be assigned to the same lgroup. 3313 */ 3314 ASSERT(tp->t_lpl != NULL); 3315 3316 if ((tp != t) && 3317 (tp->t_lpl->lpl_lgrpid == oldid)) { 3318 /* 3319 * Another thread is assigned to the 3320 * same lgroup as the thread that's 3321 * moving, p_lgrpset doesn't change. 3322 */ 3323 break; 3324 } else if (tp == p->p_tlist) { 3325 /* 3326 * No other thread is assigned to the 3327 * same lgroup as the exiting thread, 3328 * clear the lgroup's bit in p_lgrpset. 3329 */ 3330 klgrpset_del(p->p_lgrpset, oldid); 3331 break; 3332 } 3333 } 3334 } 3335 3336 /* 3337 * If this thread was assigned to its old lgroup for such a 3338 * short amount of time that the anticipatory load that was 3339 * added on its behalf has aged very little, remove that 3340 * anticipatory load. 3341 */ 3342 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3343 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3344 lpl = oldlpl; 3345 for (;;) { 3346 do { 3347 old = new = lpl->lpl_loadavg; 3348 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3349 if (new > old) { 3350 /* 3351 * this can happen if the load 3352 * average was aged since we 3353 * added in the anticipatory 3354 * load 3355 */ 3356 new = 0; 3357 } 3358 } while (cas32( 3359 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3360 new) != old); 3361 3362 lpl = lpl->lpl_parent; 3363 if (lpl == NULL) 3364 break; 3365 3366 ncpu = lpl->lpl_ncpu; 3367 ASSERT(ncpu > 0); 3368 } 3369 } 3370 } 3371 /* 3372 * If the thread has a new lgroup (i.e. it's not exiting), update its 3373 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3374 * to its new lgroup to account for its move to its new lgroup. 3375 */ 3376 if (newlpl != NULL) { 3377 /* 3378 * This thread is moving to a new lgroup 3379 */ 3380 t->t_lpl = newlpl; 3381 3382 /* 3383 * Reflect move in load average of new lgroup 3384 * unless it is root lgroup 3385 */ 3386 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3387 return; 3388 3389 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3390 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3391 } 3392 3393 /* 3394 * It'll take some time for the load on the new lgroup 3395 * to reflect this thread's placement on it. We'd 3396 * like not, however, to have all threads between now 3397 * and then also piling on to this lgroup. To avoid 3398 * this pileup, we anticipate the load this thread 3399 * will generate on its new lgroup. The goal is to 3400 * make the lgroup's load appear as though the thread 3401 * had been there all along. We're very conservative 3402 * in calculating this anticipatory load, we assume 3403 * the worst case case (100% CPU-bound thread). This 3404 * may be modified in the future to be more accurate. 3405 */ 3406 lpl = newlpl; 3407 for (;;) { 3408 ncpu = lpl->lpl_ncpu; 3409 ASSERT(ncpu > 0); 3410 do { 3411 old = new = lpl->lpl_loadavg; 3412 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3413 /* 3414 * Check for overflow 3415 * Underflow not possible here 3416 */ 3417 if (new < old) 3418 new = UINT32_MAX; 3419 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3420 new) != old); 3421 3422 lpl = lpl->lpl_parent; 3423 if (lpl == NULL) 3424 break; 3425 } 3426 t->t_anttime = gethrtime(); 3427 } 3428 } 3429 3430 /* 3431 * Return lgroup memory allocation policy given advice from madvise(3C) 3432 */ 3433 lgrp_mem_policy_t 3434 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3435 { 3436 switch (advice) { 3437 case MADV_ACCESS_LWP: 3438 return (LGRP_MEM_POLICY_NEXT); 3439 case MADV_ACCESS_MANY: 3440 return (LGRP_MEM_POLICY_RANDOM); 3441 default: 3442 return (lgrp_mem_policy_default(size, type)); 3443 } 3444 } 3445 3446 /* 3447 * Figure out default policy 3448 */ 3449 lgrp_mem_policy_t 3450 lgrp_mem_policy_default(size_t size, int type) 3451 { 3452 cpupart_t *cp; 3453 lgrp_mem_policy_t policy; 3454 size_t pset_mem_size; 3455 3456 /* 3457 * Randomly allocate memory across lgroups for shared memory 3458 * beyond a certain threshold 3459 */ 3460 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3461 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3462 /* 3463 * Get total memory size of current thread's pset 3464 */ 3465 kpreempt_disable(); 3466 cp = curthread->t_cpupart; 3467 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3468 kpreempt_enable(); 3469 3470 /* 3471 * Choose policy to randomly allocate memory across 3472 * lgroups in pset if it will fit and is not default 3473 * partition. Otherwise, allocate memory randomly 3474 * across machine. 3475 */ 3476 if (lgrp_mem_pset_aware && size < pset_mem_size) 3477 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3478 else 3479 policy = LGRP_MEM_POLICY_RANDOM; 3480 } else 3481 /* 3482 * Apply default policy for private memory and 3483 * shared memory under the respective random 3484 * threshold. 3485 */ 3486 policy = lgrp_mem_default_policy; 3487 3488 return (policy); 3489 } 3490 3491 /* 3492 * Get memory allocation policy for this segment 3493 */ 3494 lgrp_mem_policy_info_t * 3495 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3496 { 3497 lgrp_mem_policy_info_t *policy_info; 3498 extern struct seg_ops segspt_ops; 3499 extern struct seg_ops segspt_shmops; 3500 3501 /* 3502 * This is for binary compatibility to protect against third party 3503 * segment drivers which haven't recompiled to allow for 3504 * SEGOP_GETPOLICY() 3505 */ 3506 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3507 seg->s_ops != &segspt_shmops) 3508 return (NULL); 3509 3510 policy_info = NULL; 3511 if (seg->s_ops->getpolicy != NULL) 3512 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3513 3514 return (policy_info); 3515 } 3516 3517 /* 3518 * Set policy for allocating private memory given desired policy, policy info, 3519 * size in bytes of memory that policy is being applied. 3520 * Return 0 if policy wasn't set already and 1 if policy was set already 3521 */ 3522 int 3523 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3524 lgrp_mem_policy_info_t *policy_info, size_t size) 3525 { 3526 3527 ASSERT(policy_info != NULL); 3528 3529 if (policy == LGRP_MEM_POLICY_DEFAULT) 3530 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3531 3532 /* 3533 * Policy set already? 3534 */ 3535 if (policy == policy_info->mem_policy) 3536 return (1); 3537 3538 /* 3539 * Set policy 3540 */ 3541 policy_info->mem_policy = policy; 3542 policy_info->mem_reserved = 0; 3543 3544 return (0); 3545 } 3546 3547 3548 /* 3549 * Get shared memory allocation policy with given tree and offset 3550 */ 3551 lgrp_mem_policy_info_t * 3552 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3553 u_offset_t vn_off) 3554 { 3555 u_offset_t off; 3556 lgrp_mem_policy_info_t *policy_info; 3557 lgrp_shm_policy_seg_t *policy_seg; 3558 lgrp_shm_locality_t *shm_locality; 3559 avl_tree_t *tree; 3560 avl_index_t where; 3561 3562 /* 3563 * Get policy segment tree from anon_map or vnode and use specified 3564 * anon index or vnode offset as offset 3565 * 3566 * Assume that no lock needs to be held on anon_map or vnode, since 3567 * they should be protected by their reference count which must be 3568 * nonzero for an existing segment 3569 */ 3570 if (amp) { 3571 ASSERT(amp->refcnt != 0); 3572 shm_locality = amp->locality; 3573 if (shm_locality == NULL) 3574 return (NULL); 3575 tree = shm_locality->loc_tree; 3576 off = ptob(anon_index); 3577 } else if (vp) { 3578 shm_locality = vp->v_locality; 3579 if (shm_locality == NULL) 3580 return (NULL); 3581 ASSERT(shm_locality->loc_count != 0); 3582 tree = shm_locality->loc_tree; 3583 off = vn_off; 3584 } 3585 3586 if (tree == NULL) 3587 return (NULL); 3588 3589 /* 3590 * Lookup policy segment for offset into shared object and return 3591 * policy info 3592 */ 3593 rw_enter(&shm_locality->loc_lock, RW_READER); 3594 policy_info = NULL; 3595 policy_seg = avl_find(tree, &off, &where); 3596 if (policy_seg) 3597 policy_info = &policy_seg->shm_policy; 3598 rw_exit(&shm_locality->loc_lock); 3599 3600 return (policy_info); 3601 } 3602 3603 /* 3604 * Default memory allocation policy for kernel segmap pages 3605 */ 3606 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3607 3608 /* 3609 * Return lgroup to use for allocating memory 3610 * given the segment and address 3611 * 3612 * There isn't any mutual exclusion that exists between calls 3613 * to this routine and DR, so this routine and whomever calls it 3614 * should be mindful of the possibility that the lgrp returned 3615 * may be deleted. If this happens, dereferences of the lgrp 3616 * pointer will still be safe, but the resources in the lgrp will 3617 * be gone, and LGRP_EXISTS() will no longer be true. 3618 */ 3619 lgrp_t * 3620 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3621 { 3622 int i; 3623 lgrp_t *lgrp; 3624 klgrpset_t lgrpset; 3625 int lgrps_spanned; 3626 unsigned long off; 3627 lgrp_mem_policy_t policy; 3628 lgrp_mem_policy_info_t *policy_info; 3629 ushort_t random; 3630 int stat = 0; 3631 extern struct seg *segkmap; 3632 3633 /* 3634 * Just return null if the lgrp framework hasn't finished 3635 * initializing or if this is a UMA machine. 3636 */ 3637 if (nlgrps == 1 || !lgrp_initialized) 3638 return (lgrp_root); 3639 3640 /* 3641 * Get memory allocation policy for this segment 3642 */ 3643 policy = lgrp_mem_default_policy; 3644 if (seg != NULL) { 3645 if (seg->s_as == &kas) { 3646 if (seg == segkmap) 3647 policy = lgrp_segmap_default_policy; 3648 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3649 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3650 policy = LGRP_MEM_POLICY_RANDOM; 3651 } else { 3652 policy_info = lgrp_mem_policy_get(seg, vaddr); 3653 if (policy_info != NULL) 3654 policy = policy_info->mem_policy; 3655 } 3656 } 3657 lgrpset = 0; 3658 3659 /* 3660 * Initialize lgroup to home by default 3661 */ 3662 lgrp = lgrp_home_lgrp(); 3663 3664 /* 3665 * When homing threads on root lgrp, override default memory 3666 * allocation policies with root lgroup memory allocation policy 3667 */ 3668 if (lgrp == lgrp_root) 3669 policy = lgrp_mem_policy_root; 3670 3671 /* 3672 * Implement policy 3673 */ 3674 switch (policy) { 3675 case LGRP_MEM_POLICY_NEXT_CPU: 3676 3677 /* 3678 * Return lgroup of current CPU which faulted on memory 3679 * If the CPU isn't currently in an lgrp, then opt to 3680 * allocate from the root. 3681 * 3682 * Kernel preemption needs to be disabled here to prevent 3683 * the current CPU from going away before lgrp is found. 3684 */ 3685 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3686 lgrp = lgrp_root; 3687 } else { 3688 kpreempt_disable(); 3689 lgrp = lgrp_cpu_to_lgrp(CPU); 3690 kpreempt_enable(); 3691 } 3692 break; 3693 3694 case LGRP_MEM_POLICY_NEXT: 3695 case LGRP_MEM_POLICY_DEFAULT: 3696 default: 3697 3698 /* 3699 * Just return current thread's home lgroup 3700 * for default policy (next touch) 3701 * If the thread is homed to the root, 3702 * then the default policy is random across lgroups. 3703 * Fallthrough to the random case. 3704 */ 3705 if (lgrp != lgrp_root) { 3706 if (policy == LGRP_MEM_POLICY_NEXT) 3707 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3708 else 3709 lgrp_stat_add(lgrp->lgrp_id, 3710 LGRP_NUM_DEFAULT, 1); 3711 break; 3712 } 3713 /* LINTED fallthrough on case statement */ 3714 case LGRP_MEM_POLICY_RANDOM: 3715 3716 /* 3717 * Return a random leaf lgroup with memory 3718 */ 3719 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3720 /* 3721 * Count how many lgroups are spanned 3722 */ 3723 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3724 3725 /* 3726 * There may be no memnodes in the root lgroup during DR copy 3727 * rename on a system with only two boards (memnodes) 3728 * configured. In this case just return the root lgrp. 3729 */ 3730 if (lgrps_spanned == 0) { 3731 lgrp = lgrp_root; 3732 break; 3733 } 3734 3735 /* 3736 * Pick a random offset within lgroups spanned 3737 * and return lgroup at that offset 3738 */ 3739 random = (ushort_t)gethrtime() >> 4; 3740 off = random % lgrps_spanned; 3741 ASSERT(off <= lgrp_alloc_max); 3742 3743 for (i = 0; i <= lgrp_alloc_max; i++) { 3744 if (!klgrpset_ismember(lgrpset, i)) 3745 continue; 3746 if (off) 3747 off--; 3748 else { 3749 lgrp = lgrp_table[i]; 3750 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3751 1); 3752 break; 3753 } 3754 } 3755 break; 3756 3757 case LGRP_MEM_POLICY_RANDOM_PROC: 3758 3759 /* 3760 * Grab copy of bitmask of lgroups spanned by 3761 * this process 3762 */ 3763 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3764 stat = LGRP_NUM_RANDOM_PROC; 3765 3766 /* LINTED fallthrough on case statement */ 3767 case LGRP_MEM_POLICY_RANDOM_PSET: 3768 3769 if (!stat) 3770 stat = LGRP_NUM_RANDOM_PSET; 3771 3772 if (klgrpset_isempty(lgrpset)) { 3773 /* 3774 * Grab copy of bitmask of lgroups spanned by 3775 * this processor set 3776 */ 3777 kpreempt_disable(); 3778 klgrpset_copy(lgrpset, 3779 curthread->t_cpupart->cp_lgrpset); 3780 kpreempt_enable(); 3781 } 3782 3783 /* 3784 * Count how many lgroups are spanned 3785 */ 3786 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3787 ASSERT(lgrps_spanned <= nlgrps); 3788 3789 /* 3790 * Probably lgrps_spanned should be always non-zero, but to be 3791 * on the safe side we return lgrp_root if it is empty. 3792 */ 3793 if (lgrps_spanned == 0) { 3794 lgrp = lgrp_root; 3795 break; 3796 } 3797 3798 /* 3799 * Pick a random offset within lgroups spanned 3800 * and return lgroup at that offset 3801 */ 3802 random = (ushort_t)gethrtime() >> 4; 3803 off = random % lgrps_spanned; 3804 ASSERT(off <= lgrp_alloc_max); 3805 3806 for (i = 0; i <= lgrp_alloc_max; i++) { 3807 if (!klgrpset_ismember(lgrpset, i)) 3808 continue; 3809 if (off) 3810 off--; 3811 else { 3812 lgrp = lgrp_table[i]; 3813 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3814 1); 3815 break; 3816 } 3817 } 3818 break; 3819 3820 case LGRP_MEM_POLICY_ROUNDROBIN: 3821 3822 /* 3823 * Use offset within segment to determine 3824 * offset from home lgroup to choose for 3825 * next lgroup to allocate memory from 3826 */ 3827 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3828 (lgrp_alloc_max + 1); 3829 3830 kpreempt_disable(); 3831 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3832 i = lgrp->lgrp_id; 3833 kpreempt_enable(); 3834 3835 while (off > 0) { 3836 i = (i + 1) % (lgrp_alloc_max + 1); 3837 lgrp = lgrp_table[i]; 3838 if (klgrpset_ismember(lgrpset, i)) 3839 off--; 3840 } 3841 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3842 3843 break; 3844 } 3845 3846 ASSERT(lgrp != NULL); 3847 return (lgrp); 3848 } 3849 3850 /* 3851 * Return the number of pages in an lgroup 3852 * 3853 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3854 * could cause tests that rely on the numat driver to fail.... 3855 */ 3856 pgcnt_t 3857 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3858 { 3859 lgrp_t *lgrp; 3860 3861 lgrp = lgrp_table[lgrpid]; 3862 if (!LGRP_EXISTS(lgrp) || 3863 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3864 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3865 return (0); 3866 3867 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3868 } 3869 3870 /* 3871 * Initialize lgroup shared memory allocation policy support 3872 */ 3873 void 3874 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3875 { 3876 lgrp_shm_locality_t *shm_locality; 3877 3878 /* 3879 * Initialize locality field in anon_map 3880 * Don't need any locks because this is called when anon_map is 3881 * allocated, but not used anywhere yet. 3882 */ 3883 if (amp) { 3884 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3885 if (amp->locality == NULL) { 3886 /* 3887 * Allocate and initialize shared memory locality info 3888 * and set anon_map locality pointer to it 3889 * Drop lock across kmem_alloc(KM_SLEEP) 3890 */ 3891 ANON_LOCK_EXIT(&->a_rwlock); 3892 shm_locality = kmem_alloc(sizeof (*shm_locality), 3893 KM_SLEEP); 3894 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3895 NULL); 3896 shm_locality->loc_count = 1; /* not used for amp */ 3897 shm_locality->loc_tree = NULL; 3898 3899 /* 3900 * Reacquire lock and check to see whether anyone beat 3901 * us to initializing the locality info 3902 */ 3903 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3904 if (amp->locality != NULL) { 3905 rw_destroy(&shm_locality->loc_lock); 3906 kmem_free(shm_locality, 3907 sizeof (*shm_locality)); 3908 } else 3909 amp->locality = shm_locality; 3910 } 3911 ANON_LOCK_EXIT(&->a_rwlock); 3912 return; 3913 } 3914 3915 /* 3916 * Allocate shared vnode policy info if vnode is not locality aware yet 3917 */ 3918 mutex_enter(&vp->v_lock); 3919 if ((vp->v_flag & V_LOCALITY) == 0) { 3920 /* 3921 * Allocate and initialize shared memory locality info 3922 */ 3923 mutex_exit(&vp->v_lock); 3924 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3925 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3926 shm_locality->loc_count = 1; 3927 shm_locality->loc_tree = NULL; 3928 3929 /* 3930 * Point vnode locality field at shared vnode policy info 3931 * and set locality aware flag in vnode 3932 */ 3933 mutex_enter(&vp->v_lock); 3934 if ((vp->v_flag & V_LOCALITY) == 0) { 3935 vp->v_locality = shm_locality; 3936 vp->v_flag |= V_LOCALITY; 3937 } else { 3938 /* 3939 * Lost race so free locality info and increment count. 3940 */ 3941 rw_destroy(&shm_locality->loc_lock); 3942 kmem_free(shm_locality, sizeof (*shm_locality)); 3943 shm_locality = vp->v_locality; 3944 shm_locality->loc_count++; 3945 } 3946 mutex_exit(&vp->v_lock); 3947 3948 return; 3949 } 3950 3951 /* 3952 * Increment reference count of number of segments mapping this vnode 3953 * shared 3954 */ 3955 shm_locality = vp->v_locality; 3956 shm_locality->loc_count++; 3957 mutex_exit(&vp->v_lock); 3958 } 3959 3960 /* 3961 * Destroy the given shared memory policy segment tree 3962 */ 3963 void 3964 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3965 { 3966 lgrp_shm_policy_seg_t *cur; 3967 lgrp_shm_policy_seg_t *next; 3968 3969 if (tree == NULL) 3970 return; 3971 3972 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3973 while (cur != NULL) { 3974 next = AVL_NEXT(tree, cur); 3975 avl_remove(tree, cur); 3976 kmem_free(cur, sizeof (*cur)); 3977 cur = next; 3978 } 3979 kmem_free(tree, sizeof (avl_tree_t)); 3980 } 3981 3982 /* 3983 * Uninitialize lgroup shared memory allocation policy support 3984 */ 3985 void 3986 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3987 { 3988 lgrp_shm_locality_t *shm_locality; 3989 3990 /* 3991 * For anon_map, deallocate shared memory policy tree and 3992 * zero locality field 3993 * Don't need any locks because anon_map is being freed 3994 */ 3995 if (amp) { 3996 if (amp->locality == NULL) 3997 return; 3998 shm_locality = amp->locality; 3999 shm_locality->loc_count = 0; /* not really used for amp */ 4000 rw_destroy(&shm_locality->loc_lock); 4001 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4002 kmem_free(shm_locality, sizeof (*shm_locality)); 4003 amp->locality = 0; 4004 return; 4005 } 4006 4007 /* 4008 * For vnode, decrement reference count of segments mapping this vnode 4009 * shared and delete locality info if reference count drops to 0 4010 */ 4011 mutex_enter(&vp->v_lock); 4012 shm_locality = vp->v_locality; 4013 shm_locality->loc_count--; 4014 4015 if (shm_locality->loc_count == 0) { 4016 rw_destroy(&shm_locality->loc_lock); 4017 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4018 kmem_free(shm_locality, sizeof (*shm_locality)); 4019 vp->v_locality = 0; 4020 vp->v_flag &= ~V_LOCALITY; 4021 } 4022 mutex_exit(&vp->v_lock); 4023 } 4024 4025 /* 4026 * Compare two shared memory policy segments 4027 * Used by AVL tree code for searching 4028 */ 4029 int 4030 lgrp_shm_policy_compar(const void *x, const void *y) 4031 { 4032 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4033 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4034 4035 if (a->shm_off < b->shm_off) 4036 return (-1); 4037 if (a->shm_off >= b->shm_off + b->shm_size) 4038 return (1); 4039 return (0); 4040 } 4041 4042 /* 4043 * Concatenate seg1 with seg2 and remove seg2 4044 */ 4045 static int 4046 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4047 lgrp_shm_policy_seg_t *seg2) 4048 { 4049 if (!seg1 || !seg2 || 4050 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4051 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4052 return (-1); 4053 4054 seg1->shm_size += seg2->shm_size; 4055 avl_remove(tree, seg2); 4056 kmem_free(seg2, sizeof (*seg2)); 4057 return (0); 4058 } 4059 4060 /* 4061 * Split segment at given offset and return rightmost (uppermost) segment 4062 * Assumes that there are no overlapping segments 4063 */ 4064 static lgrp_shm_policy_seg_t * 4065 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4066 u_offset_t off) 4067 { 4068 lgrp_shm_policy_seg_t *newseg; 4069 avl_index_t where; 4070 4071 ASSERT(seg != NULL); 4072 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4073 4074 if (!seg || off < seg->shm_off || off > seg->shm_off + 4075 seg->shm_size) 4076 return (NULL); 4077 4078 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4079 return (seg); 4080 4081 /* 4082 * Adjust size of left segment and allocate new (right) segment 4083 */ 4084 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4085 newseg->shm_policy = seg->shm_policy; 4086 newseg->shm_off = off; 4087 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4088 seg->shm_size = off - seg->shm_off; 4089 4090 /* 4091 * Find where to insert new segment in AVL tree and insert it 4092 */ 4093 (void) avl_find(tree, &off, &where); 4094 avl_insert(tree, newseg, where); 4095 4096 return (newseg); 4097 } 4098 4099 /* 4100 * Set shared memory allocation policy on specified shared object at given 4101 * offset and length 4102 * 4103 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4104 * -1 if can't set policy. 4105 */ 4106 int 4107 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4108 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4109 { 4110 u_offset_t eoff; 4111 lgrp_shm_policy_seg_t *next; 4112 lgrp_shm_policy_seg_t *newseg; 4113 u_offset_t off; 4114 u_offset_t oldeoff; 4115 lgrp_shm_policy_seg_t *prev; 4116 int retval; 4117 lgrp_shm_policy_seg_t *seg; 4118 lgrp_shm_locality_t *shm_locality; 4119 avl_tree_t *tree; 4120 avl_index_t where; 4121 4122 ASSERT(amp || vp); 4123 ASSERT((len & PAGEOFFSET) == 0); 4124 4125 if (len == 0) 4126 return (-1); 4127 4128 retval = 0; 4129 4130 /* 4131 * Get locality info and starting offset into shared object 4132 * Try anon map first and then vnode 4133 * Assume that no locks need to be held on anon_map or vnode, since 4134 * it should be protected by its reference count which must be nonzero 4135 * for an existing segment. 4136 */ 4137 if (amp) { 4138 /* 4139 * Get policy info from anon_map 4140 * 4141 */ 4142 ASSERT(amp->refcnt != 0); 4143 if (amp->locality == NULL) 4144 lgrp_shm_policy_init(amp, NULL); 4145 shm_locality = amp->locality; 4146 off = ptob(anon_index); 4147 } else if (vp) { 4148 /* 4149 * Get policy info from vnode 4150 */ 4151 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4152 lgrp_shm_policy_init(NULL, vp); 4153 shm_locality = vp->v_locality; 4154 ASSERT(shm_locality->loc_count != 0); 4155 off = vn_off; 4156 } else 4157 return (-1); 4158 4159 ASSERT((off & PAGEOFFSET) == 0); 4160 4161 /* 4162 * Figure out default policy 4163 */ 4164 if (policy == LGRP_MEM_POLICY_DEFAULT) 4165 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4166 4167 /* 4168 * Create AVL tree if there isn't one yet 4169 * and set locality field to point at it 4170 */ 4171 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4172 tree = shm_locality->loc_tree; 4173 if (!tree) { 4174 rw_exit(&shm_locality->loc_lock); 4175 4176 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4177 4178 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4179 if (shm_locality->loc_tree == NULL) { 4180 avl_create(tree, lgrp_shm_policy_compar, 4181 sizeof (lgrp_shm_policy_seg_t), 4182 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4183 shm_locality->loc_tree = tree; 4184 } else { 4185 /* 4186 * Another thread managed to set up the tree 4187 * before we could. Free the tree we allocated 4188 * and use the one that's already there. 4189 */ 4190 kmem_free(tree, sizeof (*tree)); 4191 tree = shm_locality->loc_tree; 4192 } 4193 } 4194 4195 /* 4196 * Set policy 4197 * 4198 * Need to maintain hold on writer's lock to keep tree from 4199 * changing out from under us 4200 */ 4201 while (len != 0) { 4202 /* 4203 * Find policy segment for specified offset into shared object 4204 */ 4205 seg = avl_find(tree, &off, &where); 4206 4207 /* 4208 * Didn't find any existing segment that contains specified 4209 * offset, so allocate new segment, insert it, and concatenate 4210 * with adjacent segments if possible 4211 */ 4212 if (seg == NULL) { 4213 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4214 KM_SLEEP); 4215 newseg->shm_policy.mem_policy = policy; 4216 newseg->shm_policy.mem_reserved = 0; 4217 newseg->shm_off = off; 4218 avl_insert(tree, newseg, where); 4219 4220 /* 4221 * Check to see whether new segment overlaps with next 4222 * one, set length of new segment accordingly, and 4223 * calculate remaining length and next offset 4224 */ 4225 seg = AVL_NEXT(tree, newseg); 4226 if (seg == NULL || off + len <= seg->shm_off) { 4227 newseg->shm_size = len; 4228 len = 0; 4229 } else { 4230 newseg->shm_size = seg->shm_off - off; 4231 off = seg->shm_off; 4232 len -= newseg->shm_size; 4233 } 4234 4235 /* 4236 * Try to concatenate new segment with next and 4237 * previous ones, since they might have the same policy 4238 * now. Grab previous and next segments first because 4239 * they will change on concatenation. 4240 */ 4241 prev = AVL_PREV(tree, newseg); 4242 next = AVL_NEXT(tree, newseg); 4243 (void) lgrp_shm_policy_concat(tree, newseg, next); 4244 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4245 4246 continue; 4247 } 4248 4249 eoff = off + len; 4250 oldeoff = seg->shm_off + seg->shm_size; 4251 4252 /* 4253 * Policy set already? 4254 */ 4255 if (policy == seg->shm_policy.mem_policy) { 4256 /* 4257 * Nothing left to do if offset and length 4258 * fall within this segment 4259 */ 4260 if (eoff <= oldeoff) { 4261 retval = 1; 4262 break; 4263 } else { 4264 len = eoff - oldeoff; 4265 off = oldeoff; 4266 continue; 4267 } 4268 } 4269 4270 /* 4271 * Specified offset and length match existing segment exactly 4272 */ 4273 if (off == seg->shm_off && len == seg->shm_size) { 4274 /* 4275 * Set policy and update current length 4276 */ 4277 seg->shm_policy.mem_policy = policy; 4278 seg->shm_policy.mem_reserved = 0; 4279 len = 0; 4280 4281 /* 4282 * Try concatenating new segment with previous and next 4283 * segments, since they might have the same policy now. 4284 * Grab previous and next segments first because they 4285 * will change on concatenation. 4286 */ 4287 prev = AVL_PREV(tree, seg); 4288 next = AVL_NEXT(tree, seg); 4289 (void) lgrp_shm_policy_concat(tree, seg, next); 4290 (void) lgrp_shm_policy_concat(tree, prev, seg); 4291 } else { 4292 /* 4293 * Specified offset and length only apply to part of 4294 * existing segment 4295 */ 4296 4297 /* 4298 * New segment starts in middle of old one, so split 4299 * new one off near beginning of old one 4300 */ 4301 newseg = NULL; 4302 if (off > seg->shm_off) { 4303 newseg = lgrp_shm_policy_split(tree, seg, off); 4304 4305 /* 4306 * New segment ends where old one did, so try 4307 * to concatenate with next segment 4308 */ 4309 if (eoff == oldeoff) { 4310 newseg->shm_policy.mem_policy = policy; 4311 newseg->shm_policy.mem_reserved = 0; 4312 (void) lgrp_shm_policy_concat(tree, 4313 newseg, AVL_NEXT(tree, newseg)); 4314 break; 4315 } 4316 } 4317 4318 /* 4319 * New segment ends before old one, so split off end of 4320 * old one 4321 */ 4322 if (eoff < oldeoff) { 4323 if (newseg) { 4324 (void) lgrp_shm_policy_split(tree, 4325 newseg, eoff); 4326 newseg->shm_policy.mem_policy = policy; 4327 newseg->shm_policy.mem_reserved = 0; 4328 } else { 4329 (void) lgrp_shm_policy_split(tree, seg, 4330 eoff); 4331 seg->shm_policy.mem_policy = policy; 4332 seg->shm_policy.mem_reserved = 0; 4333 } 4334 4335 if (off == seg->shm_off) 4336 (void) lgrp_shm_policy_concat(tree, 4337 AVL_PREV(tree, seg), seg); 4338 break; 4339 } 4340 4341 /* 4342 * Calculate remaining length and next offset 4343 */ 4344 len = eoff - oldeoff; 4345 off = oldeoff; 4346 } 4347 } 4348 4349 rw_exit(&shm_locality->loc_lock); 4350 return (retval); 4351 } 4352 4353 /* 4354 * Return the best memnode from which to allocate memory given 4355 * an lgroup. 4356 * 4357 * "c" is for cookie, which is good enough for me. 4358 * It references a cookie struct that should be zero'ed to initialize. 4359 * The cookie should live on the caller's stack. 4360 * 4361 * The routine returns -1 when: 4362 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4363 * - traverse is 1, and all the memnodes in the system have been 4364 * returned. 4365 */ 4366 int 4367 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4368 { 4369 lgrp_t *lp = c->lmc_lgrp; 4370 mnodeset_t nodes = c->lmc_nodes; 4371 int cnt = c->lmc_cnt; 4372 int offset, mnode; 4373 4374 extern int max_mem_nodes; 4375 4376 /* 4377 * If the set is empty, and the caller is willing, traverse 4378 * up the hierarchy until we find a non-empty set. 4379 */ 4380 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4381 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4382 ((lp = lp->lgrp_parent) == NULL)) 4383 return (-1); 4384 4385 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4386 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4387 } 4388 4389 /* 4390 * Select a memnode by picking one at a "random" offset. 4391 * Because of DR, memnodes can come and go at any time. 4392 * This code must be able to cope with the possibility 4393 * that the nodes count "cnt" is inconsistent with respect 4394 * to the number of elements actually in "nodes", and 4395 * therefore that the offset chosen could be greater than 4396 * the number of elements in the set (some memnodes may 4397 * have dissapeared just before cnt was read). 4398 * If this happens, the search simply wraps back to the 4399 * beginning of the set. 4400 */ 4401 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4402 offset = c->lmc_rand % cnt; 4403 do { 4404 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4405 if (nodes & ((mnodeset_t)1 << mnode)) 4406 if (!offset--) 4407 break; 4408 } while (mnode >= max_mem_nodes); 4409 4410 /* Found a node. Store state before returning. */ 4411 c->lmc_lgrp = lp; 4412 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4413 c->lmc_cnt = cnt - 1; 4414 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4415 c->lmc_ntried++; 4416 4417 return (mnode); 4418 } 4419