1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Basic NUMA support in terms of locality groups 31 * 32 * Solaris needs to know which CPUs, memory, etc. are near each other to 33 * provide good performance on NUMA machines by optimizing for locality. 34 * In order to do this, a new abstraction called a "locality group (lgroup)" 35 * has been introduced to keep track of which CPU-like and memory-like hardware 36 * resources are close to each other. Currently, latency is the only measure 37 * used to determine how to group hardware resources into lgroups, but this 38 * does not limit the groupings to be based solely on latency. Other factors 39 * may be used to determine the groupings in the future. 40 * 41 * Lgroups are organized into a hieararchy or topology that represents the 42 * latency topology of the machine. There is always at least a root lgroup in 43 * the system. It represents all the hardware resources in the machine at a 44 * latency big enough that any hardware resource can at least access any other 45 * hardware resource within that latency. A Uniform Memory Access (UMA) 46 * machine is represented with one lgroup (the root). In contrast, a NUMA 47 * machine is represented at least by the root lgroup and some number of leaf 48 * lgroups where the leaf lgroups contain the hardware resources within the 49 * least latency of each other and the root lgroup still contains all the 50 * resources in the machine. Some number of intermediate lgroups may exist 51 * which represent more levels of locality than just the local latency of the 52 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 53 * (eg. root and intermediate lgroups) contain the next nearest resources to 54 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 55 * to the root lgroup shows the hardware resources from closest to farthest 56 * from the leaf lgroup such that each successive ancestor lgroup contains 57 * the next nearest resources at the next level of locality from the previous. 58 * 59 * The kernel uses the lgroup abstraction to know how to allocate resources 60 * near a given process/thread. At fork() and lwp/thread_create() time, a 61 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 62 * with the lowest load average. Binding to a processor or processor set will 63 * change the home lgroup for a thread. The scheduler has been modified to try 64 * to dispatch a thread on a CPU in its home lgroup. Physical memory 65 * allocation is lgroup aware too, so memory will be allocated from the current 66 * thread's home lgroup if possible. If the desired resources are not 67 * available, the kernel traverses the lgroup hierarchy going to the parent 68 * lgroup to find resources at the next level of locality until it reaches the 69 * root lgroup. 70 */ 71 72 #include <sys/lgrp.h> 73 #include <sys/lgrp_user.h> 74 #include <sys/types.h> 75 #include <sys/mman.h> 76 #include <sys/param.h> 77 #include <sys/var.h> 78 #include <sys/thread.h> 79 #include <sys/cpuvar.h> 80 #include <sys/cpupart.h> 81 #include <sys/kmem.h> 82 #include <vm/seg.h> 83 #include <vm/seg_kmem.h> 84 #include <vm/seg_spt.h> 85 #include <vm/seg_vn.h> 86 #include <vm/as.h> 87 #include <sys/atomic.h> 88 #include <sys/systm.h> 89 #include <sys/errno.h> 90 #include <sys/cmn_err.h> 91 #include <sys/kstat.h> 92 #include <sys/sysmacros.h> 93 #include <sys/chip.h> 94 #include <sys/promif.h> 95 #include <sys/sdt.h> 96 97 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 99 /* indexed by lgrp_id */ 100 int nlgrps; /* number of lgroups in machine */ 101 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 102 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 103 104 /* 105 * Kstat data for lgroups. 106 * 107 * Actual kstat data is collected in lgrp_stats array. 108 * The lgrp_kstat_data array of named kstats is used to extract data from 109 * lgrp_stats and present it to kstat framework. It is protected from partallel 110 * modifications by lgrp_kstat_mutex. This may cause some contention when 111 * several kstat commands run in parallel but this is not the 112 * performance-critical path. 113 */ 114 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 115 116 /* 117 * Declare kstat names statically for enums as defined in the header file. 118 */ 119 LGRP_KSTAT_NAMES; 120 121 static void lgrp_kstat_init(void); 122 static int lgrp_kstat_extract(kstat_t *, int); 123 static void lgrp_kstat_reset(lgrp_id_t); 124 125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 126 static kmutex_t lgrp_kstat_mutex; 127 128 129 /* 130 * max number of lgroups supported by the platform 131 */ 132 int nlgrpsmax = 0; 133 134 /* 135 * The root lgroup. Represents the set of resources at the system wide 136 * level of locality. 137 */ 138 lgrp_t *lgrp_root = NULL; 139 140 /* 141 * During system bootstrap cp_default does not contain the list of lgrp load 142 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 143 * on-line when cp_default is initialized by cpupart_initialize_default(). 144 * Configuring CPU0 may create a two-level topology with root and one leaf node 145 * containing CPU0. This topology is initially constructed in a special 146 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 147 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 148 * for all lpl operations until cp_default is fully constructed. 149 * 150 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 151 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 152 * the first element of lpl_bootstrap_list. 153 * 154 * CPUs that are added to the system, but have not yet been assigned to an 155 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 156 * on some architectures (x86) it's possible for the slave CPU startup thread 157 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 158 */ 159 #define LPL_BOOTSTRAP_SIZE 2 160 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 161 lpl_t *lpl_bootstrap; 162 163 /* 164 * If cp still references the bootstrap lpl, it has not yet been added to 165 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 166 * a thread is trying to allocate memory close to a CPU that has no lgrp. 167 */ 168 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 169 170 static lgrp_t lroot; 171 172 173 /* 174 * Size, in bytes, beyond which random memory allocation policy is applied 175 * to non-shared memory. Default is the maximum size, so random memory 176 * allocation won't be used for non-shared memory by default. 177 */ 178 size_t lgrp_privm_random_thresh = (size_t)(-1); 179 180 /* 181 * Size, in bytes, beyond which random memory allocation policy is applied to 182 * shared memory. Default is 8MB (2 ISM pages). 183 */ 184 size_t lgrp_shm_random_thresh = 8*1024*1024; 185 186 /* 187 * Whether to do processor set aware memory allocation by default 188 */ 189 int lgrp_mem_pset_aware = 0; 190 191 /* 192 * Set the default memory allocation policy for root lgroup 193 */ 194 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 195 196 /* 197 * Set the default memory allocation policy. For most platforms, 198 * next touch is sufficient, but some platforms may wish to override 199 * this. 200 */ 201 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 202 203 204 /* 205 * lgroup CPU event handlers 206 */ 207 static void lgrp_cpu_init(struct cpu *); 208 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 209 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 210 211 static void lgrp_latency_change(u_longlong_t, u_longlong_t); 212 213 /* 214 * lgroup memory event handlers 215 */ 216 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 217 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 218 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 219 220 /* 221 * lgroup CPU partition event handlers 222 */ 223 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 224 static void lgrp_part_del_cpu(struct cpu *); 225 226 static void lgrp_root_init(void); 227 228 /* 229 * lpl topology 230 */ 231 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 232 static void lpl_clear(lpl_t *); 233 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 234 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 235 static void lpl_rset_add(lpl_t *, lpl_t *); 236 static void lpl_rset_del(lpl_t *, lpl_t *); 237 static int lpl_rset_contains(lpl_t *, lpl_t *); 238 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 239 static void lpl_child_update(lpl_t *, struct cpupart *); 240 static int lpl_pick(lpl_t *, lpl_t *); 241 static void lpl_verify_wrapper(struct cpupart *); 242 243 /* 244 * defines for lpl topology verifier return codes 245 */ 246 247 #define LPL_TOPO_CORRECT 0 248 #define LPL_TOPO_PART_HAS_NO_LPL -1 249 #define LPL_TOPO_CPUS_NOT_EMPTY -2 250 #define LPL_TOPO_LGRP_MISMATCH -3 251 #define LPL_TOPO_MISSING_PARENT -4 252 #define LPL_TOPO_PARENT_MISMATCH -5 253 #define LPL_TOPO_BAD_CPUCNT -6 254 #define LPL_TOPO_RSET_MISMATCH -7 255 #define LPL_TOPO_LPL_ORPHANED -8 256 #define LPL_TOPO_LPL_BAD_NCPU -9 257 #define LPL_TOPO_RSET_MSSNG_LF -10 258 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 259 #define LPL_TOPO_BOGUS_HINT -12 260 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 261 #define LPL_TOPO_LGRP_NOT_LEAF -14 262 #define LPL_TOPO_BAD_RSETCNT -15 263 264 /* 265 * Return whether lgroup optimizations should be enabled on this system 266 */ 267 int 268 lgrp_optimizations(void) 269 { 270 /* 271 * System must have more than 2 lgroups to enable lgroup optimizations 272 * 273 * XXX This assumes that a 2 lgroup system has an empty root lgroup 274 * with one child lgroup containing all the resources. A 2 lgroup 275 * system with a root lgroup directly containing CPUs or memory might 276 * need lgroup optimizations with its child lgroup, but there 277 * isn't such a machine for now.... 278 */ 279 if (nlgrps > 2) 280 return (1); 281 282 return (0); 283 } 284 285 /* 286 * Build full lgroup topology 287 */ 288 static void 289 lgrp_root_init(void) 290 { 291 lgrp_handle_t hand; 292 int i; 293 lgrp_id_t id; 294 295 /* 296 * Create the "root" lgroup 297 */ 298 ASSERT(nlgrps == 0); 299 id = nlgrps++; 300 301 lgrp_root = &lroot; 302 303 lgrp_root->lgrp_cpu = NULL; 304 lgrp_root->lgrp_mnodes = 0; 305 lgrp_root->lgrp_nmnodes = 0; 306 hand = lgrp_plat_root_hand(); 307 lgrp_root->lgrp_plathand = hand; 308 309 lgrp_root->lgrp_id = id; 310 lgrp_root->lgrp_cpucnt = 0; 311 lgrp_root->lgrp_childcnt = 0; 312 klgrpset_clear(lgrp_root->lgrp_children); 313 klgrpset_clear(lgrp_root->lgrp_leaves); 314 lgrp_root->lgrp_parent = NULL; 315 lgrp_root->lgrp_chips = NULL; 316 lgrp_root->lgrp_chipcnt = 0; 317 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 318 319 for (i = 0; i < LGRP_RSRC_COUNT; i++) 320 klgrpset_clear(lgrp_root->lgrp_set[i]); 321 322 lgrp_root->lgrp_kstat = NULL; 323 324 lgrp_table[id] = lgrp_root; 325 326 /* 327 * Setup initial lpl list for CPU0 and initial t0 home. 328 * The only lpl space we have so far is lpl_bootstrap. It is used for 329 * all topology operations until cp_default is initialized at which 330 * point t0.t_lpl will be updated. 331 */ 332 lpl_bootstrap = lpl_bootstrap_list; 333 t0.t_lpl = lpl_bootstrap; 334 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 335 lpl_bootstrap_list[1].lpl_lgrpid = 1; 336 cp_default.cp_lgrploads = lpl_bootstrap; 337 } 338 339 /* 340 * Initialize the lgroup framework and allow the platform to do the same 341 */ 342 void 343 lgrp_init(void) 344 { 345 /* 346 * Initialize the platform 347 */ 348 lgrp_plat_init(); 349 350 /* 351 * Set max number of lgroups supported on this platform which must be 352 * less than the max number of lgroups supported by the common lgroup 353 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 354 */ 355 nlgrpsmax = lgrp_plat_max_lgrps(); 356 ASSERT(nlgrpsmax <= NLGRPS_MAX); 357 } 358 359 /* 360 * Create the root and cpu0's lgroup, and set t0's home. 361 */ 362 void 363 lgrp_setup(void) 364 { 365 /* 366 * Setup the root lgroup 367 */ 368 lgrp_root_init(); 369 370 /* 371 * Add cpu0 to an lgroup 372 */ 373 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 374 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 375 } 376 377 /* 378 * Lgroup initialization is split in two parts. The first part 379 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 380 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 381 * when all CPUs are brought online and all distance information is available. 382 * 383 * When lgrp_main_init() is complete it sets lgrp_initialized. The 384 * lgrp_main_mp_init() sets lgrp_topo_initialized. 385 */ 386 387 /* 388 * true when lgrp initialization has been completed. 389 */ 390 int lgrp_initialized = 0; 391 392 /* 393 * True when lgrp topology is constructed. 394 */ 395 int lgrp_topo_initialized = 0; 396 397 /* 398 * Init routine called after startup(), /etc/system has been processed, 399 * and cpu0 has been added to an lgroup. 400 */ 401 void 402 lgrp_main_init(void) 403 { 404 cpu_t *cp = CPU; 405 lgrp_id_t lgrpid; 406 int i; 407 /* 408 * Enforce a valid lgrp_mem_default_policy 409 */ 410 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 411 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 412 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 413 414 /* 415 * See if mpo should be disabled. 416 * This may happen in the case of null proc LPA on Starcat. 417 * The platform won't be able to detect null proc LPA until after 418 * cpu0 and memory have already been added to lgroups. 419 * When and if it is detected, the Starcat platform will return 420 * a different platform handle for cpu0 which is what we check for 421 * here. If mpo should be disabled move cpu0 to it's rightful place 422 * (the root), and destroy the remaining lgroups. This effectively 423 * provides an UMA lgroup topology. 424 */ 425 lgrpid = cp->cpu_lpl->lpl_lgrpid; 426 if (lgrp_table[lgrpid]->lgrp_plathand != 427 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 428 lgrp_part_del_cpu(cp); 429 lgrp_cpu_fini(cp, lgrpid); 430 431 lgrp_cpu_init(cp); 432 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 433 434 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 435 436 for (i = 0; i <= lgrp_alloc_max; i++) { 437 if (LGRP_EXISTS(lgrp_table[i]) && 438 lgrp_table[i] != lgrp_root) 439 lgrp_destroy(lgrp_table[i]); 440 } 441 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 442 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 443 } 444 445 /* 446 * Initialize kstats framework. 447 */ 448 lgrp_kstat_init(); 449 /* 450 * cpu0 is finally where it should be, so create it's lgroup's kstats 451 */ 452 mutex_enter(&cpu_lock); 453 lgrp_kstat_create(cp); 454 mutex_exit(&cpu_lock); 455 456 lgrp_plat_main_init(); 457 lgrp_initialized = 1; 458 } 459 460 /* 461 * Finish lgrp initialization after all CPUS are brought on-line. 462 * This routine is called after start_other_cpus(). 463 */ 464 void 465 lgrp_main_mp_init(void) 466 { 467 klgrpset_t changed; 468 469 /* 470 * Update lgroup topology (if necessary) 471 */ 472 klgrpset_clear(changed); 473 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 474 lgrp_topo_initialized = 1; 475 } 476 477 /* 478 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 479 */ 480 void 481 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 482 { 483 klgrpset_t changed; 484 cpu_t *cp; 485 lgrp_id_t id; 486 int rc; 487 488 switch (event) { 489 /* 490 * The following (re)configuration events are common code 491 * initiated. lgrp_plat_config() is called here to inform the 492 * platform of the reconfiguration event. 493 */ 494 case LGRP_CONFIG_CPU_ADD: 495 cp = (cpu_t *)resource; 496 497 /* 498 * Initialize the new CPU's lgrp related next/prev 499 * links, and give it a bootstrap lpl so that it can 500 * survive should it need to enter the dispatcher. 501 */ 502 cp->cpu_next_lpl = cp; 503 cp->cpu_prev_lpl = cp; 504 cp->cpu_next_lgrp = cp; 505 cp->cpu_prev_lgrp = cp; 506 cp->cpu_lpl = lpl_bootstrap; 507 508 lgrp_plat_config(event, resource); 509 atomic_add_32(&lgrp_gen, 1); 510 511 break; 512 case LGRP_CONFIG_CPU_DEL: 513 lgrp_plat_config(event, resource); 514 atomic_add_32(&lgrp_gen, 1); 515 516 break; 517 case LGRP_CONFIG_CPU_ONLINE: 518 cp = (cpu_t *)resource; 519 lgrp_cpu_init(cp); 520 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 521 rc = lpl_topo_verify(cp->cpu_part); 522 if (rc != LPL_TOPO_CORRECT) { 523 panic("lpl_topo_verify failed: %d", rc); 524 } 525 lgrp_plat_config(event, resource); 526 atomic_add_32(&lgrp_gen, 1); 527 528 break; 529 case LGRP_CONFIG_CPU_OFFLINE: 530 cp = (cpu_t *)resource; 531 id = cp->cpu_lpl->lpl_lgrpid; 532 lgrp_part_del_cpu(cp); 533 lgrp_cpu_fini(cp, id); 534 rc = lpl_topo_verify(cp->cpu_part); 535 if (rc != LPL_TOPO_CORRECT) { 536 panic("lpl_topo_verify failed: %d", rc); 537 } 538 lgrp_plat_config(event, resource); 539 atomic_add_32(&lgrp_gen, 1); 540 541 break; 542 case LGRP_CONFIG_CPUPART_ADD: 543 cp = (cpu_t *)resource; 544 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 545 rc = lpl_topo_verify(cp->cpu_part); 546 if (rc != LPL_TOPO_CORRECT) { 547 panic("lpl_topo_verify failed: %d", rc); 548 } 549 lgrp_plat_config(event, resource); 550 551 break; 552 case LGRP_CONFIG_CPUPART_DEL: 553 cp = (cpu_t *)resource; 554 lgrp_part_del_cpu((cpu_t *)resource); 555 rc = lpl_topo_verify(cp->cpu_part); 556 if (rc != LPL_TOPO_CORRECT) { 557 panic("lpl_topo_verify failed: %d", rc); 558 } 559 lgrp_plat_config(event, resource); 560 561 break; 562 /* 563 * The following events are initiated by the memnode 564 * subsystem. 565 */ 566 case LGRP_CONFIG_MEM_ADD: 567 lgrp_mem_init((int)resource, where, B_FALSE); 568 atomic_add_32(&lgrp_gen, 1); 569 570 break; 571 case LGRP_CONFIG_MEM_DEL: 572 lgrp_mem_fini((int)resource, where, B_FALSE); 573 atomic_add_32(&lgrp_gen, 1); 574 575 break; 576 case LGRP_CONFIG_MEM_RENAME: { 577 lgrp_config_mem_rename_t *ren_arg = 578 (lgrp_config_mem_rename_t *)where; 579 580 lgrp_mem_rename((int)resource, 581 ren_arg->lmem_rename_from, 582 ren_arg->lmem_rename_to); 583 atomic_add_32(&lgrp_gen, 1); 584 585 break; 586 } 587 case LGRP_CONFIG_GEN_UPDATE: 588 atomic_add_32(&lgrp_gen, 1); 589 590 break; 591 case LGRP_CONFIG_FLATTEN: 592 if (where == 0) 593 lgrp_topo_levels = (int)resource; 594 else 595 (void) lgrp_topo_flatten(resource, 596 lgrp_table, lgrp_alloc_max, &changed); 597 598 break; 599 /* 600 * Initiated by platform latency probing code 601 */ 602 case LGRP_CONFIG_LATENCY_CHANGE: 603 lgrp_latency_change((u_longlong_t)resource, 604 (u_longlong_t)where); 605 606 break; 607 case LGRP_CONFIG_NOP: 608 609 break; 610 default: 611 break; 612 } 613 614 } 615 616 /* 617 * Called to add lgrp info into cpu structure from cpu_add_unit; 618 * do not assume cpu is in cpu[] yet! 619 * 620 * CPUs are brought online with all other CPUs paused so we can't 621 * allocate memory or we could deadlock the system, so we rely on 622 * the platform to statically allocate as much space as we need 623 * for the lgrp structs and stats. 624 */ 625 static void 626 lgrp_cpu_init(struct cpu *cp) 627 { 628 klgrpset_t changed; 629 int count; 630 lgrp_handle_t hand; 631 int first_cpu; 632 lgrp_t *my_lgrp; 633 lgrp_id_t lgrpid; 634 struct cpu *cptr; 635 struct chip *chp; 636 637 /* 638 * This is the first time through if the resource set 639 * for the root lgroup is empty. After cpu0 has been 640 * initially added to an lgroup, the root's CPU resource 641 * set can never be empty, since the system's last CPU 642 * cannot be offlined. 643 */ 644 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 645 /* 646 * First time through. 647 */ 648 first_cpu = 1; 649 } else { 650 /* 651 * If cpu0 needs to move lgroups, we may come 652 * through here again, at which time cpu_lock won't 653 * be held, and lgrp_initialized will be false. 654 */ 655 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 656 ASSERT(cp->cpu_part != NULL); 657 first_cpu = 0; 658 } 659 660 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 661 my_lgrp = lgrp_hand_to_lgrp(hand); 662 663 if (my_lgrp == NULL) { 664 /* 665 * Create new lgrp and add it to lgroup topology 666 */ 667 my_lgrp = lgrp_create(); 668 my_lgrp->lgrp_plathand = hand; 669 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 670 lgrpid = my_lgrp->lgrp_id; 671 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 672 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 673 674 count = 0; 675 klgrpset_clear(changed); 676 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 677 &changed); 678 /* 679 * May have added new intermediate lgroups, so need to add 680 * resources other than CPUs which are added below 681 */ 682 (void) lgrp_mnode_update(changed, NULL); 683 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 684 > 0) { 685 /* 686 * Leaf lgroup was created, but latency wasn't available 687 * then. So, set latency for it and fill in rest of lgroup 688 * topology now that we know how far it is from other leaf 689 * lgroups. 690 */ 691 lgrpid = my_lgrp->lgrp_id; 692 klgrpset_clear(changed); 693 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 694 lgrpid)) 695 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 696 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 697 &changed); 698 699 /* 700 * May have added new intermediate lgroups, so need to add 701 * resources other than CPUs which are added below 702 */ 703 (void) lgrp_mnode_update(changed, NULL); 704 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 705 my_lgrp->lgrp_id)) { 706 int i; 707 708 /* 709 * Update existing lgroup and lgroups containing it with CPU 710 * resource 711 */ 712 lgrpid = my_lgrp->lgrp_id; 713 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 714 for (i = 0; i <= lgrp_alloc_max; i++) { 715 lgrp_t *lgrp; 716 717 lgrp = lgrp_table[i]; 718 if (!LGRP_EXISTS(lgrp) || 719 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 720 continue; 721 722 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 723 } 724 } 725 726 lgrpid = my_lgrp->lgrp_id; 727 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 728 729 /* 730 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 731 * end up in lpl for lgroup 0 whether it is supposed to be in there or 732 * not since none of lgroup IDs in the lpl's have been set yet. 733 */ 734 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 735 cp->cpu_lpl->lpl_lgrpid = lgrpid; 736 737 /* 738 * link the CPU into the lgrp's CPU list 739 */ 740 if (my_lgrp->lgrp_cpucnt == 0) { 741 my_lgrp->lgrp_cpu = cp; 742 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 743 } else { 744 cptr = my_lgrp->lgrp_cpu; 745 cp->cpu_next_lgrp = cptr; 746 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 747 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 748 cptr->cpu_prev_lgrp = cp; 749 } 750 my_lgrp->lgrp_cpucnt++; 751 752 /* 753 * Add this cpu's chip to the per lgroup list 754 * if necessary 755 */ 756 if (cp->cpu_chip->chip_lgrp == NULL) { 757 struct chip *lcpr; 758 759 chp = cp->cpu_chip; 760 761 if (my_lgrp->lgrp_chipcnt == 0) { 762 my_lgrp->lgrp_chips = chp; 763 chp->chip_next_lgrp = 764 chp->chip_prev_lgrp = chp; 765 } else { 766 lcpr = my_lgrp->lgrp_chips; 767 chp->chip_next_lgrp = lcpr; 768 chp->chip_prev_lgrp = 769 lcpr->chip_prev_lgrp; 770 lcpr->chip_prev_lgrp->chip_next_lgrp = 771 chp; 772 lcpr->chip_prev_lgrp = chp; 773 } 774 chp->chip_lgrp = my_lgrp; 775 chp->chip_balance = chp->chip_next_lgrp; 776 my_lgrp->lgrp_chipcnt++; 777 } 778 } 779 780 lgrp_t * 781 lgrp_create(void) 782 { 783 lgrp_t *my_lgrp; 784 lgrp_id_t lgrpid; 785 int i; 786 787 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 788 789 /* 790 * Find an open slot in the lgroup table and recycle unused lgroup 791 * left there if any 792 */ 793 my_lgrp = NULL; 794 if (lgrp_alloc_hint == -1) 795 /* 796 * Allocate from end when hint not set yet because no lgroups 797 * have been deleted yet 798 */ 799 lgrpid = nlgrps++; 800 else { 801 /* 802 * Start looking for next open slot from hint and leave hint 803 * at slot allocated 804 */ 805 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 806 my_lgrp = lgrp_table[i]; 807 if (!LGRP_EXISTS(my_lgrp)) { 808 lgrpid = i; 809 nlgrps++; 810 break; 811 } 812 } 813 lgrp_alloc_hint = lgrpid; 814 } 815 816 /* 817 * Keep track of max lgroup ID allocated so far to cut down on searches 818 */ 819 if (lgrpid > lgrp_alloc_max) 820 lgrp_alloc_max = lgrpid; 821 822 /* 823 * Need to allocate new lgroup if next open slot didn't have one 824 * for recycling 825 */ 826 if (my_lgrp == NULL) 827 my_lgrp = lgrp_plat_alloc(lgrpid); 828 829 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 830 panic("Too many lgrps for platform (%d)", nlgrps); 831 832 my_lgrp->lgrp_id = lgrpid; 833 my_lgrp->lgrp_latency = 0; 834 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 835 my_lgrp->lgrp_parent = NULL; 836 my_lgrp->lgrp_childcnt = 0; 837 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 838 my_lgrp->lgrp_nmnodes = 0; 839 klgrpset_clear(my_lgrp->lgrp_children); 840 klgrpset_clear(my_lgrp->lgrp_leaves); 841 for (i = 0; i < LGRP_RSRC_COUNT; i++) 842 klgrpset_clear(my_lgrp->lgrp_set[i]); 843 844 my_lgrp->lgrp_cpu = NULL; 845 my_lgrp->lgrp_cpucnt = 0; 846 my_lgrp->lgrp_chips = NULL; 847 my_lgrp->lgrp_chipcnt = 0; 848 849 if (my_lgrp->lgrp_kstat != NULL) 850 lgrp_kstat_reset(lgrpid); 851 852 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 853 854 return (my_lgrp); 855 } 856 857 void 858 lgrp_destroy(lgrp_t *lgrp) 859 { 860 int i; 861 862 /* 863 * Unless this lgroup is being destroyed on behalf of 864 * the boot CPU, cpu_lock must be held 865 */ 866 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 867 868 if (nlgrps == 1) 869 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 870 871 if (!LGRP_EXISTS(lgrp)) 872 return; 873 874 /* 875 * Set hint to lgroup being deleted and try to keep lower numbered 876 * hints to facilitate finding empty slots 877 */ 878 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 879 lgrp_alloc_hint = lgrp->lgrp_id; 880 881 /* 882 * Mark this lgroup to be recycled by setting its lgroup ID to 883 * LGRP_NONE and clear relevant fields 884 */ 885 lgrp->lgrp_id = LGRP_NONE; 886 lgrp->lgrp_latency = 0; 887 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 888 lgrp->lgrp_parent = NULL; 889 lgrp->lgrp_childcnt = 0; 890 891 klgrpset_clear(lgrp->lgrp_children); 892 klgrpset_clear(lgrp->lgrp_leaves); 893 for (i = 0; i < LGRP_RSRC_COUNT; i++) 894 klgrpset_clear(lgrp->lgrp_set[i]); 895 896 lgrp->lgrp_mnodes = (mnodeset_t)0; 897 lgrp->lgrp_nmnodes = 0; 898 899 lgrp->lgrp_cpu = NULL; 900 lgrp->lgrp_cpucnt = 0; 901 lgrp->lgrp_chipcnt = 0; 902 lgrp->lgrp_chips = NULL; 903 904 nlgrps--; 905 } 906 907 /* 908 * Initialize kstat data. Called from lgrp intialization code. 909 */ 910 static void 911 lgrp_kstat_init(void) 912 { 913 lgrp_stat_t stat; 914 915 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 916 917 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 918 kstat_named_init(&lgrp_kstat_data[stat], 919 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 920 } 921 922 /* 923 * initialize an lgrp's kstats if needed 924 * called with cpu_lock held but not with cpus paused. 925 * we don't tear these down now because we don't know about 926 * memory leaving the lgrp yet... 927 */ 928 929 void 930 lgrp_kstat_create(cpu_t *cp) 931 { 932 kstat_t *lgrp_kstat; 933 lgrp_id_t lgrpid; 934 lgrp_t *my_lgrp; 935 936 ASSERT(MUTEX_HELD(&cpu_lock)); 937 938 lgrpid = cp->cpu_lpl->lpl_lgrpid; 939 my_lgrp = lgrp_table[lgrpid]; 940 941 if (my_lgrp->lgrp_kstat != NULL) 942 return; /* already initialized */ 943 944 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 945 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 946 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 947 948 if (lgrp_kstat != NULL) { 949 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 950 lgrp_kstat->ks_private = my_lgrp; 951 lgrp_kstat->ks_data = &lgrp_kstat_data; 952 lgrp_kstat->ks_update = lgrp_kstat_extract; 953 my_lgrp->lgrp_kstat = lgrp_kstat; 954 kstat_install(lgrp_kstat); 955 } 956 } 957 958 /* 959 * this will do something when we manage to remove now unused lgrps 960 */ 961 962 /* ARGSUSED */ 963 void 964 lgrp_kstat_destroy(cpu_t *cp) 965 { 966 ASSERT(MUTEX_HELD(&cpu_lock)); 967 } 968 969 /* 970 * Called when a CPU is off-lined. 971 */ 972 static void 973 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 974 { 975 lgrp_t *my_lgrp; 976 struct cpu *prev; 977 struct cpu *next; 978 chip_t *chp; 979 980 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 981 982 prev = cp->cpu_prev_lgrp; 983 next = cp->cpu_next_lgrp; 984 985 prev->cpu_next_lgrp = next; 986 next->cpu_prev_lgrp = prev; 987 988 /* 989 * just because I'm paranoid doesn't mean... 990 */ 991 992 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 993 994 my_lgrp = lgrp_table[lgrpid]; 995 my_lgrp->lgrp_cpucnt--; 996 997 /* 998 * If the last CPU on it's chip is being offlined 999 * then remove this chip from the per lgroup list. 1000 * 1001 * This is also done for the boot CPU when it needs 1002 * to move between lgroups as a consequence of 1003 * null proc lpa. 1004 */ 1005 chp = cp->cpu_chip; 1006 if (chp->chip_ncpu == 0 || !lgrp_initialized) { 1007 1008 chip_t *chpp; 1009 1010 if (--my_lgrp->lgrp_chipcnt == 0) 1011 my_lgrp->lgrp_chips = NULL; 1012 else if (my_lgrp->lgrp_chips == chp) 1013 my_lgrp->lgrp_chips = chp->chip_next_lgrp; 1014 1015 /* 1016 * Walk this lgroup's chip list looking for chips that 1017 * may try to balance against the one that's leaving 1018 */ 1019 for (chpp = chp->chip_next_lgrp; chpp != chp; 1020 chpp = chpp->chip_next_lgrp) { 1021 if (chpp->chip_balance == chp) 1022 chpp->chip_balance = chp->chip_next_lgrp; 1023 } 1024 1025 chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 1026 chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 1027 1028 chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 1029 chp->chip_lgrp = NULL; 1030 chp->chip_balance = NULL; 1031 } 1032 1033 /* 1034 * Removing last CPU in lgroup, so update lgroup topology 1035 */ 1036 if (my_lgrp->lgrp_cpucnt == 0) { 1037 klgrpset_t changed; 1038 int count; 1039 int i; 1040 1041 my_lgrp->lgrp_cpu = NULL; 1042 1043 /* 1044 * Remove this lgroup from its lgroup CPU resources and remove 1045 * lgroup from lgroup topology if it doesn't have any more 1046 * resources in it now 1047 */ 1048 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1049 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1050 count = 0; 1051 klgrpset_clear(changed); 1052 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1053 lgrp_alloc_max + 1, &changed); 1054 return; 1055 } 1056 1057 /* 1058 * This lgroup isn't empty, so just remove it from CPU 1059 * resources of any lgroups that contain it as such 1060 */ 1061 for (i = 0; i <= lgrp_alloc_max; i++) { 1062 lgrp_t *lgrp; 1063 1064 lgrp = lgrp_table[i]; 1065 if (!LGRP_EXISTS(lgrp) || 1066 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1067 lgrpid)) 1068 continue; 1069 1070 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1071 } 1072 return; 1073 } 1074 1075 if (my_lgrp->lgrp_cpu == cp) 1076 my_lgrp->lgrp_cpu = next; 1077 1078 } 1079 1080 /* 1081 * Update memory nodes in target lgroups and return ones that get changed 1082 */ 1083 int 1084 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1085 { 1086 int count; 1087 int i; 1088 int j; 1089 lgrp_t *lgrp; 1090 lgrp_t *lgrp_rsrc; 1091 1092 count = 0; 1093 if (changed) 1094 klgrpset_clear(*changed); 1095 1096 if (klgrpset_isempty(target)) 1097 return (0); 1098 1099 /* 1100 * Find each lgroup in target lgroups 1101 */ 1102 for (i = 0; i <= lgrp_alloc_max; i++) { 1103 /* 1104 * Skip any lgroups that don't exist or aren't in target group 1105 */ 1106 lgrp = lgrp_table[i]; 1107 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1108 continue; 1109 } 1110 1111 /* 1112 * Initialize memnodes for intermediate lgroups to 0 1113 * and update them from scratch since they may have completely 1114 * changed 1115 */ 1116 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1117 lgrp->lgrp_mnodes = (mnodeset_t)0; 1118 lgrp->lgrp_nmnodes = 0; 1119 } 1120 1121 /* 1122 * Update memory nodes of of target lgroup with memory nodes 1123 * from each lgroup in its lgroup memory resource set 1124 */ 1125 for (j = 0; j <= lgrp_alloc_max; j++) { 1126 int k; 1127 1128 /* 1129 * Skip any lgroups that don't exist or aren't in 1130 * memory resources of target lgroup 1131 */ 1132 lgrp_rsrc = lgrp_table[j]; 1133 if (!LGRP_EXISTS(lgrp_rsrc) || 1134 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1135 j)) 1136 continue; 1137 1138 /* 1139 * Update target lgroup's memnodes to include memnodes 1140 * of this lgroup 1141 */ 1142 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1143 mnodeset_t mnode_mask; 1144 1145 mnode_mask = (mnodeset_t)1 << k; 1146 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1147 !(lgrp->lgrp_mnodes & mnode_mask)) { 1148 lgrp->lgrp_mnodes |= mnode_mask; 1149 lgrp->lgrp_nmnodes++; 1150 } 1151 } 1152 count++; 1153 if (changed) 1154 klgrpset_add(*changed, lgrp->lgrp_id); 1155 } 1156 } 1157 1158 return (count); 1159 } 1160 1161 /* 1162 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1163 * is moved from one board to another. The "from" and "to" arguments specify the 1164 * source and the destination of the move. 1165 * 1166 * See plat_lgrp_config() for a detailed description of the copy-rename 1167 * semantics. 1168 * 1169 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1170 * the lgroup topology which is changing as memory moves from one lgroup to 1171 * another. It removes the mnode from the source lgroup and re-inserts it in the 1172 * target lgroup. 1173 * 1174 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1175 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1176 * copy-rename operation. 1177 * 1178 * There is one case which requires special handling. If the system contains 1179 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1180 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1181 * lgrp_mem_init), but there is a window when the system has no memory in the 1182 * lgroup hierarchy. If another thread tries to allocate memory during this 1183 * window, the allocation will fail, although the system has physical memory. 1184 * This may cause a system panic or a deadlock (some sleeping memory allocations 1185 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1186 * the mnode back). 1187 * 1188 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1189 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1190 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1191 * but it updates the rest of the lgroup topology as if the mnode was actually 1192 * removed. The lgrp_mem_init() function recognizes that the mnode being 1193 * inserted represents such a special case and updates the topology 1194 * appropriately. 1195 */ 1196 void 1197 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1198 { 1199 /* 1200 * Remove the memory from the source node and add it to the destination 1201 * node. 1202 */ 1203 lgrp_mem_fini(mnode, from, B_TRUE); 1204 lgrp_mem_init(mnode, to, B_TRUE); 1205 } 1206 1207 /* 1208 * Called to indicate that the lgrp with platform handle "hand" now 1209 * contains the memory identified by "mnode". 1210 * 1211 * LOCKING for this routine is a bit tricky. Usually it is called without 1212 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1213 * callers. During DR of the board containing the caged memory it may be called 1214 * with cpu_lock already held and CPUs paused. 1215 * 1216 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1217 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1218 * dealing with the special case of DR copy-rename described in 1219 * lgrp_mem_rename(). 1220 */ 1221 void 1222 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1223 { 1224 klgrpset_t changed; 1225 int count; 1226 int i; 1227 lgrp_t *my_lgrp; 1228 lgrp_id_t lgrpid; 1229 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1230 boolean_t drop_lock = B_FALSE; 1231 boolean_t need_synch = B_FALSE; 1232 1233 /* 1234 * Grab CPU lock (if we haven't already) 1235 */ 1236 if (!MUTEX_HELD(&cpu_lock)) { 1237 mutex_enter(&cpu_lock); 1238 drop_lock = B_TRUE; 1239 } 1240 1241 /* 1242 * This routine may be called from a context where we already 1243 * hold cpu_lock, and have already paused cpus. 1244 */ 1245 if (!cpus_paused()) 1246 need_synch = B_TRUE; 1247 1248 /* 1249 * Check if this mnode is already configured and return immediately if 1250 * it is. 1251 * 1252 * NOTE: in special case of copy-rename of the only remaining mnode, 1253 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1254 * recognize this case and continue as usual, but skip the update to 1255 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1256 * in topology, temporarily introduced by lgrp_mem_fini(). 1257 */ 1258 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1259 lgrp_root->lgrp_mnodes & mnodes_mask) { 1260 if (drop_lock) 1261 mutex_exit(&cpu_lock); 1262 return; 1263 } 1264 1265 /* 1266 * Update lgroup topology with new memory resources, keeping track of 1267 * which lgroups change 1268 */ 1269 count = 0; 1270 klgrpset_clear(changed); 1271 my_lgrp = lgrp_hand_to_lgrp(hand); 1272 if (my_lgrp == NULL) { 1273 /* new lgrp */ 1274 my_lgrp = lgrp_create(); 1275 lgrpid = my_lgrp->lgrp_id; 1276 my_lgrp->lgrp_plathand = hand; 1277 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1278 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1279 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1280 1281 if (need_synch) 1282 pause_cpus(NULL); 1283 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1284 &changed); 1285 if (need_synch) 1286 start_cpus(); 1287 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1288 > 0) { 1289 /* 1290 * Leaf lgroup was created, but latency wasn't available 1291 * then. So, set latency for it and fill in rest of lgroup 1292 * topology now that we know how far it is from other leaf 1293 * lgroups. 1294 */ 1295 klgrpset_clear(changed); 1296 lgrpid = my_lgrp->lgrp_id; 1297 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1298 lgrpid)) 1299 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1300 if (need_synch) 1301 pause_cpus(NULL); 1302 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1303 &changed); 1304 if (need_synch) 1305 start_cpus(); 1306 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1307 my_lgrp->lgrp_id)) { 1308 /* 1309 * Add new lgroup memory resource to existing lgroup 1310 */ 1311 lgrpid = my_lgrp->lgrp_id; 1312 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1313 klgrpset_add(changed, lgrpid); 1314 count++; 1315 for (i = 0; i <= lgrp_alloc_max; i++) { 1316 lgrp_t *lgrp; 1317 1318 lgrp = lgrp_table[i]; 1319 if (!LGRP_EXISTS(lgrp) || 1320 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1321 continue; 1322 1323 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1324 klgrpset_add(changed, lgrp->lgrp_id); 1325 count++; 1326 } 1327 } 1328 1329 /* 1330 * Add memory node to lgroup and remove lgroup from ones that need 1331 * to be updated 1332 */ 1333 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1334 my_lgrp->lgrp_mnodes |= mnodes_mask; 1335 my_lgrp->lgrp_nmnodes++; 1336 } 1337 klgrpset_del(changed, lgrpid); 1338 1339 /* 1340 * Update memory node information for all lgroups that changed and 1341 * contain new memory node as a resource 1342 */ 1343 if (count) 1344 (void) lgrp_mnode_update(changed, NULL); 1345 1346 if (drop_lock) 1347 mutex_exit(&cpu_lock); 1348 } 1349 1350 /* 1351 * Called to indicate that the lgroup associated with the platform 1352 * handle "hand" no longer contains given memory node 1353 * 1354 * LOCKING for this routine is a bit tricky. Usually it is called without 1355 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1356 * callers. During DR of the board containing the caged memory it may be called 1357 * with cpu_lock already held and CPUs paused. 1358 * 1359 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1360 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1361 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1362 * the same mnode back into the topology. See lgrp_mem_rename() and 1363 * lgrp_mem_init() for additional details. 1364 */ 1365 void 1366 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1367 { 1368 klgrpset_t changed; 1369 int count; 1370 int i; 1371 lgrp_t *my_lgrp; 1372 lgrp_id_t lgrpid; 1373 mnodeset_t mnodes_mask; 1374 boolean_t drop_lock = B_FALSE; 1375 boolean_t need_synch = B_FALSE; 1376 1377 /* 1378 * Grab CPU lock (if we haven't already) 1379 */ 1380 if (!MUTEX_HELD(&cpu_lock)) { 1381 mutex_enter(&cpu_lock); 1382 drop_lock = B_TRUE; 1383 } 1384 1385 /* 1386 * This routine may be called from a context where we already 1387 * hold cpu_lock and have already paused cpus. 1388 */ 1389 if (!cpus_paused()) 1390 need_synch = B_TRUE; 1391 1392 my_lgrp = lgrp_hand_to_lgrp(hand); 1393 1394 /* 1395 * The lgrp *must* be pre-existing 1396 */ 1397 ASSERT(my_lgrp != NULL); 1398 1399 /* 1400 * Delete memory node from lgroups which contain it 1401 */ 1402 mnodes_mask = ((mnodeset_t)1 << mnode); 1403 for (i = 0; i <= lgrp_alloc_max; i++) { 1404 lgrp_t *lgrp = lgrp_table[i]; 1405 /* 1406 * Skip any non-existent lgroups and any lgroups that don't 1407 * contain leaf lgroup of memory as a memory resource 1408 */ 1409 if (!LGRP_EXISTS(lgrp) || 1410 !(lgrp->lgrp_mnodes & mnodes_mask)) 1411 continue; 1412 1413 /* 1414 * Avoid removing the last mnode from the root in the DR 1415 * copy-rename case. See lgrp_mem_rename() for details. 1416 */ 1417 if (is_copy_rename && 1418 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1419 continue; 1420 1421 /* 1422 * Remove memory node from lgroup. 1423 */ 1424 lgrp->lgrp_mnodes &= ~mnodes_mask; 1425 lgrp->lgrp_nmnodes--; 1426 ASSERT(lgrp->lgrp_nmnodes >= 0); 1427 } 1428 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1429 1430 /* 1431 * Don't need to update lgroup topology if this lgroup still has memory. 1432 * 1433 * In the special case of DR copy-rename with the only mnode being 1434 * removed, the lgrp_mnodes for the root is always non-zero, but we 1435 * still need to update the lgroup topology. 1436 */ 1437 if ((my_lgrp->lgrp_nmnodes > 0) && 1438 !(is_copy_rename && 1439 (my_lgrp == lgrp_root) && 1440 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1441 if (drop_lock) 1442 mutex_exit(&cpu_lock); 1443 return; 1444 } 1445 1446 /* 1447 * This lgroup does not contain any memory now 1448 */ 1449 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1450 1451 /* 1452 * Remove this lgroup from lgroup topology if it does not contain any 1453 * resources now 1454 */ 1455 lgrpid = my_lgrp->lgrp_id; 1456 count = 0; 1457 klgrpset_clear(changed); 1458 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1459 /* 1460 * Delete lgroup when no more resources 1461 */ 1462 if (need_synch) 1463 pause_cpus(NULL); 1464 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1465 lgrp_alloc_max + 1, &changed); 1466 ASSERT(count > 0); 1467 if (need_synch) 1468 start_cpus(); 1469 } else { 1470 /* 1471 * Remove lgroup from memory resources of any lgroups that 1472 * contain it as such 1473 */ 1474 for (i = 0; i <= lgrp_alloc_max; i++) { 1475 lgrp_t *lgrp; 1476 1477 lgrp = lgrp_table[i]; 1478 if (!LGRP_EXISTS(lgrp) || 1479 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1480 lgrpid)) 1481 continue; 1482 1483 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1484 } 1485 } 1486 if (drop_lock) 1487 mutex_exit(&cpu_lock); 1488 } 1489 1490 /* 1491 * Return lgroup with given platform handle 1492 */ 1493 lgrp_t * 1494 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1495 { 1496 int i; 1497 lgrp_t *lgrp; 1498 1499 if (hand == LGRP_NULL_HANDLE) 1500 return (NULL); 1501 1502 for (i = 0; i <= lgrp_alloc_max; i++) { 1503 lgrp = lgrp_table[i]; 1504 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1505 return (lgrp); 1506 } 1507 return (NULL); 1508 } 1509 1510 /* 1511 * Return the home lgroup of the current thread. 1512 * We must do this with kernel preemption disabled, since we don't want our 1513 * thread to be re-homed while we're poking around with its lpl, and the lpl 1514 * should never be NULL. 1515 * 1516 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1517 * is enabled because of DR. Callers can use disable kernel preemption 1518 * around this call to guarantee that the lgroup will be valid beyond this 1519 * routine, since kernel preemption can be recursive. 1520 */ 1521 lgrp_t * 1522 lgrp_home_lgrp(void) 1523 { 1524 lgrp_t *lgrp; 1525 lpl_t *lpl; 1526 1527 kpreempt_disable(); 1528 1529 lpl = curthread->t_lpl; 1530 ASSERT(lpl != NULL); 1531 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1532 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1533 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1534 1535 kpreempt_enable(); 1536 1537 return (lgrp); 1538 } 1539 1540 /* 1541 * Return ID of home lgroup for given thread 1542 * (See comments for lgrp_home_lgrp() for special care and handling 1543 * instructions) 1544 */ 1545 lgrp_id_t 1546 lgrp_home_id(kthread_t *t) 1547 { 1548 lgrp_id_t lgrp; 1549 lpl_t *lpl; 1550 1551 ASSERT(t != NULL); 1552 /* 1553 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1554 * cannot since the HAT layer can call into this routine to 1555 * determine the locality for its data structures in the context 1556 * of a page fault. 1557 */ 1558 1559 kpreempt_disable(); 1560 1561 lpl = t->t_lpl; 1562 ASSERT(lpl != NULL); 1563 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1564 lgrp = lpl->lpl_lgrpid; 1565 1566 kpreempt_enable(); 1567 1568 return (lgrp); 1569 } 1570 1571 /* 1572 * Return lgroup containing the physical memory for the given page frame number 1573 */ 1574 lgrp_t * 1575 lgrp_pfn_to_lgrp(pfn_t pfn) 1576 { 1577 lgrp_handle_t hand; 1578 int i; 1579 lgrp_t *lgrp; 1580 1581 hand = lgrp_plat_pfn_to_hand(pfn); 1582 if (hand != LGRP_NULL_HANDLE) 1583 for (i = 0; i <= lgrp_alloc_max; i++) { 1584 lgrp = lgrp_table[i]; 1585 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1586 return (lgrp); 1587 } 1588 return (NULL); 1589 } 1590 1591 /* 1592 * Return lgroup containing the physical memory for the given page frame number 1593 */ 1594 lgrp_t * 1595 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1596 { 1597 lgrp_handle_t hand; 1598 int i; 1599 lgrp_t *lgrp; 1600 pfn_t pfn; 1601 1602 pfn = btop(physaddr); 1603 hand = lgrp_plat_pfn_to_hand(pfn); 1604 if (hand != LGRP_NULL_HANDLE) 1605 for (i = 0; i <= lgrp_alloc_max; i++) { 1606 lgrp = lgrp_table[i]; 1607 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1608 return (lgrp); 1609 } 1610 return (NULL); 1611 } 1612 1613 /* 1614 * Return the leaf lgroup containing the given CPU 1615 * 1616 * The caller needs to take precautions necessary to prevent 1617 * "cpu" from going away across a call to this function. 1618 * hint: kpreempt_disable()/kpreempt_enable() 1619 */ 1620 static lgrp_t * 1621 lgrp_cpu_to_lgrp(cpu_t *cpu) 1622 { 1623 return (cpu->cpu_chip->chip_lgrp); 1624 } 1625 1626 /* 1627 * Return the sum of the partition loads in an lgrp divided by 1628 * the number of CPUs in the lgrp. This is our best approximation 1629 * of an 'lgroup load average' for a useful per-lgroup kstat. 1630 */ 1631 static uint64_t 1632 lgrp_sum_loadavgs(lgrp_t *lgrp) 1633 { 1634 cpu_t *cpu; 1635 int ncpu; 1636 uint64_t loads = 0; 1637 1638 mutex_enter(&cpu_lock); 1639 1640 cpu = lgrp->lgrp_cpu; 1641 ncpu = lgrp->lgrp_cpucnt; 1642 1643 if (cpu == NULL || ncpu == 0) { 1644 mutex_exit(&cpu_lock); 1645 return (0ull); 1646 } 1647 1648 do { 1649 loads += cpu->cpu_lpl->lpl_loadavg; 1650 cpu = cpu->cpu_next_lgrp; 1651 } while (cpu != lgrp->lgrp_cpu); 1652 1653 mutex_exit(&cpu_lock); 1654 1655 return (loads / ncpu); 1656 } 1657 1658 void 1659 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1660 { 1661 struct lgrp_stats *pstats; 1662 1663 /* 1664 * Verify that the caller isn't trying to add to 1665 * a statistic for an lgroup that has gone away 1666 */ 1667 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1668 return; 1669 1670 pstats = &lgrp_stats[lgrpid]; 1671 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1672 } 1673 1674 int64_t 1675 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1676 { 1677 uint64_t val; 1678 struct lgrp_stats *pstats; 1679 1680 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1681 return ((int64_t)0); 1682 1683 pstats = &lgrp_stats[lgrpid]; 1684 LGRP_STAT_READ(pstats, stat, val); 1685 return (val); 1686 } 1687 1688 /* 1689 * Reset all kstats for lgrp specified by its lgrpid. 1690 */ 1691 static void 1692 lgrp_kstat_reset(lgrp_id_t lgrpid) 1693 { 1694 lgrp_stat_t stat; 1695 1696 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1697 return; 1698 1699 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1700 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1701 } 1702 } 1703 1704 /* 1705 * Collect all per-lgrp statistics for the lgrp associated with this 1706 * kstat, and store them in the ks_data array. 1707 * 1708 * The superuser can reset all the running counter statistics for an 1709 * lgrp by writing to any of the lgrp's stats. 1710 */ 1711 static int 1712 lgrp_kstat_extract(kstat_t *ksp, int rw) 1713 { 1714 lgrp_stat_t stat; 1715 struct kstat_named *ksd; 1716 lgrp_t *lgrp; 1717 lgrp_id_t lgrpid; 1718 1719 lgrp = (lgrp_t *)ksp->ks_private; 1720 1721 ksd = (struct kstat_named *)ksp->ks_data; 1722 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1723 1724 lgrpid = lgrp->lgrp_id; 1725 1726 if (lgrpid == LGRP_NONE) { 1727 /* 1728 * Return all zeroes as stats for freed lgrp. 1729 */ 1730 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1731 ksd[stat].value.i64 = 0; 1732 } 1733 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1734 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1735 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1736 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1737 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1738 } else if (rw != KSTAT_WRITE) { 1739 /* 1740 * Handle counter stats 1741 */ 1742 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1743 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1744 } 1745 1746 /* 1747 * Handle kernel data snapshot stats 1748 */ 1749 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1750 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1751 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1752 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1753 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1754 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1755 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1756 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1757 } else { 1758 lgrp_kstat_reset(lgrpid); 1759 } 1760 1761 return (0); 1762 } 1763 1764 int 1765 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1766 { 1767 cpu_t *cp; 1768 1769 mutex_enter(&cpu_lock); 1770 1771 if ((cp = cpu_get(id)) == NULL) { 1772 mutex_exit(&cpu_lock); 1773 return (EINVAL); 1774 } 1775 1776 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1777 mutex_exit(&cpu_lock); 1778 return (EINVAL); 1779 } 1780 1781 ASSERT(cp->cpu_lpl != NULL); 1782 1783 *lp = cp->cpu_lpl->lpl_lgrpid; 1784 1785 mutex_exit(&cpu_lock); 1786 1787 return (0); 1788 } 1789 1790 int 1791 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1792 { 1793 cpu_t *cp; 1794 1795 mutex_enter(&cpu_lock); 1796 1797 if ((cp = cpu_get(id)) == NULL) { 1798 mutex_exit(&cpu_lock); 1799 return (EINVAL); 1800 } 1801 1802 ASSERT(cp->cpu_lpl != NULL); 1803 1804 *lp = cp->cpu_lpl->lpl_loadavg; 1805 1806 mutex_exit(&cpu_lock); 1807 1808 return (0); 1809 } 1810 1811 void 1812 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime) 1813 { 1814 lgrp_t *lgrp; 1815 int i; 1816 1817 for (i = 0; i <= lgrp_alloc_max; i++) { 1818 lgrp = lgrp_table[i]; 1819 1820 if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime)) 1821 lgrp->lgrp_latency = (int)newtime; 1822 } 1823 } 1824 1825 /* 1826 * Add a resource named by lpl_leaf to rset of lpl_target 1827 * 1828 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1829 * resource. It is adjusted here, as this is presently the only place that we 1830 * can be certain a resource addition has succeeded. 1831 * 1832 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1833 * list in order until it reaches a NULL. (This list is required to be NULL 1834 * terminated, too). This is done so that we can mark start pos + 1, so that 1835 * each lpl is traversed sequentially, but in a different order. We hope this 1836 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1837 */ 1838 1839 void 1840 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1841 { 1842 int i; 1843 int entry_slot = 0; 1844 1845 /* return if leaf is already present */ 1846 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1847 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1848 return; 1849 } 1850 1851 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1852 lpl_leaf->lpl_lgrpid) { 1853 break; 1854 } 1855 } 1856 1857 /* insert leaf, update counts */ 1858 entry_slot = i; 1859 i = lpl_target->lpl_nrset++; 1860 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1861 panic("More leaf lgrps in system than are supported!\n"); 1862 } 1863 1864 /* 1865 * Start at the end of the rset array and work backwards towards the 1866 * slot into which the new lpl will be inserted. This effectively 1867 * preserves the current ordering by scooting everybody over one entry, 1868 * and placing the new entry into the space created. 1869 */ 1870 1871 while (i-- > entry_slot) { 1872 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1873 } 1874 1875 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1876 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1877 } 1878 1879 /* 1880 * Update each of lpl_parent's children with a proper hint and 1881 * a reference to their parent. 1882 * The lgrp topology is used as the reference since it is fully 1883 * consistent and correct at this point. 1884 * 1885 * Each child's hint will reference an element in lpl_parent's 1886 * rset that designates where the child should start searching 1887 * for CPU resources. The hint selected is the highest order leaf present 1888 * in the child's lineage. 1889 * 1890 * This should be called after any potential change in lpl_parent's 1891 * rset. 1892 */ 1893 static void 1894 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1895 { 1896 klgrpset_t children, leaves; 1897 lpl_t *lpl; 1898 int hint; 1899 int i, j; 1900 1901 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1902 if (klgrpset_isempty(children)) 1903 return; /* nothing to do */ 1904 1905 for (i = 0; i <= lgrp_alloc_max; i++) { 1906 if (klgrpset_ismember(children, i)) { 1907 1908 /* 1909 * Given the set of leaves in this child's lineage, 1910 * find the highest order leaf present in the parent's 1911 * rset. Select this as the hint for the child. 1912 */ 1913 leaves = lgrp_table[i]->lgrp_leaves; 1914 hint = 0; 1915 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1916 lpl = lpl_parent->lpl_rset[j]; 1917 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1918 hint = j; 1919 } 1920 cp->cp_lgrploads[i].lpl_hint = hint; 1921 1922 /* 1923 * (Re)set the parent. It may be incorrect if 1924 * lpl_parent is new in the topology. 1925 */ 1926 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1927 } 1928 } 1929 } 1930 1931 /* 1932 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1933 * 1934 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1935 * resource. The values are adjusted here, as this is the only place that we can 1936 * be certain a resource was successfully deleted. 1937 */ 1938 void 1939 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1940 { 1941 int i; 1942 1943 /* find leaf in intermediate node */ 1944 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1945 if (lpl_target->lpl_rset[i] == lpl_leaf) 1946 break; 1947 } 1948 1949 /* return if leaf not found */ 1950 if (lpl_target->lpl_rset[i] != lpl_leaf) 1951 return; 1952 1953 /* prune leaf, compress array */ 1954 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1955 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1956 lpl_target->lpl_ncpu--; 1957 do { 1958 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1959 } while (i++ < lpl_target->lpl_nrset); 1960 } 1961 1962 /* 1963 * Check to see if the resource set of the target lpl contains the 1964 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1965 */ 1966 1967 int 1968 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1969 { 1970 int i; 1971 1972 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1973 if (lpl_target->lpl_rset[i] == lpl_leaf) 1974 return (1); 1975 } 1976 1977 return (0); 1978 } 1979 1980 /* 1981 * Called when we change cpu lpl membership. This increments or decrements the 1982 * per-cpu counter in every lpl in which our leaf appears. 1983 */ 1984 void 1985 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1986 { 1987 cpupart_t *cpupart; 1988 lgrp_t *lgrp_leaf; 1989 lgrp_t *lgrp_cur; 1990 lpl_t *lpl_leaf; 1991 lpl_t *lpl_cur; 1992 int i; 1993 1994 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1995 1996 cpupart = cp->cpu_part; 1997 lpl_leaf = cp->cpu_lpl; 1998 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1999 2000 for (i = 0; i <= lgrp_alloc_max; i++) { 2001 lgrp_cur = lgrp_table[i]; 2002 2003 /* 2004 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 2005 * for the cpu in question, or if the current lgrp and leaf 2006 * don't share the same resources. 2007 */ 2008 2009 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 2010 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 2011 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 2012 continue; 2013 2014 2015 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2016 2017 if (lpl_cur->lpl_nrset > 0) { 2018 if (act == LPL_INCREMENT) { 2019 lpl_cur->lpl_ncpu++; 2020 } else if (act == LPL_DECREMENT) { 2021 lpl_cur->lpl_ncpu--; 2022 } 2023 } 2024 } 2025 } 2026 2027 /* 2028 * Initialize lpl with given resources and specified lgrp 2029 */ 2030 2031 void 2032 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2033 { 2034 lpl->lpl_lgrpid = lgrp->lgrp_id; 2035 lpl->lpl_loadavg = 0; 2036 if (lpl == lpl_leaf) 2037 lpl->lpl_ncpu = 1; 2038 else 2039 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2040 lpl->lpl_nrset = 1; 2041 lpl->lpl_rset[0] = lpl_leaf; 2042 lpl->lpl_lgrp = lgrp; 2043 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2044 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2045 } 2046 2047 /* 2048 * Clear an unused lpl 2049 */ 2050 2051 void 2052 lpl_clear(lpl_t *lpl) 2053 { 2054 lgrpid_t lid; 2055 2056 /* save lid for debugging purposes */ 2057 lid = lpl->lpl_lgrpid; 2058 bzero(lpl, sizeof (lpl_t)); 2059 lpl->lpl_lgrpid = lid; 2060 } 2061 2062 /* 2063 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2064 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2065 * make full use of all of the lgroup topology, but this checks to make sure 2066 * that for the parts that it does use, it has correctly understood the 2067 * relationships that exist. This function returns 2068 * 0 if the topology is correct, and a non-zero error code, for non-debug 2069 * kernels if incorrect. Asserts are spread throughout the code to aid in 2070 * debugging on a DEBUG kernel. 2071 */ 2072 int 2073 lpl_topo_verify(cpupart_t *cpupart) 2074 { 2075 lgrp_t *lgrp; 2076 lpl_t *lpl; 2077 klgrpset_t rset; 2078 klgrpset_t cset; 2079 cpu_t *cpu; 2080 cpu_t *cp_start; 2081 int i; 2082 int j; 2083 int sum; 2084 2085 /* topology can't be incorrect if it doesn't exist */ 2086 if (!lgrp_topo_initialized || !lgrp_initialized) 2087 return (LPL_TOPO_CORRECT); 2088 2089 ASSERT(cpupart != NULL); 2090 2091 for (i = 0; i <= lgrp_alloc_max; i++) { 2092 lgrp = lgrp_table[i]; 2093 lpl = NULL; 2094 /* make sure lpls are allocated */ 2095 ASSERT(cpupart->cp_lgrploads); 2096 if (!cpupart->cp_lgrploads) 2097 return (LPL_TOPO_PART_HAS_NO_LPL); 2098 2099 lpl = &cpupart->cp_lgrploads[i]; 2100 /* make sure our index is good */ 2101 ASSERT(i < cpupart->cp_nlgrploads); 2102 2103 /* if lgroup doesn't exist, make sure lpl is empty */ 2104 if (!LGRP_EXISTS(lgrp)) { 2105 ASSERT(lpl->lpl_ncpu == 0); 2106 if (lpl->lpl_ncpu > 0) { 2107 return (LPL_TOPO_CPUS_NOT_EMPTY); 2108 } else { 2109 continue; 2110 } 2111 } 2112 2113 /* verify that lgroup and lpl are identically numbered */ 2114 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2115 2116 /* if lgroup isn't in our partition, make sure lpl is empty */ 2117 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2118 cpupart->cp_lgrpset)) { 2119 ASSERT(lpl->lpl_ncpu == 0); 2120 if (lpl->lpl_ncpu > 0) { 2121 return (LPL_TOPO_CPUS_NOT_EMPTY); 2122 } 2123 /* 2124 * lpl is empty, and lgroup isn't in partition. verify 2125 * that lpl doesn't show up in anyone else's rsets (in 2126 * this partition, anyway) 2127 */ 2128 2129 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2130 lpl_t *i_lpl; /* lpl we're iterating over */ 2131 2132 i_lpl = &cpupart->cp_lgrploads[j]; 2133 2134 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2135 if (lpl_rset_contains(i_lpl, lpl)) { 2136 return (LPL_TOPO_LPL_ORPHANED); 2137 } 2138 } 2139 /* lgroup is empty, and everything is ok. continue */ 2140 continue; 2141 } 2142 2143 2144 /* lgroup is in this partition, now check it against lpl */ 2145 2146 /* do both have matching lgrps? */ 2147 ASSERT(lgrp == lpl->lpl_lgrp); 2148 if (lgrp != lpl->lpl_lgrp) { 2149 return (LPL_TOPO_LGRP_MISMATCH); 2150 } 2151 2152 /* do the parent lgroups exist and do they match? */ 2153 if (lgrp->lgrp_parent) { 2154 ASSERT(lpl->lpl_parent); 2155 ASSERT(lgrp->lgrp_parent->lgrp_id == 2156 lpl->lpl_parent->lpl_lgrpid); 2157 2158 if (!lpl->lpl_parent) { 2159 return (LPL_TOPO_MISSING_PARENT); 2160 } else if (lgrp->lgrp_parent->lgrp_id != 2161 lpl->lpl_parent->lpl_lgrpid) { 2162 return (LPL_TOPO_PARENT_MISMATCH); 2163 } 2164 } 2165 2166 /* only leaf lgroups keep a cpucnt, only check leaves */ 2167 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2168 2169 /* verify that lgrp is also a leaf */ 2170 ASSERT((lgrp->lgrp_childcnt == 0) && 2171 (klgrpset_ismember(lgrp->lgrp_leaves, 2172 lpl->lpl_lgrpid))); 2173 2174 if ((lgrp->lgrp_childcnt > 0) || 2175 (!klgrpset_ismember(lgrp->lgrp_leaves, 2176 lpl->lpl_lgrpid))) { 2177 return (LPL_TOPO_LGRP_NOT_LEAF); 2178 } 2179 2180 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2181 (lpl->lpl_ncpu > 0)); 2182 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2183 (lpl->lpl_ncpu <= 0)) { 2184 return (LPL_TOPO_BAD_CPUCNT); 2185 } 2186 2187 /* 2188 * Check that lpl_ncpu also matches the number of 2189 * cpus in the lpl's linked list. This only exists in 2190 * leaves, but they should always match. 2191 */ 2192 j = 0; 2193 cpu = cp_start = lpl->lpl_cpus; 2194 while (cpu != NULL) { 2195 j++; 2196 2197 /* check to make sure cpu's lpl is leaf lpl */ 2198 ASSERT(cpu->cpu_lpl == lpl); 2199 if (cpu->cpu_lpl != lpl) { 2200 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2201 } 2202 2203 /* check next cpu */ 2204 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2205 continue; 2206 } else { 2207 cpu = NULL; 2208 } 2209 } 2210 2211 ASSERT(j == lpl->lpl_ncpu); 2212 if (j != lpl->lpl_ncpu) { 2213 return (LPL_TOPO_LPL_BAD_NCPU); 2214 } 2215 2216 /* 2217 * Also, check that leaf lpl is contained in all 2218 * intermediate lpls that name the leaf as a descendant 2219 */ 2220 2221 for (j = 0; j <= lgrp_alloc_max; j++) { 2222 klgrpset_t intersect; 2223 lgrp_t *lgrp_cand; 2224 lpl_t *lpl_cand; 2225 2226 lgrp_cand = lgrp_table[j]; 2227 intersect = klgrpset_intersects( 2228 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2229 cpupart->cp_lgrpset); 2230 2231 if (!LGRP_EXISTS(lgrp_cand) || 2232 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2233 cpupart->cp_lgrpset) || 2234 (intersect == 0)) 2235 continue; 2236 2237 lpl_cand = 2238 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2239 2240 if (klgrpset_ismember(intersect, 2241 lgrp->lgrp_id)) { 2242 ASSERT(lpl_rset_contains(lpl_cand, 2243 lpl)); 2244 2245 if (!lpl_rset_contains(lpl_cand, lpl)) { 2246 return (LPL_TOPO_RSET_MSSNG_LF); 2247 } 2248 } 2249 } 2250 2251 } else { /* non-leaf specific checks */ 2252 2253 /* 2254 * Non-leaf lpls should have lpl_cpus == NULL 2255 * verify that this is so 2256 */ 2257 ASSERT(lpl->lpl_cpus == NULL); 2258 if (lpl->lpl_cpus != NULL) { 2259 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2260 } 2261 2262 /* 2263 * verify that the sum of the cpus in the leaf resources 2264 * is equal to the total ncpu in the intermediate 2265 */ 2266 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2267 sum += lpl->lpl_rset[j]->lpl_ncpu; 2268 } 2269 2270 ASSERT(sum == lpl->lpl_ncpu); 2271 if (sum != lpl->lpl_ncpu) { 2272 return (LPL_TOPO_LPL_BAD_NCPU); 2273 } 2274 } 2275 2276 /* 2277 * check on lpl_hint. Don't check root, since it has no parent. 2278 */ 2279 if (lpl->lpl_parent != NULL) { 2280 int hint; 2281 lpl_t *hint_lpl; 2282 2283 /* make sure hint is within limits of nrset */ 2284 hint = lpl->lpl_hint; 2285 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2286 if (lpl->lpl_parent->lpl_nrset < hint) { 2287 return (LPL_TOPO_BOGUS_HINT); 2288 } 2289 2290 /* make sure hint points to valid lpl */ 2291 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2292 ASSERT(hint_lpl->lpl_ncpu > 0); 2293 if (hint_lpl->lpl_ncpu <= 0) { 2294 return (LPL_TOPO_BOGUS_HINT); 2295 } 2296 } 2297 2298 /* 2299 * Check the rset of the lpl in question. Make sure that each 2300 * rset contains a subset of the resources in 2301 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2302 * sure that each rset doesn't include resources that are 2303 * outside of that set. (Which would be resources somehow not 2304 * accounted for). 2305 */ 2306 2307 klgrpset_clear(rset); 2308 for (j = 0; j < lpl->lpl_nrset; j++) { 2309 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2310 } 2311 klgrpset_copy(cset, rset); 2312 /* make sure lpl rset matches lgrp rset */ 2313 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2314 /* make sure rset is contained with in partition, too */ 2315 klgrpset_diff(cset, cpupart->cp_lgrpset); 2316 2317 ASSERT(klgrpset_isempty(rset) && 2318 klgrpset_isempty(cset)); 2319 if (!klgrpset_isempty(rset) || 2320 !klgrpset_isempty(cset)) { 2321 return (LPL_TOPO_RSET_MISMATCH); 2322 } 2323 2324 /* 2325 * check to make sure lpl_nrset matches the number of rsets 2326 * contained in the lpl 2327 */ 2328 2329 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2330 j++); 2331 2332 ASSERT(j == lpl->lpl_nrset); 2333 if (j != lpl->lpl_nrset) { 2334 return (LPL_TOPO_BAD_RSETCNT); 2335 } 2336 2337 } 2338 return (LPL_TOPO_CORRECT); 2339 } 2340 2341 /* 2342 * Flatten lpl topology to given number of levels. This is presently only 2343 * implemented for a flatten to 2 levels, which will prune out the intermediates 2344 * and home the leaf lpls to the root lpl. 2345 */ 2346 int 2347 lpl_topo_flatten(int levels) 2348 { 2349 int i; 2350 uint_t sum; 2351 lgrp_t *lgrp_cur; 2352 lpl_t *lpl_cur; 2353 lpl_t *lpl_root; 2354 cpupart_t *cp; 2355 2356 if (levels != 2) 2357 return (0); 2358 2359 /* called w/ cpus paused - grab no locks! */ 2360 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2361 !lgrp_initialized); 2362 2363 cp = cp_list_head; 2364 do { 2365 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2366 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2367 2368 for (i = 0; i <= lgrp_alloc_max; i++) { 2369 lgrp_cur = lgrp_table[i]; 2370 lpl_cur = &cp->cp_lgrploads[i]; 2371 2372 if ((lgrp_cur == lgrp_root) || 2373 (!LGRP_EXISTS(lgrp_cur) && 2374 (lpl_cur->lpl_ncpu == 0))) 2375 continue; 2376 2377 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2378 /* 2379 * this should be a deleted intermediate, so 2380 * clear it 2381 */ 2382 lpl_clear(lpl_cur); 2383 } else if ((lpl_cur->lpl_nrset == 1) && 2384 (lpl_cur->lpl_rset[0] == lpl_cur) && 2385 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2386 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2387 /* 2388 * this is a leaf whose parent was deleted, or 2389 * whose parent had their lgrp deleted. (And 2390 * whose parent will soon be deleted). Point 2391 * this guy back to the root lpl. 2392 */ 2393 lpl_cur->lpl_parent = lpl_root; 2394 lpl_rset_add(lpl_root, lpl_cur); 2395 } 2396 2397 } 2398 2399 /* 2400 * Now that we're done, make sure the count on the root lpl is 2401 * correct, and update the hints of the children for the sake of 2402 * thoroughness 2403 */ 2404 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2405 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2406 } 2407 lpl_root->lpl_ncpu = sum; 2408 lpl_child_update(lpl_root, cp); 2409 2410 cp = cp->cp_next; 2411 } while (cp != cp_list_head); 2412 2413 return (levels); 2414 } 2415 2416 /* 2417 * Insert a lpl into the resource hierarchy and create any additional lpls that 2418 * are necessary to represent the varying states of locality for the cpu 2419 * resoruces newly added to the partition. 2420 * 2421 * This routine is clever enough that it can correctly add resources from the 2422 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2423 * those for which the lpl is a leaf as opposed to simply a named equally local 2424 * resource). The one special case that needs additional processing is when a 2425 * new intermediate lpl is introduced. Since the main loop only traverses 2426 * looking to add the leaf resource where it does not yet exist, additional work 2427 * is necessary to add other leaf resources that may need to exist in the newly 2428 * created intermediate. This is performed by the second inner loop, and is 2429 * only done when the check for more than one overlapping resource succeeds. 2430 */ 2431 2432 void 2433 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2434 { 2435 int i; 2436 int j; 2437 int hint; 2438 int rset_num_intersect; 2439 lgrp_t *lgrp_cur; 2440 lpl_t *lpl_cur; 2441 lpl_t *lpl_parent; 2442 lgrpid_t parent_id; 2443 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2444 2445 for (i = 0; i <= lgrp_alloc_max; i++) { 2446 lgrp_cur = lgrp_table[i]; 2447 2448 /* 2449 * Don't insert if the lgrp isn't there, if the leaf isn't 2450 * contained within the current lgrp, or if the current lgrp has 2451 * no leaves in this partition 2452 */ 2453 2454 if (!LGRP_EXISTS(lgrp_cur) || 2455 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2456 lpl_leaf->lpl_lgrpid) || 2457 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2458 cpupart->cp_lgrpset)) 2459 continue; 2460 2461 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2462 if (lgrp_cur->lgrp_parent != NULL) { 2463 /* if lgrp has a parent, assign it properly */ 2464 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2465 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2466 } else { 2467 /* if not, make sure parent ptr gets set to null */ 2468 lpl_parent = NULL; 2469 } 2470 2471 if (lpl_cur == lpl_leaf) { 2472 /* 2473 * Almost all leaf state was initialized elsewhere. The 2474 * only thing left to do is to set the parent. 2475 */ 2476 lpl_cur->lpl_parent = lpl_parent; 2477 continue; 2478 } 2479 2480 /* 2481 * Initialize intermediate lpl 2482 * Save this lpl's hint though. Since we're changing this 2483 * lpl's resources, we need to update the hint in this lpl's 2484 * children, but the hint in this lpl is unaffected and 2485 * should be preserved. 2486 */ 2487 hint = lpl_cur->lpl_hint; 2488 2489 lpl_clear(lpl_cur); 2490 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2491 2492 lpl_cur->lpl_hint = hint; 2493 lpl_cur->lpl_parent = lpl_parent; 2494 2495 /* does new lpl need to be populated with other resources? */ 2496 rset_intersect = 2497 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2498 cpupart->cp_lgrpset); 2499 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2500 2501 if (rset_num_intersect > 1) { 2502 /* 2503 * If so, figure out what lpls have resources that 2504 * intersect this one, and add them. 2505 */ 2506 for (j = 0; j <= lgrp_alloc_max; j++) { 2507 lgrp_t *lgrp_cand; /* candidate lgrp */ 2508 lpl_t *lpl_cand; /* candidate lpl */ 2509 2510 lgrp_cand = lgrp_table[j]; 2511 if (!LGRP_EXISTS(lgrp_cand) || 2512 !klgrpset_ismember(rset_intersect, 2513 lgrp_cand->lgrp_id)) 2514 continue; 2515 lpl_cand = 2516 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2517 lpl_rset_add(lpl_cur, lpl_cand); 2518 } 2519 } 2520 /* 2521 * This lpl's rset has changed. Update the hint in it's 2522 * children. 2523 */ 2524 lpl_child_update(lpl_cur, cpupart); 2525 } 2526 } 2527 2528 /* 2529 * remove a lpl from the hierarchy of resources, clearing its state when 2530 * finished. If the lpls at the intermediate levels of the hierarchy have no 2531 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2532 * delete them as well. 2533 */ 2534 2535 void 2536 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2537 { 2538 int i; 2539 lgrp_t *lgrp_cur; 2540 lpl_t *lpl_cur; 2541 klgrpset_t leaf_intersect; /* intersection of leaves */ 2542 2543 for (i = 0; i <= lgrp_alloc_max; i++) { 2544 lgrp_cur = lgrp_table[i]; 2545 2546 /* 2547 * Don't attempt to remove from lgrps that aren't there, that 2548 * don't contain our leaf, or from the leaf itself. (We do that 2549 * later) 2550 */ 2551 2552 if (!LGRP_EXISTS(lgrp_cur)) 2553 continue; 2554 2555 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2556 2557 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2558 lpl_leaf->lpl_lgrpid) || 2559 (lpl_cur == lpl_leaf)) { 2560 continue; 2561 } 2562 2563 /* 2564 * This is a slightly sleazy simplification in that we have 2565 * already marked the cp_lgrpset as no longer containing the 2566 * leaf we've deleted. Any lpls that pass the above checks 2567 * based upon lgrp membership but not necessarily cpu-part 2568 * membership also get cleared by the checks below. Currently 2569 * this is harmless, as the lpls should be empty anyway. 2570 * 2571 * In particular, we want to preserve lpls that have additional 2572 * leaf resources, even though we don't yet have a processor 2573 * architecture that represents resources this way. 2574 */ 2575 2576 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2577 cpupart->cp_lgrpset); 2578 2579 lpl_rset_del(lpl_cur, lpl_leaf); 2580 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2581 lpl_clear(lpl_cur); 2582 } else { 2583 /* 2584 * Update this lpl's children 2585 */ 2586 lpl_child_update(lpl_cur, cpupart); 2587 } 2588 } 2589 lpl_clear(lpl_leaf); 2590 } 2591 2592 /* 2593 * add a cpu to a partition in terms of lgrp load avg bookeeping 2594 * 2595 * The lpl (cpu partition load average information) is now arranged in a 2596 * hierarchical fashion whereby resources that are closest, ie. most local, to 2597 * the cpu in question are considered to be leaves in a tree of resources. 2598 * There are two general cases for cpu additon: 2599 * 2600 * 1. A lpl structure that contains resources already in the hierarchy tree. 2601 * In this case, all of the associated lpl relationships have been defined, and 2602 * all that is necessary is that we link the new cpu into the per-lpl list of 2603 * cpus, and increment the ncpu count of all places where this cpu resource will 2604 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2605 * pushing is accomplished by this routine. 2606 * 2607 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2608 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2609 * construct the hierarchy of state necessary to name it's more distant 2610 * resources, if they should exist. The leaf structure is initialized by this 2611 * routine, as is the cpu-partition state for the lgrp membership. This routine 2612 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2613 * and builds all of the "ancestoral" state necessary to identify resources at 2614 * differing levels of locality. 2615 */ 2616 void 2617 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2618 { 2619 cpupart_t *cpupart; 2620 lgrp_t *lgrp_leaf; 2621 lpl_t *lpl_leaf; 2622 2623 /* called sometimes w/ cpus paused - grab no locks */ 2624 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2625 2626 cpupart = cp->cpu_part; 2627 lgrp_leaf = lgrp_table[lgrpid]; 2628 2629 /* don't add non-existent lgrp */ 2630 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2631 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2632 cp->cpu_lpl = lpl_leaf; 2633 2634 /* only leaf lpls contain cpus */ 2635 2636 if (lpl_leaf->lpl_ncpu++ == 0) { 2637 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2638 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2639 lpl_leaf_insert(lpl_leaf, cpupart); 2640 } else { 2641 /* 2642 * the lpl should already exist in the parent, so just update 2643 * the count of available CPUs 2644 */ 2645 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2646 } 2647 2648 /* link cpu into list of cpus in lpl */ 2649 2650 if (lpl_leaf->lpl_cpus) { 2651 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2652 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2653 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2654 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2655 } else { 2656 /* 2657 * We increment ncpu immediately after we create a new leaf 2658 * lpl, so assert that ncpu == 1 for the case where we don't 2659 * have any cpu pointers yet. 2660 */ 2661 ASSERT(lpl_leaf->lpl_ncpu == 1); 2662 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2663 } 2664 2665 } 2666 2667 2668 /* 2669 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2670 * 2671 * The lpl (cpu partition load average information) is now arranged in a 2672 * hierarchical fashion whereby resources that are closest, ie. most local, to 2673 * the cpu in question are considered to be leaves in a tree of resources. 2674 * There are two removal cases in question: 2675 * 2676 * 1. Removal of the resource in the leaf leaves other resources remaining in 2677 * that leaf. (Another cpu still exists at this level of locality). In this 2678 * case, the count of available cpus is decremented in all assocated lpls by 2679 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2680 * from the per-cpu lpl list. 2681 * 2682 * 2. Removal of the resource results in the lpl containing no resources. (It's 2683 * empty) In this case, all of what has occurred for the first step must take 2684 * place; however, additionally we must remove the lpl structure itself, prune 2685 * out any stranded lpls that do not directly name a leaf resource, and mark the 2686 * cpu partition in question as no longer containing resources from the lgrp of 2687 * the lpl that has been delted. Cpu-partition changes are handled by this 2688 * method, but the lpl_leaf_remove function deals with the details of pruning 2689 * out the empty lpl and any of its orphaned direct ancestors. 2690 */ 2691 void 2692 lgrp_part_del_cpu(cpu_t *cp) 2693 { 2694 lpl_t *lpl; 2695 lpl_t *leaf_lpl; 2696 lgrp_t *lgrp_leaf; 2697 2698 /* called sometimes w/ cpus paused - grab no locks */ 2699 2700 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2701 2702 lpl = leaf_lpl = cp->cpu_lpl; 2703 lgrp_leaf = leaf_lpl->lpl_lgrp; 2704 2705 /* don't delete a leaf that isn't there */ 2706 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2707 2708 /* no double-deletes */ 2709 ASSERT(lpl->lpl_ncpu); 2710 if (--lpl->lpl_ncpu == 0) { 2711 /* 2712 * This was the last cpu in this lgroup for this partition, 2713 * clear its bit in the partition's lgroup bitmask 2714 */ 2715 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2716 2717 /* eliminate remaning lpl link pointers in cpu, lpl */ 2718 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2719 2720 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2721 } else { 2722 2723 /* unlink cpu from lists of cpus in lpl */ 2724 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2725 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2726 if (lpl->lpl_cpus == cp) { 2727 lpl->lpl_cpus = cp->cpu_next_lpl; 2728 } 2729 2730 /* 2731 * Update the cpu count in the lpls associated with parent 2732 * lgroups. 2733 */ 2734 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2735 2736 } 2737 /* clear cpu's lpl ptr when we're all done */ 2738 cp->cpu_lpl = NULL; 2739 } 2740 2741 /* 2742 * Recompute load average for the specified partition/lgrp fragment. 2743 * 2744 * We rely on the fact that this routine is called from the clock thread 2745 * at a point before the clock thread can block (i.e. before its first 2746 * lock request). Since the clock thread can not be preempted (since it 2747 * runs at highest priority), we know that cpu partitions can not change 2748 * (since doing so would require either the repartition requester or the 2749 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2750 * without grabbing cpu_lock. 2751 */ 2752 void 2753 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2754 { 2755 uint_t ncpu; 2756 int64_t old, new, f; 2757 2758 /* 2759 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2760 */ 2761 static short expval[] = { 2762 0, 3196, 1618, 1083, 2763 814, 652, 543, 466, 2764 408, 363, 326, 297, 2765 272, 251, 233, 218, 2766 204, 192, 181, 172, 2767 163, 155, 148, 142, 2768 136, 130, 125, 121, 2769 116, 112, 109, 105 2770 }; 2771 2772 /* ASSERT (called from clock level) */ 2773 2774 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2775 ((ncpu = lpl->lpl_ncpu) == 0)) { 2776 return; 2777 } 2778 2779 for (;;) { 2780 2781 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2782 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2783 else 2784 f = expval[ncpu]; 2785 2786 /* 2787 * Modify the load average atomically to avoid losing 2788 * anticipatory load updates (see lgrp_move_thread()). 2789 */ 2790 if (ageflag) { 2791 /* 2792 * We're supposed to both update and age the load. 2793 * This happens 10 times/sec. per cpu. We do a 2794 * little hoop-jumping to avoid integer overflow. 2795 */ 2796 int64_t q, r; 2797 2798 do { 2799 old = new = lpl->lpl_loadavg; 2800 q = (old >> 16) << 7; 2801 r = (old & 0xffff) << 7; 2802 new += ((long long)(nrcpus - q) * f - 2803 ((r * f) >> 16)) >> 7; 2804 2805 /* 2806 * Check for overflow 2807 */ 2808 if (new > LGRP_LOADAVG_MAX) 2809 new = LGRP_LOADAVG_MAX; 2810 else if (new < 0) 2811 new = 0; 2812 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2813 new) != old); 2814 } else { 2815 /* 2816 * We're supposed to update the load, but not age it. 2817 * This option is used to update the load (which either 2818 * has already been aged in this 1/10 sec. interval or 2819 * soon will be) to account for a remotely executing 2820 * thread. 2821 */ 2822 do { 2823 old = new = lpl->lpl_loadavg; 2824 new += f; 2825 /* 2826 * Check for overflow 2827 * Underflow not possible here 2828 */ 2829 if (new < old) 2830 new = LGRP_LOADAVG_MAX; 2831 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2832 new) != old); 2833 } 2834 2835 /* 2836 * Do the same for this lpl's parent 2837 */ 2838 if ((lpl = lpl->lpl_parent) == NULL) 2839 break; 2840 ncpu = lpl->lpl_ncpu; 2841 } 2842 } 2843 2844 /* 2845 * Initialize lpl topology in the target based on topology currently present in 2846 * lpl_bootstrap. 2847 * 2848 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2849 * initialize cp_default list of lpls. Up to this point all topology operations 2850 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2851 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2852 * `target' points to the list of lpls in cp_default and `size' is the size of 2853 * this list. 2854 * 2855 * This function walks the lpl topology in lpl_bootstrap and does for things: 2856 * 2857 * 1) Copies all fields from lpl_bootstrap to the target. 2858 * 2859 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2860 * 2861 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2862 * instead of lpl_bootstrap. 2863 * 2864 * 4) Updates pointers in the resource list of the target to point to the lpls 2865 * in the target list instead of lpl_bootstrap. 2866 * 2867 * After lpl_topo_bootstrap() completes, target contains the same information 2868 * that would be present there if it were used during boot instead of 2869 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2870 * and it is bzeroed. 2871 */ 2872 void 2873 lpl_topo_bootstrap(lpl_t *target, int size) 2874 { 2875 lpl_t *lpl = lpl_bootstrap; 2876 lpl_t *target_lpl = target; 2877 int howmany; 2878 int id; 2879 int i; 2880 2881 /* 2882 * The only target that should be passed here is cp_default lpl list. 2883 */ 2884 ASSERT(target == cp_default.cp_lgrploads); 2885 ASSERT(size == cp_default.cp_nlgrploads); 2886 ASSERT(!lgrp_topo_initialized); 2887 ASSERT(ncpus == 1); 2888 2889 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2890 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2891 /* 2892 * Copy all fields from lpl. 2893 */ 2894 2895 *target_lpl = *lpl; 2896 2897 /* 2898 * Substitute CPU0 lpl pointer with one relative to target. 2899 */ 2900 if (lpl->lpl_cpus == CPU) { 2901 ASSERT(CPU->cpu_lpl == lpl); 2902 CPU->cpu_lpl = target_lpl; 2903 } 2904 2905 /* 2906 * Substitute parent information with parent relative to target. 2907 */ 2908 if (lpl->lpl_parent != NULL) 2909 target_lpl->lpl_parent = (lpl_t *) 2910 (((uintptr_t)lpl->lpl_parent - 2911 (uintptr_t)lpl_bootstrap) + 2912 (uintptr_t)target); 2913 2914 /* 2915 * Walk over resource set substituting pointers relative to 2916 * lpl_bootstrap to pointers relative to target. 2917 */ 2918 ASSERT(lpl->lpl_nrset <= 1); 2919 2920 for (id = 0; id < lpl->lpl_nrset; id++) { 2921 if (lpl->lpl_rset[id] != NULL) { 2922 target_lpl->lpl_rset[id] = 2923 (lpl_t *) 2924 (((uintptr_t)lpl->lpl_rset[id] - 2925 (uintptr_t)lpl_bootstrap) + 2926 (uintptr_t)target); 2927 } 2928 } 2929 } 2930 2931 /* 2932 * Topology information in lpl_bootstrap is no longer needed. 2933 */ 2934 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2935 } 2936 2937 /* the maximum effect that a single thread can have on it's lgroup's load */ 2938 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 2939 ((lgrp_loadavg_max_effect) / (ncpu)) 2940 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 2941 2942 /* 2943 * If the lowest load among the lgroups a process' threads are currently 2944 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2945 * expanding the process to a new lgroup. 2946 */ 2947 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2948 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2949 2950 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2951 ((lgrp_expand_proc_thresh) / (ncpu)) 2952 2953 /* 2954 * A process will be expanded to a new lgroup only if the difference between 2955 * the lowest load on the lgroups the process' thread's are currently spread 2956 * across and the lowest load on the other lgroups in the process' partition 2957 * is greater than lgrp_expand_proc_diff. 2958 */ 2959 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2960 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2961 2962 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2963 ((lgrp_expand_proc_diff) / (ncpu)) 2964 2965 /* 2966 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2967 * be present due to impreciseness of the load average decay algorithm. 2968 * 2969 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2970 * tolerance is scaled by the number of cpus in the lgroup just like 2971 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2972 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2973 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2974 */ 2975 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2976 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2977 ((lgrp_loadavg_tolerance) / ncpu) 2978 2979 /* 2980 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2981 * average is above this threshold 2982 */ 2983 uint32_t lgrp_load_thresh = UINT32_MAX; 2984 2985 /* 2986 * lgrp_choose() will try to skip any lgroups with less memory 2987 * than this free when choosing a home lgroup 2988 */ 2989 pgcnt_t lgrp_mem_free_thresh = 0; 2990 2991 /* 2992 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2993 * one based on one of the following policies: 2994 * - Random selection 2995 * - Pseudo round robin placement 2996 * - Longest time since a thread was last placed 2997 */ 2998 #define LGRP_CHOOSE_RANDOM 1 2999 #define LGRP_CHOOSE_RR 2 3000 #define LGRP_CHOOSE_TIME 3 3001 3002 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 3003 3004 /* 3005 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 3006 * be bound to a CPU or processor set. 3007 * 3008 * Arguments: 3009 * t The thread 3010 * cpupart The partition the thread belongs to. 3011 * 3012 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 3013 * disabled, or thread_lock held (at splhigh) to protect against the CPU 3014 * partitions changing out from under us and assumes that given thread is 3015 * protected. Also, called sometimes w/ cpus paused or kernel preemption 3016 * disabled, so don't grab any locks because we should never block under 3017 * those conditions. 3018 */ 3019 lpl_t * 3020 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 3021 { 3022 lgrp_load_t bestload, bestrload; 3023 int lgrpid_offset, lgrp_count; 3024 lgrp_id_t lgrpid, lgrpid_start; 3025 lpl_t *lpl, *bestlpl, *bestrlpl; 3026 klgrpset_t lgrpset; 3027 proc_t *p; 3028 3029 ASSERT(t != NULL); 3030 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3031 THREAD_LOCK_HELD(t)); 3032 ASSERT(cpupart != NULL); 3033 3034 p = t->t_procp; 3035 3036 /* A process should always be in an active partition */ 3037 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3038 3039 bestlpl = bestrlpl = NULL; 3040 bestload = bestrload = LGRP_LOADAVG_MAX; 3041 lgrpset = cpupart->cp_lgrpset; 3042 3043 switch (lgrp_choose_policy) { 3044 case LGRP_CHOOSE_RR: 3045 lgrpid = cpupart->cp_lgrp_hint; 3046 do { 3047 if (++lgrpid > lgrp_alloc_max) 3048 lgrpid = 0; 3049 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3050 3051 break; 3052 default: 3053 case LGRP_CHOOSE_TIME: 3054 case LGRP_CHOOSE_RANDOM: 3055 klgrpset_nlgrps(lgrpset, lgrp_count); 3056 lgrpid_offset = 3057 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3058 for (lgrpid = 0; ; lgrpid++) { 3059 if (klgrpset_ismember(lgrpset, lgrpid)) { 3060 if (--lgrpid_offset == 0) 3061 break; 3062 } 3063 } 3064 break; 3065 } 3066 3067 lgrpid_start = lgrpid; 3068 3069 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3070 lgrp_id_t, cpupart->cp_lgrp_hint); 3071 3072 /* 3073 * Use lgroup affinities (if any) to choose best lgroup 3074 * 3075 * NOTE: Assumes that thread is protected from going away and its 3076 * lgroup affinities won't change (ie. p_lock, or 3077 * thread_lock() being held and/or CPUs paused) 3078 */ 3079 if (t->t_lgrp_affinity) { 3080 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start); 3081 if (lpl != NULL) 3082 return (lpl); 3083 } 3084 3085 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3086 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3087 3088 do { 3089 pgcnt_t npgs; 3090 3091 /* 3092 * Skip any lgroups outside of thread's pset 3093 */ 3094 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3095 if (++lgrpid > lgrp_alloc_max) 3096 lgrpid = 0; /* wrap the search */ 3097 continue; 3098 } 3099 3100 /* 3101 * Skip any non-leaf lgroups 3102 */ 3103 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3104 continue; 3105 3106 /* 3107 * Skip any lgroups without enough free memory 3108 * (when threshold set to nonzero positive value) 3109 */ 3110 if (lgrp_mem_free_thresh > 0) { 3111 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3112 if (npgs < lgrp_mem_free_thresh) { 3113 if (++lgrpid > lgrp_alloc_max) 3114 lgrpid = 0; /* wrap the search */ 3115 continue; 3116 } 3117 } 3118 3119 lpl = &cpupart->cp_lgrploads[lgrpid]; 3120 if (klgrpset_isempty(p->p_lgrpset) || 3121 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3122 /* 3123 * Either this is a new process or the process already 3124 * has threads on this lgrp, so this is a preferred 3125 * lgroup for the thread. 3126 */ 3127 if (lpl_pick(lpl, bestlpl)) { 3128 bestload = lpl->lpl_loadavg; 3129 bestlpl = lpl; 3130 } 3131 } else { 3132 /* 3133 * The process doesn't have any threads on this lgrp, 3134 * but we're willing to consider this lgrp if the load 3135 * difference is big enough to justify splitting up 3136 * the process' threads. 3137 */ 3138 if (lpl_pick(lpl, bestrlpl)) { 3139 bestrload = lpl->lpl_loadavg; 3140 bestrlpl = lpl; 3141 } 3142 } 3143 if (++lgrpid > lgrp_alloc_max) 3144 lgrpid = 0; /* wrap the search */ 3145 } while (lgrpid != lgrpid_start); 3146 3147 /* 3148 * Return root lgroup if threshold isn't set to maximum value and 3149 * lowest lgroup load average more than a certain threshold 3150 */ 3151 if (lgrp_load_thresh != UINT32_MAX && 3152 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3153 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3154 3155 /* 3156 * If all the lgroups over which the thread's process is spread are 3157 * heavily loaded, we'll consider placing the thread on one of the 3158 * other leaf lgroups in the thread's partition. 3159 */ 3160 if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3161 (bestrload < bestload) && /* paranoid about wraparound */ 3162 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3163 bestload)) { 3164 bestlpl = bestrlpl; 3165 } 3166 3167 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3168 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3169 3170 ASSERT(bestlpl->lpl_ncpu > 0); 3171 return (bestlpl); 3172 } 3173 3174 /* 3175 * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing. 3176 */ 3177 static int 3178 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3179 { 3180 lgrp_load_t l1, l2; 3181 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3182 3183 3184 if (lpl2 == NULL) 3185 return (1); 3186 3187 l1 = lpl1->lpl_loadavg; 3188 l2 = lpl2->lpl_loadavg; 3189 3190 if ((l1 + tolerance < l2) && (l1 < l2)) { 3191 /* lpl1 is significantly less loaded than lpl2 */ 3192 return (1); 3193 } 3194 3195 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3196 l1 + tolerance >= l2 && l1 < l2 && 3197 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3198 /* 3199 * lpl1's load is within the tolerance of lpl2. We're 3200 * willing to consider it be to better however if 3201 * it has been longer since we last homed a thread there 3202 */ 3203 return (1); 3204 } 3205 3206 return (0); 3207 } 3208 3209 /* 3210 * An LWP is expected to be assigned to an lgroup for at least this long 3211 * for its anticipatory load to be justified. NOTE that this value should 3212 * not be set extremely huge (say, larger than 100 years), to avoid problems 3213 * with overflow in the calculation that uses it. 3214 */ 3215 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3216 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3217 3218 /* 3219 * Routine to change a thread's lgroup affiliation. This routine updates 3220 * the thread's kthread_t struct and its process' proc_t struct to note the 3221 * thread's new lgroup affiliation, and its lgroup affinities. 3222 * 3223 * Note that this is the only routine that modifies a thread's t_lpl field, 3224 * and that adds in or removes anticipatory load. 3225 * 3226 * If the thread is exiting, newlpl is NULL. 3227 * 3228 * Locking: 3229 * The following lock must be held on entry: 3230 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3231 * doesn't get removed from t's partition 3232 * 3233 * This routine is not allowed to grab any locks, since it may be called 3234 * with cpus paused (such as from cpu_offline). 3235 */ 3236 void 3237 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3238 { 3239 proc_t *p; 3240 lpl_t *lpl, *oldlpl; 3241 lgrp_id_t oldid; 3242 kthread_t *tp; 3243 uint_t ncpu; 3244 lgrp_load_t old, new; 3245 3246 ASSERT(t); 3247 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3248 THREAD_LOCK_HELD(t)); 3249 3250 /* 3251 * If not changing lpls, just return 3252 */ 3253 if ((oldlpl = t->t_lpl) == newlpl) 3254 return; 3255 3256 /* 3257 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3258 * associated with process 0 rather than with its original process). 3259 */ 3260 if (t->t_proc_flag & TP_LWPEXIT) { 3261 if (newlpl != NULL) { 3262 t->t_lpl = newlpl; 3263 } 3264 return; 3265 } 3266 3267 p = ttoproc(t); 3268 3269 /* 3270 * If the thread had a previous lgroup, update its process' p_lgrpset 3271 * to account for it being moved from its old lgroup. 3272 */ 3273 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3274 (p->p_tlist != NULL)) { 3275 oldid = oldlpl->lpl_lgrpid; 3276 3277 if (newlpl != NULL) 3278 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3279 3280 if ((do_lgrpset_delete) && 3281 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3282 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3283 /* 3284 * Check if a thread other than the thread 3285 * that's moving is assigned to the same 3286 * lgroup as the thread that's moving. Note 3287 * that we have to compare lgroup IDs, rather 3288 * than simply comparing t_lpl's, since the 3289 * threads may belong to different partitions 3290 * but be assigned to the same lgroup. 3291 */ 3292 ASSERT(tp->t_lpl != NULL); 3293 3294 if ((tp != t) && 3295 (tp->t_lpl->lpl_lgrpid == oldid)) { 3296 /* 3297 * Another thread is assigned to the 3298 * same lgroup as the thread that's 3299 * moving, p_lgrpset doesn't change. 3300 */ 3301 break; 3302 } else if (tp == p->p_tlist) { 3303 /* 3304 * No other thread is assigned to the 3305 * same lgroup as the exiting thread, 3306 * clear the lgroup's bit in p_lgrpset. 3307 */ 3308 klgrpset_del(p->p_lgrpset, oldid); 3309 break; 3310 } 3311 } 3312 } 3313 3314 /* 3315 * If this thread was assigned to its old lgroup for such a 3316 * short amount of time that the anticipatory load that was 3317 * added on its behalf has aged very little, remove that 3318 * anticipatory load. 3319 */ 3320 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3321 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3322 lpl = oldlpl; 3323 for (;;) { 3324 do { 3325 old = new = lpl->lpl_loadavg; 3326 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3327 if (new > old) { 3328 /* 3329 * this can happen if the load 3330 * average was aged since we 3331 * added in the anticipatory 3332 * load 3333 */ 3334 new = 0; 3335 } 3336 } while (cas32( 3337 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3338 new) != old); 3339 3340 lpl = lpl->lpl_parent; 3341 if (lpl == NULL) 3342 break; 3343 3344 ncpu = lpl->lpl_ncpu; 3345 ASSERT(ncpu > 0); 3346 } 3347 } 3348 } 3349 /* 3350 * If the thread has a new lgroup (i.e. it's not exiting), update its 3351 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3352 * to its new lgroup to account for its move to its new lgroup. 3353 */ 3354 if (newlpl != NULL) { 3355 /* 3356 * This thread is moving to a new lgroup 3357 */ 3358 t->t_lpl = newlpl; 3359 3360 /* 3361 * Reflect move in load average of new lgroup 3362 * unless it is root lgroup 3363 */ 3364 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3365 return; 3366 3367 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3368 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3369 } 3370 3371 /* 3372 * It'll take some time for the load on the new lgroup 3373 * to reflect this thread's placement on it. We'd 3374 * like not, however, to have all threads between now 3375 * and then also piling on to this lgroup. To avoid 3376 * this pileup, we anticipate the load this thread 3377 * will generate on its new lgroup. The goal is to 3378 * make the lgroup's load appear as though the thread 3379 * had been there all along. We're very conservative 3380 * in calculating this anticipatory load, we assume 3381 * the worst case case (100% CPU-bound thread). This 3382 * may be modified in the future to be more accurate. 3383 */ 3384 lpl = newlpl; 3385 for (;;) { 3386 ncpu = lpl->lpl_ncpu; 3387 ASSERT(ncpu > 0); 3388 do { 3389 old = new = lpl->lpl_loadavg; 3390 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3391 /* 3392 * Check for overflow 3393 * Underflow not possible here 3394 */ 3395 if (new < old) 3396 new = UINT32_MAX; 3397 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3398 new) != old); 3399 3400 lpl = lpl->lpl_parent; 3401 if (lpl == NULL) 3402 break; 3403 } 3404 t->t_anttime = gethrtime(); 3405 } 3406 } 3407 3408 /* 3409 * Return lgroup memory allocation policy given advice from madvise(3C) 3410 */ 3411 lgrp_mem_policy_t 3412 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3413 { 3414 switch (advice) { 3415 case MADV_ACCESS_LWP: 3416 return (LGRP_MEM_POLICY_NEXT); 3417 case MADV_ACCESS_MANY: 3418 return (LGRP_MEM_POLICY_RANDOM); 3419 default: 3420 return (lgrp_mem_policy_default(size, type)); 3421 } 3422 } 3423 3424 /* 3425 * Figure out default policy 3426 */ 3427 lgrp_mem_policy_t 3428 lgrp_mem_policy_default(size_t size, int type) 3429 { 3430 cpupart_t *cp; 3431 lgrp_mem_policy_t policy; 3432 size_t pset_mem_size; 3433 3434 /* 3435 * Randomly allocate memory across lgroups for shared memory 3436 * beyond a certain threshold 3437 */ 3438 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3439 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3440 /* 3441 * Get total memory size of current thread's pset 3442 */ 3443 kpreempt_disable(); 3444 cp = curthread->t_cpupart; 3445 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3446 kpreempt_enable(); 3447 3448 /* 3449 * Choose policy to randomly allocate memory across 3450 * lgroups in pset if it will fit and is not default 3451 * partition. Otherwise, allocate memory randomly 3452 * across machine. 3453 */ 3454 if (lgrp_mem_pset_aware && size < pset_mem_size) 3455 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3456 else 3457 policy = LGRP_MEM_POLICY_RANDOM; 3458 } else 3459 /* 3460 * Apply default policy for private memory and 3461 * shared memory under the respective random 3462 * threshold. 3463 */ 3464 policy = lgrp_mem_default_policy; 3465 3466 return (policy); 3467 } 3468 3469 /* 3470 * Get memory allocation policy for this segment 3471 */ 3472 lgrp_mem_policy_info_t * 3473 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3474 { 3475 lgrp_mem_policy_info_t *policy_info; 3476 extern struct seg_ops segspt_ops; 3477 extern struct seg_ops segspt_shmops; 3478 3479 /* 3480 * This is for binary compatibility to protect against third party 3481 * segment drivers which haven't recompiled to allow for 3482 * SEGOP_GETPOLICY() 3483 */ 3484 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3485 seg->s_ops != &segspt_shmops) 3486 return (NULL); 3487 3488 policy_info = NULL; 3489 if (seg->s_ops->getpolicy != NULL) 3490 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3491 3492 return (policy_info); 3493 } 3494 3495 /* 3496 * Set policy for allocating private memory given desired policy, policy info, 3497 * size in bytes of memory that policy is being applied. 3498 * Return 0 if policy wasn't set already and 1 if policy was set already 3499 */ 3500 int 3501 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3502 lgrp_mem_policy_info_t *policy_info, size_t size) 3503 { 3504 3505 ASSERT(policy_info != NULL); 3506 3507 if (policy == LGRP_MEM_POLICY_DEFAULT) 3508 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3509 3510 /* 3511 * Policy set already? 3512 */ 3513 if (policy == policy_info->mem_policy) 3514 return (1); 3515 3516 /* 3517 * Set policy 3518 */ 3519 policy_info->mem_policy = policy; 3520 policy_info->mem_reserved = 0; 3521 3522 return (0); 3523 } 3524 3525 3526 /* 3527 * Get shared memory allocation policy with given tree and offset 3528 */ 3529 lgrp_mem_policy_info_t * 3530 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3531 u_offset_t vn_off) 3532 { 3533 u_offset_t off; 3534 lgrp_mem_policy_info_t *policy_info; 3535 lgrp_shm_policy_seg_t *policy_seg; 3536 lgrp_shm_locality_t *shm_locality; 3537 avl_tree_t *tree; 3538 avl_index_t where; 3539 3540 /* 3541 * Get policy segment tree from anon_map or vnode and use specified 3542 * anon index or vnode offset as offset 3543 * 3544 * Assume that no lock needs to be held on anon_map or vnode, since 3545 * they should be protected by their reference count which must be 3546 * nonzero for an existing segment 3547 */ 3548 if (amp) { 3549 ASSERT(amp->refcnt != 0); 3550 shm_locality = amp->locality; 3551 if (shm_locality == NULL) 3552 return (NULL); 3553 tree = shm_locality->loc_tree; 3554 off = ptob(anon_index); 3555 } else if (vp) { 3556 shm_locality = vp->v_locality; 3557 if (shm_locality == NULL) 3558 return (NULL); 3559 ASSERT(shm_locality->loc_count != 0); 3560 tree = shm_locality->loc_tree; 3561 off = vn_off; 3562 } 3563 3564 if (tree == NULL) 3565 return (NULL); 3566 3567 /* 3568 * Lookup policy segment for offset into shared object and return 3569 * policy info 3570 */ 3571 rw_enter(&shm_locality->loc_lock, RW_READER); 3572 policy_info = NULL; 3573 policy_seg = avl_find(tree, &off, &where); 3574 if (policy_seg) 3575 policy_info = &policy_seg->shm_policy; 3576 rw_exit(&shm_locality->loc_lock); 3577 3578 return (policy_info); 3579 } 3580 3581 /* 3582 * Return lgroup to use for allocating memory 3583 * given the segment and address 3584 * 3585 * There isn't any mutual exclusion that exists between calls 3586 * to this routine and DR, so this routine and whomever calls it 3587 * should be mindful of the possibility that the lgrp returned 3588 * may be deleted. If this happens, dereferences of the lgrp 3589 * pointer will still be safe, but the resources in the lgrp will 3590 * be gone, and LGRP_EXISTS() will no longer be true. 3591 */ 3592 lgrp_t * 3593 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3594 { 3595 int i; 3596 lgrp_t *lgrp; 3597 klgrpset_t lgrpset; 3598 int lgrps_spanned; 3599 unsigned long off; 3600 lgrp_mem_policy_t policy; 3601 lgrp_mem_policy_info_t *policy_info; 3602 ushort_t random; 3603 int stat = 0; 3604 3605 /* 3606 * Just return null if the lgrp framework hasn't finished 3607 * initializing or if this is a UMA machine. 3608 */ 3609 if (nlgrps == 1 || !lgrp_initialized) 3610 return (lgrp_root); 3611 3612 /* 3613 * Get memory allocation policy for this segment 3614 */ 3615 policy = lgrp_mem_default_policy; 3616 if (seg != NULL) { 3617 if (seg->s_as == &kas) { 3618 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3619 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3620 policy = LGRP_MEM_POLICY_RANDOM; 3621 } else { 3622 policy_info = lgrp_mem_policy_get(seg, vaddr); 3623 if (policy_info != NULL) 3624 policy = policy_info->mem_policy; 3625 } 3626 } 3627 lgrpset = 0; 3628 3629 /* 3630 * Initialize lgroup to home by default 3631 */ 3632 lgrp = lgrp_home_lgrp(); 3633 3634 /* 3635 * When homing threads on root lgrp, override default memory 3636 * allocation policies with root lgroup memory allocation policy 3637 */ 3638 if (lgrp == lgrp_root) 3639 policy = lgrp_mem_policy_root; 3640 3641 /* 3642 * Implement policy 3643 */ 3644 switch (policy) { 3645 case LGRP_MEM_POLICY_NEXT_CPU: 3646 3647 /* 3648 * Return lgroup of current CPU which faulted on memory 3649 * If the CPU isn't currently in an lgrp, then opt to 3650 * allocate from the root. 3651 * 3652 * Kernel preemption needs to be disabled here to prevent 3653 * the current CPU from going away before lgrp is found. 3654 */ 3655 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3656 lgrp = lgrp_root; 3657 } else { 3658 kpreempt_disable(); 3659 lgrp = lgrp_cpu_to_lgrp(CPU); 3660 kpreempt_enable(); 3661 } 3662 break; 3663 3664 case LGRP_MEM_POLICY_NEXT: 3665 case LGRP_MEM_POLICY_DEFAULT: 3666 default: 3667 3668 /* 3669 * Just return current thread's home lgroup 3670 * for default policy (next touch) 3671 * If the thread is homed to the root, 3672 * then the default policy is random across lgroups. 3673 * Fallthrough to the random case. 3674 */ 3675 if (lgrp != lgrp_root) { 3676 if (policy == LGRP_MEM_POLICY_NEXT) 3677 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3678 else 3679 lgrp_stat_add(lgrp->lgrp_id, 3680 LGRP_NUM_DEFAULT, 1); 3681 break; 3682 } 3683 /* LINTED fallthrough on case statement */ 3684 case LGRP_MEM_POLICY_RANDOM: 3685 3686 /* 3687 * Return a random leaf lgroup with memory 3688 */ 3689 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3690 /* 3691 * Count how many lgroups are spanned 3692 */ 3693 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3694 3695 /* 3696 * There may be no memnodes in the root lgroup during DR copy 3697 * rename on a system with only two boards (memnodes) 3698 * configured. In this case just return the root lgrp. 3699 */ 3700 if (lgrps_spanned == 0) { 3701 lgrp = lgrp_root; 3702 break; 3703 } 3704 3705 /* 3706 * Pick a random offset within lgroups spanned 3707 * and return lgroup at that offset 3708 */ 3709 random = (ushort_t)gethrtime() >> 4; 3710 off = random % lgrps_spanned; 3711 ASSERT(off <= lgrp_alloc_max); 3712 3713 for (i = 0; i <= lgrp_alloc_max; i++) { 3714 if (!klgrpset_ismember(lgrpset, i)) 3715 continue; 3716 if (off) 3717 off--; 3718 else { 3719 lgrp = lgrp_table[i]; 3720 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3721 1); 3722 break; 3723 } 3724 } 3725 break; 3726 3727 case LGRP_MEM_POLICY_RANDOM_PROC: 3728 3729 /* 3730 * Grab copy of bitmask of lgroups spanned by 3731 * this process 3732 */ 3733 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3734 stat = LGRP_NUM_RANDOM_PROC; 3735 3736 /* LINTED fallthrough on case statement */ 3737 case LGRP_MEM_POLICY_RANDOM_PSET: 3738 3739 if (!stat) 3740 stat = LGRP_NUM_RANDOM_PSET; 3741 3742 if (klgrpset_isempty(lgrpset)) { 3743 /* 3744 * Grab copy of bitmask of lgroups spanned by 3745 * this processor set 3746 */ 3747 kpreempt_disable(); 3748 klgrpset_copy(lgrpset, 3749 curthread->t_cpupart->cp_lgrpset); 3750 kpreempt_enable(); 3751 } 3752 3753 /* 3754 * Count how many lgroups are spanned 3755 */ 3756 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3757 ASSERT(lgrps_spanned <= nlgrps); 3758 3759 /* 3760 * Probably lgrps_spanned should be always non-zero, but to be 3761 * on the safe side we return lgrp_root if it is empty. 3762 */ 3763 if (lgrps_spanned == 0) { 3764 lgrp = lgrp_root; 3765 break; 3766 } 3767 3768 /* 3769 * Pick a random offset within lgroups spanned 3770 * and return lgroup at that offset 3771 */ 3772 random = (ushort_t)gethrtime() >> 4; 3773 off = random % lgrps_spanned; 3774 ASSERT(off <= lgrp_alloc_max); 3775 3776 for (i = 0; i <= lgrp_alloc_max; i++) { 3777 if (!klgrpset_ismember(lgrpset, i)) 3778 continue; 3779 if (off) 3780 off--; 3781 else { 3782 lgrp = lgrp_table[i]; 3783 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3784 1); 3785 break; 3786 } 3787 } 3788 break; 3789 3790 case LGRP_MEM_POLICY_ROUNDROBIN: 3791 3792 /* 3793 * Use offset within segment to determine 3794 * offset from home lgroup to choose for 3795 * next lgroup to allocate memory from 3796 */ 3797 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3798 (lgrp_alloc_max + 1); 3799 3800 kpreempt_disable(); 3801 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3802 i = lgrp->lgrp_id; 3803 kpreempt_enable(); 3804 3805 while (off > 0) { 3806 i = (i + 1) % (lgrp_alloc_max + 1); 3807 lgrp = lgrp_table[i]; 3808 if (klgrpset_ismember(lgrpset, i)) 3809 off--; 3810 } 3811 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3812 3813 break; 3814 } 3815 3816 ASSERT(lgrp != NULL); 3817 return (lgrp); 3818 } 3819 3820 /* 3821 * Return the number of pages in an lgroup 3822 * 3823 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3824 * could cause tests that rely on the numat driver to fail.... 3825 */ 3826 pgcnt_t 3827 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3828 { 3829 lgrp_t *lgrp; 3830 3831 lgrp = lgrp_table[lgrpid]; 3832 if (!LGRP_EXISTS(lgrp) || 3833 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3834 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3835 return (0); 3836 3837 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3838 } 3839 3840 /* 3841 * Initialize lgroup shared memory allocation policy support 3842 */ 3843 void 3844 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3845 { 3846 lgrp_shm_locality_t *shm_locality; 3847 3848 /* 3849 * Initialize locality field in anon_map 3850 * Don't need any locks because this is called when anon_map is 3851 * allocated, but not used anywhere yet. 3852 */ 3853 if (amp) { 3854 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3855 if (amp->locality == NULL) { 3856 /* 3857 * Allocate and initialize shared memory locality info 3858 * and set anon_map locality pointer to it 3859 * Drop lock across kmem_alloc(KM_SLEEP) 3860 */ 3861 ANON_LOCK_EXIT(&->a_rwlock); 3862 shm_locality = kmem_alloc(sizeof (*shm_locality), 3863 KM_SLEEP); 3864 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3865 NULL); 3866 shm_locality->loc_count = 1; /* not used for amp */ 3867 shm_locality->loc_tree = NULL; 3868 3869 /* 3870 * Reacquire lock and check to see whether anyone beat 3871 * us to initializing the locality info 3872 */ 3873 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3874 if (amp->locality != NULL) { 3875 rw_destroy(&shm_locality->loc_lock); 3876 kmem_free(shm_locality, 3877 sizeof (*shm_locality)); 3878 } else 3879 amp->locality = shm_locality; 3880 } 3881 ANON_LOCK_EXIT(&->a_rwlock); 3882 return; 3883 } 3884 3885 /* 3886 * Allocate shared vnode policy info if vnode is not locality aware yet 3887 */ 3888 mutex_enter(&vp->v_lock); 3889 if ((vp->v_flag & V_LOCALITY) == 0) { 3890 /* 3891 * Allocate and initialize shared memory locality info 3892 */ 3893 mutex_exit(&vp->v_lock); 3894 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3895 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3896 shm_locality->loc_count = 1; 3897 shm_locality->loc_tree = NULL; 3898 3899 /* 3900 * Point vnode locality field at shared vnode policy info 3901 * and set locality aware flag in vnode 3902 */ 3903 mutex_enter(&vp->v_lock); 3904 if ((vp->v_flag & V_LOCALITY) == 0) { 3905 vp->v_locality = shm_locality; 3906 vp->v_flag |= V_LOCALITY; 3907 } else { 3908 /* 3909 * Lost race so free locality info and increment count. 3910 */ 3911 rw_destroy(&shm_locality->loc_lock); 3912 kmem_free(shm_locality, sizeof (*shm_locality)); 3913 shm_locality = vp->v_locality; 3914 shm_locality->loc_count++; 3915 } 3916 mutex_exit(&vp->v_lock); 3917 3918 return; 3919 } 3920 3921 /* 3922 * Increment reference count of number of segments mapping this vnode 3923 * shared 3924 */ 3925 shm_locality = vp->v_locality; 3926 shm_locality->loc_count++; 3927 mutex_exit(&vp->v_lock); 3928 } 3929 3930 /* 3931 * Destroy the given shared memory policy segment tree 3932 */ 3933 void 3934 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3935 { 3936 lgrp_shm_policy_seg_t *cur; 3937 lgrp_shm_policy_seg_t *next; 3938 3939 if (tree == NULL) 3940 return; 3941 3942 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3943 while (cur != NULL) { 3944 next = AVL_NEXT(tree, cur); 3945 avl_remove(tree, cur); 3946 kmem_free(cur, sizeof (*cur)); 3947 cur = next; 3948 } 3949 kmem_free(tree, sizeof (avl_tree_t)); 3950 } 3951 3952 /* 3953 * Uninitialize lgroup shared memory allocation policy support 3954 */ 3955 void 3956 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3957 { 3958 lgrp_shm_locality_t *shm_locality; 3959 3960 /* 3961 * For anon_map, deallocate shared memory policy tree and 3962 * zero locality field 3963 * Don't need any locks because anon_map is being freed 3964 */ 3965 if (amp) { 3966 if (amp->locality == NULL) 3967 return; 3968 shm_locality = amp->locality; 3969 shm_locality->loc_count = 0; /* not really used for amp */ 3970 rw_destroy(&shm_locality->loc_lock); 3971 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3972 kmem_free(shm_locality, sizeof (*shm_locality)); 3973 amp->locality = 0; 3974 return; 3975 } 3976 3977 /* 3978 * For vnode, decrement reference count of segments mapping this vnode 3979 * shared and delete locality info if reference count drops to 0 3980 */ 3981 mutex_enter(&vp->v_lock); 3982 shm_locality = vp->v_locality; 3983 shm_locality->loc_count--; 3984 3985 if (shm_locality->loc_count == 0) { 3986 rw_destroy(&shm_locality->loc_lock); 3987 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3988 kmem_free(shm_locality, sizeof (*shm_locality)); 3989 vp->v_locality = 0; 3990 vp->v_flag &= ~V_LOCALITY; 3991 } 3992 mutex_exit(&vp->v_lock); 3993 } 3994 3995 /* 3996 * Compare two shared memory policy segments 3997 * Used by AVL tree code for searching 3998 */ 3999 int 4000 lgrp_shm_policy_compar(const void *x, const void *y) 4001 { 4002 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4003 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4004 4005 if (a->shm_off < b->shm_off) 4006 return (-1); 4007 if (a->shm_off >= b->shm_off + b->shm_size) 4008 return (1); 4009 return (0); 4010 } 4011 4012 /* 4013 * Concatenate seg1 with seg2 and remove seg2 4014 */ 4015 static int 4016 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4017 lgrp_shm_policy_seg_t *seg2) 4018 { 4019 if (!seg1 || !seg2 || 4020 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4021 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4022 return (-1); 4023 4024 seg1->shm_size += seg2->shm_size; 4025 avl_remove(tree, seg2); 4026 kmem_free(seg2, sizeof (*seg2)); 4027 return (0); 4028 } 4029 4030 /* 4031 * Split segment at given offset and return rightmost (uppermost) segment 4032 * Assumes that there are no overlapping segments 4033 */ 4034 static lgrp_shm_policy_seg_t * 4035 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4036 u_offset_t off) 4037 { 4038 lgrp_shm_policy_seg_t *newseg; 4039 avl_index_t where; 4040 4041 ASSERT(seg != NULL); 4042 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4043 4044 if (!seg || off < seg->shm_off || off > seg->shm_off + 4045 seg->shm_size) 4046 return (NULL); 4047 4048 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4049 return (seg); 4050 4051 /* 4052 * Adjust size of left segment and allocate new (right) segment 4053 */ 4054 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4055 newseg->shm_policy = seg->shm_policy; 4056 newseg->shm_off = off; 4057 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4058 seg->shm_size = off - seg->shm_off; 4059 4060 /* 4061 * Find where to insert new segment in AVL tree and insert it 4062 */ 4063 (void) avl_find(tree, &off, &where); 4064 avl_insert(tree, newseg, where); 4065 4066 return (newseg); 4067 } 4068 4069 /* 4070 * Set shared memory allocation policy on specified shared object at given 4071 * offset and length 4072 * 4073 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4074 * -1 if can't set policy. 4075 */ 4076 int 4077 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4078 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4079 { 4080 u_offset_t eoff; 4081 lgrp_shm_policy_seg_t *next; 4082 lgrp_shm_policy_seg_t *newseg; 4083 u_offset_t off; 4084 u_offset_t oldeoff; 4085 lgrp_shm_policy_seg_t *prev; 4086 int retval; 4087 lgrp_shm_policy_seg_t *seg; 4088 lgrp_shm_locality_t *shm_locality; 4089 avl_tree_t *tree; 4090 avl_index_t where; 4091 4092 ASSERT(amp || vp); 4093 ASSERT((len & PAGEOFFSET) == 0); 4094 4095 if (len == 0) 4096 return (-1); 4097 4098 retval = 0; 4099 4100 /* 4101 * Get locality info and starting offset into shared object 4102 * Try anon map first and then vnode 4103 * Assume that no locks need to be held on anon_map or vnode, since 4104 * it should be protected by its reference count which must be nonzero 4105 * for an existing segment. 4106 */ 4107 if (amp) { 4108 /* 4109 * Get policy info from anon_map 4110 * 4111 */ 4112 ASSERT(amp->refcnt != 0); 4113 if (amp->locality == NULL) 4114 lgrp_shm_policy_init(amp, NULL); 4115 shm_locality = amp->locality; 4116 off = ptob(anon_index); 4117 } else if (vp) { 4118 /* 4119 * Get policy info from vnode 4120 */ 4121 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4122 lgrp_shm_policy_init(NULL, vp); 4123 shm_locality = vp->v_locality; 4124 ASSERT(shm_locality->loc_count != 0); 4125 off = vn_off; 4126 } else 4127 return (-1); 4128 4129 ASSERT((off & PAGEOFFSET) == 0); 4130 4131 /* 4132 * Figure out default policy 4133 */ 4134 if (policy == LGRP_MEM_POLICY_DEFAULT) 4135 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4136 4137 /* 4138 * Create AVL tree if there isn't one yet 4139 * and set locality field to point at it 4140 */ 4141 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4142 tree = shm_locality->loc_tree; 4143 if (!tree) { 4144 rw_exit(&shm_locality->loc_lock); 4145 4146 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4147 4148 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4149 if (shm_locality->loc_tree == NULL) { 4150 avl_create(tree, lgrp_shm_policy_compar, 4151 sizeof (lgrp_shm_policy_seg_t), 4152 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4153 shm_locality->loc_tree = tree; 4154 } else { 4155 /* 4156 * Another thread managed to set up the tree 4157 * before we could. Free the tree we allocated 4158 * and use the one that's already there. 4159 */ 4160 kmem_free(tree, sizeof (*tree)); 4161 tree = shm_locality->loc_tree; 4162 } 4163 } 4164 4165 /* 4166 * Set policy 4167 * 4168 * Need to maintain hold on writer's lock to keep tree from 4169 * changing out from under us 4170 */ 4171 while (len != 0) { 4172 /* 4173 * Find policy segment for specified offset into shared object 4174 */ 4175 seg = avl_find(tree, &off, &where); 4176 4177 /* 4178 * Didn't find any existing segment that contains specified 4179 * offset, so allocate new segment, insert it, and concatenate 4180 * with adjacent segments if possible 4181 */ 4182 if (seg == NULL) { 4183 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4184 KM_SLEEP); 4185 newseg->shm_policy.mem_policy = policy; 4186 newseg->shm_policy.mem_reserved = 0; 4187 newseg->shm_off = off; 4188 avl_insert(tree, newseg, where); 4189 4190 /* 4191 * Check to see whether new segment overlaps with next 4192 * one, set length of new segment accordingly, and 4193 * calculate remaining length and next offset 4194 */ 4195 seg = AVL_NEXT(tree, newseg); 4196 if (seg == NULL || off + len <= seg->shm_off) { 4197 newseg->shm_size = len; 4198 len = 0; 4199 } else { 4200 newseg->shm_size = seg->shm_off - off; 4201 off = seg->shm_off; 4202 len -= newseg->shm_size; 4203 } 4204 4205 /* 4206 * Try to concatenate new segment with next and 4207 * previous ones, since they might have the same policy 4208 * now. Grab previous and next segments first because 4209 * they will change on concatenation. 4210 */ 4211 prev = AVL_PREV(tree, newseg); 4212 next = AVL_NEXT(tree, newseg); 4213 (void) lgrp_shm_policy_concat(tree, newseg, next); 4214 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4215 4216 continue; 4217 } 4218 4219 eoff = off + len; 4220 oldeoff = seg->shm_off + seg->shm_size; 4221 4222 /* 4223 * Policy set already? 4224 */ 4225 if (policy == seg->shm_policy.mem_policy) { 4226 /* 4227 * Nothing left to do if offset and length 4228 * fall within this segment 4229 */ 4230 if (eoff <= oldeoff) { 4231 retval = 1; 4232 break; 4233 } else { 4234 len = eoff - oldeoff; 4235 off = oldeoff; 4236 continue; 4237 } 4238 } 4239 4240 /* 4241 * Specified offset and length match existing segment exactly 4242 */ 4243 if (off == seg->shm_off && len == seg->shm_size) { 4244 /* 4245 * Set policy and update current length 4246 */ 4247 seg->shm_policy.mem_policy = policy; 4248 seg->shm_policy.mem_reserved = 0; 4249 len = 0; 4250 4251 /* 4252 * Try concatenating new segment with previous and next 4253 * segments, since they might have the same policy now. 4254 * Grab previous and next segments first because they 4255 * will change on concatenation. 4256 */ 4257 prev = AVL_PREV(tree, seg); 4258 next = AVL_NEXT(tree, seg); 4259 (void) lgrp_shm_policy_concat(tree, seg, next); 4260 (void) lgrp_shm_policy_concat(tree, prev, seg); 4261 } else { 4262 /* 4263 * Specified offset and length only apply to part of 4264 * existing segment 4265 */ 4266 4267 /* 4268 * New segment starts in middle of old one, so split 4269 * new one off near beginning of old one 4270 */ 4271 newseg = NULL; 4272 if (off > seg->shm_off) { 4273 newseg = lgrp_shm_policy_split(tree, seg, off); 4274 4275 /* 4276 * New segment ends where old one did, so try 4277 * to concatenate with next segment 4278 */ 4279 if (eoff == oldeoff) { 4280 newseg->shm_policy.mem_policy = policy; 4281 newseg->shm_policy.mem_reserved = 0; 4282 (void) lgrp_shm_policy_concat(tree, 4283 newseg, AVL_NEXT(tree, newseg)); 4284 break; 4285 } 4286 } 4287 4288 /* 4289 * New segment ends before old one, so split off end of 4290 * old one 4291 */ 4292 if (eoff < oldeoff) { 4293 if (newseg) { 4294 (void) lgrp_shm_policy_split(tree, 4295 newseg, eoff); 4296 newseg->shm_policy.mem_policy = policy; 4297 newseg->shm_policy.mem_reserved = 0; 4298 } else { 4299 (void) lgrp_shm_policy_split(tree, seg, 4300 eoff); 4301 seg->shm_policy.mem_policy = policy; 4302 seg->shm_policy.mem_reserved = 0; 4303 } 4304 4305 if (off == seg->shm_off) 4306 (void) lgrp_shm_policy_concat(tree, 4307 AVL_PREV(tree, seg), seg); 4308 break; 4309 } 4310 4311 /* 4312 * Calculate remaining length and next offset 4313 */ 4314 len = eoff - oldeoff; 4315 off = oldeoff; 4316 } 4317 } 4318 4319 rw_exit(&shm_locality->loc_lock); 4320 return (retval); 4321 } 4322 4323 /* 4324 * Return the best memnode from which to allocate memory given 4325 * an lgroup. 4326 * 4327 * "c" is for cookie, which is good enough for me. 4328 * It references a cookie struct that should be zero'ed to initialize. 4329 * The cookie should live on the caller's stack. 4330 * 4331 * The routine returns -1 when: 4332 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4333 * - traverse is 1, and all the memnodes in the system have been 4334 * returned. 4335 */ 4336 int 4337 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4338 { 4339 lgrp_t *lp = c->lmc_lgrp; 4340 mnodeset_t nodes = c->lmc_nodes; 4341 int cnt = c->lmc_cnt; 4342 int offset, mnode; 4343 4344 extern int max_mem_nodes; 4345 4346 /* 4347 * If the set is empty, and the caller is willing, traverse 4348 * up the hierarchy until we find a non-empty set. 4349 */ 4350 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4351 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4352 ((lp = lp->lgrp_parent) == NULL)) 4353 return (-1); 4354 4355 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4356 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4357 } 4358 4359 /* 4360 * Select a memnode by picking one at a "random" offset. 4361 * Because of DR, memnodes can come and go at any time. 4362 * This code must be able to cope with the possibility 4363 * that the nodes count "cnt" is inconsistent with respect 4364 * to the number of elements actually in "nodes", and 4365 * therefore that the offset chosen could be greater than 4366 * the number of elements in the set (some memnodes may 4367 * have dissapeared just before cnt was read). 4368 * If this happens, the search simply wraps back to the 4369 * beginning of the set. 4370 */ 4371 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4372 offset = c->lmc_rand % cnt; 4373 do { 4374 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4375 if (nodes & ((mnodeset_t)1 << mnode)) 4376 if (!offset--) 4377 break; 4378 } while (mnode >= max_mem_nodes); 4379 4380 /* Found a node. Store state before returning. */ 4381 c->lmc_lgrp = lp; 4382 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4383 c->lmc_cnt = cnt - 1; 4384 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4385 c->lmc_ntried++; 4386 4387 return (mnode); 4388 } 4389