1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Basic NUMA support in terms of locality groups 31 * 32 * Solaris needs to know which CPUs, memory, etc. are near each other to 33 * provide good performance on NUMA machines by optimizing for locality. 34 * In order to do this, a new abstraction called a "locality group (lgroup)" 35 * has been introduced to keep track of which CPU-like and memory-like hardware 36 * resources are close to each other. Currently, latency is the only measure 37 * used to determine how to group hardware resources into lgroups, but this 38 * does not limit the groupings to be based solely on latency. Other factors 39 * may be used to determine the groupings in the future. 40 * 41 * Lgroups are organized into a hieararchy or topology that represents the 42 * latency topology of the machine. There is always at least a root lgroup in 43 * the system. It represents all the hardware resources in the machine at a 44 * latency big enough that any hardware resource can at least access any other 45 * hardware resource within that latency. A Uniform Memory Access (UMA) 46 * machine is represented with one lgroup (the root). In contrast, a NUMA 47 * machine is represented at least by the root lgroup and some number of leaf 48 * lgroups where the leaf lgroups contain the hardware resources within the 49 * least latency of each other and the root lgroup still contains all the 50 * resources in the machine. Some number of intermediate lgroups may exist 51 * which represent more levels of locality than just the local latency of the 52 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 53 * (eg. root and intermediate lgroups) contain the next nearest resources to 54 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 55 * to the root lgroup shows the hardware resources from closest to farthest 56 * from the leaf lgroup such that each successive ancestor lgroup contains 57 * the next nearest resources at the next level of locality from the previous. 58 * 59 * The kernel uses the lgroup abstraction to know how to allocate resources 60 * near a given process/thread. At fork() and lwp/thread_create() time, a 61 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 62 * with the lowest load average. Binding to a processor or processor set will 63 * change the home lgroup for a thread. The scheduler has been modified to try 64 * to dispatch a thread on a CPU in its home lgroup. Physical memory 65 * allocation is lgroup aware too, so memory will be allocated from the current 66 * thread's home lgroup if possible. If the desired resources are not 67 * available, the kernel traverses the lgroup hierarchy going to the parent 68 * lgroup to find resources at the next level of locality until it reaches the 69 * root lgroup. 70 */ 71 72 #include <sys/lgrp.h> 73 #include <sys/lgrp_user.h> 74 #include <sys/types.h> 75 #include <sys/mman.h> 76 #include <sys/param.h> 77 #include <sys/var.h> 78 #include <sys/thread.h> 79 #include <sys/cpuvar.h> 80 #include <sys/cpupart.h> 81 #include <sys/kmem.h> 82 #include <vm/seg.h> 83 #include <vm/seg_kmem.h> 84 #include <vm/seg_spt.h> 85 #include <vm/seg_vn.h> 86 #include <vm/as.h> 87 #include <sys/atomic.h> 88 #include <sys/systm.h> 89 #include <sys/errno.h> 90 #include <sys/cmn_err.h> 91 #include <sys/kstat.h> 92 #include <sys/sysmacros.h> 93 #include <sys/chip.h> 94 #include <sys/promif.h> 95 #include <sys/sdt.h> 96 97 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 99 /* indexed by lgrp_id */ 100 int nlgrps; /* number of lgroups in machine */ 101 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 102 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 103 104 /* 105 * Kstat data for lgroups. 106 * 107 * Actual kstat data is collected in lgrp_stats array. 108 * The lgrp_kstat_data array of named kstats is used to extract data from 109 * lgrp_stats and present it to kstat framework. It is protected from partallel 110 * modifications by lgrp_kstat_mutex. This may cause some contention when 111 * several kstat commands run in parallel but this is not the 112 * performance-critical path. 113 */ 114 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 115 116 /* 117 * Declare kstat names statically for enums as defined in the header file. 118 */ 119 LGRP_KSTAT_NAMES; 120 121 static void lgrp_kstat_init(void); 122 static int lgrp_kstat_extract(kstat_t *, int); 123 static void lgrp_kstat_reset(lgrp_id_t); 124 125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 126 static kmutex_t lgrp_kstat_mutex; 127 128 129 /* 130 * max number of lgroups supported by the platform 131 */ 132 int nlgrpsmax = 0; 133 134 /* 135 * The root lgroup. Represents the set of resources at the system wide 136 * level of locality. 137 */ 138 lgrp_t *lgrp_root = NULL; 139 140 /* 141 * During system bootstrap cp_default does not contain the list of lgrp load 142 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 143 * on-line when cp_default is initialized by cpupart_initialize_default(). 144 * Configuring CPU0 may create a two-level topology with root and one leaf node 145 * containing CPU0. This topology is initially constructed in a special 146 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 147 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 148 * for all lpl operations until cp_default is fully constructed. 149 * 150 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 151 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 152 * the first element of lpl_bootstrap_list. 153 */ 154 #define LPL_BOOTSTRAP_SIZE 2 155 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 156 lpl_t *lpl_bootstrap; 157 158 static lgrp_t lroot; 159 160 161 /* 162 * Size, in bytes, beyond which random memory allocation policy is applied 163 * to non-shared memory. Default is the maximum size, so random memory 164 * allocation won't be used for non-shared memory by default. 165 */ 166 size_t lgrp_privm_random_thresh = (size_t)(-1); 167 168 /* 169 * Size, in bytes, beyond which random memory allocation policy is applied to 170 * shared memory. Default is 8MB (2 ISM pages). 171 */ 172 size_t lgrp_shm_random_thresh = 8*1024*1024; 173 174 /* 175 * Whether to do processor set aware memory allocation by default 176 */ 177 int lgrp_mem_pset_aware = 0; 178 179 /* 180 * Set the default memory allocation policy for root lgroup 181 */ 182 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 183 184 /* 185 * Set the default memory allocation policy. For most platforms, 186 * next touch is sufficient, but some platforms may wish to override 187 * this. 188 */ 189 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 190 191 192 /* 193 * lgroup CPU event handlers 194 */ 195 static void lgrp_cpu_init(struct cpu *); 196 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 197 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 198 199 static void lgrp_latency_change(u_longlong_t, u_longlong_t); 200 201 /* 202 * lgroup memory event handlers 203 */ 204 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 205 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 206 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 207 208 /* 209 * lgroup CPU partition event handlers 210 */ 211 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 212 static void lgrp_part_del_cpu(struct cpu *); 213 214 static void lgrp_root_init(void); 215 216 /* 217 * lpl topology 218 */ 219 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 220 static void lpl_clear(lpl_t *); 221 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 222 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 223 static void lpl_rset_add(lpl_t *, lpl_t *); 224 static void lpl_rset_del(lpl_t *, lpl_t *); 225 static int lpl_rset_contains(lpl_t *, lpl_t *); 226 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 227 static void lpl_child_update(lpl_t *, struct cpupart *); 228 static int lpl_pick(lpl_t *, lpl_t *); 229 static void lpl_verify_wrapper(struct cpupart *); 230 231 /* 232 * defines for lpl topology verifier return codes 233 */ 234 235 #define LPL_TOPO_CORRECT 0 236 #define LPL_TOPO_PART_HAS_NO_LPL -1 237 #define LPL_TOPO_CPUS_NOT_EMPTY -2 238 #define LPL_TOPO_LGRP_MISMATCH -3 239 #define LPL_TOPO_MISSING_PARENT -4 240 #define LPL_TOPO_PARENT_MISMATCH -5 241 #define LPL_TOPO_BAD_CPUCNT -6 242 #define LPL_TOPO_RSET_MISMATCH -7 243 #define LPL_TOPO_LPL_ORPHANED -8 244 #define LPL_TOPO_LPL_BAD_NCPU -9 245 #define LPL_TOPO_RSET_MSSNG_LF -10 246 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 247 #define LPL_TOPO_BOGUS_HINT -12 248 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 249 #define LPL_TOPO_LGRP_NOT_LEAF -14 250 #define LPL_TOPO_BAD_RSETCNT -15 251 252 /* 253 * Return whether lgroup optimizations should be enabled on this system 254 */ 255 int 256 lgrp_optimizations(void) 257 { 258 /* 259 * System must have more than 2 lgroups to enable lgroup optimizations 260 * 261 * XXX This assumes that a 2 lgroup system has an empty root lgroup 262 * with one child lgroup containing all the resources. A 2 lgroup 263 * system with a root lgroup directly containing CPUs or memory might 264 * need lgroup optimizations with its child lgroup, but there 265 * isn't such a machine for now.... 266 */ 267 if (nlgrps > 2) 268 return (1); 269 270 return (0); 271 } 272 273 /* 274 * Build full lgroup topology 275 */ 276 static void 277 lgrp_root_init(void) 278 { 279 lgrp_handle_t hand; 280 int i; 281 lgrp_id_t id; 282 283 /* 284 * Create the "root" lgroup 285 */ 286 ASSERT(nlgrps == 0); 287 id = nlgrps++; 288 289 lgrp_root = &lroot; 290 291 lgrp_root->lgrp_cpu = NULL; 292 lgrp_root->lgrp_mnodes = 0; 293 lgrp_root->lgrp_nmnodes = 0; 294 hand = lgrp_plat_root_hand(); 295 lgrp_root->lgrp_plathand = hand; 296 297 lgrp_root->lgrp_id = id; 298 lgrp_root->lgrp_cpucnt = 0; 299 lgrp_root->lgrp_childcnt = 0; 300 klgrpset_clear(lgrp_root->lgrp_children); 301 klgrpset_clear(lgrp_root->lgrp_leaves); 302 lgrp_root->lgrp_parent = NULL; 303 lgrp_root->lgrp_chips = NULL; 304 lgrp_root->lgrp_chipcnt = 0; 305 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 306 307 for (i = 0; i < LGRP_RSRC_COUNT; i++) 308 klgrpset_clear(lgrp_root->lgrp_set[i]); 309 310 lgrp_root->lgrp_kstat = NULL; 311 312 lgrp_table[id] = lgrp_root; 313 314 /* 315 * Setup initial lpl list for CPU0 and initial t0 home. 316 * The only lpl space we have so far is lpl_bootstrap. It is used for 317 * all topology operations untill cp_default until cp_default is 318 * initialized at which point t0.t_lpl will be updated. 319 */ 320 lpl_bootstrap = lpl_bootstrap_list; 321 t0.t_lpl = lpl_bootstrap; 322 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 323 lpl_bootstrap_list[1].lpl_lgrpid = 1; 324 cp_default.cp_lgrploads = lpl_bootstrap; 325 } 326 327 /* 328 * Initialize the lgroup framework and allow the platform to do the same 329 */ 330 void 331 lgrp_init(void) 332 { 333 /* 334 * Initialize the platform 335 */ 336 lgrp_plat_init(); 337 338 /* 339 * Set max number of lgroups supported on this platform which must be 340 * less than the max number of lgroups supported by the common lgroup 341 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 342 */ 343 nlgrpsmax = lgrp_plat_max_lgrps(); 344 ASSERT(nlgrpsmax <= NLGRPS_MAX); 345 } 346 347 /* 348 * Create the root and cpu0's lgroup, and set t0's home. 349 */ 350 void 351 lgrp_setup(void) 352 { 353 /* 354 * Setup the root lgroup 355 */ 356 lgrp_root_init(); 357 358 /* 359 * Add cpu0 to an lgroup 360 */ 361 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 362 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 363 } 364 365 /* 366 * Lgroup initialization is split in two parts. The first part 367 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 368 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 369 * when all CPUs are brought online and all distance information is available. 370 * 371 * When lgrp_main_init() is complete it sets lgrp_initialized. The 372 * lgrp_main_mp_init() sets lgrp_topo_initialized. 373 */ 374 375 /* 376 * true when lgrp initialization has been completed. 377 */ 378 int lgrp_initialized = 0; 379 380 /* 381 * True when lgrp topology is constructed. 382 */ 383 int lgrp_topo_initialized = 0; 384 385 /* 386 * Init routine called after startup(), /etc/system has been processed, 387 * and cpu0 has been added to an lgroup. 388 */ 389 void 390 lgrp_main_init(void) 391 { 392 cpu_t *cp = CPU; 393 lgrp_id_t lgrpid; 394 int i; 395 /* 396 * Enforce a valid lgrp_mem_default_policy 397 */ 398 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 399 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 400 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 401 402 /* 403 * See if mpo should be disabled. 404 * This may happen in the case of null proc LPA on Starcat. 405 * The platform won't be able to detect null proc LPA until after 406 * cpu0 and memory have already been added to lgroups. 407 * When and if it is detected, the Starcat platform will return 408 * a different platform handle for cpu0 which is what we check for 409 * here. If mpo should be disabled move cpu0 to it's rightful place 410 * (the root), and destroy the remaining lgroups. This effectively 411 * provides an UMA lgroup topology. 412 */ 413 lgrpid = cp->cpu_lpl->lpl_lgrpid; 414 if (lgrp_table[lgrpid]->lgrp_plathand != 415 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 416 lgrp_part_del_cpu(cp); 417 lgrp_cpu_fini(cp, lgrpid); 418 419 lgrp_cpu_init(cp); 420 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 421 422 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 423 424 for (i = 0; i <= lgrp_alloc_max; i++) { 425 if (LGRP_EXISTS(lgrp_table[i]) && 426 lgrp_table[i] != lgrp_root) 427 lgrp_destroy(lgrp_table[i]); 428 } 429 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 430 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 431 } 432 433 /* 434 * Initialize kstats framework. 435 */ 436 lgrp_kstat_init(); 437 /* 438 * cpu0 is finally where it should be, so create it's lgroup's kstats 439 */ 440 mutex_enter(&cpu_lock); 441 lgrp_kstat_create(cp); 442 mutex_exit(&cpu_lock); 443 444 lgrp_plat_main_init(); 445 lgrp_initialized = 1; 446 } 447 448 /* 449 * Finish lgrp initialization after all CPUS are brought on-line. 450 * This routine is called after start_other_cpus(). 451 */ 452 void 453 lgrp_main_mp_init(void) 454 { 455 klgrpset_t changed; 456 457 /* 458 * Update lgroup topology (if necessary) 459 */ 460 klgrpset_clear(changed); 461 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 462 lgrp_topo_initialized = 1; 463 } 464 465 /* 466 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 467 */ 468 void 469 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 470 { 471 klgrpset_t changed; 472 cpu_t *cp; 473 lgrp_id_t id; 474 int rc; 475 476 switch (event) { 477 /* 478 * The following (re)configuration events are common code 479 * initiated. lgrp_plat_config() is called here to inform the 480 * platform of the reconfiguration event. 481 */ 482 case LGRP_CONFIG_CPU_ADD: 483 lgrp_plat_config(event, resource); 484 atomic_add_32(&lgrp_gen, 1); 485 486 break; 487 case LGRP_CONFIG_CPU_DEL: 488 lgrp_plat_config(event, resource); 489 atomic_add_32(&lgrp_gen, 1); 490 491 break; 492 case LGRP_CONFIG_CPU_ONLINE: 493 cp = (cpu_t *)resource; 494 lgrp_cpu_init(cp); 495 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 496 rc = lpl_topo_verify(cp->cpu_part); 497 if (rc != LPL_TOPO_CORRECT) { 498 panic("lpl_topo_verify failed: %d", rc); 499 } 500 lgrp_plat_config(event, resource); 501 atomic_add_32(&lgrp_gen, 1); 502 503 break; 504 case LGRP_CONFIG_CPU_OFFLINE: 505 cp = (cpu_t *)resource; 506 id = cp->cpu_lpl->lpl_lgrpid; 507 lgrp_part_del_cpu(cp); 508 lgrp_cpu_fini(cp, id); 509 rc = lpl_topo_verify(cp->cpu_part); 510 if (rc != LPL_TOPO_CORRECT) { 511 panic("lpl_topo_verify failed: %d", rc); 512 } 513 lgrp_plat_config(event, resource); 514 atomic_add_32(&lgrp_gen, 1); 515 516 break; 517 case LGRP_CONFIG_CPUPART_ADD: 518 cp = (cpu_t *)resource; 519 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 520 rc = lpl_topo_verify(cp->cpu_part); 521 if (rc != LPL_TOPO_CORRECT) { 522 panic("lpl_topo_verify failed: %d", rc); 523 } 524 lgrp_plat_config(event, resource); 525 526 break; 527 case LGRP_CONFIG_CPUPART_DEL: 528 cp = (cpu_t *)resource; 529 lgrp_part_del_cpu((cpu_t *)resource); 530 rc = lpl_topo_verify(cp->cpu_part); 531 if (rc != LPL_TOPO_CORRECT) { 532 panic("lpl_topo_verify failed: %d", rc); 533 } 534 lgrp_plat_config(event, resource); 535 536 break; 537 /* 538 * The following events are initiated by the memnode 539 * subsystem. 540 */ 541 case LGRP_CONFIG_MEM_ADD: 542 lgrp_mem_init((int)resource, where, B_FALSE); 543 atomic_add_32(&lgrp_gen, 1); 544 545 break; 546 case LGRP_CONFIG_MEM_DEL: 547 lgrp_mem_fini((int)resource, where, B_FALSE); 548 atomic_add_32(&lgrp_gen, 1); 549 550 break; 551 case LGRP_CONFIG_MEM_RENAME: { 552 lgrp_config_mem_rename_t *ren_arg = 553 (lgrp_config_mem_rename_t *)where; 554 555 lgrp_mem_rename((int)resource, 556 ren_arg->lmem_rename_from, 557 ren_arg->lmem_rename_to); 558 atomic_add_32(&lgrp_gen, 1); 559 560 break; 561 } 562 case LGRP_CONFIG_GEN_UPDATE: 563 atomic_add_32(&lgrp_gen, 1); 564 565 break; 566 case LGRP_CONFIG_FLATTEN: 567 if (where == 0) 568 lgrp_topo_levels = (int)resource; 569 else 570 (void) lgrp_topo_flatten(resource, 571 lgrp_table, lgrp_alloc_max, &changed); 572 573 break; 574 /* 575 * Initiated by platform latency probing code 576 */ 577 case LGRP_CONFIG_LATENCY_CHANGE: 578 lgrp_latency_change((u_longlong_t)resource, 579 (u_longlong_t)where); 580 581 break; 582 case LGRP_CONFIG_NOP: 583 584 break; 585 default: 586 break; 587 } 588 589 } 590 591 /* 592 * Called to add lgrp info into cpu structure from cpu_add_unit; 593 * do not assume cpu is in cpu[] yet! 594 * 595 * CPUs are brought online with all other CPUs paused so we can't 596 * allocate memory or we could deadlock the system, so we rely on 597 * the platform to statically allocate as much space as we need 598 * for the lgrp structs and stats. 599 */ 600 static void 601 lgrp_cpu_init(struct cpu *cp) 602 { 603 klgrpset_t changed; 604 int count; 605 lgrp_handle_t hand; 606 int first_cpu; 607 lgrp_t *my_lgrp; 608 lgrp_id_t lgrpid; 609 struct cpu *cptr; 610 struct chip *chp; 611 612 /* 613 * This is the first time through if the resource set 614 * for the root lgroup is empty. After cpu0 has been 615 * initially added to an lgroup, the root's CPU resource 616 * set can never be empty, since the system's last CPU 617 * cannot be offlined. 618 */ 619 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 620 /* 621 * First time through. 622 */ 623 first_cpu = 1; 624 } else { 625 /* 626 * If cpu0 needs to move lgroups, we may come 627 * through here again, at which time cpu_lock won't 628 * be held, and lgrp_initialized will be false. 629 */ 630 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 631 ASSERT(cp->cpu_part != NULL); 632 first_cpu = 0; 633 } 634 635 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 636 my_lgrp = lgrp_hand_to_lgrp(hand); 637 638 if (my_lgrp == NULL) { 639 /* 640 * Create new lgrp and add it to lgroup topology 641 */ 642 my_lgrp = lgrp_create(); 643 my_lgrp->lgrp_plathand = hand; 644 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 645 lgrpid = my_lgrp->lgrp_id; 646 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 647 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 648 649 count = 0; 650 klgrpset_clear(changed); 651 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 652 &changed); 653 /* 654 * May have added new intermediate lgroups, so need to add 655 * resources other than CPUs which are added below 656 */ 657 (void) lgrp_mnode_update(changed, NULL); 658 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 659 > 0) { 660 /* 661 * Leaf lgroup was created, but latency wasn't available 662 * then. So, set latency for it and fill in rest of lgroup 663 * topology now that we know how far it is from other leaf 664 * lgroups. 665 */ 666 lgrpid = my_lgrp->lgrp_id; 667 klgrpset_clear(changed); 668 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 669 lgrpid)) 670 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 671 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 672 &changed); 673 674 /* 675 * May have added new intermediate lgroups, so need to add 676 * resources other than CPUs which are added below 677 */ 678 (void) lgrp_mnode_update(changed, NULL); 679 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 680 my_lgrp->lgrp_id)) { 681 int i; 682 683 /* 684 * Update existing lgroup and lgroups containing it with CPU 685 * resource 686 */ 687 lgrpid = my_lgrp->lgrp_id; 688 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 689 for (i = 0; i <= lgrp_alloc_max; i++) { 690 lgrp_t *lgrp; 691 692 lgrp = lgrp_table[i]; 693 if (!LGRP_EXISTS(lgrp) || 694 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 695 continue; 696 697 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 698 } 699 } 700 701 lgrpid = my_lgrp->lgrp_id; 702 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 703 704 /* 705 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 706 * end up in lpl for lgroup 0 whether it is supposed to be in there or 707 * not since none of lgroup IDs in the lpl's have been set yet. 708 */ 709 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 710 cp->cpu_lpl->lpl_lgrpid = lgrpid; 711 712 /* 713 * link the CPU into the lgrp's CPU list 714 */ 715 if (my_lgrp->lgrp_cpucnt == 0) { 716 my_lgrp->lgrp_cpu = cp; 717 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 718 } else { 719 cptr = my_lgrp->lgrp_cpu; 720 cp->cpu_next_lgrp = cptr; 721 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 722 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 723 cptr->cpu_prev_lgrp = cp; 724 } 725 my_lgrp->lgrp_cpucnt++; 726 727 /* 728 * Add this cpu's chip to the per lgroup list 729 * if necessary 730 */ 731 if (cp->cpu_chip->chip_lgrp == NULL) { 732 struct chip *lcpr; 733 734 chp = cp->cpu_chip; 735 736 if (my_lgrp->lgrp_chipcnt == 0) { 737 my_lgrp->lgrp_chips = chp; 738 chp->chip_next_lgrp = 739 chp->chip_prev_lgrp = chp; 740 } else { 741 lcpr = my_lgrp->lgrp_chips; 742 chp->chip_next_lgrp = lcpr; 743 chp->chip_prev_lgrp = 744 lcpr->chip_prev_lgrp; 745 lcpr->chip_prev_lgrp->chip_next_lgrp = 746 chp; 747 lcpr->chip_prev_lgrp = chp; 748 } 749 chp->chip_lgrp = my_lgrp; 750 chp->chip_balance = chp->chip_next_lgrp; 751 my_lgrp->lgrp_chipcnt++; 752 } 753 } 754 755 lgrp_t * 756 lgrp_create(void) 757 { 758 lgrp_t *my_lgrp; 759 lgrp_id_t lgrpid; 760 int i; 761 762 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 763 764 /* 765 * Find an open slot in the lgroup table and recycle unused lgroup 766 * left there if any 767 */ 768 my_lgrp = NULL; 769 if (lgrp_alloc_hint == -1) 770 /* 771 * Allocate from end when hint not set yet because no lgroups 772 * have been deleted yet 773 */ 774 lgrpid = nlgrps++; 775 else { 776 /* 777 * Start looking for next open slot from hint and leave hint 778 * at slot allocated 779 */ 780 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 781 my_lgrp = lgrp_table[i]; 782 if (!LGRP_EXISTS(my_lgrp)) { 783 lgrpid = i; 784 nlgrps++; 785 break; 786 } 787 } 788 lgrp_alloc_hint = lgrpid; 789 } 790 791 /* 792 * Keep track of max lgroup ID allocated so far to cut down on searches 793 */ 794 if (lgrpid > lgrp_alloc_max) 795 lgrp_alloc_max = lgrpid; 796 797 /* 798 * Need to allocate new lgroup if next open slot didn't have one 799 * for recycling 800 */ 801 if (my_lgrp == NULL) 802 my_lgrp = lgrp_plat_alloc(lgrpid); 803 804 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 805 panic("Too many lgrps for platform (%d)", nlgrps); 806 807 my_lgrp->lgrp_id = lgrpid; 808 my_lgrp->lgrp_latency = 0; 809 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 810 my_lgrp->lgrp_parent = NULL; 811 my_lgrp->lgrp_childcnt = 0; 812 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 813 my_lgrp->lgrp_nmnodes = 0; 814 klgrpset_clear(my_lgrp->lgrp_children); 815 klgrpset_clear(my_lgrp->lgrp_leaves); 816 for (i = 0; i < LGRP_RSRC_COUNT; i++) 817 klgrpset_clear(my_lgrp->lgrp_set[i]); 818 819 my_lgrp->lgrp_cpu = NULL; 820 my_lgrp->lgrp_cpucnt = 0; 821 my_lgrp->lgrp_chips = NULL; 822 my_lgrp->lgrp_chipcnt = 0; 823 824 if (my_lgrp->lgrp_kstat != NULL) 825 lgrp_kstat_reset(lgrpid); 826 827 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 828 829 return (my_lgrp); 830 } 831 832 void 833 lgrp_destroy(lgrp_t *lgrp) 834 { 835 int i; 836 837 /* 838 * Unless this lgroup is being destroyed on behalf of 839 * the boot CPU, cpu_lock must be held 840 */ 841 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 842 843 if (nlgrps == 1) 844 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 845 846 if (!LGRP_EXISTS(lgrp)) 847 return; 848 849 /* 850 * Set hint to lgroup being deleted and try to keep lower numbered 851 * hints to facilitate finding empty slots 852 */ 853 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 854 lgrp_alloc_hint = lgrp->lgrp_id; 855 856 /* 857 * Mark this lgroup to be recycled by setting its lgroup ID to 858 * LGRP_NONE and clear relevant fields 859 */ 860 lgrp->lgrp_id = LGRP_NONE; 861 lgrp->lgrp_latency = 0; 862 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 863 lgrp->lgrp_parent = NULL; 864 lgrp->lgrp_childcnt = 0; 865 866 klgrpset_clear(lgrp->lgrp_children); 867 klgrpset_clear(lgrp->lgrp_leaves); 868 for (i = 0; i < LGRP_RSRC_COUNT; i++) 869 klgrpset_clear(lgrp->lgrp_set[i]); 870 871 lgrp->lgrp_mnodes = (mnodeset_t)0; 872 lgrp->lgrp_nmnodes = 0; 873 874 lgrp->lgrp_cpu = NULL; 875 lgrp->lgrp_cpucnt = 0; 876 lgrp->lgrp_chipcnt = 0; 877 lgrp->lgrp_chips = NULL; 878 879 nlgrps--; 880 } 881 882 /* 883 * Initialize kstat data. Called from lgrp intialization code. 884 */ 885 static void 886 lgrp_kstat_init(void) 887 { 888 lgrp_stat_t stat; 889 890 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 891 892 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 893 kstat_named_init(&lgrp_kstat_data[stat], 894 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 895 } 896 897 /* 898 * initialize an lgrp's kstats if needed 899 * called with cpu_lock held but not with cpus paused. 900 * we don't tear these down now because we don't know about 901 * memory leaving the lgrp yet... 902 */ 903 904 void 905 lgrp_kstat_create(cpu_t *cp) 906 { 907 kstat_t *lgrp_kstat; 908 lgrp_id_t lgrpid; 909 lgrp_t *my_lgrp; 910 911 ASSERT(MUTEX_HELD(&cpu_lock)); 912 913 lgrpid = cp->cpu_lpl->lpl_lgrpid; 914 my_lgrp = lgrp_table[lgrpid]; 915 916 if (my_lgrp->lgrp_kstat != NULL) 917 return; /* already initialized */ 918 919 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 920 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 921 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 922 923 if (lgrp_kstat != NULL) { 924 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 925 lgrp_kstat->ks_private = my_lgrp; 926 lgrp_kstat->ks_data = &lgrp_kstat_data; 927 lgrp_kstat->ks_update = lgrp_kstat_extract; 928 my_lgrp->lgrp_kstat = lgrp_kstat; 929 kstat_install(lgrp_kstat); 930 } 931 } 932 933 /* 934 * this will do something when we manage to remove now unused lgrps 935 */ 936 937 /* ARGSUSED */ 938 void 939 lgrp_kstat_destroy(cpu_t *cp) 940 { 941 ASSERT(MUTEX_HELD(&cpu_lock)); 942 } 943 944 /* 945 * Called when a CPU is off-lined. 946 */ 947 static void 948 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 949 { 950 lgrp_t *my_lgrp; 951 struct cpu *prev; 952 struct cpu *next; 953 chip_t *chp; 954 955 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 956 957 prev = cp->cpu_prev_lgrp; 958 next = cp->cpu_next_lgrp; 959 960 prev->cpu_next_lgrp = next; 961 next->cpu_prev_lgrp = prev; 962 963 /* 964 * just because I'm paranoid doesn't mean... 965 */ 966 967 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 968 969 my_lgrp = lgrp_table[lgrpid]; 970 my_lgrp->lgrp_cpucnt--; 971 972 /* 973 * If the last CPU on it's chip is being offlined 974 * then remove this chip from the per lgroup list. 975 * 976 * This is also done for the boot CPU when it needs 977 * to move between lgroups as a consequence of 978 * null proc lpa. 979 */ 980 chp = cp->cpu_chip; 981 if (chp->chip_ncpu == 0 || !lgrp_initialized) { 982 983 chip_t *chpp; 984 985 if (--my_lgrp->lgrp_chipcnt == 0) 986 my_lgrp->lgrp_chips = NULL; 987 else if (my_lgrp->lgrp_chips == chp) 988 my_lgrp->lgrp_chips = chp->chip_next_lgrp; 989 990 /* 991 * Walk this lgroup's chip list looking for chips that 992 * may try to balance against the one that's leaving 993 */ 994 for (chpp = chp->chip_next_lgrp; chpp != chp; 995 chpp = chpp->chip_next_lgrp) { 996 if (chpp->chip_balance == chp) 997 chpp->chip_balance = chp->chip_next_lgrp; 998 } 999 1000 chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 1001 chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 1002 1003 chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 1004 chp->chip_lgrp = NULL; 1005 chp->chip_balance = NULL; 1006 } 1007 1008 /* 1009 * Removing last CPU in lgroup, so update lgroup topology 1010 */ 1011 if (my_lgrp->lgrp_cpucnt == 0) { 1012 klgrpset_t changed; 1013 int count; 1014 int i; 1015 1016 my_lgrp->lgrp_cpu = NULL; 1017 1018 /* 1019 * Remove this lgroup from its lgroup CPU resources and remove 1020 * lgroup from lgroup topology if it doesn't have any more 1021 * resources in it now 1022 */ 1023 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1024 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1025 count = 0; 1026 klgrpset_clear(changed); 1027 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1028 lgrp_alloc_max + 1, &changed); 1029 return; 1030 } 1031 1032 /* 1033 * This lgroup isn't empty, so just remove it from CPU 1034 * resources of any lgroups that contain it as such 1035 */ 1036 for (i = 0; i <= lgrp_alloc_max; i++) { 1037 lgrp_t *lgrp; 1038 1039 lgrp = lgrp_table[i]; 1040 if (!LGRP_EXISTS(lgrp) || 1041 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1042 lgrpid)) 1043 continue; 1044 1045 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1046 } 1047 return; 1048 } 1049 1050 if (my_lgrp->lgrp_cpu == cp) 1051 my_lgrp->lgrp_cpu = next; 1052 1053 } 1054 1055 /* 1056 * Update memory nodes in target lgroups and return ones that get changed 1057 */ 1058 int 1059 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1060 { 1061 int count; 1062 int i; 1063 int j; 1064 lgrp_t *lgrp; 1065 lgrp_t *lgrp_rsrc; 1066 1067 count = 0; 1068 if (changed) 1069 klgrpset_clear(*changed); 1070 1071 if (klgrpset_isempty(target)) 1072 return (0); 1073 1074 /* 1075 * Find each lgroup in target lgroups 1076 */ 1077 for (i = 0; i <= lgrp_alloc_max; i++) { 1078 /* 1079 * Skip any lgroups that don't exist or aren't in target group 1080 */ 1081 lgrp = lgrp_table[i]; 1082 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1083 continue; 1084 } 1085 1086 /* 1087 * Initialize memnodes for intermediate lgroups to 0 1088 * and update them from scratch since they may have completely 1089 * changed 1090 */ 1091 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1092 lgrp->lgrp_mnodes = (mnodeset_t)0; 1093 lgrp->lgrp_nmnodes = 0; 1094 } 1095 1096 /* 1097 * Update memory nodes of of target lgroup with memory nodes 1098 * from each lgroup in its lgroup memory resource set 1099 */ 1100 for (j = 0; j <= lgrp_alloc_max; j++) { 1101 int k; 1102 1103 /* 1104 * Skip any lgroups that don't exist or aren't in 1105 * memory resources of target lgroup 1106 */ 1107 lgrp_rsrc = lgrp_table[j]; 1108 if (!LGRP_EXISTS(lgrp_rsrc) || 1109 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1110 j)) 1111 continue; 1112 1113 /* 1114 * Update target lgroup's memnodes to include memnodes 1115 * of this lgroup 1116 */ 1117 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1118 mnodeset_t mnode_mask; 1119 1120 mnode_mask = (mnodeset_t)1 << k; 1121 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1122 !(lgrp->lgrp_mnodes & mnode_mask)) { 1123 lgrp->lgrp_mnodes |= mnode_mask; 1124 lgrp->lgrp_nmnodes++; 1125 } 1126 } 1127 count++; 1128 if (changed) 1129 klgrpset_add(*changed, lgrp->lgrp_id); 1130 } 1131 } 1132 1133 return (count); 1134 } 1135 1136 /* 1137 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1138 * is moved from one board to another. The "from" and "to" arguments specify the 1139 * source and the destination of the move. 1140 * 1141 * See plat_lgrp_config() for a detailed description of the copy-rename 1142 * semantics. 1143 * 1144 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1145 * the lgroup topology which is changing as memory moves from one lgroup to 1146 * another. It removes the mnode from the source lgroup and re-inserts it in the 1147 * target lgroup. 1148 * 1149 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1150 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1151 * copy-rename operation. 1152 * 1153 * There is one case which requires special handling. If the system contains 1154 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1155 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1156 * lgrp_mem_init), but there is a window when the system has no memory in the 1157 * lgroup hierarchy. If another thread tries to allocate memory during this 1158 * window, the allocation will fail, although the system has physical memory. 1159 * This may cause a system panic or a deadlock (some sleeping memory allocations 1160 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1161 * the mnode back). 1162 * 1163 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1164 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1165 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1166 * but it updates the rest of the lgroup topology as if the mnode was actually 1167 * removed. The lgrp_mem_init() function recognizes that the mnode being 1168 * inserted represents such a special case and updates the topology 1169 * appropriately. 1170 */ 1171 void 1172 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1173 { 1174 /* 1175 * Remove the memory from the source node and add it to the destination 1176 * node. 1177 */ 1178 lgrp_mem_fini(mnode, from, B_TRUE); 1179 lgrp_mem_init(mnode, to, B_TRUE); 1180 } 1181 1182 /* 1183 * Called to indicate that the lgrp with platform handle "hand" now 1184 * contains the memory identified by "mnode". 1185 * 1186 * LOCKING for this routine is a bit tricky. Usually it is called without 1187 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1188 * callers. During DR of the board containing the caged memory it may be called 1189 * with cpu_lock already held and CPUs paused. 1190 * 1191 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1192 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1193 * dealing with the special case of DR copy-rename described in 1194 * lgrp_mem_rename(). 1195 */ 1196 void 1197 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1198 { 1199 klgrpset_t changed; 1200 int count; 1201 int i; 1202 lgrp_t *my_lgrp; 1203 lgrp_id_t lgrpid; 1204 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1205 boolean_t drop_lock = B_FALSE; 1206 boolean_t need_synch = B_FALSE; 1207 1208 /* 1209 * Grab CPU lock (if we haven't already) 1210 */ 1211 if (!MUTEX_HELD(&cpu_lock)) { 1212 mutex_enter(&cpu_lock); 1213 drop_lock = B_TRUE; 1214 } 1215 1216 /* 1217 * This routine may be called from a context where we already 1218 * hold cpu_lock, and have already paused cpus. 1219 */ 1220 if (!cpus_paused()) 1221 need_synch = B_TRUE; 1222 1223 /* 1224 * Check if this mnode is already configured and return immediately if 1225 * it is. 1226 * 1227 * NOTE: in special case of copy-rename of the only remaining mnode, 1228 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1229 * recognize this case and continue as usual, but skip the update to 1230 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1231 * in topology, temporarily introduced by lgrp_mem_fini(). 1232 */ 1233 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1234 lgrp_root->lgrp_mnodes & mnodes_mask) { 1235 if (drop_lock) 1236 mutex_exit(&cpu_lock); 1237 return; 1238 } 1239 1240 /* 1241 * Update lgroup topology with new memory resources, keeping track of 1242 * which lgroups change 1243 */ 1244 count = 0; 1245 klgrpset_clear(changed); 1246 my_lgrp = lgrp_hand_to_lgrp(hand); 1247 if (my_lgrp == NULL) { 1248 /* new lgrp */ 1249 my_lgrp = lgrp_create(); 1250 lgrpid = my_lgrp->lgrp_id; 1251 my_lgrp->lgrp_plathand = hand; 1252 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1253 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1254 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1255 1256 if (need_synch) 1257 pause_cpus(NULL); 1258 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1259 &changed); 1260 if (need_synch) 1261 start_cpus(); 1262 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1263 > 0) { 1264 /* 1265 * Leaf lgroup was created, but latency wasn't available 1266 * then. So, set latency for it and fill in rest of lgroup 1267 * topology now that we know how far it is from other leaf 1268 * lgroups. 1269 */ 1270 klgrpset_clear(changed); 1271 lgrpid = my_lgrp->lgrp_id; 1272 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1273 lgrpid)) 1274 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1275 if (need_synch) 1276 pause_cpus(NULL); 1277 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1278 &changed); 1279 if (need_synch) 1280 start_cpus(); 1281 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1282 my_lgrp->lgrp_id)) { 1283 /* 1284 * Add new lgroup memory resource to existing lgroup 1285 */ 1286 lgrpid = my_lgrp->lgrp_id; 1287 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1288 klgrpset_add(changed, lgrpid); 1289 count++; 1290 for (i = 0; i <= lgrp_alloc_max; i++) { 1291 lgrp_t *lgrp; 1292 1293 lgrp = lgrp_table[i]; 1294 if (!LGRP_EXISTS(lgrp) || 1295 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1296 continue; 1297 1298 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1299 klgrpset_add(changed, lgrp->lgrp_id); 1300 count++; 1301 } 1302 } 1303 1304 /* 1305 * Add memory node to lgroup and remove lgroup from ones that need 1306 * to be updated 1307 */ 1308 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1309 my_lgrp->lgrp_mnodes |= mnodes_mask; 1310 my_lgrp->lgrp_nmnodes++; 1311 } 1312 klgrpset_del(changed, lgrpid); 1313 1314 /* 1315 * Update memory node information for all lgroups that changed and 1316 * contain new memory node as a resource 1317 */ 1318 if (count) 1319 (void) lgrp_mnode_update(changed, NULL); 1320 1321 if (drop_lock) 1322 mutex_exit(&cpu_lock); 1323 } 1324 1325 /* 1326 * Called to indicate that the lgroup associated with the platform 1327 * handle "hand" no longer contains given memory node 1328 * 1329 * LOCKING for this routine is a bit tricky. Usually it is called without 1330 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1331 * callers. During DR of the board containing the caged memory it may be called 1332 * with cpu_lock already held and CPUs paused. 1333 * 1334 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1335 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1336 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1337 * the same mnode back into the topology. See lgrp_mem_rename() and 1338 * lgrp_mem_init() for additional details. 1339 */ 1340 void 1341 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1342 { 1343 klgrpset_t changed; 1344 int count; 1345 int i; 1346 lgrp_t *my_lgrp; 1347 lgrp_id_t lgrpid; 1348 mnodeset_t mnodes_mask; 1349 boolean_t drop_lock = B_FALSE; 1350 boolean_t need_synch = B_FALSE; 1351 1352 /* 1353 * Grab CPU lock (if we haven't already) 1354 */ 1355 if (!MUTEX_HELD(&cpu_lock)) { 1356 mutex_enter(&cpu_lock); 1357 drop_lock = B_TRUE; 1358 } 1359 1360 /* 1361 * This routine may be called from a context where we already 1362 * hold cpu_lock and have already paused cpus. 1363 */ 1364 if (!cpus_paused()) 1365 need_synch = B_TRUE; 1366 1367 my_lgrp = lgrp_hand_to_lgrp(hand); 1368 1369 /* 1370 * The lgrp *must* be pre-existing 1371 */ 1372 ASSERT(my_lgrp != NULL); 1373 1374 /* 1375 * Delete memory node from lgroups which contain it 1376 */ 1377 mnodes_mask = ((mnodeset_t)1 << mnode); 1378 for (i = 0; i <= lgrp_alloc_max; i++) { 1379 lgrp_t *lgrp = lgrp_table[i]; 1380 /* 1381 * Skip any non-existent lgroups and any lgroups that don't 1382 * contain leaf lgroup of memory as a memory resource 1383 */ 1384 if (!LGRP_EXISTS(lgrp) || 1385 !(lgrp->lgrp_mnodes & mnodes_mask)) 1386 continue; 1387 1388 /* 1389 * Avoid removing the last mnode from the root in the DR 1390 * copy-rename case. See lgrp_mem_rename() for details. 1391 */ 1392 if (is_copy_rename && 1393 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1394 continue; 1395 1396 /* 1397 * Remove memory node from lgroup. 1398 */ 1399 lgrp->lgrp_mnodes &= ~mnodes_mask; 1400 lgrp->lgrp_nmnodes--; 1401 ASSERT(lgrp->lgrp_nmnodes >= 0); 1402 } 1403 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1404 1405 /* 1406 * Don't need to update lgroup topology if this lgroup still has memory. 1407 * 1408 * In the special case of DR copy-rename with the only mnode being 1409 * removed, the lgrp_mnodes for the root is always non-zero, but we 1410 * still need to update the lgroup topology. 1411 */ 1412 if ((my_lgrp->lgrp_nmnodes > 0) && 1413 !(is_copy_rename && 1414 (my_lgrp == lgrp_root) && 1415 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1416 if (drop_lock) 1417 mutex_exit(&cpu_lock); 1418 return; 1419 } 1420 1421 /* 1422 * This lgroup does not contain any memory now 1423 */ 1424 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1425 1426 /* 1427 * Remove this lgroup from lgroup topology if it does not contain any 1428 * resources now 1429 */ 1430 lgrpid = my_lgrp->lgrp_id; 1431 count = 0; 1432 klgrpset_clear(changed); 1433 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1434 /* 1435 * Delete lgroup when no more resources 1436 */ 1437 if (need_synch) 1438 pause_cpus(NULL); 1439 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1440 lgrp_alloc_max + 1, &changed); 1441 ASSERT(count > 0); 1442 if (need_synch) 1443 start_cpus(); 1444 } else { 1445 /* 1446 * Remove lgroup from memory resources of any lgroups that 1447 * contain it as such 1448 */ 1449 for (i = 0; i <= lgrp_alloc_max; i++) { 1450 lgrp_t *lgrp; 1451 1452 lgrp = lgrp_table[i]; 1453 if (!LGRP_EXISTS(lgrp) || 1454 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1455 lgrpid)) 1456 continue; 1457 1458 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1459 } 1460 } 1461 if (drop_lock) 1462 mutex_exit(&cpu_lock); 1463 } 1464 1465 /* 1466 * Return lgroup with given platform handle 1467 */ 1468 lgrp_t * 1469 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1470 { 1471 int i; 1472 lgrp_t *lgrp; 1473 1474 if (hand == LGRP_NULL_HANDLE) 1475 return (NULL); 1476 1477 for (i = 0; i <= lgrp_alloc_max; i++) { 1478 lgrp = lgrp_table[i]; 1479 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1480 return (lgrp); 1481 } 1482 return (NULL); 1483 } 1484 1485 /* 1486 * Return the home lgroup of the current thread. 1487 * We must do this with kernel preemption disabled, since we don't want our 1488 * thread to be re-homed while we're poking around with its lpl, and the lpl 1489 * should never be NULL. 1490 * 1491 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1492 * is enabled because of DR. Callers can use disable kernel preemption 1493 * around this call to guarantee that the lgroup will be valid beyond this 1494 * routine, since kernel preemption can be recursive. 1495 */ 1496 lgrp_t * 1497 lgrp_home_lgrp(void) 1498 { 1499 lgrp_t *lgrp; 1500 lpl_t *lpl; 1501 1502 kpreempt_disable(); 1503 1504 lpl = curthread->t_lpl; 1505 ASSERT(lpl != NULL); 1506 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1507 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1508 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1509 1510 kpreempt_enable(); 1511 1512 return (lgrp); 1513 } 1514 1515 /* 1516 * Return ID of home lgroup for given thread 1517 * (See comments for lgrp_home_lgrp() for special care and handling 1518 * instructions) 1519 */ 1520 lgrp_id_t 1521 lgrp_home_id(kthread_t *t) 1522 { 1523 lgrp_id_t lgrp; 1524 lpl_t *lpl; 1525 1526 ASSERT(t != NULL); 1527 /* 1528 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1529 * cannot since the HAT layer can call into this routine to 1530 * determine the locality for its data structures in the context 1531 * of a page fault. 1532 */ 1533 1534 kpreempt_disable(); 1535 1536 lpl = t->t_lpl; 1537 ASSERT(lpl != NULL); 1538 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1539 lgrp = lpl->lpl_lgrpid; 1540 1541 kpreempt_enable(); 1542 1543 return (lgrp); 1544 } 1545 1546 /* 1547 * Return lgroup containing the physical memory for the given page frame number 1548 */ 1549 lgrp_t * 1550 lgrp_pfn_to_lgrp(pfn_t pfn) 1551 { 1552 lgrp_handle_t hand; 1553 int i; 1554 lgrp_t *lgrp; 1555 1556 hand = lgrp_plat_pfn_to_hand(pfn); 1557 if (hand != LGRP_NULL_HANDLE) 1558 for (i = 0; i <= lgrp_alloc_max; i++) { 1559 lgrp = lgrp_table[i]; 1560 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1561 return (lgrp); 1562 } 1563 return (NULL); 1564 } 1565 1566 /* 1567 * Return lgroup containing the physical memory for the given page frame number 1568 */ 1569 lgrp_t * 1570 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1571 { 1572 lgrp_handle_t hand; 1573 int i; 1574 lgrp_t *lgrp; 1575 pfn_t pfn; 1576 1577 pfn = btop(physaddr); 1578 hand = lgrp_plat_pfn_to_hand(pfn); 1579 if (hand != LGRP_NULL_HANDLE) 1580 for (i = 0; i <= lgrp_alloc_max; i++) { 1581 lgrp = lgrp_table[i]; 1582 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1583 return (lgrp); 1584 } 1585 return (NULL); 1586 } 1587 1588 /* 1589 * Return the leaf lgroup containing the given CPU 1590 */ 1591 static lgrp_t * 1592 lgrp_cpu_to_lgrp(cpu_t *cpu) 1593 { 1594 return (cpu->cpu_chip->chip_lgrp); 1595 } 1596 1597 /* 1598 * Return the sum of the partition loads in an lgrp divided by 1599 * the number of CPUs in the lgrp. This is our best approximation 1600 * of an 'lgroup load average' for a useful per-lgroup kstat. 1601 */ 1602 static uint64_t 1603 lgrp_sum_loadavgs(lgrp_t *lgrp) 1604 { 1605 cpu_t *cpu; 1606 int ncpu; 1607 uint64_t loads = 0; 1608 1609 mutex_enter(&cpu_lock); 1610 1611 cpu = lgrp->lgrp_cpu; 1612 ncpu = lgrp->lgrp_cpucnt; 1613 1614 if (cpu == NULL || ncpu == 0) { 1615 mutex_exit(&cpu_lock); 1616 return (0ull); 1617 } 1618 1619 do { 1620 loads += cpu->cpu_lpl->lpl_loadavg; 1621 cpu = cpu->cpu_next_lgrp; 1622 } while (cpu != lgrp->lgrp_cpu); 1623 1624 mutex_exit(&cpu_lock); 1625 1626 return (loads / ncpu); 1627 } 1628 1629 void 1630 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1631 { 1632 struct lgrp_stats *pstats; 1633 1634 /* 1635 * Verify that the caller isn't trying to add to 1636 * a statistic for an lgroup that has gone away 1637 */ 1638 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1639 return; 1640 1641 pstats = &lgrp_stats[lgrpid]; 1642 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1643 } 1644 1645 int64_t 1646 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1647 { 1648 uint64_t val; 1649 struct lgrp_stats *pstats; 1650 1651 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1652 return ((int64_t)0); 1653 1654 pstats = &lgrp_stats[lgrpid]; 1655 LGRP_STAT_READ(pstats, stat, val); 1656 return (val); 1657 } 1658 1659 /* 1660 * Reset all kstats for lgrp specified by its lgrpid. 1661 */ 1662 static void 1663 lgrp_kstat_reset(lgrp_id_t lgrpid) 1664 { 1665 lgrp_stat_t stat; 1666 1667 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1668 return; 1669 1670 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1671 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1672 } 1673 } 1674 1675 /* 1676 * Collect all per-lgrp statistics for the lgrp associated with this 1677 * kstat, and store them in the ks_data array. 1678 * 1679 * The superuser can reset all the running counter statistics for an 1680 * lgrp by writing to any of the lgrp's stats. 1681 */ 1682 static int 1683 lgrp_kstat_extract(kstat_t *ksp, int rw) 1684 { 1685 lgrp_stat_t stat; 1686 struct kstat_named *ksd; 1687 lgrp_t *lgrp; 1688 lgrp_id_t lgrpid; 1689 1690 lgrp = (lgrp_t *)ksp->ks_private; 1691 1692 ksd = (struct kstat_named *)ksp->ks_data; 1693 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1694 1695 lgrpid = lgrp->lgrp_id; 1696 1697 if (lgrpid == LGRP_NONE) { 1698 /* 1699 * Return all zeroes as stats for freed lgrp. 1700 */ 1701 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1702 ksd[stat].value.i64 = 0; 1703 } 1704 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1705 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1706 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1707 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1708 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1709 } else if (rw != KSTAT_WRITE) { 1710 /* 1711 * Handle counter stats 1712 */ 1713 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1714 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1715 } 1716 1717 /* 1718 * Handle kernel data snapshot stats 1719 */ 1720 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1721 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1722 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1723 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1724 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1725 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1726 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1727 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1728 } else { 1729 lgrp_kstat_reset(lgrpid); 1730 } 1731 1732 return (0); 1733 } 1734 1735 int 1736 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1737 { 1738 cpu_t *cp; 1739 1740 mutex_enter(&cpu_lock); 1741 1742 if ((cp = cpu_get(id)) == NULL) { 1743 mutex_exit(&cpu_lock); 1744 return (EINVAL); 1745 } 1746 1747 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1748 mutex_exit(&cpu_lock); 1749 return (EINVAL); 1750 } 1751 1752 ASSERT(cp->cpu_lpl != NULL); 1753 1754 *lp = cp->cpu_lpl->lpl_lgrpid; 1755 1756 mutex_exit(&cpu_lock); 1757 1758 return (0); 1759 } 1760 1761 int 1762 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1763 { 1764 cpu_t *cp; 1765 1766 mutex_enter(&cpu_lock); 1767 1768 if ((cp = cpu_get(id)) == NULL) { 1769 mutex_exit(&cpu_lock); 1770 return (EINVAL); 1771 } 1772 1773 ASSERT(cp->cpu_lpl != NULL); 1774 1775 *lp = cp->cpu_lpl->lpl_loadavg; 1776 1777 mutex_exit(&cpu_lock); 1778 1779 return (0); 1780 } 1781 1782 void 1783 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime) 1784 { 1785 lgrp_t *lgrp; 1786 int i; 1787 1788 for (i = 0; i <= lgrp_alloc_max; i++) { 1789 lgrp = lgrp_table[i]; 1790 1791 if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime)) 1792 lgrp->lgrp_latency = (int)newtime; 1793 } 1794 } 1795 1796 /* 1797 * Add a resource named by lpl_leaf to rset of lpl_target 1798 * 1799 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1800 * resource. It is adjusted here, as this is presently the only place that we 1801 * can be certain a resource addition has succeeded. 1802 * 1803 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1804 * list in order until it reaches a NULL. (This list is required to be NULL 1805 * terminated, too). This is done so that we can mark start pos + 1, so that 1806 * each lpl is traversed sequentially, but in a different order. We hope this 1807 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1808 */ 1809 1810 void 1811 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1812 { 1813 int i; 1814 int entry_slot = 0; 1815 1816 /* return if leaf is already present */ 1817 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1818 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1819 return; 1820 } 1821 1822 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1823 lpl_leaf->lpl_lgrpid) { 1824 break; 1825 } 1826 } 1827 1828 /* insert leaf, update counts */ 1829 entry_slot = i; 1830 i = lpl_target->lpl_nrset++; 1831 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1832 panic("More leaf lgrps in system than are supported!\n"); 1833 } 1834 1835 /* 1836 * Start at the end of the rset array and work backwards towards the 1837 * slot into which the new lpl will be inserted. This effectively 1838 * preserves the current ordering by scooting everybody over one entry, 1839 * and placing the new entry into the space created. 1840 */ 1841 1842 while (i-- > entry_slot) { 1843 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1844 } 1845 1846 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1847 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1848 } 1849 1850 /* 1851 * Update each of lpl_parent's children with a proper hint and 1852 * a reference to their parent. 1853 * The lgrp topology is used as the reference since it is fully 1854 * consistent and correct at this point. 1855 * 1856 * Each child's hint will reference an element in lpl_parent's 1857 * rset that designates where the child should start searching 1858 * for CPU resources. The hint selected is the highest order leaf present 1859 * in the child's lineage. 1860 * 1861 * This should be called after any potential change in lpl_parent's 1862 * rset. 1863 */ 1864 static void 1865 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1866 { 1867 klgrpset_t children, leaves; 1868 lpl_t *lpl; 1869 int hint; 1870 int i, j; 1871 1872 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1873 if (klgrpset_isempty(children)) 1874 return; /* nothing to do */ 1875 1876 for (i = 0; i <= lgrp_alloc_max; i++) { 1877 if (klgrpset_ismember(children, i)) { 1878 1879 /* 1880 * Given the set of leaves in this child's lineage, 1881 * find the highest order leaf present in the parent's 1882 * rset. Select this as the hint for the child. 1883 */ 1884 leaves = lgrp_table[i]->lgrp_leaves; 1885 hint = 0; 1886 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1887 lpl = lpl_parent->lpl_rset[j]; 1888 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1889 hint = j; 1890 } 1891 cp->cp_lgrploads[i].lpl_hint = hint; 1892 1893 /* 1894 * (Re)set the parent. It may be incorrect if 1895 * lpl_parent is new in the topology. 1896 */ 1897 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1898 } 1899 } 1900 } 1901 1902 /* 1903 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1904 * 1905 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1906 * resource. The values are adjusted here, as this is the only place that we can 1907 * be certain a resource was successfully deleted. 1908 */ 1909 void 1910 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1911 { 1912 int i; 1913 1914 /* find leaf in intermediate node */ 1915 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1916 if (lpl_target->lpl_rset[i] == lpl_leaf) 1917 break; 1918 } 1919 1920 /* return if leaf not found */ 1921 if (lpl_target->lpl_rset[i] != lpl_leaf) 1922 return; 1923 1924 /* prune leaf, compress array */ 1925 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1926 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1927 lpl_target->lpl_ncpu--; 1928 do { 1929 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1930 } while (i++ < lpl_target->lpl_nrset); 1931 } 1932 1933 /* 1934 * Check to see if the resource set of the target lpl contains the 1935 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1936 */ 1937 1938 int 1939 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1940 { 1941 int i; 1942 1943 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1944 if (lpl_target->lpl_rset[i] == lpl_leaf) 1945 return (1); 1946 } 1947 1948 return (0); 1949 } 1950 1951 /* 1952 * Called when we change cpu lpl membership. This increments or decrements the 1953 * per-cpu counter in every lpl in which our leaf appears. 1954 */ 1955 void 1956 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1957 { 1958 cpupart_t *cpupart; 1959 lgrp_t *lgrp_leaf; 1960 lgrp_t *lgrp_cur; 1961 lpl_t *lpl_leaf; 1962 lpl_t *lpl_cur; 1963 int i; 1964 1965 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1966 1967 cpupart = cp->cpu_part; 1968 lpl_leaf = cp->cpu_lpl; 1969 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1970 1971 for (i = 0; i <= lgrp_alloc_max; i++) { 1972 lgrp_cur = lgrp_table[i]; 1973 1974 /* 1975 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 1976 * for the cpu in question, or if the current lgrp and leaf 1977 * don't share the same resources. 1978 */ 1979 1980 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 1981 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 1982 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 1983 continue; 1984 1985 1986 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 1987 1988 if (lpl_cur->lpl_nrset > 0) { 1989 if (act == LPL_INCREMENT) { 1990 lpl_cur->lpl_ncpu++; 1991 } else if (act == LPL_DECREMENT) { 1992 lpl_cur->lpl_ncpu--; 1993 } 1994 } 1995 } 1996 } 1997 1998 /* 1999 * Initialize lpl with given resources and specified lgrp 2000 */ 2001 2002 void 2003 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2004 { 2005 lpl->lpl_lgrpid = lgrp->lgrp_id; 2006 lpl->lpl_loadavg = 0; 2007 if (lpl == lpl_leaf) 2008 lpl->lpl_ncpu = 1; 2009 else 2010 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2011 lpl->lpl_nrset = 1; 2012 lpl->lpl_rset[0] = lpl_leaf; 2013 lpl->lpl_lgrp = lgrp; 2014 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2015 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2016 } 2017 2018 /* 2019 * Clear an unused lpl 2020 */ 2021 2022 void 2023 lpl_clear(lpl_t *lpl) 2024 { 2025 lgrpid_t lid; 2026 2027 /* save lid for debugging purposes */ 2028 lid = lpl->lpl_lgrpid; 2029 bzero(lpl, sizeof (lpl_t)); 2030 lpl->lpl_lgrpid = lid; 2031 } 2032 2033 /* 2034 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2035 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2036 * make full use of all of the lgroup topology, but this checks to make sure 2037 * that for the parts that it does use, it has correctly understood the 2038 * relationships that exist. This function returns 2039 * 0 if the topology is correct, and a non-zero error code, for non-debug 2040 * kernels if incorrect. Asserts are spread throughout the code to aid in 2041 * debugging on a DEBUG kernel. 2042 */ 2043 int 2044 lpl_topo_verify(cpupart_t *cpupart) 2045 { 2046 lgrp_t *lgrp; 2047 lpl_t *lpl; 2048 klgrpset_t rset; 2049 klgrpset_t cset; 2050 cpu_t *cpu; 2051 cpu_t *cp_start; 2052 int i; 2053 int j; 2054 int sum; 2055 2056 /* topology can't be incorrect if it doesn't exist */ 2057 if (!lgrp_topo_initialized || !lgrp_initialized) 2058 return (LPL_TOPO_CORRECT); 2059 2060 ASSERT(cpupart != NULL); 2061 2062 for (i = 0; i <= lgrp_alloc_max; i++) { 2063 lgrp = lgrp_table[i]; 2064 lpl = NULL; 2065 /* make sure lpls are allocated */ 2066 ASSERT(cpupart->cp_lgrploads); 2067 if (!cpupart->cp_lgrploads) 2068 return (LPL_TOPO_PART_HAS_NO_LPL); 2069 2070 lpl = &cpupart->cp_lgrploads[i]; 2071 /* make sure our index is good */ 2072 ASSERT(i < cpupart->cp_nlgrploads); 2073 2074 /* if lgroup doesn't exist, make sure lpl is empty */ 2075 if (!LGRP_EXISTS(lgrp)) { 2076 ASSERT(lpl->lpl_ncpu == 0); 2077 if (lpl->lpl_ncpu > 0) { 2078 return (LPL_TOPO_CPUS_NOT_EMPTY); 2079 } else { 2080 continue; 2081 } 2082 } 2083 2084 /* verify that lgroup and lpl are identically numbered */ 2085 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2086 2087 /* if lgroup isn't in our partition, make sure lpl is empty */ 2088 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2089 cpupart->cp_lgrpset)) { 2090 ASSERT(lpl->lpl_ncpu == 0); 2091 if (lpl->lpl_ncpu > 0) { 2092 return (LPL_TOPO_CPUS_NOT_EMPTY); 2093 } 2094 /* 2095 * lpl is empty, and lgroup isn't in partition. verify 2096 * that lpl doesn't show up in anyone else's rsets (in 2097 * this partition, anyway) 2098 */ 2099 2100 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2101 lpl_t *i_lpl; /* lpl we're iterating over */ 2102 2103 i_lpl = &cpupart->cp_lgrploads[j]; 2104 2105 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2106 if (lpl_rset_contains(i_lpl, lpl)) { 2107 return (LPL_TOPO_LPL_ORPHANED); 2108 } 2109 } 2110 /* lgroup is empty, and everything is ok. continue */ 2111 continue; 2112 } 2113 2114 2115 /* lgroup is in this partition, now check it against lpl */ 2116 2117 /* do both have matching lgrps? */ 2118 ASSERT(lgrp == lpl->lpl_lgrp); 2119 if (lgrp != lpl->lpl_lgrp) { 2120 return (LPL_TOPO_LGRP_MISMATCH); 2121 } 2122 2123 /* do the parent lgroups exist and do they match? */ 2124 if (lgrp->lgrp_parent) { 2125 ASSERT(lpl->lpl_parent); 2126 ASSERT(lgrp->lgrp_parent->lgrp_id == 2127 lpl->lpl_parent->lpl_lgrpid); 2128 2129 if (!lpl->lpl_parent) { 2130 return (LPL_TOPO_MISSING_PARENT); 2131 } else if (lgrp->lgrp_parent->lgrp_id != 2132 lpl->lpl_parent->lpl_lgrpid) { 2133 return (LPL_TOPO_PARENT_MISMATCH); 2134 } 2135 } 2136 2137 /* only leaf lgroups keep a cpucnt, only check leaves */ 2138 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2139 2140 /* verify that lgrp is also a leaf */ 2141 ASSERT((lgrp->lgrp_childcnt == 0) && 2142 (klgrpset_ismember(lgrp->lgrp_leaves, 2143 lpl->lpl_lgrpid))); 2144 2145 if ((lgrp->lgrp_childcnt > 0) || 2146 (!klgrpset_ismember(lgrp->lgrp_leaves, 2147 lpl->lpl_lgrpid))) { 2148 return (LPL_TOPO_LGRP_NOT_LEAF); 2149 } 2150 2151 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2152 (lpl->lpl_ncpu > 0)); 2153 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2154 (lpl->lpl_ncpu <= 0)) { 2155 return (LPL_TOPO_BAD_CPUCNT); 2156 } 2157 2158 /* 2159 * Check that lpl_ncpu also matches the number of 2160 * cpus in the lpl's linked list. This only exists in 2161 * leaves, but they should always match. 2162 */ 2163 j = 0; 2164 cpu = cp_start = lpl->lpl_cpus; 2165 while (cpu != NULL) { 2166 j++; 2167 2168 /* check to make sure cpu's lpl is leaf lpl */ 2169 ASSERT(cpu->cpu_lpl == lpl); 2170 if (cpu->cpu_lpl != lpl) { 2171 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2172 } 2173 2174 /* check next cpu */ 2175 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2176 continue; 2177 } else { 2178 cpu = NULL; 2179 } 2180 } 2181 2182 ASSERT(j == lpl->lpl_ncpu); 2183 if (j != lpl->lpl_ncpu) { 2184 return (LPL_TOPO_LPL_BAD_NCPU); 2185 } 2186 2187 /* 2188 * Also, check that leaf lpl is contained in all 2189 * intermediate lpls that name the leaf as a descendant 2190 */ 2191 2192 for (j = 0; j <= lgrp_alloc_max; j++) { 2193 klgrpset_t intersect; 2194 lgrp_t *lgrp_cand; 2195 lpl_t *lpl_cand; 2196 2197 lgrp_cand = lgrp_table[j]; 2198 intersect = klgrpset_intersects( 2199 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2200 cpupart->cp_lgrpset); 2201 2202 if (!LGRP_EXISTS(lgrp_cand) || 2203 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2204 cpupart->cp_lgrpset) || 2205 (intersect == 0)) 2206 continue; 2207 2208 lpl_cand = 2209 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2210 2211 if (klgrpset_ismember(intersect, 2212 lgrp->lgrp_id)) { 2213 ASSERT(lpl_rset_contains(lpl_cand, 2214 lpl)); 2215 2216 if (!lpl_rset_contains(lpl_cand, lpl)) { 2217 return (LPL_TOPO_RSET_MSSNG_LF); 2218 } 2219 } 2220 } 2221 2222 } else { /* non-leaf specific checks */ 2223 2224 /* 2225 * Non-leaf lpls should have lpl_cpus == NULL 2226 * verify that this is so 2227 */ 2228 ASSERT(lpl->lpl_cpus == NULL); 2229 if (lpl->lpl_cpus != NULL) { 2230 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2231 } 2232 2233 /* 2234 * verify that the sum of the cpus in the leaf resources 2235 * is equal to the total ncpu in the intermediate 2236 */ 2237 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2238 sum += lpl->lpl_rset[j]->lpl_ncpu; 2239 } 2240 2241 ASSERT(sum == lpl->lpl_ncpu); 2242 if (sum != lpl->lpl_ncpu) { 2243 return (LPL_TOPO_LPL_BAD_NCPU); 2244 } 2245 } 2246 2247 /* 2248 * check on lpl_hint. Don't check root, since it has no parent. 2249 */ 2250 if (lpl->lpl_parent != NULL) { 2251 int hint; 2252 lpl_t *hint_lpl; 2253 2254 /* make sure hint is within limits of nrset */ 2255 hint = lpl->lpl_hint; 2256 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2257 if (lpl->lpl_parent->lpl_nrset < hint) { 2258 return (LPL_TOPO_BOGUS_HINT); 2259 } 2260 2261 /* make sure hint points to valid lpl */ 2262 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2263 ASSERT(hint_lpl->lpl_ncpu > 0); 2264 if (hint_lpl->lpl_ncpu <= 0) { 2265 return (LPL_TOPO_BOGUS_HINT); 2266 } 2267 } 2268 2269 /* 2270 * Check the rset of the lpl in question. Make sure that each 2271 * rset contains a subset of the resources in 2272 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2273 * sure that each rset doesn't include resources that are 2274 * outside of that set. (Which would be resources somehow not 2275 * accounted for). 2276 */ 2277 2278 klgrpset_clear(rset); 2279 for (j = 0; j < lpl->lpl_nrset; j++) { 2280 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2281 } 2282 klgrpset_copy(cset, rset); 2283 /* make sure lpl rset matches lgrp rset */ 2284 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2285 /* make sure rset is contained with in partition, too */ 2286 klgrpset_diff(cset, cpupart->cp_lgrpset); 2287 2288 ASSERT(klgrpset_isempty(rset) && 2289 klgrpset_isempty(cset)); 2290 if (!klgrpset_isempty(rset) || 2291 !klgrpset_isempty(cset)) { 2292 return (LPL_TOPO_RSET_MISMATCH); 2293 } 2294 2295 /* 2296 * check to make sure lpl_nrset matches the number of rsets 2297 * contained in the lpl 2298 */ 2299 2300 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2301 j++); 2302 2303 ASSERT(j == lpl->lpl_nrset); 2304 if (j != lpl->lpl_nrset) { 2305 return (LPL_TOPO_BAD_RSETCNT); 2306 } 2307 2308 } 2309 return (LPL_TOPO_CORRECT); 2310 } 2311 2312 /* 2313 * Flatten lpl topology to given number of levels. This is presently only 2314 * implemented for a flatten to 2 levels, which will prune out the intermediates 2315 * and home the leaf lpls to the root lpl. 2316 */ 2317 int 2318 lpl_topo_flatten(int levels) 2319 { 2320 int i; 2321 uint_t sum; 2322 lgrp_t *lgrp_cur; 2323 lpl_t *lpl_cur; 2324 lpl_t *lpl_root; 2325 cpupart_t *cp; 2326 2327 if (levels != 2) 2328 return (0); 2329 2330 /* called w/ cpus paused - grab no locks! */ 2331 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2332 !lgrp_initialized); 2333 2334 cp = cp_list_head; 2335 do { 2336 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2337 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2338 2339 for (i = 0; i <= lgrp_alloc_max; i++) { 2340 lgrp_cur = lgrp_table[i]; 2341 lpl_cur = &cp->cp_lgrploads[i]; 2342 2343 if ((lgrp_cur == lgrp_root) || 2344 (!LGRP_EXISTS(lgrp_cur) && 2345 (lpl_cur->lpl_ncpu == 0))) 2346 continue; 2347 2348 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2349 /* 2350 * this should be a deleted intermediate, so 2351 * clear it 2352 */ 2353 lpl_clear(lpl_cur); 2354 } else if ((lpl_cur->lpl_nrset == 1) && 2355 (lpl_cur->lpl_rset[0] == lpl_cur) && 2356 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2357 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2358 /* 2359 * this is a leaf whose parent was deleted, or 2360 * whose parent had their lgrp deleted. (And 2361 * whose parent will soon be deleted). Point 2362 * this guy back to the root lpl. 2363 */ 2364 lpl_cur->lpl_parent = lpl_root; 2365 lpl_rset_add(lpl_root, lpl_cur); 2366 } 2367 2368 } 2369 2370 /* 2371 * Now that we're done, make sure the count on the root lpl is 2372 * correct, and update the hints of the children for the sake of 2373 * thoroughness 2374 */ 2375 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2376 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2377 } 2378 lpl_root->lpl_ncpu = sum; 2379 lpl_child_update(lpl_root, cp); 2380 2381 cp = cp->cp_next; 2382 } while (cp != cp_list_head); 2383 2384 return (levels); 2385 } 2386 2387 /* 2388 * Insert a lpl into the resource hierarchy and create any additional lpls that 2389 * are necessary to represent the varying states of locality for the cpu 2390 * resoruces newly added to the partition. 2391 * 2392 * This routine is clever enough that it can correctly add resources from the 2393 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2394 * those for which the lpl is a leaf as opposed to simply a named equally local 2395 * resource). The one special case that needs additional processing is when a 2396 * new intermediate lpl is introduced. Since the main loop only traverses 2397 * looking to add the leaf resource where it does not yet exist, additional work 2398 * is necessary to add other leaf resources that may need to exist in the newly 2399 * created intermediate. This is performed by the second inner loop, and is 2400 * only done when the check for more than one overlapping resource succeeds. 2401 */ 2402 2403 void 2404 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2405 { 2406 int i; 2407 int j; 2408 int hint; 2409 int rset_num_intersect; 2410 lgrp_t *lgrp_cur; 2411 lpl_t *lpl_cur; 2412 lpl_t *lpl_parent; 2413 lgrpid_t parent_id; 2414 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2415 2416 for (i = 0; i <= lgrp_alloc_max; i++) { 2417 lgrp_cur = lgrp_table[i]; 2418 2419 /* 2420 * Don't insert if the lgrp isn't there, if the leaf isn't 2421 * contained within the current lgrp, or if the current lgrp has 2422 * no leaves in this partition 2423 */ 2424 2425 if (!LGRP_EXISTS(lgrp_cur) || 2426 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2427 lpl_leaf->lpl_lgrpid) || 2428 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2429 cpupart->cp_lgrpset)) 2430 continue; 2431 2432 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2433 if (lgrp_cur->lgrp_parent != NULL) { 2434 /* if lgrp has a parent, assign it properly */ 2435 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2436 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2437 } else { 2438 /* if not, make sure parent ptr gets set to null */ 2439 lpl_parent = NULL; 2440 } 2441 2442 if (lpl_cur == lpl_leaf) { 2443 /* 2444 * Almost all leaf state was initialized elsewhere. The 2445 * only thing left to do is to set the parent. 2446 */ 2447 lpl_cur->lpl_parent = lpl_parent; 2448 continue; 2449 } 2450 2451 /* 2452 * Initialize intermediate lpl 2453 * Save this lpl's hint though. Since we're changing this 2454 * lpl's resources, we need to update the hint in this lpl's 2455 * children, but the hint in this lpl is unaffected and 2456 * should be preserved. 2457 */ 2458 hint = lpl_cur->lpl_hint; 2459 2460 lpl_clear(lpl_cur); 2461 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2462 2463 lpl_cur->lpl_hint = hint; 2464 lpl_cur->lpl_parent = lpl_parent; 2465 2466 /* does new lpl need to be populated with other resources? */ 2467 rset_intersect = 2468 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2469 cpupart->cp_lgrpset); 2470 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2471 2472 if (rset_num_intersect > 1) { 2473 /* 2474 * If so, figure out what lpls have resources that 2475 * intersect this one, and add them. 2476 */ 2477 for (j = 0; j <= lgrp_alloc_max; j++) { 2478 lgrp_t *lgrp_cand; /* candidate lgrp */ 2479 lpl_t *lpl_cand; /* candidate lpl */ 2480 2481 lgrp_cand = lgrp_table[j]; 2482 if (!LGRP_EXISTS(lgrp_cand) || 2483 !klgrpset_ismember(rset_intersect, 2484 lgrp_cand->lgrp_id)) 2485 continue; 2486 lpl_cand = 2487 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2488 lpl_rset_add(lpl_cur, lpl_cand); 2489 } 2490 } 2491 /* 2492 * This lpl's rset has changed. Update the hint in it's 2493 * children. 2494 */ 2495 lpl_child_update(lpl_cur, cpupart); 2496 } 2497 } 2498 2499 /* 2500 * remove a lpl from the hierarchy of resources, clearing its state when 2501 * finished. If the lpls at the intermediate levels of the hierarchy have no 2502 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2503 * delete them as well. 2504 */ 2505 2506 void 2507 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2508 { 2509 int i; 2510 lgrp_t *lgrp_cur; 2511 lpl_t *lpl_cur; 2512 klgrpset_t leaf_intersect; /* intersection of leaves */ 2513 2514 for (i = 0; i <= lgrp_alloc_max; i++) { 2515 lgrp_cur = lgrp_table[i]; 2516 2517 /* 2518 * Don't attempt to remove from lgrps that aren't there, that 2519 * don't contain our leaf, or from the leaf itself. (We do that 2520 * later) 2521 */ 2522 2523 if (!LGRP_EXISTS(lgrp_cur)) 2524 continue; 2525 2526 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2527 2528 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2529 lpl_leaf->lpl_lgrpid) || 2530 (lpl_cur == lpl_leaf)) { 2531 continue; 2532 } 2533 2534 /* 2535 * This is a slightly sleazy simplification in that we have 2536 * already marked the cp_lgrpset as no longer containing the 2537 * leaf we've deleted. Any lpls that pass the above checks 2538 * based upon lgrp membership but not necessarily cpu-part 2539 * membership also get cleared by the checks below. Currently 2540 * this is harmless, as the lpls should be empty anyway. 2541 * 2542 * In particular, we want to preserve lpls that have additional 2543 * leaf resources, even though we don't yet have a processor 2544 * architecture that represents resources this way. 2545 */ 2546 2547 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2548 cpupart->cp_lgrpset); 2549 2550 lpl_rset_del(lpl_cur, lpl_leaf); 2551 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2552 lpl_clear(lpl_cur); 2553 } else { 2554 /* 2555 * Update this lpl's children 2556 */ 2557 lpl_child_update(lpl_cur, cpupart); 2558 } 2559 } 2560 lpl_clear(lpl_leaf); 2561 } 2562 2563 /* 2564 * add a cpu to a partition in terms of lgrp load avg bookeeping 2565 * 2566 * The lpl (cpu partition load average information) is now arranged in a 2567 * hierarchical fashion whereby resources that are closest, ie. most local, to 2568 * the cpu in question are considered to be leaves in a tree of resources. 2569 * There are two general cases for cpu additon: 2570 * 2571 * 1. A lpl structure that contains resources already in the hierarchy tree. 2572 * In this case, all of the associated lpl relationships have been defined, and 2573 * all that is necessary is that we link the new cpu into the per-lpl list of 2574 * cpus, and increment the ncpu count of all places where this cpu resource will 2575 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2576 * pushing is accomplished by this routine. 2577 * 2578 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2579 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2580 * construct the hierarchy of state necessary to name it's more distant 2581 * resources, if they should exist. The leaf structure is initialized by this 2582 * routine, as is the cpu-partition state for the lgrp membership. This routine 2583 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2584 * and builds all of the "ancestoral" state necessary to identify resources at 2585 * differing levels of locality. 2586 */ 2587 void 2588 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2589 { 2590 cpupart_t *cpupart; 2591 lgrp_t *lgrp_leaf; 2592 lpl_t *lpl_leaf; 2593 2594 /* called sometimes w/ cpus paused - grab no locks */ 2595 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2596 2597 cpupart = cp->cpu_part; 2598 lgrp_leaf = lgrp_table[lgrpid]; 2599 2600 /* don't add non-existent lgrp */ 2601 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2602 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2603 cp->cpu_lpl = lpl_leaf; 2604 2605 /* only leaf lpls contain cpus */ 2606 2607 if (lpl_leaf->lpl_ncpu++ == 0) { 2608 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2609 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2610 lpl_leaf_insert(lpl_leaf, cpupart); 2611 } else { 2612 /* 2613 * the lpl should already exist in the parent, so just update 2614 * the count of available CPUs 2615 */ 2616 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2617 } 2618 2619 /* link cpu into list of cpus in lpl */ 2620 2621 if (lpl_leaf->lpl_cpus) { 2622 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2623 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2624 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2625 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2626 } else { 2627 /* 2628 * We increment ncpu immediately after we create a new leaf 2629 * lpl, so assert that ncpu == 1 for the case where we don't 2630 * have any cpu pointers yet. 2631 */ 2632 ASSERT(lpl_leaf->lpl_ncpu == 1); 2633 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2634 } 2635 2636 } 2637 2638 2639 /* 2640 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2641 * 2642 * The lpl (cpu partition load average information) is now arranged in a 2643 * hierarchical fashion whereby resources that are closest, ie. most local, to 2644 * the cpu in question are considered to be leaves in a tree of resources. 2645 * There are two removal cases in question: 2646 * 2647 * 1. Removal of the resource in the leaf leaves other resources remaining in 2648 * that leaf. (Another cpu still exists at this level of locality). In this 2649 * case, the count of available cpus is decremented in all assocated lpls by 2650 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2651 * from the per-cpu lpl list. 2652 * 2653 * 2. Removal of the resource results in the lpl containing no resources. (It's 2654 * empty) In this case, all of what has occurred for the first step must take 2655 * place; however, additionally we must remove the lpl structure itself, prune 2656 * out any stranded lpls that do not directly name a leaf resource, and mark the 2657 * cpu partition in question as no longer containing resources from the lgrp of 2658 * the lpl that has been delted. Cpu-partition changes are handled by this 2659 * method, but the lpl_leaf_remove function deals with the details of pruning 2660 * out the empty lpl and any of its orphaned direct ancestors. 2661 */ 2662 void 2663 lgrp_part_del_cpu(cpu_t *cp) 2664 { 2665 lpl_t *lpl; 2666 lpl_t *leaf_lpl; 2667 lgrp_t *lgrp_leaf; 2668 2669 /* called sometimes w/ cpus paused - grab no locks */ 2670 2671 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2672 2673 lpl = leaf_lpl = cp->cpu_lpl; 2674 lgrp_leaf = leaf_lpl->lpl_lgrp; 2675 2676 /* don't delete a leaf that isn't there */ 2677 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2678 2679 /* no double-deletes */ 2680 ASSERT(lpl->lpl_ncpu); 2681 if (--lpl->lpl_ncpu == 0) { 2682 /* 2683 * This was the last cpu in this lgroup for this partition, 2684 * clear its bit in the partition's lgroup bitmask 2685 */ 2686 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2687 2688 /* eliminate remaning lpl link pointers in cpu, lpl */ 2689 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2690 2691 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2692 } else { 2693 2694 /* unlink cpu from lists of cpus in lpl */ 2695 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2696 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2697 if (lpl->lpl_cpus == cp) { 2698 lpl->lpl_cpus = cp->cpu_next_lpl; 2699 } 2700 2701 /* 2702 * Update the cpu count in the lpls associated with parent 2703 * lgroups. 2704 */ 2705 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2706 2707 } 2708 /* clear cpu's lpl ptr when we're all done */ 2709 cp->cpu_lpl = NULL; 2710 } 2711 2712 /* 2713 * Recompute load average for the specified partition/lgrp fragment. 2714 * 2715 * We rely on the fact that this routine is called from the clock thread 2716 * at a point before the clock thread can block (i.e. before its first 2717 * lock request). Since the clock thread can not be preempted (since it 2718 * runs at highest priority), we know that cpu partitions can not change 2719 * (since doing so would require either the repartition requester or the 2720 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2721 * without grabbing cpu_lock. 2722 */ 2723 void 2724 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2725 { 2726 uint_t ncpu; 2727 int64_t old, new, f; 2728 2729 /* 2730 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2731 */ 2732 static short expval[] = { 2733 0, 3196, 1618, 1083, 2734 814, 652, 543, 466, 2735 408, 363, 326, 297, 2736 272, 251, 233, 218, 2737 204, 192, 181, 172, 2738 163, 155, 148, 142, 2739 136, 130, 125, 121, 2740 116, 112, 109, 105 2741 }; 2742 2743 /* ASSERT (called from clock level) */ 2744 2745 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2746 ((ncpu = lpl->lpl_ncpu) == 0)) { 2747 return; 2748 } 2749 2750 for (;;) { 2751 2752 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2753 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2754 else 2755 f = expval[ncpu]; 2756 2757 /* 2758 * Modify the load average atomically to avoid losing 2759 * anticipatory load updates (see lgrp_move_thread()). 2760 */ 2761 if (ageflag) { 2762 /* 2763 * We're supposed to both update and age the load. 2764 * This happens 10 times/sec. per cpu. We do a 2765 * little hoop-jumping to avoid integer overflow. 2766 */ 2767 int64_t q, r; 2768 2769 do { 2770 old = new = lpl->lpl_loadavg; 2771 q = (old >> 16) << 7; 2772 r = (old & 0xffff) << 7; 2773 new += ((long long)(nrcpus - q) * f - 2774 ((r * f) >> 16)) >> 7; 2775 2776 /* 2777 * Check for overflow 2778 */ 2779 if (new > LGRP_LOADAVG_MAX) 2780 new = LGRP_LOADAVG_MAX; 2781 else if (new < 0) 2782 new = 0; 2783 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2784 new) != old); 2785 } else { 2786 /* 2787 * We're supposed to update the load, but not age it. 2788 * This option is used to update the load (which either 2789 * has already been aged in this 1/10 sec. interval or 2790 * soon will be) to account for a remotely executing 2791 * thread. 2792 */ 2793 do { 2794 old = new = lpl->lpl_loadavg; 2795 new += f; 2796 /* 2797 * Check for overflow 2798 * Underflow not possible here 2799 */ 2800 if (new < old) 2801 new = LGRP_LOADAVG_MAX; 2802 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2803 new) != old); 2804 } 2805 2806 /* 2807 * Do the same for this lpl's parent 2808 */ 2809 if ((lpl = lpl->lpl_parent) == NULL) 2810 break; 2811 ncpu = lpl->lpl_ncpu; 2812 } 2813 } 2814 2815 /* 2816 * Initialize lpl topology in the target based on topology currently present in 2817 * lpl_bootstrap. 2818 * 2819 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2820 * initialize cp_default list of lpls. Up to this point all topology operations 2821 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2822 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2823 * `target' points to the list of lpls in cp_default and `size' is the size of 2824 * this list. 2825 * 2826 * This function walks the lpl topology in lpl_bootstrap and does for things: 2827 * 2828 * 1) Copies all fields from lpl_bootstrap to the target. 2829 * 2830 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2831 * 2832 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2833 * instead of lpl_bootstrap. 2834 * 2835 * 4) Updates pointers in the resource list of the target to point to the lpls 2836 * in the target list instead of lpl_bootstrap. 2837 * 2838 * After lpl_topo_bootstrap() completes, target contains the same information 2839 * that would be present there if it were used during boot instead of 2840 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2841 * and it is bzeroed. 2842 */ 2843 void 2844 lpl_topo_bootstrap(lpl_t *target, int size) 2845 { 2846 lpl_t *lpl = lpl_bootstrap; 2847 lpl_t *target_lpl = target; 2848 int howmany; 2849 int id; 2850 int i; 2851 2852 /* 2853 * The only target that should be passed here is cp_default lpl list. 2854 */ 2855 ASSERT(target == cp_default.cp_lgrploads); 2856 ASSERT(size == cp_default.cp_nlgrploads); 2857 ASSERT(!lgrp_topo_initialized); 2858 ASSERT(ncpus == 1); 2859 2860 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2861 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2862 /* 2863 * Copy all fields from lpl. 2864 */ 2865 2866 *target_lpl = *lpl; 2867 2868 /* 2869 * Substitute CPU0 lpl pointer with one relative to target. 2870 */ 2871 if (lpl->lpl_cpus == CPU) { 2872 ASSERT(CPU->cpu_lpl == lpl); 2873 CPU->cpu_lpl = target_lpl; 2874 } 2875 2876 /* 2877 * Substitute parent information with parent relative to target. 2878 */ 2879 if (lpl->lpl_parent != NULL) 2880 target_lpl->lpl_parent = (lpl_t *) 2881 (((uintptr_t)lpl->lpl_parent - 2882 (uintptr_t)lpl_bootstrap) + 2883 (uintptr_t)target); 2884 2885 /* 2886 * Walk over resource set substituting pointers relative to 2887 * lpl_bootstrap to pointers relative to target. 2888 */ 2889 ASSERT(lpl->lpl_nrset <= 1); 2890 2891 for (id = 0; id < lpl->lpl_nrset; id++) { 2892 if (lpl->lpl_rset[id] != NULL) { 2893 target_lpl->lpl_rset[id] = 2894 (lpl_t *) 2895 (((uintptr_t)lpl->lpl_rset[id] - 2896 (uintptr_t)lpl_bootstrap) + 2897 (uintptr_t)target); 2898 } 2899 } 2900 } 2901 2902 /* 2903 * Topology information in lpl_bootstrap is no longer needed. 2904 */ 2905 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2906 } 2907 2908 /* the maximum effect that a single thread can have on it's lgroup's load */ 2909 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 2910 ((lgrp_loadavg_max_effect) / (ncpu)) 2911 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 2912 2913 /* 2914 * If the lowest load among the lgroups a process' threads are currently 2915 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2916 * expanding the process to a new lgroup. 2917 */ 2918 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2919 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2920 2921 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2922 ((lgrp_expand_proc_thresh) / (ncpu)) 2923 2924 /* 2925 * A process will be expanded to a new lgroup only if the difference between 2926 * the lowest load on the lgroups the process' thread's are currently spread 2927 * across and the lowest load on the other lgroups in the process' partition 2928 * is greater than lgrp_expand_proc_diff. 2929 */ 2930 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2931 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2932 2933 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2934 ((lgrp_expand_proc_diff) / (ncpu)) 2935 2936 /* 2937 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2938 * be present due to impreciseness of the load average decay algorithm. 2939 * 2940 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2941 * tolerance is scaled by the number of cpus in the lgroup just like 2942 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2943 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2944 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2945 */ 2946 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2947 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2948 ((lgrp_loadavg_tolerance) / ncpu) 2949 2950 /* 2951 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2952 * average is above this threshold 2953 */ 2954 uint32_t lgrp_load_thresh = UINT32_MAX; 2955 2956 /* 2957 * lgrp_choose() will try to skip any lgroups with less memory 2958 * than this free when choosing a home lgroup 2959 */ 2960 pgcnt_t lgrp_mem_free_thresh = 0; 2961 2962 /* 2963 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2964 * one based on one of the following policies: 2965 * - Random selection 2966 * - Pseudo round robin placement 2967 * - Longest time since a thread was last placed 2968 */ 2969 #define LGRP_CHOOSE_RANDOM 1 2970 #define LGRP_CHOOSE_RR 2 2971 #define LGRP_CHOOSE_TIME 3 2972 2973 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2974 2975 /* 2976 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 2977 * be bound to a CPU or processor set. 2978 * 2979 * Arguments: 2980 * t The thread 2981 * cpupart The partition the thread belongs to. 2982 * 2983 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 2984 * disabled, or thread_lock held (at splhigh) to protect against the CPU 2985 * partitions changing out from under us and assumes that given thread is 2986 * protected. Also, called sometimes w/ cpus paused or kernel preemption 2987 * disabled, so don't grab any locks because we should never block under 2988 * those conditions. 2989 */ 2990 lpl_t * 2991 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 2992 { 2993 lgrp_load_t bestload, bestrload; 2994 int lgrpid_offset, lgrp_count; 2995 lgrp_id_t lgrpid, lgrpid_start; 2996 lpl_t *lpl, *bestlpl, *bestrlpl; 2997 klgrpset_t lgrpset; 2998 proc_t *p; 2999 3000 ASSERT(t != NULL); 3001 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3002 THREAD_LOCK_HELD(t)); 3003 ASSERT(cpupart != NULL); 3004 3005 p = t->t_procp; 3006 3007 /* A process should always be in an active partition */ 3008 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3009 3010 bestlpl = bestrlpl = NULL; 3011 bestload = bestrload = LGRP_LOADAVG_MAX; 3012 lgrpset = cpupart->cp_lgrpset; 3013 3014 switch (lgrp_choose_policy) { 3015 case LGRP_CHOOSE_RR: 3016 lgrpid = cpupart->cp_lgrp_hint; 3017 do { 3018 if (++lgrpid > lgrp_alloc_max) 3019 lgrpid = 0; 3020 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3021 3022 break; 3023 default: 3024 case LGRP_CHOOSE_TIME: 3025 case LGRP_CHOOSE_RANDOM: 3026 klgrpset_nlgrps(lgrpset, lgrp_count); 3027 lgrpid_offset = 3028 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3029 for (lgrpid = 0; ; lgrpid++) { 3030 if (klgrpset_ismember(lgrpset, lgrpid)) { 3031 if (--lgrpid_offset == 0) 3032 break; 3033 } 3034 } 3035 break; 3036 } 3037 3038 lgrpid_start = lgrpid; 3039 3040 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3041 lgrp_id_t, cpupart->cp_lgrp_hint); 3042 3043 /* 3044 * Use lgroup affinities (if any) to choose best lgroup 3045 * 3046 * NOTE: Assumes that thread is protected from going away and its 3047 * lgroup affinities won't change (ie. p_lock, or 3048 * thread_lock() being held and/or CPUs paused) 3049 */ 3050 if (t->t_lgrp_affinity) { 3051 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start); 3052 if (lpl != NULL) 3053 return (lpl); 3054 } 3055 3056 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3057 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3058 3059 do { 3060 pgcnt_t npgs; 3061 3062 /* 3063 * Skip any lgroups outside of thread's pset 3064 */ 3065 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3066 if (++lgrpid > lgrp_alloc_max) 3067 lgrpid = 0; /* wrap the search */ 3068 continue; 3069 } 3070 3071 /* 3072 * Skip any non-leaf lgroups 3073 */ 3074 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3075 continue; 3076 3077 /* 3078 * Skip any lgroups without enough free memory 3079 * (when threshold set to nonzero positive value) 3080 */ 3081 if (lgrp_mem_free_thresh > 0) { 3082 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3083 if (npgs < lgrp_mem_free_thresh) { 3084 if (++lgrpid > lgrp_alloc_max) 3085 lgrpid = 0; /* wrap the search */ 3086 continue; 3087 } 3088 } 3089 3090 lpl = &cpupart->cp_lgrploads[lgrpid]; 3091 if (klgrpset_isempty(p->p_lgrpset) || 3092 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3093 /* 3094 * Either this is a new process or the process already 3095 * has threads on this lgrp, so this is a preferred 3096 * lgroup for the thread. 3097 */ 3098 if (lpl_pick(lpl, bestlpl)) { 3099 bestload = lpl->lpl_loadavg; 3100 bestlpl = lpl; 3101 } 3102 } else { 3103 /* 3104 * The process doesn't have any threads on this lgrp, 3105 * but we're willing to consider this lgrp if the load 3106 * difference is big enough to justify splitting up 3107 * the process' threads. 3108 */ 3109 if (lpl_pick(lpl, bestrlpl)) { 3110 bestrload = lpl->lpl_loadavg; 3111 bestrlpl = lpl; 3112 } 3113 } 3114 if (++lgrpid > lgrp_alloc_max) 3115 lgrpid = 0; /* wrap the search */ 3116 } while (lgrpid != lgrpid_start); 3117 3118 /* 3119 * Return root lgroup if threshold isn't set to maximum value and 3120 * lowest lgroup load average more than a certain threshold 3121 */ 3122 if (lgrp_load_thresh != UINT32_MAX && 3123 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3124 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3125 3126 /* 3127 * If all the lgroups over which the thread's process is spread are 3128 * heavily loaded, we'll consider placing the thread on one of the 3129 * other leaf lgroups in the thread's partition. 3130 */ 3131 if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3132 (bestrload < bestload) && /* paranoid about wraparound */ 3133 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3134 bestload)) { 3135 bestlpl = bestrlpl; 3136 } 3137 3138 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3139 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3140 3141 ASSERT(bestlpl->lpl_ncpu > 0); 3142 return (bestlpl); 3143 } 3144 3145 /* 3146 * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing. 3147 */ 3148 static int 3149 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3150 { 3151 lgrp_load_t l1, l2; 3152 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3153 3154 3155 if (lpl2 == NULL) 3156 return (1); 3157 3158 l1 = lpl1->lpl_loadavg; 3159 l2 = lpl2->lpl_loadavg; 3160 3161 if ((l1 + tolerance < l2) && (l1 < l2)) { 3162 /* lpl1 is significantly less loaded than lpl2 */ 3163 return (1); 3164 } 3165 3166 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3167 l1 + tolerance >= l2 && l1 < l2 && 3168 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3169 /* 3170 * lpl1's load is within the tolerance of lpl2. We're 3171 * willing to consider it be to better however if 3172 * it has been longer since we last homed a thread there 3173 */ 3174 return (1); 3175 } 3176 3177 return (0); 3178 } 3179 3180 /* 3181 * An LWP is expected to be assigned to an lgroup for at least this long 3182 * for its anticipatory load to be justified. NOTE that this value should 3183 * not be set extremely huge (say, larger than 100 years), to avoid problems 3184 * with overflow in the calculation that uses it. 3185 */ 3186 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3187 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3188 3189 /* 3190 * Routine to change a thread's lgroup affiliation. This routine updates 3191 * the thread's kthread_t struct and its process' proc_t struct to note the 3192 * thread's new lgroup affiliation, and its lgroup affinities. 3193 * 3194 * Note that this is the only routine that modifies a thread's t_lpl field, 3195 * and that adds in or removes anticipatory load. 3196 * 3197 * If the thread is exiting, newlpl is NULL. 3198 * 3199 * Locking: 3200 * The following lock must be held on entry: 3201 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3202 * doesn't get removed from t's partition 3203 * 3204 * This routine is not allowed to grab any locks, since it may be called 3205 * with cpus paused (such as from cpu_offline). 3206 */ 3207 void 3208 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3209 { 3210 proc_t *p; 3211 lpl_t *lpl, *oldlpl; 3212 lgrp_id_t oldid; 3213 kthread_t *tp; 3214 uint_t ncpu; 3215 lgrp_load_t old, new; 3216 3217 ASSERT(t); 3218 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3219 THREAD_LOCK_HELD(t)); 3220 3221 /* 3222 * If not changing lpls, just return 3223 */ 3224 if ((oldlpl = t->t_lpl) == newlpl) 3225 return; 3226 3227 /* 3228 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3229 * associated with process 0 rather than with its original process). 3230 */ 3231 if (t->t_proc_flag & TP_LWPEXIT) { 3232 if (newlpl != NULL) { 3233 t->t_lpl = newlpl; 3234 } 3235 return; 3236 } 3237 3238 p = ttoproc(t); 3239 3240 /* 3241 * If the thread had a previous lgroup, update its process' p_lgrpset 3242 * to account for it being moved from its old lgroup. 3243 */ 3244 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3245 (p->p_tlist != NULL)) { 3246 oldid = oldlpl->lpl_lgrpid; 3247 3248 if (newlpl != NULL) 3249 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3250 3251 if ((do_lgrpset_delete) && 3252 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3253 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3254 /* 3255 * Check if a thread other than the thread 3256 * that's moving is assigned to the same 3257 * lgroup as the thread that's moving. Note 3258 * that we have to compare lgroup IDs, rather 3259 * than simply comparing t_lpl's, since the 3260 * threads may belong to different partitions 3261 * but be assigned to the same lgroup. 3262 */ 3263 ASSERT(tp->t_lpl != NULL); 3264 3265 if ((tp != t) && 3266 (tp->t_lpl->lpl_lgrpid == oldid)) { 3267 /* 3268 * Another thread is assigned to the 3269 * same lgroup as the thread that's 3270 * moving, p_lgrpset doesn't change. 3271 */ 3272 break; 3273 } else if (tp == p->p_tlist) { 3274 /* 3275 * No other thread is assigned to the 3276 * same lgroup as the exiting thread, 3277 * clear the lgroup's bit in p_lgrpset. 3278 */ 3279 klgrpset_del(p->p_lgrpset, oldid); 3280 break; 3281 } 3282 } 3283 } 3284 3285 /* 3286 * If this thread was assigned to its old lgroup for such a 3287 * short amount of time that the anticipatory load that was 3288 * added on its behalf has aged very little, remove that 3289 * anticipatory load. 3290 */ 3291 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3292 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3293 lpl = oldlpl; 3294 for (;;) { 3295 do { 3296 old = new = lpl->lpl_loadavg; 3297 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3298 if (new > old) { 3299 /* 3300 * this can happen if the load 3301 * average was aged since we 3302 * added in the anticipatory 3303 * load 3304 */ 3305 new = 0; 3306 } 3307 } while (cas32( 3308 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3309 new) != old); 3310 3311 lpl = lpl->lpl_parent; 3312 if (lpl == NULL) 3313 break; 3314 3315 ncpu = lpl->lpl_ncpu; 3316 ASSERT(ncpu > 0); 3317 } 3318 } 3319 } 3320 /* 3321 * If the thread has a new lgroup (i.e. it's not exiting), update its 3322 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3323 * to its new lgroup to account for its move to its new lgroup. 3324 */ 3325 if (newlpl != NULL) { 3326 /* 3327 * This thread is moving to a new lgroup 3328 */ 3329 t->t_lpl = newlpl; 3330 3331 /* 3332 * Reflect move in load average of new lgroup 3333 * unless it is root lgroup 3334 */ 3335 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3336 return; 3337 3338 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3339 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3340 } 3341 3342 /* 3343 * It'll take some time for the load on the new lgroup 3344 * to reflect this thread's placement on it. We'd 3345 * like not, however, to have all threads between now 3346 * and then also piling on to this lgroup. To avoid 3347 * this pileup, we anticipate the load this thread 3348 * will generate on its new lgroup. The goal is to 3349 * make the lgroup's load appear as though the thread 3350 * had been there all along. We're very conservative 3351 * in calculating this anticipatory load, we assume 3352 * the worst case case (100% CPU-bound thread). This 3353 * may be modified in the future to be more accurate. 3354 */ 3355 lpl = newlpl; 3356 for (;;) { 3357 ncpu = lpl->lpl_ncpu; 3358 ASSERT(ncpu > 0); 3359 do { 3360 old = new = lpl->lpl_loadavg; 3361 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3362 /* 3363 * Check for overflow 3364 * Underflow not possible here 3365 */ 3366 if (new < old) 3367 new = UINT32_MAX; 3368 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3369 new) != old); 3370 3371 lpl = lpl->lpl_parent; 3372 if (lpl == NULL) 3373 break; 3374 } 3375 t->t_anttime = gethrtime(); 3376 } 3377 } 3378 3379 /* 3380 * Return lgroup memory allocation policy given advice from madvise(3C) 3381 */ 3382 lgrp_mem_policy_t 3383 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3384 { 3385 switch (advice) { 3386 case MADV_ACCESS_LWP: 3387 return (LGRP_MEM_POLICY_NEXT); 3388 case MADV_ACCESS_MANY: 3389 return (LGRP_MEM_POLICY_RANDOM); 3390 default: 3391 return (lgrp_mem_policy_default(size, type)); 3392 } 3393 } 3394 3395 /* 3396 * Figure out default policy 3397 */ 3398 lgrp_mem_policy_t 3399 lgrp_mem_policy_default(size_t size, int type) 3400 { 3401 cpupart_t *cp; 3402 lgrp_mem_policy_t policy; 3403 size_t pset_mem_size; 3404 3405 /* 3406 * Randomly allocate memory across lgroups for shared memory 3407 * beyond a certain threshold 3408 */ 3409 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3410 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3411 /* 3412 * Get total memory size of current thread's pset 3413 */ 3414 kpreempt_disable(); 3415 cp = curthread->t_cpupart; 3416 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3417 kpreempt_enable(); 3418 3419 /* 3420 * Choose policy to randomly allocate memory across 3421 * lgroups in pset if it will fit and is not default 3422 * partition. Otherwise, allocate memory randomly 3423 * across machine. 3424 */ 3425 if (lgrp_mem_pset_aware && size < pset_mem_size) 3426 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3427 else 3428 policy = LGRP_MEM_POLICY_RANDOM; 3429 } else 3430 /* 3431 * Apply default policy for private memory and 3432 * shared memory under the respective random 3433 * threshold. 3434 */ 3435 policy = lgrp_mem_default_policy; 3436 3437 return (policy); 3438 } 3439 3440 /* 3441 * Get memory allocation policy for this segment 3442 */ 3443 lgrp_mem_policy_info_t * 3444 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3445 { 3446 lgrp_mem_policy_info_t *policy_info; 3447 extern struct seg_ops segspt_ops; 3448 extern struct seg_ops segspt_shmops; 3449 3450 /* 3451 * This is for binary compatibility to protect against third party 3452 * segment drivers which haven't recompiled to allow for 3453 * SEGOP_GETPOLICY() 3454 */ 3455 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3456 seg->s_ops != &segspt_shmops) 3457 return (NULL); 3458 3459 policy_info = NULL; 3460 if (seg->s_ops->getpolicy != NULL) 3461 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3462 3463 return (policy_info); 3464 } 3465 3466 /* 3467 * Set policy for allocating private memory given desired policy, policy info, 3468 * size in bytes of memory that policy is being applied. 3469 * Return 0 if policy wasn't set already and 1 if policy was set already 3470 */ 3471 int 3472 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3473 lgrp_mem_policy_info_t *policy_info, size_t size) 3474 { 3475 3476 ASSERT(policy_info != NULL); 3477 3478 if (policy == LGRP_MEM_POLICY_DEFAULT) 3479 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3480 3481 /* 3482 * Policy set already? 3483 */ 3484 if (policy == policy_info->mem_policy) 3485 return (1); 3486 3487 /* 3488 * Set policy 3489 */ 3490 policy_info->mem_policy = policy; 3491 policy_info->mem_reserved = 0; 3492 3493 return (0); 3494 } 3495 3496 3497 /* 3498 * Get shared memory allocation policy with given tree and offset 3499 */ 3500 lgrp_mem_policy_info_t * 3501 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3502 u_offset_t vn_off) 3503 { 3504 u_offset_t off; 3505 lgrp_mem_policy_info_t *policy_info; 3506 lgrp_shm_policy_seg_t *policy_seg; 3507 lgrp_shm_locality_t *shm_locality; 3508 avl_tree_t *tree; 3509 avl_index_t where; 3510 3511 /* 3512 * Get policy segment tree from anon_map or vnode and use specified 3513 * anon index or vnode offset as offset 3514 * 3515 * Assume that no lock needs to be held on anon_map or vnode, since 3516 * they should be protected by their reference count which must be 3517 * nonzero for an existing segment 3518 */ 3519 if (amp) { 3520 ASSERT(amp->refcnt != 0); 3521 shm_locality = amp->locality; 3522 if (shm_locality == NULL) 3523 return (NULL); 3524 tree = shm_locality->loc_tree; 3525 off = ptob(anon_index); 3526 } else if (vp) { 3527 shm_locality = vp->v_locality; 3528 if (shm_locality == NULL) 3529 return (NULL); 3530 ASSERT(shm_locality->loc_count != 0); 3531 tree = shm_locality->loc_tree; 3532 off = vn_off; 3533 } 3534 3535 if (tree == NULL) 3536 return (NULL); 3537 3538 /* 3539 * Lookup policy segment for offset into shared object and return 3540 * policy info 3541 */ 3542 rw_enter(&shm_locality->loc_lock, RW_READER); 3543 policy_info = NULL; 3544 policy_seg = avl_find(tree, &off, &where); 3545 if (policy_seg) 3546 policy_info = &policy_seg->shm_policy; 3547 rw_exit(&shm_locality->loc_lock); 3548 3549 return (policy_info); 3550 } 3551 3552 /* 3553 * Return lgroup to use for allocating memory 3554 * given the segment and address 3555 * 3556 * There isn't any mutual exclusion that exists between calls 3557 * to this routine and DR, so this routine and whomever calls it 3558 * should be mindful of the possibility that the lgrp returned 3559 * may be deleted. If this happens, dereferences of the lgrp 3560 * pointer will still be safe, but the resources in the lgrp will 3561 * be gone, and LGRP_EXISTS() will no longer be true. 3562 */ 3563 lgrp_t * 3564 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3565 { 3566 int i; 3567 lgrp_t *lgrp; 3568 klgrpset_t lgrpset; 3569 int lgrps_spanned; 3570 unsigned long off; 3571 lgrp_mem_policy_t policy; 3572 lgrp_mem_policy_info_t *policy_info; 3573 ushort_t random; 3574 int stat = 0; 3575 3576 /* 3577 * Just return null if the lgrp framework hasn't finished 3578 * initializing or if this is a UMA machine. 3579 */ 3580 if (nlgrps == 1 || !lgrp_initialized) 3581 return (lgrp_root); 3582 3583 /* 3584 * Get memory allocation policy for this segment 3585 */ 3586 policy = lgrp_mem_default_policy; 3587 if (seg != NULL) { 3588 if (seg->s_as == &kas) { 3589 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3590 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3591 policy = LGRP_MEM_POLICY_RANDOM; 3592 } else { 3593 policy_info = lgrp_mem_policy_get(seg, vaddr); 3594 if (policy_info != NULL) 3595 policy = policy_info->mem_policy; 3596 } 3597 } 3598 lgrpset = 0; 3599 3600 /* 3601 * Initialize lgroup to home by default 3602 */ 3603 lgrp = lgrp_home_lgrp(); 3604 3605 /* 3606 * When homing threads on root lgrp, override default memory 3607 * allocation policies with root lgroup memory allocation policy 3608 */ 3609 if (lgrp == lgrp_root) 3610 policy = lgrp_mem_policy_root; 3611 3612 /* 3613 * Implement policy 3614 */ 3615 switch (policy) { 3616 case LGRP_MEM_POLICY_NEXT_CPU: 3617 3618 /* 3619 * Return lgroup of current CPU which faulted on memory 3620 */ 3621 lgrp = lgrp_cpu_to_lgrp(CPU); 3622 break; 3623 3624 case LGRP_MEM_POLICY_NEXT: 3625 case LGRP_MEM_POLICY_DEFAULT: 3626 default: 3627 3628 /* 3629 * Just return current thread's home lgroup 3630 * for default policy (next touch) 3631 * If the thread is homed to the root, 3632 * then the default policy is random across lgroups. 3633 * Fallthrough to the random case. 3634 */ 3635 if (lgrp != lgrp_root) { 3636 if (policy == LGRP_MEM_POLICY_NEXT) 3637 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3638 else 3639 lgrp_stat_add(lgrp->lgrp_id, 3640 LGRP_NUM_DEFAULT, 1); 3641 break; 3642 } 3643 /* LINTED fallthrough on case statement */ 3644 case LGRP_MEM_POLICY_RANDOM: 3645 3646 /* 3647 * Return a random leaf lgroup with memory 3648 */ 3649 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3650 /* 3651 * Count how many lgroups are spanned 3652 */ 3653 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3654 3655 /* 3656 * There may be no memnodes in the root lgroup during DR copy 3657 * rename on a system with only two boards (memnodes) 3658 * configured. In this case just return the root lgrp. 3659 */ 3660 if (lgrps_spanned == 0) { 3661 lgrp = lgrp_root; 3662 break; 3663 } 3664 3665 /* 3666 * Pick a random offset within lgroups spanned 3667 * and return lgroup at that offset 3668 */ 3669 random = (ushort_t)gethrtime() >> 4; 3670 off = random % lgrps_spanned; 3671 ASSERT(off <= lgrp_alloc_max); 3672 3673 for (i = 0; i <= lgrp_alloc_max; i++) { 3674 if (!klgrpset_ismember(lgrpset, i)) 3675 continue; 3676 if (off) 3677 off--; 3678 else { 3679 lgrp = lgrp_table[i]; 3680 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3681 1); 3682 break; 3683 } 3684 } 3685 break; 3686 3687 case LGRP_MEM_POLICY_RANDOM_PROC: 3688 3689 /* 3690 * Grab copy of bitmask of lgroups spanned by 3691 * this process 3692 */ 3693 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3694 stat = LGRP_NUM_RANDOM_PROC; 3695 3696 /* LINTED fallthrough on case statement */ 3697 case LGRP_MEM_POLICY_RANDOM_PSET: 3698 3699 if (!stat) 3700 stat = LGRP_NUM_RANDOM_PSET; 3701 3702 if (klgrpset_isempty(lgrpset)) { 3703 /* 3704 * Grab copy of bitmask of lgroups spanned by 3705 * this processor set 3706 */ 3707 kpreempt_disable(); 3708 klgrpset_copy(lgrpset, 3709 curthread->t_cpupart->cp_lgrpset); 3710 kpreempt_enable(); 3711 } 3712 3713 /* 3714 * Count how many lgroups are spanned 3715 */ 3716 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3717 ASSERT(lgrps_spanned <= nlgrps); 3718 3719 /* 3720 * Probably lgrps_spanned should be always non-zero, but to be 3721 * on the safe side we return lgrp_root if it is empty. 3722 */ 3723 if (lgrps_spanned == 0) { 3724 lgrp = lgrp_root; 3725 break; 3726 } 3727 3728 /* 3729 * Pick a random offset within lgroups spanned 3730 * and return lgroup at that offset 3731 */ 3732 random = (ushort_t)gethrtime() >> 4; 3733 off = random % lgrps_spanned; 3734 ASSERT(off <= lgrp_alloc_max); 3735 3736 for (i = 0; i <= lgrp_alloc_max; i++) { 3737 if (!klgrpset_ismember(lgrpset, i)) 3738 continue; 3739 if (off) 3740 off--; 3741 else { 3742 lgrp = lgrp_table[i]; 3743 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3744 1); 3745 break; 3746 } 3747 } 3748 break; 3749 3750 case LGRP_MEM_POLICY_ROUNDROBIN: 3751 3752 /* 3753 * Use offset within segment to determine 3754 * offset from home lgroup to choose for 3755 * next lgroup to allocate memory from 3756 */ 3757 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3758 (lgrp_alloc_max + 1); 3759 3760 kpreempt_disable(); 3761 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3762 i = lgrp->lgrp_id; 3763 kpreempt_enable(); 3764 3765 while (off > 0) { 3766 i = (i + 1) % (lgrp_alloc_max + 1); 3767 lgrp = lgrp_table[i]; 3768 if (klgrpset_ismember(lgrpset, i)) 3769 off--; 3770 } 3771 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3772 3773 break; 3774 } 3775 3776 ASSERT(lgrp != NULL); 3777 return (lgrp); 3778 } 3779 3780 /* 3781 * Return the number of pages in an lgroup 3782 * 3783 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3784 * could cause tests that rely on the numat driver to fail.... 3785 */ 3786 pgcnt_t 3787 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3788 { 3789 lgrp_t *lgrp; 3790 3791 lgrp = lgrp_table[lgrpid]; 3792 if (!LGRP_EXISTS(lgrp) || 3793 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3794 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3795 return (0); 3796 3797 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3798 } 3799 3800 /* 3801 * Initialize lgroup shared memory allocation policy support 3802 */ 3803 void 3804 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3805 { 3806 lgrp_shm_locality_t *shm_locality; 3807 3808 /* 3809 * Initialize locality field in anon_map 3810 * Don't need any locks because this is called when anon_map is 3811 * allocated, but not used anywhere yet. 3812 */ 3813 if (amp) { 3814 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3815 if (amp->locality == NULL) { 3816 /* 3817 * Allocate and initialize shared memory locality info 3818 * and set anon_map locality pointer to it 3819 * Drop lock across kmem_alloc(KM_SLEEP) 3820 */ 3821 ANON_LOCK_EXIT(&->a_rwlock); 3822 shm_locality = kmem_alloc(sizeof (*shm_locality), 3823 KM_SLEEP); 3824 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3825 NULL); 3826 shm_locality->loc_count = 1; /* not used for amp */ 3827 shm_locality->loc_tree = NULL; 3828 3829 /* 3830 * Reacquire lock and check to see whether anyone beat 3831 * us to initializing the locality info 3832 */ 3833 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3834 if (amp->locality != NULL) { 3835 rw_destroy(&shm_locality->loc_lock); 3836 kmem_free(shm_locality, 3837 sizeof (*shm_locality)); 3838 } else 3839 amp->locality = shm_locality; 3840 } 3841 ANON_LOCK_EXIT(&->a_rwlock); 3842 return; 3843 } 3844 3845 /* 3846 * Allocate shared vnode policy info if vnode is not locality aware yet 3847 */ 3848 mutex_enter(&vp->v_lock); 3849 if ((vp->v_flag & V_LOCALITY) == 0) { 3850 /* 3851 * Allocate and initialize shared memory locality info 3852 */ 3853 mutex_exit(&vp->v_lock); 3854 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3855 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3856 shm_locality->loc_count = 1; 3857 shm_locality->loc_tree = NULL; 3858 3859 /* 3860 * Point vnode locality field at shared vnode policy info 3861 * and set locality aware flag in vnode 3862 */ 3863 mutex_enter(&vp->v_lock); 3864 if ((vp->v_flag & V_LOCALITY) == 0) { 3865 vp->v_locality = shm_locality; 3866 vp->v_flag |= V_LOCALITY; 3867 } else { 3868 /* 3869 * Lost race so free locality info and increment count. 3870 */ 3871 rw_destroy(&shm_locality->loc_lock); 3872 kmem_free(shm_locality, sizeof (*shm_locality)); 3873 shm_locality = vp->v_locality; 3874 shm_locality->loc_count++; 3875 } 3876 mutex_exit(&vp->v_lock); 3877 3878 return; 3879 } 3880 3881 /* 3882 * Increment reference count of number of segments mapping this vnode 3883 * shared 3884 */ 3885 shm_locality = vp->v_locality; 3886 shm_locality->loc_count++; 3887 mutex_exit(&vp->v_lock); 3888 } 3889 3890 /* 3891 * Destroy the given shared memory policy segment tree 3892 */ 3893 void 3894 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3895 { 3896 lgrp_shm_policy_seg_t *cur; 3897 lgrp_shm_policy_seg_t *next; 3898 3899 if (tree == NULL) 3900 return; 3901 3902 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3903 while (cur != NULL) { 3904 next = AVL_NEXT(tree, cur); 3905 avl_remove(tree, cur); 3906 kmem_free(cur, sizeof (*cur)); 3907 cur = next; 3908 } 3909 kmem_free(tree, sizeof (avl_tree_t)); 3910 } 3911 3912 /* 3913 * Uninitialize lgroup shared memory allocation policy support 3914 */ 3915 void 3916 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3917 { 3918 lgrp_shm_locality_t *shm_locality; 3919 3920 /* 3921 * For anon_map, deallocate shared memory policy tree and 3922 * zero locality field 3923 * Don't need any locks because anon_map is being freed 3924 */ 3925 if (amp) { 3926 if (amp->locality == NULL) 3927 return; 3928 shm_locality = amp->locality; 3929 shm_locality->loc_count = 0; /* not really used for amp */ 3930 rw_destroy(&shm_locality->loc_lock); 3931 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3932 kmem_free(shm_locality, sizeof (*shm_locality)); 3933 amp->locality = 0; 3934 return; 3935 } 3936 3937 /* 3938 * For vnode, decrement reference count of segments mapping this vnode 3939 * shared and delete locality info if reference count drops to 0 3940 */ 3941 mutex_enter(&vp->v_lock); 3942 shm_locality = vp->v_locality; 3943 shm_locality->loc_count--; 3944 3945 if (shm_locality->loc_count == 0) { 3946 rw_destroy(&shm_locality->loc_lock); 3947 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3948 kmem_free(shm_locality, sizeof (*shm_locality)); 3949 vp->v_locality = 0; 3950 vp->v_flag &= ~V_LOCALITY; 3951 } 3952 mutex_exit(&vp->v_lock); 3953 } 3954 3955 /* 3956 * Compare two shared memory policy segments 3957 * Used by AVL tree code for searching 3958 */ 3959 int 3960 lgrp_shm_policy_compar(const void *x, const void *y) 3961 { 3962 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 3963 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 3964 3965 if (a->shm_off < b->shm_off) 3966 return (-1); 3967 if (a->shm_off >= b->shm_off + b->shm_size) 3968 return (1); 3969 return (0); 3970 } 3971 3972 /* 3973 * Concatenate seg1 with seg2 and remove seg2 3974 */ 3975 static int 3976 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 3977 lgrp_shm_policy_seg_t *seg2) 3978 { 3979 if (!seg1 || !seg2 || 3980 seg1->shm_off + seg1->shm_size != seg2->shm_off || 3981 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 3982 return (-1); 3983 3984 seg1->shm_size += seg2->shm_size; 3985 avl_remove(tree, seg2); 3986 kmem_free(seg2, sizeof (*seg2)); 3987 return (0); 3988 } 3989 3990 /* 3991 * Split segment at given offset and return rightmost (uppermost) segment 3992 * Assumes that there are no overlapping segments 3993 */ 3994 static lgrp_shm_policy_seg_t * 3995 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 3996 u_offset_t off) 3997 { 3998 lgrp_shm_policy_seg_t *newseg; 3999 avl_index_t where; 4000 4001 ASSERT(seg != NULL); 4002 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4003 4004 if (!seg || off < seg->shm_off || off > seg->shm_off + 4005 seg->shm_size) 4006 return (NULL); 4007 4008 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4009 return (seg); 4010 4011 /* 4012 * Adjust size of left segment and allocate new (right) segment 4013 */ 4014 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4015 newseg->shm_policy = seg->shm_policy; 4016 newseg->shm_off = off; 4017 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4018 seg->shm_size = off - seg->shm_off; 4019 4020 /* 4021 * Find where to insert new segment in AVL tree and insert it 4022 */ 4023 (void) avl_find(tree, &off, &where); 4024 avl_insert(tree, newseg, where); 4025 4026 return (newseg); 4027 } 4028 4029 /* 4030 * Set shared memory allocation policy on specified shared object at given 4031 * offset and length 4032 * 4033 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4034 * -1 if can't set policy. 4035 */ 4036 int 4037 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4038 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4039 { 4040 u_offset_t eoff; 4041 lgrp_shm_policy_seg_t *next; 4042 lgrp_shm_policy_seg_t *newseg; 4043 u_offset_t off; 4044 u_offset_t oldeoff; 4045 lgrp_shm_policy_seg_t *prev; 4046 int retval; 4047 lgrp_shm_policy_seg_t *seg; 4048 lgrp_shm_locality_t *shm_locality; 4049 avl_tree_t *tree; 4050 avl_index_t where; 4051 4052 ASSERT(amp || vp); 4053 ASSERT((len & PAGEOFFSET) == 0); 4054 4055 if (len == 0) 4056 return (-1); 4057 4058 retval = 0; 4059 4060 /* 4061 * Get locality info and starting offset into shared object 4062 * Try anon map first and then vnode 4063 * Assume that no locks need to be held on anon_map or vnode, since 4064 * it should be protected by its reference count which must be nonzero 4065 * for an existing segment. 4066 */ 4067 if (amp) { 4068 /* 4069 * Get policy info from anon_map 4070 * 4071 */ 4072 ASSERT(amp->refcnt != 0); 4073 if (amp->locality == NULL) 4074 lgrp_shm_policy_init(amp, NULL); 4075 shm_locality = amp->locality; 4076 off = ptob(anon_index); 4077 } else if (vp) { 4078 /* 4079 * Get policy info from vnode 4080 */ 4081 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4082 lgrp_shm_policy_init(NULL, vp); 4083 shm_locality = vp->v_locality; 4084 ASSERT(shm_locality->loc_count != 0); 4085 off = vn_off; 4086 } else 4087 return (-1); 4088 4089 ASSERT((off & PAGEOFFSET) == 0); 4090 4091 /* 4092 * Figure out default policy 4093 */ 4094 if (policy == LGRP_MEM_POLICY_DEFAULT) 4095 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4096 4097 /* 4098 * Create AVL tree if there isn't one yet 4099 * and set locality field to point at it 4100 */ 4101 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4102 tree = shm_locality->loc_tree; 4103 if (!tree) { 4104 rw_exit(&shm_locality->loc_lock); 4105 4106 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4107 4108 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4109 if (shm_locality->loc_tree == NULL) { 4110 avl_create(tree, lgrp_shm_policy_compar, 4111 sizeof (lgrp_shm_policy_seg_t), 4112 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4113 shm_locality->loc_tree = tree; 4114 } else { 4115 /* 4116 * Another thread managed to set up the tree 4117 * before we could. Free the tree we allocated 4118 * and use the one that's already there. 4119 */ 4120 kmem_free(tree, sizeof (*tree)); 4121 tree = shm_locality->loc_tree; 4122 } 4123 } 4124 4125 /* 4126 * Set policy 4127 * 4128 * Need to maintain hold on writer's lock to keep tree from 4129 * changing out from under us 4130 */ 4131 while (len != 0) { 4132 /* 4133 * Find policy segment for specified offset into shared object 4134 */ 4135 seg = avl_find(tree, &off, &where); 4136 4137 /* 4138 * Didn't find any existing segment that contains specified 4139 * offset, so allocate new segment, insert it, and concatenate 4140 * with adjacent segments if possible 4141 */ 4142 if (seg == NULL) { 4143 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4144 KM_SLEEP); 4145 newseg->shm_policy.mem_policy = policy; 4146 newseg->shm_policy.mem_reserved = 0; 4147 newseg->shm_off = off; 4148 avl_insert(tree, newseg, where); 4149 4150 /* 4151 * Check to see whether new segment overlaps with next 4152 * one, set length of new segment accordingly, and 4153 * calculate remaining length and next offset 4154 */ 4155 seg = AVL_NEXT(tree, newseg); 4156 if (seg == NULL || off + len <= seg->shm_off) { 4157 newseg->shm_size = len; 4158 len = 0; 4159 } else { 4160 newseg->shm_size = seg->shm_off - off; 4161 off = seg->shm_off; 4162 len -= newseg->shm_size; 4163 } 4164 4165 /* 4166 * Try to concatenate new segment with next and 4167 * previous ones, since they might have the same policy 4168 * now. Grab previous and next segments first because 4169 * they will change on concatenation. 4170 */ 4171 prev = AVL_PREV(tree, newseg); 4172 next = AVL_NEXT(tree, newseg); 4173 (void) lgrp_shm_policy_concat(tree, newseg, next); 4174 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4175 4176 continue; 4177 } 4178 4179 eoff = off + len; 4180 oldeoff = seg->shm_off + seg->shm_size; 4181 4182 /* 4183 * Policy set already? 4184 */ 4185 if (policy == seg->shm_policy.mem_policy) { 4186 /* 4187 * Nothing left to do if offset and length 4188 * fall within this segment 4189 */ 4190 if (eoff <= oldeoff) { 4191 retval = 1; 4192 break; 4193 } else { 4194 len = eoff - oldeoff; 4195 off = oldeoff; 4196 continue; 4197 } 4198 } 4199 4200 /* 4201 * Specified offset and length match existing segment exactly 4202 */ 4203 if (off == seg->shm_off && len == seg->shm_size) { 4204 /* 4205 * Set policy and update current length 4206 */ 4207 seg->shm_policy.mem_policy = policy; 4208 seg->shm_policy.mem_reserved = 0; 4209 len = 0; 4210 4211 /* 4212 * Try concatenating new segment with previous and next 4213 * segments, since they might have the same policy now. 4214 * Grab previous and next segments first because they 4215 * will change on concatenation. 4216 */ 4217 prev = AVL_PREV(tree, seg); 4218 next = AVL_NEXT(tree, seg); 4219 (void) lgrp_shm_policy_concat(tree, seg, next); 4220 (void) lgrp_shm_policy_concat(tree, prev, seg); 4221 } else { 4222 /* 4223 * Specified offset and length only apply to part of 4224 * existing segment 4225 */ 4226 4227 /* 4228 * New segment starts in middle of old one, so split 4229 * new one off near beginning of old one 4230 */ 4231 newseg = NULL; 4232 if (off > seg->shm_off) { 4233 newseg = lgrp_shm_policy_split(tree, seg, off); 4234 4235 /* 4236 * New segment ends where old one did, so try 4237 * to concatenate with next segment 4238 */ 4239 if (eoff == oldeoff) { 4240 newseg->shm_policy.mem_policy = policy; 4241 newseg->shm_policy.mem_reserved = 0; 4242 (void) lgrp_shm_policy_concat(tree, 4243 newseg, AVL_NEXT(tree, newseg)); 4244 break; 4245 } 4246 } 4247 4248 /* 4249 * New segment ends before old one, so split off end of 4250 * old one 4251 */ 4252 if (eoff < oldeoff) { 4253 if (newseg) { 4254 (void) lgrp_shm_policy_split(tree, 4255 newseg, eoff); 4256 newseg->shm_policy.mem_policy = policy; 4257 newseg->shm_policy.mem_reserved = 0; 4258 } else { 4259 (void) lgrp_shm_policy_split(tree, seg, 4260 eoff); 4261 seg->shm_policy.mem_policy = policy; 4262 seg->shm_policy.mem_reserved = 0; 4263 } 4264 4265 if (off == seg->shm_off) 4266 (void) lgrp_shm_policy_concat(tree, 4267 AVL_PREV(tree, seg), seg); 4268 break; 4269 } 4270 4271 /* 4272 * Calculate remaining length and next offset 4273 */ 4274 len = eoff - oldeoff; 4275 off = oldeoff; 4276 } 4277 } 4278 4279 rw_exit(&shm_locality->loc_lock); 4280 return (retval); 4281 } 4282 4283 /* 4284 * Return the best memnode from which to allocate memory given 4285 * an lgroup. 4286 * 4287 * "c" is for cookie, which is good enough for me. 4288 * It references a cookie struct that should be zero'ed to initialize. 4289 * The cookie should live on the caller's stack. 4290 * 4291 * The routine returns -1 when: 4292 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4293 * - traverse is 1, and all the memnodes in the system have been 4294 * returned. 4295 */ 4296 int 4297 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4298 { 4299 lgrp_t *lp = c->lmc_lgrp; 4300 mnodeset_t nodes = c->lmc_nodes; 4301 int cnt = c->lmc_cnt; 4302 int offset, mnode; 4303 4304 extern int max_mem_nodes; 4305 4306 /* 4307 * If the set is empty, and the caller is willing, traverse 4308 * up the hierarchy until we find a non-empty set. 4309 */ 4310 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4311 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4312 ((lp = lp->lgrp_parent) == NULL)) 4313 return (-1); 4314 4315 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4316 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4317 } 4318 4319 /* 4320 * Select a memnode by picking one at a "random" offset. 4321 * Because of DR, memnodes can come and go at any time. 4322 * This code must be able to cope with the possibility 4323 * that the nodes count "cnt" is inconsistent with respect 4324 * to the number of elements actually in "nodes", and 4325 * therefore that the offset chosen could be greater than 4326 * the number of elements in the set (some memnodes may 4327 * have dissapeared just before cnt was read). 4328 * If this happens, the search simply wraps back to the 4329 * beginning of the set. 4330 */ 4331 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4332 offset = c->lmc_rand % cnt; 4333 do { 4334 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4335 if (nodes & ((mnodeset_t)1 << mnode)) 4336 if (!offset--) 4337 break; 4338 } while (mnode >= max_mem_nodes); 4339 4340 /* Found a node. Store state before returning. */ 4341 c->lmc_lgrp = lp; 4342 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4343 c->lmc_cnt = cnt - 1; 4344 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4345 c->lmc_ntried++; 4346 4347 return (mnode); 4348 } 4349