1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Basic NUMA support in terms of locality groups 31 * 32 * Solaris needs to know which CPUs, memory, etc. are near each other to 33 * provide good performance on NUMA machines by optimizing for locality. 34 * In order to do this, a new abstraction called a "locality group (lgroup)" 35 * has been introduced to keep track of which CPU-like and memory-like hardware 36 * resources are close to each other. Currently, latency is the only measure 37 * used to determine how to group hardware resources into lgroups, but this 38 * does not limit the groupings to be based solely on latency. Other factors 39 * may be used to determine the groupings in the future. 40 * 41 * Lgroups are organized into a hieararchy or topology that represents the 42 * latency topology of the machine. There is always at least a root lgroup in 43 * the system. It represents all the hardware resources in the machine at a 44 * latency big enough that any hardware resource can at least access any other 45 * hardware resource within that latency. A Uniform Memory Access (UMA) 46 * machine is represented with one lgroup (the root). In contrast, a NUMA 47 * machine is represented at least by the root lgroup and some number of leaf 48 * lgroups where the leaf lgroups contain the hardware resources within the 49 * least latency of each other and the root lgroup still contains all the 50 * resources in the machine. Some number of intermediate lgroups may exist 51 * which represent more levels of locality than just the local latency of the 52 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 53 * (eg. root and intermediate lgroups) contain the next nearest resources to 54 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 55 * to the root lgroup shows the hardware resources from closest to farthest 56 * from the leaf lgroup such that each successive ancestor lgroup contains 57 * the next nearest resources at the next level of locality from the previous. 58 * 59 * The kernel uses the lgroup abstraction to know how to allocate resources 60 * near a given process/thread. At fork() and lwp/thread_create() time, a 61 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 62 * with the lowest load average. Binding to a processor or processor set will 63 * change the home lgroup for a thread. The scheduler has been modified to try 64 * to dispatch a thread on a CPU in its home lgroup. Physical memory 65 * allocation is lgroup aware too, so memory will be allocated from the current 66 * thread's home lgroup if possible. If the desired resources are not 67 * available, the kernel traverses the lgroup hierarchy going to the parent 68 * lgroup to find resources at the next level of locality until it reaches the 69 * root lgroup. 70 */ 71 72 #include <sys/lgrp.h> 73 #include <sys/lgrp_user.h> 74 #include <sys/types.h> 75 #include <sys/mman.h> 76 #include <sys/param.h> 77 #include <sys/var.h> 78 #include <sys/thread.h> 79 #include <sys/cpuvar.h> 80 #include <sys/cpupart.h> 81 #include <sys/kmem.h> 82 #include <vm/seg.h> 83 #include <vm/seg_kmem.h> 84 #include <vm/seg_spt.h> 85 #include <vm/seg_vn.h> 86 #include <vm/as.h> 87 #include <sys/atomic.h> 88 #include <sys/systm.h> 89 #include <sys/errno.h> 90 #include <sys/cmn_err.h> 91 #include <sys/kstat.h> 92 #include <sys/sysmacros.h> 93 #include <sys/chip.h> 94 #include <sys/promif.h> 95 #include <sys/sdt.h> 96 97 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 99 /* indexed by lgrp_id */ 100 int nlgrps; /* number of lgroups in machine */ 101 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 102 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 103 104 /* 105 * Kstat data for lgroups. 106 * 107 * Actual kstat data is collected in lgrp_stats array. 108 * The lgrp_kstat_data array of named kstats is used to extract data from 109 * lgrp_stats and present it to kstat framework. It is protected from partallel 110 * modifications by lgrp_kstat_mutex. This may cause some contention when 111 * several kstat commands run in parallel but this is not the 112 * performance-critical path. 113 */ 114 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 115 116 /* 117 * Declare kstat names statically for enums as defined in the header file. 118 */ 119 LGRP_KSTAT_NAMES; 120 121 static void lgrp_kstat_init(void); 122 static int lgrp_kstat_extract(kstat_t *, int); 123 static void lgrp_kstat_reset(lgrp_id_t); 124 125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 126 static kmutex_t lgrp_kstat_mutex; 127 128 129 /* 130 * max number of lgroups supported by the platform 131 */ 132 int nlgrpsmax = 0; 133 134 /* 135 * The root lgroup. Represents the set of resources at the system wide 136 * level of locality. 137 */ 138 lgrp_t *lgrp_root = NULL; 139 140 /* 141 * During system bootstrap cp_default does not contain the list of lgrp load 142 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 143 * on-line when cp_default is initialized by cpupart_initialize_default(). 144 * Configuring CPU0 may create a two-level topology with root and one leaf node 145 * containing CPU0. This topology is initially constructed in a special 146 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 147 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 148 * for all lpl operations until cp_default is fully constructed. 149 * 150 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 151 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 152 * the first element of lpl_bootstrap_list. 153 */ 154 #define LPL_BOOTSTRAP_SIZE 2 155 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 156 lpl_t *lpl_bootstrap; 157 158 static lgrp_t lroot; 159 160 161 /* 162 * Size, in bytes, beyond which random memory allocation policy is applied 163 * to non-shared memory. Default is the maximum size, so random memory 164 * allocation won't be used for non-shared memory by default. 165 */ 166 size_t lgrp_privm_random_thresh = (size_t)(-1); 167 168 /* 169 * Size, in bytes, beyond which random memory allocation policy is applied to 170 * shared memory. Default is 8MB (2 ISM pages). 171 */ 172 size_t lgrp_shm_random_thresh = 8*1024*1024; 173 174 /* 175 * Whether to do processor set aware memory allocation by default 176 */ 177 int lgrp_mem_pset_aware = 0; 178 179 /* 180 * Set the default memory allocation policy for root lgroup 181 */ 182 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 183 184 /* 185 * Set the default memory allocation policy. For most platforms, 186 * next touch is sufficient, but some platforms may wish to override 187 * this. 188 */ 189 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 190 191 192 /* 193 * lgroup CPU event handlers 194 */ 195 static void lgrp_cpu_init(struct cpu *); 196 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 197 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 198 199 static void lgrp_latency_change(u_longlong_t, u_longlong_t); 200 201 /* 202 * lgroup memory event handlers 203 */ 204 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 205 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 206 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 207 208 /* 209 * lgroup CPU partition event handlers 210 */ 211 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 212 static void lgrp_part_del_cpu(struct cpu *); 213 214 static void lgrp_root_init(void); 215 216 /* 217 * lpl topology 218 */ 219 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 220 static void lpl_clear(lpl_t *); 221 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 222 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 223 static void lpl_rset_add(lpl_t *, lpl_t *); 224 static void lpl_rset_del(lpl_t *, lpl_t *); 225 static int lpl_rset_contains(lpl_t *, lpl_t *); 226 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 227 static void lpl_child_update(lpl_t *, struct cpupart *); 228 static int lpl_pick(lpl_t *, lpl_t *); 229 static void lpl_verify_wrapper(struct cpupart *); 230 231 /* 232 * defines for lpl topology verifier return codes 233 */ 234 235 #define LPL_TOPO_CORRECT 0 236 #define LPL_TOPO_PART_HAS_NO_LPL -1 237 #define LPL_TOPO_CPUS_NOT_EMPTY -2 238 #define LPL_TOPO_LGRP_MISMATCH -3 239 #define LPL_TOPO_MISSING_PARENT -4 240 #define LPL_TOPO_PARENT_MISMATCH -5 241 #define LPL_TOPO_BAD_CPUCNT -6 242 #define LPL_TOPO_RSET_MISMATCH -7 243 #define LPL_TOPO_LPL_ORPHANED -8 244 #define LPL_TOPO_LPL_BAD_NCPU -9 245 #define LPL_TOPO_RSET_MSSNG_LF -10 246 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 247 #define LPL_TOPO_BOGUS_HINT -12 248 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 249 #define LPL_TOPO_LGRP_NOT_LEAF -14 250 #define LPL_TOPO_BAD_RSETCNT -15 251 252 /* 253 * Return whether lgroup optimizations should be enabled on this system 254 */ 255 int 256 lgrp_optimizations(void) 257 { 258 /* 259 * System must have more than 2 lgroups to enable lgroup optimizations 260 * 261 * XXX This assumes that a 2 lgroup system has an empty root lgroup 262 * with one child lgroup containing all the resources. A 2 lgroup 263 * system with a root lgroup directly containing CPUs or memory might 264 * need lgroup optimizations with its child lgroup, but there 265 * isn't such a machine for now.... 266 */ 267 if (nlgrps > 2) 268 return (1); 269 270 return (0); 271 } 272 273 /* 274 * Build full lgroup topology 275 */ 276 static void 277 lgrp_root_init(void) 278 { 279 lgrp_handle_t hand; 280 int i; 281 lgrp_id_t id; 282 283 /* 284 * Create the "root" lgroup 285 */ 286 ASSERT(nlgrps == 0); 287 id = nlgrps++; 288 289 lgrp_root = &lroot; 290 291 lgrp_root->lgrp_cpu = NULL; 292 lgrp_root->lgrp_mnodes = 0; 293 lgrp_root->lgrp_nmnodes = 0; 294 hand = lgrp_plat_root_hand(); 295 lgrp_root->lgrp_plathand = hand; 296 297 lgrp_root->lgrp_id = id; 298 lgrp_root->lgrp_cpucnt = 0; 299 lgrp_root->lgrp_childcnt = 0; 300 klgrpset_clear(lgrp_root->lgrp_children); 301 klgrpset_clear(lgrp_root->lgrp_leaves); 302 lgrp_root->lgrp_parent = NULL; 303 lgrp_root->lgrp_chips = NULL; 304 lgrp_root->lgrp_chipcnt = 0; 305 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 306 307 for (i = 0; i < LGRP_RSRC_COUNT; i++) 308 klgrpset_clear(lgrp_root->lgrp_set[i]); 309 310 lgrp_root->lgrp_kstat = NULL; 311 312 lgrp_table[id] = lgrp_root; 313 314 /* 315 * Setup initial lpl list for CPU0 and initial t0 home. 316 * The only lpl space we have so far is lpl_bootstrap. It is used for 317 * all topology operations untill cp_default until cp_default is 318 * initialized at which point t0.t_lpl will be updated. 319 */ 320 lpl_bootstrap = lpl_bootstrap_list; 321 t0.t_lpl = lpl_bootstrap; 322 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 323 lpl_bootstrap_list[1].lpl_lgrpid = 1; 324 cp_default.cp_lgrploads = lpl_bootstrap; 325 } 326 327 /* 328 * Initialize the lgroup framework and allow the platform to do the same 329 */ 330 void 331 lgrp_init(void) 332 { 333 /* 334 * Initialize the platform 335 */ 336 lgrp_plat_init(); 337 338 /* 339 * Set max number of lgroups supported on this platform which must be 340 * less than the max number of lgroups supported by the common lgroup 341 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 342 */ 343 nlgrpsmax = lgrp_plat_max_lgrps(); 344 ASSERT(nlgrpsmax <= NLGRPS_MAX); 345 } 346 347 /* 348 * Create the root and cpu0's lgroup, and set t0's home. 349 */ 350 void 351 lgrp_setup(void) 352 { 353 /* 354 * Setup the root lgroup 355 */ 356 lgrp_root_init(); 357 358 /* 359 * Add cpu0 to an lgroup 360 */ 361 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 362 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 363 } 364 365 /* 366 * Lgroup initialization is split in two parts. The first part 367 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 368 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 369 * when all CPUs are brought online and all distance information is available. 370 * 371 * When lgrp_main_init() is complete it sets lgrp_initialized. The 372 * lgrp_main_mp_init() sets lgrp_topo_initialized. 373 */ 374 375 /* 376 * true when lgrp initialization has been completed. 377 */ 378 int lgrp_initialized = 0; 379 380 /* 381 * True when lgrp topology is constructed. 382 */ 383 int lgrp_topo_initialized = 0; 384 385 /* 386 * Init routine called after startup(), /etc/system has been processed, 387 * and cpu0 has been added to an lgroup. 388 */ 389 void 390 lgrp_main_init(void) 391 { 392 cpu_t *cp = CPU; 393 lgrp_id_t lgrpid; 394 int i; 395 /* 396 * Enforce a valid lgrp_mem_default_policy 397 */ 398 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 399 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 400 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 401 402 /* 403 * See if mpo should be disabled. 404 * This may happen in the case of null proc LPA on Starcat. 405 * The platform won't be able to detect null proc LPA until after 406 * cpu0 and memory have already been added to lgroups. 407 * When and if it is detected, the Starcat platform will return 408 * a different platform handle for cpu0 which is what we check for 409 * here. If mpo should be disabled move cpu0 to it's rightful place 410 * (the root), and destroy the remaining lgroups. This effectively 411 * provides an UMA lgroup topology. 412 */ 413 lgrpid = cp->cpu_lpl->lpl_lgrpid; 414 if (lgrp_table[lgrpid]->lgrp_plathand != 415 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 416 lgrp_part_del_cpu(cp); 417 lgrp_cpu_fini(cp, lgrpid); 418 419 lgrp_cpu_init(cp); 420 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 421 422 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 423 424 for (i = 0; i <= lgrp_alloc_max; i++) { 425 if (LGRP_EXISTS(lgrp_table[i]) && 426 lgrp_table[i] != lgrp_root) 427 lgrp_destroy(lgrp_table[i]); 428 } 429 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 430 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 431 } 432 433 /* 434 * Initialize kstats framework. 435 */ 436 lgrp_kstat_init(); 437 /* 438 * cpu0 is finally where it should be, so create it's lgroup's kstats 439 */ 440 mutex_enter(&cpu_lock); 441 lgrp_kstat_create(cp); 442 mutex_exit(&cpu_lock); 443 444 lgrp_plat_main_init(); 445 lgrp_initialized = 1; 446 } 447 448 /* 449 * Finish lgrp initialization after all CPUS are brought on-line. 450 * This routine is called after start_other_cpus(). 451 */ 452 void 453 lgrp_main_mp_init(void) 454 { 455 klgrpset_t changed; 456 457 /* 458 * Update lgroup topology (if necessary) 459 */ 460 klgrpset_clear(changed); 461 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 462 lgrp_topo_initialized = 1; 463 } 464 465 /* 466 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 467 */ 468 void 469 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 470 { 471 klgrpset_t changed; 472 cpu_t *cp; 473 lgrp_id_t id; 474 int rc; 475 476 switch (event) { 477 /* 478 * The following (re)configuration events are common code 479 * initiated. lgrp_plat_config() is called here to inform the 480 * platform of the reconfiguration event. 481 */ 482 case LGRP_CONFIG_CPU_ADD: 483 lgrp_plat_config(event, resource); 484 atomic_add_32(&lgrp_gen, 1); 485 486 break; 487 case LGRP_CONFIG_CPU_DEL: 488 lgrp_plat_config(event, resource); 489 atomic_add_32(&lgrp_gen, 1); 490 491 break; 492 case LGRP_CONFIG_CPU_ONLINE: 493 cp = (cpu_t *)resource; 494 lgrp_cpu_init(cp); 495 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 496 rc = lpl_topo_verify(cp->cpu_part); 497 if (rc != LPL_TOPO_CORRECT) { 498 panic("lpl_topo_verify failed: %d", rc); 499 } 500 lgrp_plat_config(event, resource); 501 atomic_add_32(&lgrp_gen, 1); 502 503 break; 504 case LGRP_CONFIG_CPU_OFFLINE: 505 cp = (cpu_t *)resource; 506 id = cp->cpu_lpl->lpl_lgrpid; 507 lgrp_part_del_cpu(cp); 508 lgrp_cpu_fini(cp, id); 509 rc = lpl_topo_verify(cp->cpu_part); 510 if (rc != LPL_TOPO_CORRECT) { 511 panic("lpl_topo_verify failed: %d", rc); 512 } 513 lgrp_plat_config(event, resource); 514 atomic_add_32(&lgrp_gen, 1); 515 516 break; 517 case LGRP_CONFIG_CPUPART_ADD: 518 cp = (cpu_t *)resource; 519 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 520 rc = lpl_topo_verify(cp->cpu_part); 521 if (rc != LPL_TOPO_CORRECT) { 522 panic("lpl_topo_verify failed: %d", rc); 523 } 524 lgrp_plat_config(event, resource); 525 526 break; 527 case LGRP_CONFIG_CPUPART_DEL: 528 cp = (cpu_t *)resource; 529 lgrp_part_del_cpu((cpu_t *)resource); 530 rc = lpl_topo_verify(cp->cpu_part); 531 if (rc != LPL_TOPO_CORRECT) { 532 panic("lpl_topo_verify failed: %d", rc); 533 } 534 lgrp_plat_config(event, resource); 535 536 break; 537 /* 538 * The following events are initiated by the memnode 539 * subsystem. 540 */ 541 case LGRP_CONFIG_MEM_ADD: 542 lgrp_mem_init((int)resource, where, B_FALSE); 543 atomic_add_32(&lgrp_gen, 1); 544 545 break; 546 case LGRP_CONFIG_MEM_DEL: 547 lgrp_mem_fini((int)resource, where, B_FALSE); 548 atomic_add_32(&lgrp_gen, 1); 549 550 break; 551 case LGRP_CONFIG_MEM_RENAME: { 552 lgrp_config_mem_rename_t *ren_arg = 553 (lgrp_config_mem_rename_t *)where; 554 555 lgrp_mem_rename((int)resource, 556 ren_arg->lmem_rename_from, 557 ren_arg->lmem_rename_to); 558 atomic_add_32(&lgrp_gen, 1); 559 560 break; 561 } 562 case LGRP_CONFIG_GEN_UPDATE: 563 atomic_add_32(&lgrp_gen, 1); 564 565 break; 566 case LGRP_CONFIG_FLATTEN: 567 if (where == 0) 568 lgrp_topo_levels = (int)resource; 569 else 570 (void) lgrp_topo_flatten(resource, 571 lgrp_table, lgrp_alloc_max, &changed); 572 573 break; 574 /* 575 * Initiated by platform latency probing code 576 */ 577 case LGRP_CONFIG_LATENCY_CHANGE: 578 lgrp_latency_change((u_longlong_t)resource, 579 (u_longlong_t)where); 580 581 break; 582 case LGRP_CONFIG_NOP: 583 584 break; 585 default: 586 break; 587 } 588 589 } 590 591 /* 592 * Called to add lgrp info into cpu structure from cpu_add_unit; 593 * do not assume cpu is in cpu[] yet! 594 * 595 * CPUs are brought online with all other CPUs paused so we can't 596 * allocate memory or we could deadlock the system, so we rely on 597 * the platform to statically allocate as much space as we need 598 * for the lgrp structs and stats. 599 */ 600 static void 601 lgrp_cpu_init(struct cpu *cp) 602 { 603 klgrpset_t changed; 604 int count; 605 lgrp_handle_t hand; 606 int first_cpu; 607 lgrp_t *my_lgrp; 608 lgrp_id_t lgrpid; 609 struct cpu *cptr; 610 struct chip *chp; 611 612 /* 613 * This is the first time through if the resource set 614 * for the root lgroup is empty. After cpu0 has been 615 * initially added to an lgroup, the root's CPU resource 616 * set can never be empty, since the system's last CPU 617 * cannot be offlined. 618 */ 619 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 620 /* 621 * First time through. 622 */ 623 first_cpu = 1; 624 } else { 625 /* 626 * If cpu0 needs to move lgroups, we may come 627 * through here again, at which time cpu_lock won't 628 * be held, and lgrp_initialized will be false. 629 */ 630 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 631 ASSERT(cp->cpu_part != NULL); 632 first_cpu = 0; 633 } 634 635 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 636 my_lgrp = lgrp_hand_to_lgrp(hand); 637 638 if (my_lgrp == NULL) { 639 /* 640 * Create new lgrp and add it to lgroup topology 641 */ 642 my_lgrp = lgrp_create(); 643 my_lgrp->lgrp_plathand = hand; 644 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 645 lgrpid = my_lgrp->lgrp_id; 646 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 647 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 648 649 count = 0; 650 klgrpset_clear(changed); 651 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 652 &changed); 653 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 654 > 0) { 655 /* 656 * Leaf lgroup was created, but latency wasn't available 657 * then. So, set latency for it and fill in rest of lgroup 658 * topology now that we know how far it is from other leaf 659 * lgroups. 660 */ 661 lgrpid = my_lgrp->lgrp_id; 662 klgrpset_clear(changed); 663 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 664 lgrpid)) 665 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 666 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 667 &changed); 668 669 /* 670 * May have added new intermediate lgroups, so need to add 671 * resources other than CPUs which are added below 672 */ 673 (void) lgrp_mnode_update(changed, NULL); 674 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 675 my_lgrp->lgrp_id)) { 676 int i; 677 678 /* 679 * Update existing lgroup and lgroups containing it with CPU 680 * resource 681 */ 682 lgrpid = my_lgrp->lgrp_id; 683 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 684 for (i = 0; i <= lgrp_alloc_max; i++) { 685 lgrp_t *lgrp; 686 687 lgrp = lgrp_table[i]; 688 if (!LGRP_EXISTS(lgrp) || 689 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 690 continue; 691 692 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 693 } 694 } 695 696 lgrpid = my_lgrp->lgrp_id; 697 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 698 699 /* 700 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 701 * end up in lpl for lgroup 0 whether it is supposed to be in there or 702 * not since none of lgroup IDs in the lpl's have been set yet. 703 */ 704 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 705 cp->cpu_lpl->lpl_lgrpid = lgrpid; 706 707 /* 708 * link the CPU into the lgrp's CPU list 709 */ 710 if (my_lgrp->lgrp_cpucnt == 0) { 711 my_lgrp->lgrp_cpu = cp; 712 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 713 } else { 714 cptr = my_lgrp->lgrp_cpu; 715 cp->cpu_next_lgrp = cptr; 716 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 717 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 718 cptr->cpu_prev_lgrp = cp; 719 } 720 my_lgrp->lgrp_cpucnt++; 721 722 /* 723 * Add this cpu's chip to the per lgroup list 724 * if necessary 725 */ 726 if (cp->cpu_chip->chip_lgrp == NULL) { 727 struct chip *lcpr; 728 729 chp = cp->cpu_chip; 730 731 if (my_lgrp->lgrp_chipcnt == 0) { 732 my_lgrp->lgrp_chips = chp; 733 chp->chip_next_lgrp = 734 chp->chip_prev_lgrp = chp; 735 } else { 736 lcpr = my_lgrp->lgrp_chips; 737 chp->chip_next_lgrp = lcpr; 738 chp->chip_prev_lgrp = 739 lcpr->chip_prev_lgrp; 740 lcpr->chip_prev_lgrp->chip_next_lgrp = 741 chp; 742 lcpr->chip_prev_lgrp = chp; 743 } 744 chp->chip_lgrp = my_lgrp; 745 chp->chip_balance = chp->chip_next_lgrp; 746 my_lgrp->lgrp_chipcnt++; 747 } 748 } 749 750 lgrp_t * 751 lgrp_create(void) 752 { 753 lgrp_t *my_lgrp; 754 lgrp_id_t lgrpid; 755 int i; 756 757 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 758 759 /* 760 * Find an open slot in the lgroup table and recycle unused lgroup 761 * left there if any 762 */ 763 my_lgrp = NULL; 764 if (lgrp_alloc_hint == -1) 765 /* 766 * Allocate from end when hint not set yet because no lgroups 767 * have been deleted yet 768 */ 769 lgrpid = nlgrps++; 770 else { 771 /* 772 * Start looking for next open slot from hint and leave hint 773 * at slot allocated 774 */ 775 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 776 my_lgrp = lgrp_table[i]; 777 if (!LGRP_EXISTS(my_lgrp)) { 778 lgrpid = i; 779 nlgrps++; 780 break; 781 } 782 } 783 lgrp_alloc_hint = lgrpid; 784 } 785 786 /* 787 * Keep track of max lgroup ID allocated so far to cut down on searches 788 */ 789 if (lgrpid > lgrp_alloc_max) 790 lgrp_alloc_max = lgrpid; 791 792 /* 793 * Need to allocate new lgroup if next open slot didn't have one 794 * for recycling 795 */ 796 if (my_lgrp == NULL) 797 my_lgrp = lgrp_plat_alloc(lgrpid); 798 799 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 800 panic("Too many lgrps for platform (%d)", nlgrps); 801 802 my_lgrp->lgrp_id = lgrpid; 803 my_lgrp->lgrp_latency = 0; 804 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 805 my_lgrp->lgrp_parent = NULL; 806 my_lgrp->lgrp_childcnt = 0; 807 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 808 my_lgrp->lgrp_nmnodes = 0; 809 klgrpset_clear(my_lgrp->lgrp_children); 810 klgrpset_clear(my_lgrp->lgrp_leaves); 811 for (i = 0; i < LGRP_RSRC_COUNT; i++) 812 klgrpset_clear(my_lgrp->lgrp_set[i]); 813 814 my_lgrp->lgrp_cpu = NULL; 815 my_lgrp->lgrp_cpucnt = 0; 816 my_lgrp->lgrp_chips = NULL; 817 my_lgrp->lgrp_chipcnt = 0; 818 819 if (my_lgrp->lgrp_kstat != NULL) 820 lgrp_kstat_reset(lgrpid); 821 822 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 823 824 return (my_lgrp); 825 } 826 827 void 828 lgrp_destroy(lgrp_t *lgrp) 829 { 830 int i; 831 832 /* 833 * Unless this lgroup is being destroyed on behalf of 834 * the boot CPU, cpu_lock must be held 835 */ 836 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 837 838 if (nlgrps == 1) 839 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 840 841 if (!LGRP_EXISTS(lgrp)) 842 return; 843 844 /* 845 * Set hint to lgroup being deleted and try to keep lower numbered 846 * hints to facilitate finding empty slots 847 */ 848 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 849 lgrp_alloc_hint = lgrp->lgrp_id; 850 851 /* 852 * Mark this lgroup to be recycled by setting its lgroup ID to 853 * LGRP_NONE and clear relevant fields 854 */ 855 lgrp->lgrp_id = LGRP_NONE; 856 lgrp->lgrp_latency = 0; 857 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 858 lgrp->lgrp_parent = NULL; 859 lgrp->lgrp_childcnt = 0; 860 861 klgrpset_clear(lgrp->lgrp_children); 862 klgrpset_clear(lgrp->lgrp_leaves); 863 for (i = 0; i < LGRP_RSRC_COUNT; i++) 864 klgrpset_clear(lgrp->lgrp_set[i]); 865 866 lgrp->lgrp_mnodes = (mnodeset_t)0; 867 lgrp->lgrp_nmnodes = 0; 868 869 lgrp->lgrp_cpu = NULL; 870 lgrp->lgrp_cpucnt = 0; 871 lgrp->lgrp_chipcnt = 0; 872 lgrp->lgrp_chips = NULL; 873 874 nlgrps--; 875 } 876 877 /* 878 * Initialize kstat data. Called from lgrp intialization code. 879 */ 880 static void 881 lgrp_kstat_init(void) 882 { 883 lgrp_stat_t stat; 884 885 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 886 887 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 888 kstat_named_init(&lgrp_kstat_data[stat], 889 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 890 } 891 892 /* 893 * initialize an lgrp's kstats if needed 894 * called with cpu_lock held but not with cpus paused. 895 * we don't tear these down now because we don't know about 896 * memory leaving the lgrp yet... 897 */ 898 899 void 900 lgrp_kstat_create(cpu_t *cp) 901 { 902 kstat_t *lgrp_kstat; 903 lgrp_id_t lgrpid; 904 lgrp_t *my_lgrp; 905 906 ASSERT(MUTEX_HELD(&cpu_lock)); 907 908 lgrpid = cp->cpu_lpl->lpl_lgrpid; 909 my_lgrp = lgrp_table[lgrpid]; 910 911 if (my_lgrp->lgrp_kstat != NULL) 912 return; /* already initialized */ 913 914 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 915 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 916 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 917 918 if (lgrp_kstat != NULL) { 919 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 920 lgrp_kstat->ks_private = my_lgrp; 921 lgrp_kstat->ks_data = &lgrp_kstat_data; 922 lgrp_kstat->ks_update = lgrp_kstat_extract; 923 my_lgrp->lgrp_kstat = lgrp_kstat; 924 kstat_install(lgrp_kstat); 925 } 926 } 927 928 /* 929 * this will do something when we manage to remove now unused lgrps 930 */ 931 932 /* ARGSUSED */ 933 void 934 lgrp_kstat_destroy(cpu_t *cp) 935 { 936 ASSERT(MUTEX_HELD(&cpu_lock)); 937 } 938 939 /* 940 * Called when a CPU is off-lined. 941 */ 942 static void 943 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 944 { 945 lgrp_t *my_lgrp; 946 struct cpu *prev; 947 struct cpu *next; 948 chip_t *chp; 949 950 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 951 952 prev = cp->cpu_prev_lgrp; 953 next = cp->cpu_next_lgrp; 954 955 prev->cpu_next_lgrp = next; 956 next->cpu_prev_lgrp = prev; 957 958 /* 959 * just because I'm paranoid doesn't mean... 960 */ 961 962 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 963 964 my_lgrp = lgrp_table[lgrpid]; 965 my_lgrp->lgrp_cpucnt--; 966 967 /* 968 * If the last CPU on it's chip is being offlined 969 * then remove this chip from the per lgroup list. 970 * 971 * This is also done for the boot CPU when it needs 972 * to move between lgroups as a consequence of 973 * null proc lpa. 974 */ 975 chp = cp->cpu_chip; 976 if (chp->chip_ncpu == 0 || !lgrp_initialized) { 977 978 chip_t *chpp; 979 980 if (--my_lgrp->lgrp_chipcnt == 0) 981 my_lgrp->lgrp_chips = NULL; 982 else if (my_lgrp->lgrp_chips == chp) 983 my_lgrp->lgrp_chips = chp->chip_next_lgrp; 984 985 /* 986 * Walk this lgroup's chip list looking for chips that 987 * may try to balance against the one that's leaving 988 */ 989 for (chpp = chp->chip_next_lgrp; chpp != chp; 990 chpp = chpp->chip_next_lgrp) { 991 if (chpp->chip_balance == chp) 992 chpp->chip_balance = chp->chip_next_lgrp; 993 } 994 995 chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 996 chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 997 998 chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 999 chp->chip_lgrp = NULL; 1000 chp->chip_balance = NULL; 1001 } 1002 1003 /* 1004 * Removing last CPU in lgroup, so update lgroup topology 1005 */ 1006 if (my_lgrp->lgrp_cpucnt == 0) { 1007 klgrpset_t changed; 1008 int count; 1009 int i; 1010 1011 my_lgrp->lgrp_cpu = NULL; 1012 1013 /* 1014 * Remove this lgroup from its lgroup CPU resources and remove 1015 * lgroup from lgroup topology if it doesn't have any more 1016 * resources in it now 1017 */ 1018 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1019 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1020 count = 0; 1021 klgrpset_clear(changed); 1022 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1023 lgrp_alloc_max + 1, &changed); 1024 return; 1025 } 1026 1027 /* 1028 * This lgroup isn't empty, so just remove it from CPU 1029 * resources of any lgroups that contain it as such 1030 */ 1031 for (i = 0; i <= lgrp_alloc_max; i++) { 1032 lgrp_t *lgrp; 1033 1034 lgrp = lgrp_table[i]; 1035 if (!LGRP_EXISTS(lgrp) || 1036 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1037 lgrpid)) 1038 continue; 1039 1040 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1041 } 1042 return; 1043 } 1044 1045 if (my_lgrp->lgrp_cpu == cp) 1046 my_lgrp->lgrp_cpu = next; 1047 1048 } 1049 1050 /* 1051 * Update memory nodes in target lgroups and return ones that get changed 1052 */ 1053 int 1054 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1055 { 1056 int count; 1057 int i; 1058 int j; 1059 lgrp_t *lgrp; 1060 lgrp_t *lgrp_rsrc; 1061 1062 count = 0; 1063 if (changed) 1064 klgrpset_clear(*changed); 1065 1066 if (klgrpset_isempty(target)) 1067 return (0); 1068 1069 /* 1070 * Find each lgroup in target lgroups 1071 */ 1072 for (i = 0; i <= lgrp_alloc_max; i++) { 1073 /* 1074 * Skip any lgroups that don't exist or aren't in target group 1075 */ 1076 lgrp = lgrp_table[i]; 1077 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1078 continue; 1079 } 1080 1081 /* 1082 * Initialize memnodes for intermediate lgroups to 0 1083 * and update them from scratch since they may have completely 1084 * changed 1085 */ 1086 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1087 lgrp->lgrp_mnodes = (mnodeset_t)0; 1088 lgrp->lgrp_nmnodes = 0; 1089 } 1090 1091 /* 1092 * Update memory nodes of of target lgroup with memory nodes 1093 * from each lgroup in its lgroup memory resource set 1094 */ 1095 for (j = 0; j <= lgrp_alloc_max; j++) { 1096 int k; 1097 1098 /* 1099 * Skip any lgroups that don't exist or aren't in 1100 * memory resources of target lgroup 1101 */ 1102 lgrp_rsrc = lgrp_table[j]; 1103 if (!LGRP_EXISTS(lgrp_rsrc) || 1104 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1105 j)) 1106 continue; 1107 1108 /* 1109 * Update target lgroup's memnodes to include memnodes 1110 * of this lgroup 1111 */ 1112 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1113 mnodeset_t mnode_mask; 1114 1115 mnode_mask = (mnodeset_t)1 << k; 1116 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1117 !(lgrp->lgrp_mnodes & mnode_mask)) { 1118 lgrp->lgrp_mnodes |= mnode_mask; 1119 lgrp->lgrp_nmnodes++; 1120 } 1121 } 1122 count++; 1123 if (changed) 1124 klgrpset_add(*changed, lgrp->lgrp_id); 1125 } 1126 } 1127 1128 return (count); 1129 } 1130 1131 /* 1132 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1133 * is moved from one board to another. The "from" and "to" arguments specify the 1134 * source and the destination of the move. 1135 * 1136 * See plat_lgrp_config() for a detailed description of the copy-rename 1137 * semantics. 1138 * 1139 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1140 * the lgroup topology which is changing as memory moves from one lgroup to 1141 * another. It removes the mnode from the source lgroup and re-inserts it in the 1142 * target lgroup. 1143 * 1144 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1145 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1146 * copy-rename operation. 1147 * 1148 * There is one case which requires special handling. If the system contains 1149 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1150 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1151 * lgrp_mem_init), but there is a window when the system has no memory in the 1152 * lgroup hierarchy. If another thread tries to allocate memory during this 1153 * window, the allocation will fail, although the system has physical memory. 1154 * This may cause a system panic or a deadlock (some sleeping memory allocations 1155 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1156 * the mnode back). 1157 * 1158 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1159 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1160 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1161 * but it updates the rest of the lgroup topology as if the mnode was actually 1162 * removed. The lgrp_mem_init() function recognizes that the mnode being 1163 * inserted represents such a special case and updates the topology 1164 * appropriately. 1165 */ 1166 void 1167 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1168 { 1169 /* 1170 * Remove the memory from the source node and add it to the destination 1171 * node. 1172 */ 1173 lgrp_mem_fini(mnode, from, B_TRUE); 1174 lgrp_mem_init(mnode, to, B_TRUE); 1175 } 1176 1177 /* 1178 * Called to indicate that the lgrp with platform handle "hand" now 1179 * contains the memory identified by "mnode". 1180 * 1181 * LOCKING for this routine is a bit tricky. Usually it is called without 1182 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1183 * callers. During DR of the board containing the caged memory it may be called 1184 * with cpu_lock already held and CPUs paused. 1185 * 1186 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1187 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1188 * dealing with the special case of DR copy-rename described in 1189 * lgrp_mem_rename(). 1190 */ 1191 void 1192 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1193 { 1194 klgrpset_t changed; 1195 int count; 1196 int i; 1197 lgrp_t *my_lgrp; 1198 lgrp_id_t lgrpid; 1199 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1200 boolean_t drop_lock = B_FALSE; 1201 boolean_t need_synch = B_FALSE; 1202 1203 /* 1204 * Grab CPU lock (if we haven't already) 1205 */ 1206 if (!MUTEX_HELD(&cpu_lock)) { 1207 mutex_enter(&cpu_lock); 1208 drop_lock = B_TRUE; 1209 } 1210 1211 /* 1212 * This routine may be called from a context where we already 1213 * hold cpu_lock, and have already paused cpus. 1214 */ 1215 if (!cpus_paused()) 1216 need_synch = B_TRUE; 1217 1218 /* 1219 * Check if this mnode is already configured and return immediately if 1220 * it is. 1221 * 1222 * NOTE: in special case of copy-rename of the only remaining mnode, 1223 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1224 * recognize this case and continue as usual, but skip the update to 1225 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1226 * in topology, temporarily introduced by lgrp_mem_fini(). 1227 */ 1228 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1229 lgrp_root->lgrp_mnodes & mnodes_mask) { 1230 if (drop_lock) 1231 mutex_exit(&cpu_lock); 1232 return; 1233 } 1234 1235 /* 1236 * Update lgroup topology with new memory resources, keeping track of 1237 * which lgroups change 1238 */ 1239 count = 0; 1240 klgrpset_clear(changed); 1241 my_lgrp = lgrp_hand_to_lgrp(hand); 1242 if (my_lgrp == NULL) { 1243 /* new lgrp */ 1244 my_lgrp = lgrp_create(); 1245 lgrpid = my_lgrp->lgrp_id; 1246 my_lgrp->lgrp_plathand = hand; 1247 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1248 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1249 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1250 1251 if (need_synch) 1252 pause_cpus(NULL); 1253 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1254 &changed); 1255 if (need_synch) 1256 start_cpus(); 1257 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1258 > 0) { 1259 /* 1260 * Leaf lgroup was created, but latency wasn't available 1261 * then. So, set latency for it and fill in rest of lgroup 1262 * topology now that we know how far it is from other leaf 1263 * lgroups. 1264 */ 1265 klgrpset_clear(changed); 1266 lgrpid = my_lgrp->lgrp_id; 1267 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1268 lgrpid)) 1269 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1270 if (need_synch) 1271 pause_cpus(NULL); 1272 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1273 &changed); 1274 if (need_synch) 1275 start_cpus(); 1276 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1277 my_lgrp->lgrp_id)) { 1278 klgrpset_add(changed, lgrpid); 1279 count = 1; 1280 1281 lgrpid = my_lgrp->lgrp_id; 1282 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1283 klgrpset_add(changed, lgrpid); 1284 count++; 1285 for (i = 0; i <= lgrp_alloc_max; i++) { 1286 lgrp_t *lgrp; 1287 1288 lgrp = lgrp_table[i]; 1289 if (!LGRP_EXISTS(lgrp) || 1290 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1291 continue; 1292 1293 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1294 klgrpset_add(changed, lgrp->lgrp_id); 1295 count++; 1296 } 1297 } 1298 1299 /* 1300 * Add memory node to lgroup and remove lgroup from ones that need 1301 * to be updated 1302 */ 1303 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1304 my_lgrp->lgrp_mnodes |= mnodes_mask; 1305 my_lgrp->lgrp_nmnodes++; 1306 } 1307 klgrpset_del(changed, lgrpid); 1308 1309 /* 1310 * Update memory node information for all lgroups that changed and 1311 * contain new memory node as a resource 1312 */ 1313 if (count) 1314 (void) lgrp_mnode_update(changed, NULL); 1315 1316 if (drop_lock) 1317 mutex_exit(&cpu_lock); 1318 } 1319 1320 /* 1321 * Called to indicate that the lgroup associated with the platform 1322 * handle "hand" no longer contains given memory node 1323 * 1324 * LOCKING for this routine is a bit tricky. Usually it is called without 1325 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1326 * callers. During DR of the board containing the caged memory it may be called 1327 * with cpu_lock already held and CPUs paused. 1328 * 1329 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1330 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1331 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1332 * the same mnode back into the topology. See lgrp_mem_rename() and 1333 * lgrp_mem_init() for additional details. 1334 */ 1335 void 1336 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1337 { 1338 klgrpset_t changed; 1339 int count; 1340 int i; 1341 lgrp_t *my_lgrp; 1342 lgrp_id_t lgrpid; 1343 mnodeset_t mnodes_mask; 1344 boolean_t drop_lock = B_FALSE; 1345 boolean_t need_synch = B_FALSE; 1346 1347 /* 1348 * Grab CPU lock (if we haven't already) 1349 */ 1350 if (!MUTEX_HELD(&cpu_lock)) { 1351 mutex_enter(&cpu_lock); 1352 drop_lock = B_TRUE; 1353 } 1354 1355 /* 1356 * This routine may be called from a context where we already 1357 * hold cpu_lock and have already paused cpus. 1358 */ 1359 if (!cpus_paused()) 1360 need_synch = B_TRUE; 1361 1362 my_lgrp = lgrp_hand_to_lgrp(hand); 1363 1364 /* 1365 * The lgrp *must* be pre-existing 1366 */ 1367 ASSERT(my_lgrp != NULL); 1368 1369 /* 1370 * Delete memory node from lgroups which contain it 1371 */ 1372 mnodes_mask = ((mnodeset_t)1 << mnode); 1373 for (i = 0; i <= lgrp_alloc_max; i++) { 1374 lgrp_t *lgrp = lgrp_table[i]; 1375 /* 1376 * Skip any non-existent lgroups and any lgroups that don't 1377 * contain leaf lgroup of memory as a memory resource 1378 */ 1379 if (!LGRP_EXISTS(lgrp) || 1380 !(lgrp->lgrp_mnodes & mnodes_mask)) 1381 continue; 1382 1383 /* 1384 * Avoid removing the last mnode from the root in the DR 1385 * copy-rename case. See lgrp_mem_rename() for details. 1386 */ 1387 if (is_copy_rename && 1388 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1389 continue; 1390 1391 /* 1392 * Remove memory node from lgroup. 1393 */ 1394 lgrp->lgrp_mnodes &= ~mnodes_mask; 1395 lgrp->lgrp_nmnodes--; 1396 ASSERT(lgrp->lgrp_nmnodes >= 0); 1397 } 1398 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1399 1400 /* 1401 * Don't need to update lgroup topology if this lgroup still has memory. 1402 * 1403 * In the special case of DR copy-rename with the only mnode being 1404 * removed, the lgrp_mnodes for the root is always non-zero, but we 1405 * still need to update the lgroup topology. 1406 */ 1407 if ((my_lgrp->lgrp_nmnodes > 0) && 1408 !(is_copy_rename && 1409 (my_lgrp == lgrp_root) && 1410 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1411 if (drop_lock) 1412 mutex_exit(&cpu_lock); 1413 return; 1414 } 1415 1416 /* 1417 * This lgroup does not contain any memory now 1418 */ 1419 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1420 1421 /* 1422 * Remove this lgroup from lgroup topology if it does not contain any 1423 * resources now 1424 */ 1425 lgrpid = my_lgrp->lgrp_id; 1426 count = 0; 1427 klgrpset_clear(changed); 1428 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1429 /* 1430 * Delete lgroup when no more resources 1431 */ 1432 if (need_synch) 1433 pause_cpus(NULL); 1434 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1435 lgrp_alloc_max + 1, &changed); 1436 ASSERT(count > 0); 1437 if (need_synch) 1438 start_cpus(); 1439 } else { 1440 /* 1441 * Remove lgroup from memory resources of any lgroups that 1442 * contain it as such 1443 */ 1444 for (i = 0; i <= lgrp_alloc_max; i++) { 1445 lgrp_t *lgrp; 1446 1447 lgrp = lgrp_table[i]; 1448 if (!LGRP_EXISTS(lgrp) || 1449 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1450 lgrpid)) 1451 continue; 1452 1453 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1454 } 1455 } 1456 if (drop_lock) 1457 mutex_exit(&cpu_lock); 1458 } 1459 1460 /* 1461 * Return lgroup with given platform handle 1462 */ 1463 lgrp_t * 1464 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1465 { 1466 int i; 1467 lgrp_t *lgrp; 1468 1469 if (hand == LGRP_NULL_HANDLE) 1470 return (NULL); 1471 1472 for (i = 0; i <= lgrp_alloc_max; i++) { 1473 lgrp = lgrp_table[i]; 1474 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1475 return (lgrp); 1476 } 1477 return (NULL); 1478 } 1479 1480 /* 1481 * Return the home lgroup of the current thread. 1482 * We must do this with kernel preemption disabled, since we don't want our 1483 * thread to be re-homed while we're poking around with its lpl, and the lpl 1484 * should never be NULL. 1485 * 1486 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1487 * is enabled because of DR. Callers can use disable kernel preemption 1488 * around this call to guarantee that the lgroup will be valid beyond this 1489 * routine, since kernel preemption can be recursive. 1490 */ 1491 lgrp_t * 1492 lgrp_home_lgrp(void) 1493 { 1494 lgrp_t *lgrp; 1495 lpl_t *lpl; 1496 1497 kpreempt_disable(); 1498 1499 lpl = curthread->t_lpl; 1500 ASSERT(lpl != NULL); 1501 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1502 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1503 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1504 1505 kpreempt_enable(); 1506 1507 return (lgrp); 1508 } 1509 1510 /* 1511 * Return ID of home lgroup for given thread 1512 * (See comments for lgrp_home_lgrp() for special care and handling 1513 * instructions) 1514 */ 1515 lgrp_id_t 1516 lgrp_home_id(kthread_t *t) 1517 { 1518 lgrp_id_t lgrp; 1519 lpl_t *lpl; 1520 1521 ASSERT(t != NULL); 1522 /* 1523 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1524 * cannot since the HAT layer can call into this routine to 1525 * determine the locality for its data structures in the context 1526 * of a page fault. 1527 */ 1528 1529 kpreempt_disable(); 1530 1531 lpl = t->t_lpl; 1532 ASSERT(lpl != NULL); 1533 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1534 lgrp = lpl->lpl_lgrpid; 1535 1536 kpreempt_enable(); 1537 1538 return (lgrp); 1539 } 1540 1541 /* 1542 * Return lgroup containing the physical memory for the given page frame number 1543 */ 1544 lgrp_t * 1545 lgrp_pfn_to_lgrp(pfn_t pfn) 1546 { 1547 lgrp_handle_t hand; 1548 int i; 1549 lgrp_t *lgrp; 1550 1551 hand = lgrp_plat_pfn_to_hand(pfn); 1552 if (hand != LGRP_NULL_HANDLE) 1553 for (i = 0; i <= lgrp_alloc_max; i++) { 1554 lgrp = lgrp_table[i]; 1555 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1556 return (lgrp); 1557 } 1558 return (NULL); 1559 } 1560 1561 /* 1562 * Return lgroup containing the physical memory for the given page frame number 1563 */ 1564 lgrp_t * 1565 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1566 { 1567 lgrp_handle_t hand; 1568 int i; 1569 lgrp_t *lgrp; 1570 pfn_t pfn; 1571 1572 pfn = btop(physaddr); 1573 hand = lgrp_plat_pfn_to_hand(pfn); 1574 if (hand != LGRP_NULL_HANDLE) 1575 for (i = 0; i <= lgrp_alloc_max; i++) { 1576 lgrp = lgrp_table[i]; 1577 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1578 return (lgrp); 1579 } 1580 return (NULL); 1581 } 1582 1583 /* 1584 * Return the leaf lgroup containing the given CPU 1585 */ 1586 static lgrp_t * 1587 lgrp_cpu_to_lgrp(cpu_t *cpu) 1588 { 1589 return (cpu->cpu_chip->chip_lgrp); 1590 } 1591 1592 /* 1593 * Return the sum of the partition loads in an lgrp divided by 1594 * the number of CPUs in the lgrp. This is our best approximation 1595 * of an 'lgroup load average' for a useful per-lgroup kstat. 1596 */ 1597 static uint64_t 1598 lgrp_sum_loadavgs(lgrp_t *lgrp) 1599 { 1600 cpu_t *cpu; 1601 int ncpu; 1602 uint64_t loads = 0; 1603 1604 mutex_enter(&cpu_lock); 1605 1606 cpu = lgrp->lgrp_cpu; 1607 ncpu = lgrp->lgrp_cpucnt; 1608 1609 if (cpu == NULL || ncpu == 0) { 1610 mutex_exit(&cpu_lock); 1611 return (0ull); 1612 } 1613 1614 do { 1615 loads += cpu->cpu_lpl->lpl_loadavg; 1616 cpu = cpu->cpu_next_lgrp; 1617 } while (cpu != lgrp->lgrp_cpu); 1618 1619 mutex_exit(&cpu_lock); 1620 1621 return (loads / ncpu); 1622 } 1623 1624 void 1625 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1626 { 1627 struct lgrp_stats *pstats; 1628 1629 /* 1630 * Verify that the caller isn't trying to add to 1631 * a statistic for an lgroup that has gone away 1632 */ 1633 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1634 return; 1635 1636 pstats = &lgrp_stats[lgrpid]; 1637 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1638 } 1639 1640 int64_t 1641 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1642 { 1643 uint64_t val; 1644 struct lgrp_stats *pstats; 1645 1646 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1647 return ((int64_t)0); 1648 1649 pstats = &lgrp_stats[lgrpid]; 1650 LGRP_STAT_READ(pstats, stat, val); 1651 return (val); 1652 } 1653 1654 /* 1655 * Reset all kstats for lgrp specified by its lgrpid. 1656 */ 1657 static void 1658 lgrp_kstat_reset(lgrp_id_t lgrpid) 1659 { 1660 lgrp_stat_t stat; 1661 1662 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1663 return; 1664 1665 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1666 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1667 } 1668 } 1669 1670 /* 1671 * Collect all per-lgrp statistics for the lgrp associated with this 1672 * kstat, and store them in the ks_data array. 1673 * 1674 * The superuser can reset all the running counter statistics for an 1675 * lgrp by writing to any of the lgrp's stats. 1676 */ 1677 static int 1678 lgrp_kstat_extract(kstat_t *ksp, int rw) 1679 { 1680 lgrp_stat_t stat; 1681 struct kstat_named *ksd; 1682 lgrp_t *lgrp; 1683 lgrp_id_t lgrpid; 1684 1685 lgrp = (lgrp_t *)ksp->ks_private; 1686 1687 ksd = (struct kstat_named *)ksp->ks_data; 1688 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1689 1690 lgrpid = lgrp->lgrp_id; 1691 1692 if (lgrpid == LGRP_NONE) { 1693 /* 1694 * Return all zeroes as stats for freed lgrp. 1695 */ 1696 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1697 ksd[stat].value.i64 = 0; 1698 } 1699 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1700 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1701 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1702 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1703 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1704 } else if (rw != KSTAT_WRITE) { 1705 /* 1706 * Handle counter stats 1707 */ 1708 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1709 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1710 } 1711 1712 /* 1713 * Handle kernel data snapshot stats 1714 */ 1715 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1716 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1717 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1718 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1719 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1720 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1721 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1722 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1723 } else { 1724 lgrp_kstat_reset(lgrpid); 1725 } 1726 1727 return (0); 1728 } 1729 1730 int 1731 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1732 { 1733 cpu_t *cp; 1734 1735 mutex_enter(&cpu_lock); 1736 1737 if ((cp = cpu_get(id)) == NULL) { 1738 mutex_exit(&cpu_lock); 1739 return (EINVAL); 1740 } 1741 1742 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1743 mutex_exit(&cpu_lock); 1744 return (EINVAL); 1745 } 1746 1747 ASSERT(cp->cpu_lpl != NULL); 1748 1749 *lp = cp->cpu_lpl->lpl_lgrpid; 1750 1751 mutex_exit(&cpu_lock); 1752 1753 return (0); 1754 } 1755 1756 int 1757 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1758 { 1759 cpu_t *cp; 1760 1761 mutex_enter(&cpu_lock); 1762 1763 if ((cp = cpu_get(id)) == NULL) { 1764 mutex_exit(&cpu_lock); 1765 return (EINVAL); 1766 } 1767 1768 ASSERT(cp->cpu_lpl != NULL); 1769 1770 *lp = cp->cpu_lpl->lpl_loadavg; 1771 1772 mutex_exit(&cpu_lock); 1773 1774 return (0); 1775 } 1776 1777 void 1778 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime) 1779 { 1780 lgrp_t *lgrp; 1781 int i; 1782 1783 for (i = 0; i <= lgrp_alloc_max; i++) { 1784 lgrp = lgrp_table[i]; 1785 1786 if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime)) 1787 lgrp->lgrp_latency = (int)newtime; 1788 } 1789 } 1790 1791 /* 1792 * Add a resource named by lpl_leaf to rset of lpl_target 1793 * 1794 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1795 * resource. It is adjusted here, as this is presently the only place that we 1796 * can be certain a resource addition has succeeded. 1797 * 1798 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1799 * list in order until it reaches a NULL. (This list is required to be NULL 1800 * terminated, too). This is done so that we can mark start pos + 1, so that 1801 * each lpl is traversed sequentially, but in a different order. We hope this 1802 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1803 */ 1804 1805 void 1806 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1807 { 1808 int i; 1809 int entry_slot = 0; 1810 1811 /* return if leaf is already present */ 1812 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1813 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1814 return; 1815 } 1816 1817 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1818 lpl_leaf->lpl_lgrpid) { 1819 break; 1820 } 1821 } 1822 1823 /* insert leaf, update counts */ 1824 entry_slot = i; 1825 i = lpl_target->lpl_nrset++; 1826 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1827 panic("More leaf lgrps in system than are supported!\n"); 1828 } 1829 1830 /* 1831 * Start at the end of the rset array and work backwards towards the 1832 * slot into which the new lpl will be inserted. This effectively 1833 * preserves the current ordering by scooting everybody over one entry, 1834 * and placing the new entry into the space created. 1835 */ 1836 1837 while (i-- > entry_slot) { 1838 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1839 } 1840 1841 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1842 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1843 } 1844 1845 /* 1846 * Update each of lpl_parent's children with a proper hint and 1847 * a reference to their parent. 1848 * The lgrp topology is used as the reference since it is fully 1849 * consistent and correct at this point. 1850 * 1851 * Each child's hint will reference an element in lpl_parent's 1852 * rset that designates where the child should start searching 1853 * for CPU resources. The hint selected is the highest order leaf present 1854 * in the child's lineage. 1855 * 1856 * This should be called after any potential change in lpl_parent's 1857 * rset. 1858 */ 1859 static void 1860 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1861 { 1862 klgrpset_t children, leaves; 1863 lpl_t *lpl; 1864 int hint; 1865 int i, j; 1866 1867 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1868 if (klgrpset_isempty(children)) 1869 return; /* nothing to do */ 1870 1871 for (i = 0; i <= lgrp_alloc_max; i++) { 1872 if (klgrpset_ismember(children, i)) { 1873 1874 /* 1875 * Given the set of leaves in this child's lineage, 1876 * find the highest order leaf present in the parent's 1877 * rset. Select this as the hint for the child. 1878 */ 1879 leaves = lgrp_table[i]->lgrp_leaves; 1880 hint = 0; 1881 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1882 lpl = lpl_parent->lpl_rset[j]; 1883 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1884 hint = j; 1885 } 1886 cp->cp_lgrploads[i].lpl_hint = hint; 1887 1888 /* 1889 * (Re)set the parent. It may be incorrect if 1890 * lpl_parent is new in the topology. 1891 */ 1892 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1893 } 1894 } 1895 } 1896 1897 /* 1898 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1899 * 1900 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1901 * resource. The values are adjusted here, as this is the only place that we can 1902 * be certain a resource was successfully deleted. 1903 */ 1904 void 1905 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1906 { 1907 int i; 1908 1909 /* find leaf in intermediate node */ 1910 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1911 if (lpl_target->lpl_rset[i] == lpl_leaf) 1912 break; 1913 } 1914 1915 /* return if leaf not found */ 1916 if (lpl_target->lpl_rset[i] != lpl_leaf) 1917 return; 1918 1919 /* prune leaf, compress array */ 1920 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1921 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1922 lpl_target->lpl_ncpu--; 1923 do { 1924 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1925 } while (i++ < lpl_target->lpl_nrset); 1926 } 1927 1928 /* 1929 * Check to see if the resource set of the target lpl contains the 1930 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1931 */ 1932 1933 int 1934 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1935 { 1936 int i; 1937 1938 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1939 if (lpl_target->lpl_rset[i] == lpl_leaf) 1940 return (1); 1941 } 1942 1943 return (0); 1944 } 1945 1946 /* 1947 * Called when we change cpu lpl membership. This increments or decrements the 1948 * per-cpu counter in every lpl in which our leaf appears. 1949 */ 1950 void 1951 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1952 { 1953 cpupart_t *cpupart; 1954 lgrp_t *lgrp_leaf; 1955 lgrp_t *lgrp_cur; 1956 lpl_t *lpl_leaf; 1957 lpl_t *lpl_cur; 1958 int i; 1959 1960 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1961 1962 cpupart = cp->cpu_part; 1963 lpl_leaf = cp->cpu_lpl; 1964 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1965 1966 for (i = 0; i <= lgrp_alloc_max; i++) { 1967 lgrp_cur = lgrp_table[i]; 1968 1969 /* 1970 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 1971 * for the cpu in question, or if the current lgrp and leaf 1972 * don't share the same resources. 1973 */ 1974 1975 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 1976 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 1977 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 1978 continue; 1979 1980 1981 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 1982 1983 if (lpl_cur->lpl_nrset > 0) { 1984 if (act == LPL_INCREMENT) { 1985 lpl_cur->lpl_ncpu++; 1986 } else if (act == LPL_DECREMENT) { 1987 lpl_cur->lpl_ncpu--; 1988 } 1989 } 1990 } 1991 } 1992 1993 /* 1994 * Initialize lpl with given resources and specified lgrp 1995 */ 1996 1997 void 1998 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 1999 { 2000 lpl->lpl_lgrpid = lgrp->lgrp_id; 2001 lpl->lpl_loadavg = 0; 2002 if (lpl == lpl_leaf) 2003 lpl->lpl_ncpu = 1; 2004 else 2005 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2006 lpl->lpl_nrset = 1; 2007 lpl->lpl_rset[0] = lpl_leaf; 2008 lpl->lpl_lgrp = lgrp; 2009 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2010 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2011 } 2012 2013 /* 2014 * Clear an unused lpl 2015 */ 2016 2017 void 2018 lpl_clear(lpl_t *lpl) 2019 { 2020 lgrpid_t lid; 2021 2022 /* save lid for debugging purposes */ 2023 lid = lpl->lpl_lgrpid; 2024 bzero(lpl, sizeof (lpl_t)); 2025 lpl->lpl_lgrpid = lid; 2026 } 2027 2028 /* 2029 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2030 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2031 * make full use of all of the lgroup topology, but this checks to make sure 2032 * that for the parts that it does use, it has correctly understood the 2033 * relationships that exist. This function returns 2034 * 0 if the topology is correct, and a non-zero error code, for non-debug 2035 * kernels if incorrect. Asserts are spread throughout the code to aid in 2036 * debugging on a DEBUG kernel. 2037 */ 2038 int 2039 lpl_topo_verify(cpupart_t *cpupart) 2040 { 2041 lgrp_t *lgrp; 2042 lpl_t *lpl; 2043 klgrpset_t rset; 2044 klgrpset_t cset; 2045 cpu_t *cpu; 2046 cpu_t *cp_start; 2047 int i; 2048 int j; 2049 int sum; 2050 2051 /* topology can't be incorrect if it doesn't exist */ 2052 if (!lgrp_topo_initialized || !lgrp_initialized) 2053 return (LPL_TOPO_CORRECT); 2054 2055 ASSERT(cpupart != NULL); 2056 2057 for (i = 0; i <= lgrp_alloc_max; i++) { 2058 lgrp = lgrp_table[i]; 2059 lpl = NULL; 2060 /* make sure lpls are allocated */ 2061 ASSERT(cpupart->cp_lgrploads); 2062 if (!cpupart->cp_lgrploads) 2063 return (LPL_TOPO_PART_HAS_NO_LPL); 2064 2065 lpl = &cpupart->cp_lgrploads[i]; 2066 /* make sure our index is good */ 2067 ASSERT(i < cpupart->cp_nlgrploads); 2068 2069 /* if lgroup doesn't exist, make sure lpl is empty */ 2070 if (!LGRP_EXISTS(lgrp)) { 2071 ASSERT(lpl->lpl_ncpu == 0); 2072 if (lpl->lpl_ncpu > 0) { 2073 return (LPL_TOPO_CPUS_NOT_EMPTY); 2074 } else { 2075 continue; 2076 } 2077 } 2078 2079 /* verify that lgroup and lpl are identically numbered */ 2080 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2081 2082 /* if lgroup isn't in our partition, make sure lpl is empty */ 2083 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2084 cpupart->cp_lgrpset)) { 2085 ASSERT(lpl->lpl_ncpu == 0); 2086 if (lpl->lpl_ncpu > 0) { 2087 return (LPL_TOPO_CPUS_NOT_EMPTY); 2088 } 2089 /* 2090 * lpl is empty, and lgroup isn't in partition. verify 2091 * that lpl doesn't show up in anyone else's rsets (in 2092 * this partition, anyway) 2093 */ 2094 2095 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2096 lpl_t *i_lpl; /* lpl we're iterating over */ 2097 2098 i_lpl = &cpupart->cp_lgrploads[j]; 2099 2100 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2101 if (lpl_rset_contains(i_lpl, lpl)) { 2102 return (LPL_TOPO_LPL_ORPHANED); 2103 } 2104 } 2105 /* lgroup is empty, and everything is ok. continue */ 2106 continue; 2107 } 2108 2109 2110 /* lgroup is in this partition, now check it against lpl */ 2111 2112 /* do both have matching lgrps? */ 2113 ASSERT(lgrp == lpl->lpl_lgrp); 2114 if (lgrp != lpl->lpl_lgrp) { 2115 return (LPL_TOPO_LGRP_MISMATCH); 2116 } 2117 2118 /* do the parent lgroups exist and do they match? */ 2119 if (lgrp->lgrp_parent) { 2120 ASSERT(lpl->lpl_parent); 2121 ASSERT(lgrp->lgrp_parent->lgrp_id == 2122 lpl->lpl_parent->lpl_lgrpid); 2123 2124 if (!lpl->lpl_parent) { 2125 return (LPL_TOPO_MISSING_PARENT); 2126 } else if (lgrp->lgrp_parent->lgrp_id != 2127 lpl->lpl_parent->lpl_lgrpid) { 2128 return (LPL_TOPO_PARENT_MISMATCH); 2129 } 2130 } 2131 2132 /* only leaf lgroups keep a cpucnt, only check leaves */ 2133 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2134 2135 /* verify that lgrp is also a leaf */ 2136 ASSERT((lgrp->lgrp_childcnt == 0) && 2137 (klgrpset_ismember(lgrp->lgrp_leaves, 2138 lpl->lpl_lgrpid))); 2139 2140 if ((lgrp->lgrp_childcnt > 0) || 2141 (!klgrpset_ismember(lgrp->lgrp_leaves, 2142 lpl->lpl_lgrpid))) { 2143 return (LPL_TOPO_LGRP_NOT_LEAF); 2144 } 2145 2146 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2147 (lpl->lpl_ncpu > 0)); 2148 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2149 (lpl->lpl_ncpu <= 0)) { 2150 return (LPL_TOPO_BAD_CPUCNT); 2151 } 2152 2153 /* 2154 * Check that lpl_ncpu also matches the number of 2155 * cpus in the lpl's linked list. This only exists in 2156 * leaves, but they should always match. 2157 */ 2158 j = 0; 2159 cpu = cp_start = lpl->lpl_cpus; 2160 while (cpu != NULL) { 2161 j++; 2162 2163 /* check to make sure cpu's lpl is leaf lpl */ 2164 ASSERT(cpu->cpu_lpl == lpl); 2165 if (cpu->cpu_lpl != lpl) { 2166 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2167 } 2168 2169 /* check next cpu */ 2170 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2171 continue; 2172 } else { 2173 cpu = NULL; 2174 } 2175 } 2176 2177 ASSERT(j == lpl->lpl_ncpu); 2178 if (j != lpl->lpl_ncpu) { 2179 return (LPL_TOPO_LPL_BAD_NCPU); 2180 } 2181 2182 /* 2183 * Also, check that leaf lpl is contained in all 2184 * intermediate lpls that name the leaf as a descendant 2185 */ 2186 2187 for (j = 0; j <= lgrp_alloc_max; j++) { 2188 klgrpset_t intersect; 2189 lgrp_t *lgrp_cand; 2190 lpl_t *lpl_cand; 2191 2192 lgrp_cand = lgrp_table[j]; 2193 intersect = klgrpset_intersects( 2194 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2195 cpupart->cp_lgrpset); 2196 2197 if (!LGRP_EXISTS(lgrp_cand) || 2198 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2199 cpupart->cp_lgrpset) || 2200 (intersect == 0)) 2201 continue; 2202 2203 lpl_cand = 2204 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2205 2206 if (klgrpset_ismember(intersect, 2207 lgrp->lgrp_id)) { 2208 ASSERT(lpl_rset_contains(lpl_cand, 2209 lpl)); 2210 2211 if (!lpl_rset_contains(lpl_cand, lpl)) { 2212 return (LPL_TOPO_RSET_MSSNG_LF); 2213 } 2214 } 2215 } 2216 2217 } else { /* non-leaf specific checks */ 2218 2219 /* 2220 * Non-leaf lpls should have lpl_cpus == NULL 2221 * verify that this is so 2222 */ 2223 ASSERT(lpl->lpl_cpus == NULL); 2224 if (lpl->lpl_cpus != NULL) { 2225 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2226 } 2227 2228 /* 2229 * verify that the sum of the cpus in the leaf resources 2230 * is equal to the total ncpu in the intermediate 2231 */ 2232 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2233 sum += lpl->lpl_rset[j]->lpl_ncpu; 2234 } 2235 2236 ASSERT(sum == lpl->lpl_ncpu); 2237 if (sum != lpl->lpl_ncpu) { 2238 return (LPL_TOPO_LPL_BAD_NCPU); 2239 } 2240 } 2241 2242 /* 2243 * check on lpl_hint. Don't check root, since it has no parent. 2244 */ 2245 if (lpl->lpl_parent != NULL) { 2246 int hint; 2247 lpl_t *hint_lpl; 2248 2249 /* make sure hint is within limits of nrset */ 2250 hint = lpl->lpl_hint; 2251 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2252 if (lpl->lpl_parent->lpl_nrset < hint) { 2253 return (LPL_TOPO_BOGUS_HINT); 2254 } 2255 2256 /* make sure hint points to valid lpl */ 2257 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2258 ASSERT(hint_lpl->lpl_ncpu > 0); 2259 if (hint_lpl->lpl_ncpu <= 0) { 2260 return (LPL_TOPO_BOGUS_HINT); 2261 } 2262 } 2263 2264 /* 2265 * Check the rset of the lpl in question. Make sure that each 2266 * rset contains a subset of the resources in 2267 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2268 * sure that each rset doesn't include resources that are 2269 * outside of that set. (Which would be resources somehow not 2270 * accounted for). 2271 */ 2272 2273 klgrpset_clear(rset); 2274 for (j = 0; j < lpl->lpl_nrset; j++) { 2275 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2276 } 2277 klgrpset_copy(cset, rset); 2278 /* make sure lpl rset matches lgrp rset */ 2279 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2280 /* make sure rset is contained with in partition, too */ 2281 klgrpset_diff(cset, cpupart->cp_lgrpset); 2282 2283 ASSERT(klgrpset_isempty(rset) && 2284 klgrpset_isempty(cset)); 2285 if (!klgrpset_isempty(rset) || 2286 !klgrpset_isempty(cset)) { 2287 return (LPL_TOPO_RSET_MISMATCH); 2288 } 2289 2290 /* 2291 * check to make sure lpl_nrset matches the number of rsets 2292 * contained in the lpl 2293 */ 2294 2295 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2296 j++); 2297 2298 ASSERT(j == lpl->lpl_nrset); 2299 if (j != lpl->lpl_nrset) { 2300 return (LPL_TOPO_BAD_RSETCNT); 2301 } 2302 2303 } 2304 return (LPL_TOPO_CORRECT); 2305 } 2306 2307 /* 2308 * Flatten lpl topology to given number of levels. This is presently only 2309 * implemented for a flatten to 2 levels, which will prune out the intermediates 2310 * and home the leaf lpls to the root lpl. 2311 */ 2312 int 2313 lpl_topo_flatten(int levels) 2314 { 2315 int i; 2316 uint_t sum; 2317 lgrp_t *lgrp_cur; 2318 lpl_t *lpl_cur; 2319 lpl_t *lpl_root; 2320 cpupart_t *cp; 2321 2322 if (levels != 2) 2323 return (0); 2324 2325 /* called w/ cpus paused - grab no locks! */ 2326 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2327 !lgrp_initialized); 2328 2329 cp = cp_list_head; 2330 do { 2331 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2332 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2333 2334 for (i = 0; i <= lgrp_alloc_max; i++) { 2335 lgrp_cur = lgrp_table[i]; 2336 lpl_cur = &cp->cp_lgrploads[i]; 2337 2338 if ((lgrp_cur == lgrp_root) || 2339 (!LGRP_EXISTS(lgrp_cur) && 2340 (lpl_cur->lpl_ncpu == 0))) 2341 continue; 2342 2343 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2344 /* 2345 * this should be a deleted intermediate, so 2346 * clear it 2347 */ 2348 lpl_clear(lpl_cur); 2349 } else if ((lpl_cur->lpl_nrset == 1) && 2350 (lpl_cur->lpl_rset[0] == lpl_cur) && 2351 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2352 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2353 /* 2354 * this is a leaf whose parent was deleted, or 2355 * whose parent had their lgrp deleted. (And 2356 * whose parent will soon be deleted). Point 2357 * this guy back to the root lpl. 2358 */ 2359 lpl_cur->lpl_parent = lpl_root; 2360 lpl_rset_add(lpl_root, lpl_cur); 2361 } 2362 2363 } 2364 2365 /* 2366 * Now that we're done, make sure the count on the root lpl is 2367 * correct, and update the hints of the children for the sake of 2368 * thoroughness 2369 */ 2370 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2371 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2372 } 2373 lpl_root->lpl_ncpu = sum; 2374 lpl_child_update(lpl_root, cp); 2375 2376 cp = cp->cp_next; 2377 } while (cp != cp_list_head); 2378 2379 return (levels); 2380 } 2381 2382 /* 2383 * Insert a lpl into the resource hierarchy and create any additional lpls that 2384 * are necessary to represent the varying states of locality for the cpu 2385 * resoruces newly added to the partition. 2386 * 2387 * This routine is clever enough that it can correctly add resources from the 2388 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2389 * those for which the lpl is a leaf as opposed to simply a named equally local 2390 * resource). The one special case that needs additional processing is when a 2391 * new intermediate lpl is introduced. Since the main loop only traverses 2392 * looking to add the leaf resource where it does not yet exist, additional work 2393 * is necessary to add other leaf resources that may need to exist in the newly 2394 * created intermediate. This is performed by the second inner loop, and is 2395 * only done when the check for more than one overlapping resource succeeds. 2396 */ 2397 2398 void 2399 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2400 { 2401 int i; 2402 int j; 2403 int hint; 2404 int rset_num_intersect; 2405 lgrp_t *lgrp_cur; 2406 lpl_t *lpl_cur; 2407 lpl_t *lpl_parent; 2408 lgrpid_t parent_id; 2409 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2410 2411 for (i = 0; i <= lgrp_alloc_max; i++) { 2412 lgrp_cur = lgrp_table[i]; 2413 2414 /* 2415 * Don't insert if the lgrp isn't there, if the leaf isn't 2416 * contained within the current lgrp, or if the current lgrp has 2417 * no leaves in this partition 2418 */ 2419 2420 if (!LGRP_EXISTS(lgrp_cur) || 2421 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2422 lpl_leaf->lpl_lgrpid) || 2423 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2424 cpupart->cp_lgrpset)) 2425 continue; 2426 2427 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2428 if (lgrp_cur->lgrp_parent != NULL) { 2429 /* if lgrp has a parent, assign it properly */ 2430 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2431 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2432 } else { 2433 /* if not, make sure parent ptr gets set to null */ 2434 lpl_parent = NULL; 2435 } 2436 2437 if (lpl_cur == lpl_leaf) { 2438 /* 2439 * Almost all leaf state was initialized elsewhere. The 2440 * only thing left to do is to set the parent. 2441 */ 2442 lpl_cur->lpl_parent = lpl_parent; 2443 continue; 2444 } 2445 2446 /* 2447 * Initialize intermediate lpl 2448 * Save this lpl's hint though. Since we're changing this 2449 * lpl's resources, we need to update the hint in this lpl's 2450 * children, but the hint in this lpl is unaffected and 2451 * should be preserved. 2452 */ 2453 hint = lpl_cur->lpl_hint; 2454 2455 lpl_clear(lpl_cur); 2456 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2457 2458 lpl_cur->lpl_hint = hint; 2459 lpl_cur->lpl_parent = lpl_parent; 2460 2461 /* does new lpl need to be populated with other resources? */ 2462 rset_intersect = 2463 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2464 cpupart->cp_lgrpset); 2465 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2466 2467 if (rset_num_intersect > 1) { 2468 /* 2469 * If so, figure out what lpls have resources that 2470 * intersect this one, and add them. 2471 */ 2472 for (j = 0; j <= lgrp_alloc_max; j++) { 2473 lgrp_t *lgrp_cand; /* candidate lgrp */ 2474 lpl_t *lpl_cand; /* candidate lpl */ 2475 2476 lgrp_cand = lgrp_table[j]; 2477 if (!LGRP_EXISTS(lgrp_cand) || 2478 !klgrpset_ismember(rset_intersect, 2479 lgrp_cand->lgrp_id)) 2480 continue; 2481 lpl_cand = 2482 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2483 lpl_rset_add(lpl_cur, lpl_cand); 2484 } 2485 } 2486 /* 2487 * This lpl's rset has changed. Update the hint in it's 2488 * children. 2489 */ 2490 lpl_child_update(lpl_cur, cpupart); 2491 } 2492 } 2493 2494 /* 2495 * remove a lpl from the hierarchy of resources, clearing its state when 2496 * finished. If the lpls at the intermediate levels of the hierarchy have no 2497 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2498 * delete them as well. 2499 */ 2500 2501 void 2502 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2503 { 2504 int i; 2505 lgrp_t *lgrp_cur; 2506 lpl_t *lpl_cur; 2507 klgrpset_t leaf_intersect; /* intersection of leaves */ 2508 2509 for (i = 0; i <= lgrp_alloc_max; i++) { 2510 lgrp_cur = lgrp_table[i]; 2511 2512 /* 2513 * Don't attempt to remove from lgrps that aren't there, that 2514 * don't contain our leaf, or from the leaf itself. (We do that 2515 * later) 2516 */ 2517 2518 if (!LGRP_EXISTS(lgrp_cur)) 2519 continue; 2520 2521 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2522 2523 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2524 lpl_leaf->lpl_lgrpid) || 2525 (lpl_cur == lpl_leaf)) { 2526 continue; 2527 } 2528 2529 /* 2530 * This is a slightly sleazy simplification in that we have 2531 * already marked the cp_lgrpset as no longer containing the 2532 * leaf we've deleted. Any lpls that pass the above checks 2533 * based upon lgrp membership but not necessarily cpu-part 2534 * membership also get cleared by the checks below. Currently 2535 * this is harmless, as the lpls should be empty anyway. 2536 * 2537 * In particular, we want to preserve lpls that have additional 2538 * leaf resources, even though we don't yet have a processor 2539 * architecture that represents resources this way. 2540 */ 2541 2542 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2543 cpupart->cp_lgrpset); 2544 2545 lpl_rset_del(lpl_cur, lpl_leaf); 2546 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2547 lpl_clear(lpl_cur); 2548 } else { 2549 /* 2550 * Update this lpl's children 2551 */ 2552 lpl_child_update(lpl_cur, cpupart); 2553 } 2554 } 2555 lpl_clear(lpl_leaf); 2556 } 2557 2558 /* 2559 * add a cpu to a partition in terms of lgrp load avg bookeeping 2560 * 2561 * The lpl (cpu partition load average information) is now arranged in a 2562 * hierarchical fashion whereby resources that are closest, ie. most local, to 2563 * the cpu in question are considered to be leaves in a tree of resources. 2564 * There are two general cases for cpu additon: 2565 * 2566 * 1. A lpl structure that contains resources already in the hierarchy tree. 2567 * In this case, all of the associated lpl relationships have been defined, and 2568 * all that is necessary is that we link the new cpu into the per-lpl list of 2569 * cpus, and increment the ncpu count of all places where this cpu resource will 2570 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2571 * pushing is accomplished by this routine. 2572 * 2573 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2574 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2575 * construct the hierarchy of state necessary to name it's more distant 2576 * resources, if they should exist. The leaf structure is initialized by this 2577 * routine, as is the cpu-partition state for the lgrp membership. This routine 2578 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2579 * and builds all of the "ancestoral" state necessary to identify resources at 2580 * differing levels of locality. 2581 */ 2582 void 2583 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2584 { 2585 cpupart_t *cpupart; 2586 lgrp_t *lgrp_leaf; 2587 lpl_t *lpl_leaf; 2588 2589 /* called sometimes w/ cpus paused - grab no locks */ 2590 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2591 2592 cpupart = cp->cpu_part; 2593 lgrp_leaf = lgrp_table[lgrpid]; 2594 2595 /* don't add non-existent lgrp */ 2596 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2597 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2598 cp->cpu_lpl = lpl_leaf; 2599 2600 /* only leaf lpls contain cpus */ 2601 2602 if (lpl_leaf->lpl_ncpu++ == 0) { 2603 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2604 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2605 lpl_leaf_insert(lpl_leaf, cpupart); 2606 } else { 2607 /* 2608 * the lpl should already exist in the parent, so just update 2609 * the count of available CPUs 2610 */ 2611 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2612 } 2613 2614 /* link cpu into list of cpus in lpl */ 2615 2616 if (lpl_leaf->lpl_cpus) { 2617 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2618 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2619 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2620 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2621 } else { 2622 /* 2623 * We increment ncpu immediately after we create a new leaf 2624 * lpl, so assert that ncpu == 1 for the case where we don't 2625 * have any cpu pointers yet. 2626 */ 2627 ASSERT(lpl_leaf->lpl_ncpu == 1); 2628 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2629 } 2630 2631 } 2632 2633 2634 /* 2635 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2636 * 2637 * The lpl (cpu partition load average information) is now arranged in a 2638 * hierarchical fashion whereby resources that are closest, ie. most local, to 2639 * the cpu in question are considered to be leaves in a tree of resources. 2640 * There are two removal cases in question: 2641 * 2642 * 1. Removal of the resource in the leaf leaves other resources remaining in 2643 * that leaf. (Another cpu still exists at this level of locality). In this 2644 * case, the count of available cpus is decremented in all assocated lpls by 2645 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2646 * from the per-cpu lpl list. 2647 * 2648 * 2. Removal of the resource results in the lpl containing no resources. (It's 2649 * empty) In this case, all of what has occurred for the first step must take 2650 * place; however, additionally we must remove the lpl structure itself, prune 2651 * out any stranded lpls that do not directly name a leaf resource, and mark the 2652 * cpu partition in question as no longer containing resources from the lgrp of 2653 * the lpl that has been delted. Cpu-partition changes are handled by this 2654 * method, but the lpl_leaf_remove function deals with the details of pruning 2655 * out the empty lpl and any of its orphaned direct ancestors. 2656 */ 2657 void 2658 lgrp_part_del_cpu(cpu_t *cp) 2659 { 2660 lpl_t *lpl; 2661 lpl_t *leaf_lpl; 2662 lgrp_t *lgrp_leaf; 2663 2664 /* called sometimes w/ cpus paused - grab no locks */ 2665 2666 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2667 2668 lpl = leaf_lpl = cp->cpu_lpl; 2669 lgrp_leaf = leaf_lpl->lpl_lgrp; 2670 2671 /* don't delete a leaf that isn't there */ 2672 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2673 2674 /* no double-deletes */ 2675 ASSERT(lpl->lpl_ncpu); 2676 if (--lpl->lpl_ncpu == 0) { 2677 /* 2678 * This was the last cpu in this lgroup for this partition, 2679 * clear its bit in the partition's lgroup bitmask 2680 */ 2681 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2682 2683 /* eliminate remaning lpl link pointers in cpu, lpl */ 2684 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2685 2686 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2687 } else { 2688 2689 /* unlink cpu from lists of cpus in lpl */ 2690 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2691 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2692 if (lpl->lpl_cpus == cp) { 2693 lpl->lpl_cpus = cp->cpu_next_lpl; 2694 } 2695 2696 /* 2697 * Update the cpu count in the lpls associated with parent 2698 * lgroups. 2699 */ 2700 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2701 2702 } 2703 /* clear cpu's lpl ptr when we're all done */ 2704 cp->cpu_lpl = NULL; 2705 } 2706 2707 /* 2708 * Recompute load average for the specified partition/lgrp fragment. 2709 * 2710 * We rely on the fact that this routine is called from the clock thread 2711 * at a point before the clock thread can block (i.e. before its first 2712 * lock request). Since the clock thread can not be preempted (since it 2713 * runs at highest priority), we know that cpu partitions can not change 2714 * (since doing so would require either the repartition requester or the 2715 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2716 * without grabbing cpu_lock. 2717 */ 2718 void 2719 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2720 { 2721 uint_t ncpu; 2722 int64_t old, new, f; 2723 2724 /* 2725 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2726 */ 2727 static short expval[] = { 2728 0, 3196, 1618, 1083, 2729 814, 652, 543, 466, 2730 408, 363, 326, 297, 2731 272, 251, 233, 218, 2732 204, 192, 181, 172, 2733 163, 155, 148, 142, 2734 136, 130, 125, 121, 2735 116, 112, 109, 105 2736 }; 2737 2738 /* ASSERT (called from clock level) */ 2739 2740 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2741 ((ncpu = lpl->lpl_ncpu) == 0)) { 2742 return; 2743 } 2744 2745 for (;;) { 2746 2747 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2748 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2749 else 2750 f = expval[ncpu]; 2751 2752 /* 2753 * Modify the load average atomically to avoid losing 2754 * anticipatory load updates (see lgrp_move_thread()). 2755 */ 2756 if (ageflag) { 2757 /* 2758 * We're supposed to both update and age the load. 2759 * This happens 10 times/sec. per cpu. We do a 2760 * little hoop-jumping to avoid integer overflow. 2761 */ 2762 int64_t q, r; 2763 2764 do { 2765 old = new = lpl->lpl_loadavg; 2766 q = (old >> 16) << 7; 2767 r = (old & 0xffff) << 7; 2768 new += ((long long)(nrcpus - q) * f - 2769 ((r * f) >> 16)) >> 7; 2770 2771 /* 2772 * Check for overflow 2773 */ 2774 if (new > LGRP_LOADAVG_MAX) 2775 new = LGRP_LOADAVG_MAX; 2776 else if (new < 0) 2777 new = 0; 2778 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2779 new) != old); 2780 } else { 2781 /* 2782 * We're supposed to update the load, but not age it. 2783 * This option is used to update the load (which either 2784 * has already been aged in this 1/10 sec. interval or 2785 * soon will be) to account for a remotely executing 2786 * thread. 2787 */ 2788 do { 2789 old = new = lpl->lpl_loadavg; 2790 new += f; 2791 /* 2792 * Check for overflow 2793 * Underflow not possible here 2794 */ 2795 if (new < old) 2796 new = LGRP_LOADAVG_MAX; 2797 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2798 new) != old); 2799 } 2800 2801 /* 2802 * Do the same for this lpl's parent 2803 */ 2804 if ((lpl = lpl->lpl_parent) == NULL) 2805 break; 2806 ncpu = lpl->lpl_ncpu; 2807 } 2808 } 2809 2810 /* 2811 * Initialize lpl topology in the target based on topology currently present in 2812 * lpl_bootstrap. 2813 * 2814 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2815 * initialize cp_default list of lpls. Up to this point all topology operations 2816 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2817 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2818 * `target' points to the list of lpls in cp_default and `size' is the size of 2819 * this list. 2820 * 2821 * This function walks the lpl topology in lpl_bootstrap and does for things: 2822 * 2823 * 1) Copies all fields from lpl_bootstrap to the target. 2824 * 2825 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2826 * 2827 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2828 * instead of lpl_bootstrap. 2829 * 2830 * 4) Updates pointers in the resource list of the target to point to the lpls 2831 * in the target list instead of lpl_bootstrap. 2832 * 2833 * After lpl_topo_bootstrap() completes, target contains the same information 2834 * that would be present there if it were used during boot instead of 2835 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2836 * and it is bzeroed. 2837 */ 2838 void 2839 lpl_topo_bootstrap(lpl_t *target, int size) 2840 { 2841 lpl_t *lpl = lpl_bootstrap; 2842 lpl_t *target_lpl = target; 2843 int howmany; 2844 int id; 2845 int i; 2846 2847 /* 2848 * The only target that should be passed here is cp_default lpl list. 2849 */ 2850 ASSERT(target == cp_default.cp_lgrploads); 2851 ASSERT(size == cp_default.cp_nlgrploads); 2852 ASSERT(!lgrp_topo_initialized); 2853 ASSERT(ncpus == 1); 2854 2855 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2856 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2857 /* 2858 * Copy all fields from lpl. 2859 */ 2860 2861 *target_lpl = *lpl; 2862 2863 /* 2864 * Substitute CPU0 lpl pointer with one relative to target. 2865 */ 2866 if (lpl->lpl_cpus == CPU) { 2867 ASSERT(CPU->cpu_lpl == lpl); 2868 CPU->cpu_lpl = target_lpl; 2869 } 2870 2871 /* 2872 * Substitute parent information with parent relative to target. 2873 */ 2874 if (lpl->lpl_parent != NULL) 2875 target_lpl->lpl_parent = (lpl_t *) 2876 (((uintptr_t)lpl->lpl_parent - 2877 (uintptr_t)lpl_bootstrap) + 2878 (uintptr_t)target); 2879 2880 /* 2881 * Walk over resource set substituting pointers relative to 2882 * lpl_bootstrap to pointers relative to target. 2883 */ 2884 ASSERT(lpl->lpl_nrset <= 1); 2885 2886 for (id = 0; id < lpl->lpl_nrset; id++) { 2887 if (lpl->lpl_rset[id] != NULL) { 2888 target_lpl->lpl_rset[id] = 2889 (lpl_t *) 2890 (((uintptr_t)lpl->lpl_rset[id] - 2891 (uintptr_t)lpl_bootstrap) + 2892 (uintptr_t)target); 2893 } 2894 } 2895 } 2896 2897 /* 2898 * Topology information in lpl_bootstrap is no longer needed. 2899 */ 2900 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2901 } 2902 2903 /* the maximum effect that a single thread can have on it's lgroup's load */ 2904 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 2905 ((lgrp_loadavg_max_effect) / (ncpu)) 2906 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 2907 2908 /* 2909 * If the lowest load among the lgroups a process' threads are currently 2910 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2911 * expanding the process to a new lgroup. 2912 */ 2913 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2914 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2915 2916 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2917 ((lgrp_expand_proc_thresh) / (ncpu)) 2918 2919 /* 2920 * A process will be expanded to a new lgroup only if the difference between 2921 * the lowest load on the lgroups the process' thread's are currently spread 2922 * across and the lowest load on the other lgroups in the process' partition 2923 * is greater than lgrp_expand_proc_diff. 2924 */ 2925 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2926 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2927 2928 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2929 ((lgrp_expand_proc_diff) / (ncpu)) 2930 2931 /* 2932 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2933 * be present due to impreciseness of the load average decay algorithm. 2934 * 2935 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2936 * tolerance is scaled by the number of cpus in the lgroup just like 2937 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2938 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2939 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2940 */ 2941 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2942 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2943 ((lgrp_loadavg_tolerance) / ncpu) 2944 2945 /* 2946 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2947 * average is above this threshold 2948 */ 2949 uint32_t lgrp_load_thresh = UINT32_MAX; 2950 2951 /* 2952 * lgrp_choose() will try to skip any lgroups with less memory 2953 * than this free when choosing a home lgroup 2954 */ 2955 pgcnt_t lgrp_mem_free_thresh = 0; 2956 2957 /* 2958 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2959 * one based on one of the following policies: 2960 * - Random selection 2961 * - Pseudo round robin placement 2962 * - Longest time since a thread was last placed 2963 */ 2964 #define LGRP_CHOOSE_RANDOM 1 2965 #define LGRP_CHOOSE_RR 2 2966 #define LGRP_CHOOSE_TIME 3 2967 2968 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2969 2970 /* 2971 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 2972 * be bound to a CPU or processor set. 2973 * 2974 * Arguments: 2975 * t The thread 2976 * cpupart The partition the thread belongs to. 2977 * 2978 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 2979 * disabled, or thread_lock held (at splhigh) to protect against the CPU 2980 * partitions changing out from under us and assumes that given thread is 2981 * protected. Also, called sometimes w/ cpus paused or kernel preemption 2982 * disabled, so don't grab any locks because we should never block under 2983 * those conditions. 2984 */ 2985 lpl_t * 2986 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 2987 { 2988 lgrp_load_t bestload, bestrload; 2989 int lgrpid_offset, lgrp_count; 2990 lgrp_id_t lgrpid, lgrpid_start; 2991 lpl_t *lpl, *bestlpl, *bestrlpl; 2992 klgrpset_t lgrpset; 2993 proc_t *p; 2994 2995 ASSERT(t != NULL); 2996 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2997 THREAD_LOCK_HELD(t)); 2998 ASSERT(cpupart != NULL); 2999 3000 p = t->t_procp; 3001 3002 /* A process should always be in an active partition */ 3003 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3004 3005 bestlpl = bestrlpl = NULL; 3006 bestload = bestrload = LGRP_LOADAVG_MAX; 3007 lgrpset = cpupart->cp_lgrpset; 3008 3009 switch (lgrp_choose_policy) { 3010 case LGRP_CHOOSE_RR: 3011 lgrpid = cpupart->cp_lgrp_hint; 3012 do { 3013 if (++lgrpid > lgrp_alloc_max) 3014 lgrpid = 0; 3015 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3016 3017 break; 3018 default: 3019 case LGRP_CHOOSE_TIME: 3020 case LGRP_CHOOSE_RANDOM: 3021 klgrpset_nlgrps(lgrpset, lgrp_count); 3022 lgrpid_offset = 3023 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3024 for (lgrpid = 0; ; lgrpid++) { 3025 if (klgrpset_ismember(lgrpset, lgrpid)) { 3026 if (--lgrpid_offset == 0) 3027 break; 3028 } 3029 } 3030 break; 3031 } 3032 3033 lgrpid_start = lgrpid; 3034 3035 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3036 lgrp_id_t, cpupart->cp_lgrp_hint); 3037 3038 /* 3039 * Use lgroup affinities (if any) to choose best lgroup 3040 * 3041 * NOTE: Assumes that thread is protected from going away and its 3042 * lgroup affinities won't change (ie. p_lock, or 3043 * thread_lock() being held and/or CPUs paused) 3044 */ 3045 if (t->t_lgrp_affinity) { 3046 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start); 3047 if (lpl != NULL) 3048 return (lpl); 3049 } 3050 3051 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3052 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3053 3054 do { 3055 pgcnt_t npgs; 3056 3057 /* 3058 * Skip any lgroups outside of thread's pset 3059 */ 3060 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3061 if (++lgrpid > lgrp_alloc_max) 3062 lgrpid = 0; /* wrap the search */ 3063 continue; 3064 } 3065 3066 /* 3067 * Skip any non-leaf lgroups 3068 */ 3069 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3070 continue; 3071 3072 /* 3073 * Skip any lgroups without enough free memory 3074 * (when threshold set to nonzero positive value) 3075 */ 3076 if (lgrp_mem_free_thresh > 0) { 3077 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3078 if (npgs < lgrp_mem_free_thresh) { 3079 if (++lgrpid > lgrp_alloc_max) 3080 lgrpid = 0; /* wrap the search */ 3081 continue; 3082 } 3083 } 3084 3085 lpl = &cpupart->cp_lgrploads[lgrpid]; 3086 if (klgrpset_isempty(p->p_lgrpset) || 3087 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3088 /* 3089 * Either this is a new process or the process already 3090 * has threads on this lgrp, so this is a preferred 3091 * lgroup for the thread. 3092 */ 3093 if (lpl_pick(lpl, bestlpl)) { 3094 bestload = lpl->lpl_loadavg; 3095 bestlpl = lpl; 3096 } 3097 } else { 3098 /* 3099 * The process doesn't have any threads on this lgrp, 3100 * but we're willing to consider this lgrp if the load 3101 * difference is big enough to justify splitting up 3102 * the process' threads. 3103 */ 3104 if (lpl_pick(lpl, bestrlpl)) { 3105 bestrload = lpl->lpl_loadavg; 3106 bestrlpl = lpl; 3107 } 3108 } 3109 if (++lgrpid > lgrp_alloc_max) 3110 lgrpid = 0; /* wrap the search */ 3111 } while (lgrpid != lgrpid_start); 3112 3113 /* 3114 * Return root lgroup if threshold isn't set to maximum value and 3115 * lowest lgroup load average more than a certain threshold 3116 */ 3117 if (lgrp_load_thresh != UINT32_MAX && 3118 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3119 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3120 3121 /* 3122 * If all the lgroups over which the thread's process is spread are 3123 * heavily loaded, we'll consider placing the thread on one of the 3124 * other leaf lgroups in the thread's partition. 3125 */ 3126 if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3127 (bestrload < bestload) && /* paranoid about wraparound */ 3128 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3129 bestload)) { 3130 bestlpl = bestrlpl; 3131 } 3132 3133 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3134 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3135 3136 ASSERT(bestlpl->lpl_ncpu > 0); 3137 return (bestlpl); 3138 } 3139 3140 /* 3141 * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing. 3142 */ 3143 static int 3144 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3145 { 3146 lgrp_load_t l1, l2; 3147 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3148 3149 3150 if (lpl2 == NULL) 3151 return (1); 3152 3153 l1 = lpl1->lpl_loadavg; 3154 l2 = lpl2->lpl_loadavg; 3155 3156 if ((l1 + tolerance < l2) && (l1 < l2)) { 3157 /* lpl1 is significantly less loaded than lpl2 */ 3158 return (1); 3159 } 3160 3161 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3162 l1 + tolerance >= l2 && l1 < l2 && 3163 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3164 /* 3165 * lpl1's load is within the tolerance of lpl2. We're 3166 * willing to consider it be to better however if 3167 * it has been longer since we last homed a thread there 3168 */ 3169 return (1); 3170 } 3171 3172 return (0); 3173 } 3174 3175 /* 3176 * An LWP is expected to be assigned to an lgroup for at least this long 3177 * for its anticipatory load to be justified. NOTE that this value should 3178 * not be set extremely huge (say, larger than 100 years), to avoid problems 3179 * with overflow in the calculation that uses it. 3180 */ 3181 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3182 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3183 3184 /* 3185 * Routine to change a thread's lgroup affiliation. This routine updates 3186 * the thread's kthread_t struct and its process' proc_t struct to note the 3187 * thread's new lgroup affiliation, and its lgroup affinities. 3188 * 3189 * Note that this is the only routine that modifies a thread's t_lpl field, 3190 * and that adds in or removes anticipatory load. 3191 * 3192 * If the thread is exiting, newlpl is NULL. 3193 * 3194 * Locking: 3195 * The following lock must be held on entry: 3196 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3197 * doesn't get removed from t's partition 3198 * 3199 * This routine is not allowed to grab any locks, since it may be called 3200 * with cpus paused (such as from cpu_offline). 3201 */ 3202 void 3203 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3204 { 3205 proc_t *p; 3206 lpl_t *lpl, *oldlpl; 3207 lgrp_id_t oldid; 3208 kthread_t *tp; 3209 uint_t ncpu; 3210 lgrp_load_t old, new; 3211 3212 ASSERT(t); 3213 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3214 THREAD_LOCK_HELD(t)); 3215 3216 /* 3217 * If not changing lpls, just return 3218 */ 3219 if ((oldlpl = t->t_lpl) == newlpl) 3220 return; 3221 3222 /* 3223 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3224 * associated with process 0 rather than with its original process). 3225 */ 3226 if (t->t_proc_flag & TP_LWPEXIT) { 3227 if (newlpl != NULL) { 3228 t->t_lpl = newlpl; 3229 } 3230 return; 3231 } 3232 3233 p = ttoproc(t); 3234 3235 /* 3236 * If the thread had a previous lgroup, update its process' p_lgrpset 3237 * to account for it being moved from its old lgroup. 3238 */ 3239 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3240 (p->p_tlist != NULL)) { 3241 oldid = oldlpl->lpl_lgrpid; 3242 3243 if (newlpl != NULL) 3244 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3245 3246 if ((do_lgrpset_delete) && 3247 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3248 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3249 /* 3250 * Check if a thread other than the thread 3251 * that's moving is assigned to the same 3252 * lgroup as the thread that's moving. Note 3253 * that we have to compare lgroup IDs, rather 3254 * than simply comparing t_lpl's, since the 3255 * threads may belong to different partitions 3256 * but be assigned to the same lgroup. 3257 */ 3258 ASSERT(tp->t_lpl != NULL); 3259 3260 if ((tp != t) && 3261 (tp->t_lpl->lpl_lgrpid == oldid)) { 3262 /* 3263 * Another thread is assigned to the 3264 * same lgroup as the thread that's 3265 * moving, p_lgrpset doesn't change. 3266 */ 3267 break; 3268 } else if (tp == p->p_tlist) { 3269 /* 3270 * No other thread is assigned to the 3271 * same lgroup as the exiting thread, 3272 * clear the lgroup's bit in p_lgrpset. 3273 */ 3274 klgrpset_del(p->p_lgrpset, oldid); 3275 break; 3276 } 3277 } 3278 } 3279 3280 /* 3281 * If this thread was assigned to its old lgroup for such a 3282 * short amount of time that the anticipatory load that was 3283 * added on its behalf has aged very little, remove that 3284 * anticipatory load. 3285 */ 3286 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3287 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3288 lpl = oldlpl; 3289 for (;;) { 3290 do { 3291 old = new = lpl->lpl_loadavg; 3292 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3293 if (new > old) { 3294 /* 3295 * this can happen if the load 3296 * average was aged since we 3297 * added in the anticipatory 3298 * load 3299 */ 3300 new = 0; 3301 } 3302 } while (cas32( 3303 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3304 new) != old); 3305 3306 lpl = lpl->lpl_parent; 3307 if (lpl == NULL) 3308 break; 3309 3310 ncpu = lpl->lpl_ncpu; 3311 ASSERT(ncpu > 0); 3312 } 3313 } 3314 } 3315 /* 3316 * If the thread has a new lgroup (i.e. it's not exiting), update its 3317 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3318 * to its new lgroup to account for its move to its new lgroup. 3319 */ 3320 if (newlpl != NULL) { 3321 /* 3322 * This thread is moving to a new lgroup 3323 */ 3324 t->t_lpl = newlpl; 3325 3326 /* 3327 * Reflect move in load average of new lgroup 3328 * unless it is root lgroup 3329 */ 3330 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3331 return; 3332 3333 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3334 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3335 } 3336 3337 /* 3338 * It'll take some time for the load on the new lgroup 3339 * to reflect this thread's placement on it. We'd 3340 * like not, however, to have all threads between now 3341 * and then also piling on to this lgroup. To avoid 3342 * this pileup, we anticipate the load this thread 3343 * will generate on its new lgroup. The goal is to 3344 * make the lgroup's load appear as though the thread 3345 * had been there all along. We're very conservative 3346 * in calculating this anticipatory load, we assume 3347 * the worst case case (100% CPU-bound thread). This 3348 * may be modified in the future to be more accurate. 3349 */ 3350 lpl = newlpl; 3351 for (;;) { 3352 ncpu = lpl->lpl_ncpu; 3353 ASSERT(ncpu > 0); 3354 do { 3355 old = new = lpl->lpl_loadavg; 3356 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3357 /* 3358 * Check for overflow 3359 * Underflow not possible here 3360 */ 3361 if (new < old) 3362 new = UINT32_MAX; 3363 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3364 new) != old); 3365 3366 lpl = lpl->lpl_parent; 3367 if (lpl == NULL) 3368 break; 3369 } 3370 t->t_anttime = gethrtime(); 3371 } 3372 } 3373 3374 /* 3375 * Return lgroup memory allocation policy given advice from madvise(3C) 3376 */ 3377 lgrp_mem_policy_t 3378 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3379 { 3380 switch (advice) { 3381 case MADV_ACCESS_LWP: 3382 return (LGRP_MEM_POLICY_NEXT); 3383 case MADV_ACCESS_MANY: 3384 return (LGRP_MEM_POLICY_RANDOM); 3385 default: 3386 return (lgrp_mem_policy_default(size, type)); 3387 } 3388 } 3389 3390 /* 3391 * Figure out default policy 3392 */ 3393 lgrp_mem_policy_t 3394 lgrp_mem_policy_default(size_t size, int type) 3395 { 3396 cpupart_t *cp; 3397 lgrp_mem_policy_t policy; 3398 size_t pset_mem_size; 3399 3400 /* 3401 * Randomly allocate memory across lgroups for shared memory 3402 * beyond a certain threshold 3403 */ 3404 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3405 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3406 /* 3407 * Get total memory size of current thread's pset 3408 */ 3409 kpreempt_disable(); 3410 cp = curthread->t_cpupart; 3411 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3412 kpreempt_enable(); 3413 3414 /* 3415 * Choose policy to randomly allocate memory across 3416 * lgroups in pset if it will fit and is not default 3417 * partition. Otherwise, allocate memory randomly 3418 * across machine. 3419 */ 3420 if (lgrp_mem_pset_aware && size < pset_mem_size) 3421 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3422 else 3423 policy = LGRP_MEM_POLICY_RANDOM; 3424 } else 3425 /* 3426 * Apply default policy for private memory and 3427 * shared memory under the respective random 3428 * threshold. 3429 */ 3430 policy = lgrp_mem_default_policy; 3431 3432 return (policy); 3433 } 3434 3435 /* 3436 * Get memory allocation policy for this segment 3437 */ 3438 lgrp_mem_policy_info_t * 3439 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3440 { 3441 lgrp_mem_policy_info_t *policy_info; 3442 extern struct seg_ops segspt_ops; 3443 extern struct seg_ops segspt_shmops; 3444 3445 /* 3446 * This is for binary compatibility to protect against third party 3447 * segment drivers which haven't recompiled to allow for 3448 * SEGOP_GETPOLICY() 3449 */ 3450 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3451 seg->s_ops != &segspt_shmops) 3452 return (NULL); 3453 3454 policy_info = NULL; 3455 if (seg->s_ops->getpolicy != NULL) 3456 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3457 3458 return (policy_info); 3459 } 3460 3461 /* 3462 * Set policy for allocating private memory given desired policy, policy info, 3463 * size in bytes of memory that policy is being applied. 3464 * Return 0 if policy wasn't set already and 1 if policy was set already 3465 */ 3466 int 3467 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3468 lgrp_mem_policy_info_t *policy_info, size_t size) 3469 { 3470 3471 ASSERT(policy_info != NULL); 3472 3473 if (policy == LGRP_MEM_POLICY_DEFAULT) 3474 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3475 3476 /* 3477 * Policy set already? 3478 */ 3479 if (policy == policy_info->mem_policy) 3480 return (1); 3481 3482 /* 3483 * Set policy 3484 */ 3485 policy_info->mem_policy = policy; 3486 policy_info->mem_reserved = 0; 3487 3488 return (0); 3489 } 3490 3491 3492 /* 3493 * Get shared memory allocation policy with given tree and offset 3494 */ 3495 lgrp_mem_policy_info_t * 3496 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3497 u_offset_t vn_off) 3498 { 3499 u_offset_t off; 3500 lgrp_mem_policy_info_t *policy_info; 3501 lgrp_shm_policy_seg_t *policy_seg; 3502 lgrp_shm_locality_t *shm_locality; 3503 avl_tree_t *tree; 3504 avl_index_t where; 3505 3506 /* 3507 * Get policy segment tree from anon_map or vnode and use specified 3508 * anon index or vnode offset as offset 3509 * 3510 * Assume that no lock needs to be held on anon_map or vnode, since 3511 * they should be protected by their reference count which must be 3512 * nonzero for an existing segment 3513 */ 3514 if (amp) { 3515 ASSERT(amp->refcnt != 0); 3516 shm_locality = amp->locality; 3517 if (shm_locality == NULL) 3518 return (NULL); 3519 tree = shm_locality->loc_tree; 3520 off = ptob(anon_index); 3521 } else if (vp) { 3522 shm_locality = vp->v_locality; 3523 if (shm_locality == NULL) 3524 return (NULL); 3525 ASSERT(shm_locality->loc_count != 0); 3526 tree = shm_locality->loc_tree; 3527 off = vn_off; 3528 } 3529 3530 if (tree == NULL) 3531 return (NULL); 3532 3533 /* 3534 * Lookup policy segment for offset into shared object and return 3535 * policy info 3536 */ 3537 rw_enter(&shm_locality->loc_lock, RW_READER); 3538 policy_info = NULL; 3539 policy_seg = avl_find(tree, &off, &where); 3540 if (policy_seg) 3541 policy_info = &policy_seg->shm_policy; 3542 rw_exit(&shm_locality->loc_lock); 3543 3544 return (policy_info); 3545 } 3546 3547 /* 3548 * Return lgroup to use for allocating memory 3549 * given the segment and address 3550 * 3551 * There isn't any mutual exclusion that exists between calls 3552 * to this routine and DR, so this routine and whomever calls it 3553 * should be mindful of the possibility that the lgrp returned 3554 * may be deleted. If this happens, dereferences of the lgrp 3555 * pointer will still be safe, but the resources in the lgrp will 3556 * be gone, and LGRP_EXISTS() will no longer be true. 3557 */ 3558 lgrp_t * 3559 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3560 { 3561 int i; 3562 lgrp_t *lgrp; 3563 klgrpset_t lgrpset; 3564 int lgrps_spanned; 3565 unsigned long off; 3566 lgrp_mem_policy_t policy; 3567 lgrp_mem_policy_info_t *policy_info; 3568 ushort_t random; 3569 int stat = 0; 3570 3571 /* 3572 * Just return null if the lgrp framework hasn't finished 3573 * initializing or if this is a UMA machine. 3574 */ 3575 if (nlgrps == 1 || !lgrp_initialized) 3576 return (lgrp_root); 3577 3578 /* 3579 * Get memory allocation policy for this segment 3580 */ 3581 policy = lgrp_mem_default_policy; 3582 if (seg != NULL) { 3583 if (seg->s_as == &kas) { 3584 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3585 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3586 policy = LGRP_MEM_POLICY_RANDOM; 3587 } else { 3588 policy_info = lgrp_mem_policy_get(seg, vaddr); 3589 if (policy_info != NULL) 3590 policy = policy_info->mem_policy; 3591 } 3592 } 3593 lgrpset = 0; 3594 3595 /* 3596 * Initialize lgroup to home by default 3597 */ 3598 lgrp = lgrp_home_lgrp(); 3599 3600 /* 3601 * When homing threads on root lgrp, override default memory 3602 * allocation policies with root lgroup memory allocation policy 3603 */ 3604 if (lgrp == lgrp_root) 3605 policy = lgrp_mem_policy_root; 3606 3607 /* 3608 * Implement policy 3609 */ 3610 switch (policy) { 3611 case LGRP_MEM_POLICY_NEXT_CPU: 3612 3613 /* 3614 * Return lgroup of current CPU which faulted on memory 3615 */ 3616 lgrp = lgrp_cpu_to_lgrp(CPU); 3617 break; 3618 3619 case LGRP_MEM_POLICY_NEXT: 3620 case LGRP_MEM_POLICY_DEFAULT: 3621 default: 3622 3623 /* 3624 * Just return current thread's home lgroup 3625 * for default policy (next touch) 3626 * If the thread is homed to the root, 3627 * then the default policy is random across lgroups. 3628 * Fallthrough to the random case. 3629 */ 3630 if (lgrp != lgrp_root) { 3631 if (policy == LGRP_MEM_POLICY_NEXT) 3632 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3633 else 3634 lgrp_stat_add(lgrp->lgrp_id, 3635 LGRP_NUM_DEFAULT, 1); 3636 break; 3637 } 3638 /* LINTED fallthrough on case statement */ 3639 case LGRP_MEM_POLICY_RANDOM: 3640 3641 /* 3642 * Return a random leaf lgroup with memory 3643 */ 3644 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3645 /* 3646 * Count how many lgroups are spanned 3647 */ 3648 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3649 3650 /* 3651 * There may be no memnodes in the root lgroup during DR copy 3652 * rename on a system with only two boards (memnodes) 3653 * configured. In this case just return the root lgrp. 3654 */ 3655 if (lgrps_spanned == 0) { 3656 lgrp = lgrp_root; 3657 break; 3658 } 3659 3660 /* 3661 * Pick a random offset within lgroups spanned 3662 * and return lgroup at that offset 3663 */ 3664 random = (ushort_t)gethrtime() >> 4; 3665 off = random % lgrps_spanned; 3666 ASSERT(off <= lgrp_alloc_max); 3667 3668 for (i = 0; i <= lgrp_alloc_max; i++) { 3669 if (!klgrpset_ismember(lgrpset, i)) 3670 continue; 3671 if (off) 3672 off--; 3673 else { 3674 lgrp = lgrp_table[i]; 3675 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3676 1); 3677 break; 3678 } 3679 } 3680 break; 3681 3682 case LGRP_MEM_POLICY_RANDOM_PROC: 3683 3684 /* 3685 * Grab copy of bitmask of lgroups spanned by 3686 * this process 3687 */ 3688 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3689 stat = LGRP_NUM_RANDOM_PROC; 3690 3691 /* LINTED fallthrough on case statement */ 3692 case LGRP_MEM_POLICY_RANDOM_PSET: 3693 3694 if (!stat) 3695 stat = LGRP_NUM_RANDOM_PSET; 3696 3697 if (klgrpset_isempty(lgrpset)) { 3698 /* 3699 * Grab copy of bitmask of lgroups spanned by 3700 * this processor set 3701 */ 3702 kpreempt_disable(); 3703 klgrpset_copy(lgrpset, 3704 curthread->t_cpupart->cp_lgrpset); 3705 kpreempt_enable(); 3706 } 3707 3708 /* 3709 * Count how many lgroups are spanned 3710 */ 3711 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3712 ASSERT(lgrps_spanned <= nlgrps); 3713 3714 /* 3715 * Probably lgrps_spanned should be always non-zero, but to be 3716 * on the safe side we return lgrp_root if it is empty. 3717 */ 3718 if (lgrps_spanned == 0) { 3719 lgrp = lgrp_root; 3720 break; 3721 } 3722 3723 /* 3724 * Pick a random offset within lgroups spanned 3725 * and return lgroup at that offset 3726 */ 3727 random = (ushort_t)gethrtime() >> 4; 3728 off = random % lgrps_spanned; 3729 ASSERT(off <= lgrp_alloc_max); 3730 3731 for (i = 0; i <= lgrp_alloc_max; i++) { 3732 if (!klgrpset_ismember(lgrpset, i)) 3733 continue; 3734 if (off) 3735 off--; 3736 else { 3737 lgrp = lgrp_table[i]; 3738 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3739 1); 3740 break; 3741 } 3742 } 3743 break; 3744 3745 case LGRP_MEM_POLICY_ROUNDROBIN: 3746 3747 /* 3748 * Use offset within segment to determine 3749 * offset from home lgroup to choose for 3750 * next lgroup to allocate memory from 3751 */ 3752 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3753 (lgrp_alloc_max + 1); 3754 3755 kpreempt_disable(); 3756 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3757 i = lgrp->lgrp_id; 3758 kpreempt_enable(); 3759 3760 while (off > 0) { 3761 i = (i + 1) % (lgrp_alloc_max + 1); 3762 lgrp = lgrp_table[i]; 3763 if (klgrpset_ismember(lgrpset, i)) 3764 off--; 3765 } 3766 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3767 3768 break; 3769 } 3770 3771 ASSERT(lgrp != NULL); 3772 return (lgrp); 3773 } 3774 3775 /* 3776 * Return the number of pages in an lgroup 3777 * 3778 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3779 * could cause tests that rely on the numat driver to fail.... 3780 */ 3781 pgcnt_t 3782 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3783 { 3784 lgrp_t *lgrp; 3785 3786 lgrp = lgrp_table[lgrpid]; 3787 if (!LGRP_EXISTS(lgrp) || 3788 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3789 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3790 return (0); 3791 3792 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3793 } 3794 3795 /* 3796 * Initialize lgroup shared memory allocation policy support 3797 */ 3798 void 3799 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3800 { 3801 lgrp_shm_locality_t *shm_locality; 3802 3803 /* 3804 * Initialize locality field in anon_map 3805 * Don't need any locks because this is called when anon_map is 3806 * allocated, but not used anywhere yet. 3807 */ 3808 if (amp) { 3809 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3810 if (amp->locality == NULL) { 3811 /* 3812 * Allocate and initialize shared memory locality info 3813 * and set anon_map locality pointer to it 3814 * Drop lock across kmem_alloc(KM_SLEEP) 3815 */ 3816 ANON_LOCK_EXIT(&->a_rwlock); 3817 shm_locality = kmem_alloc(sizeof (*shm_locality), 3818 KM_SLEEP); 3819 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3820 NULL); 3821 shm_locality->loc_count = 1; /* not used for amp */ 3822 shm_locality->loc_tree = NULL; 3823 3824 /* 3825 * Reacquire lock and check to see whether anyone beat 3826 * us to initializing the locality info 3827 */ 3828 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3829 if (amp->locality != NULL) { 3830 rw_destroy(&shm_locality->loc_lock); 3831 kmem_free(shm_locality, 3832 sizeof (*shm_locality)); 3833 } else 3834 amp->locality = shm_locality; 3835 } 3836 ANON_LOCK_EXIT(&->a_rwlock); 3837 return; 3838 } 3839 3840 /* 3841 * Allocate shared vnode policy info if vnode is not locality aware yet 3842 */ 3843 mutex_enter(&vp->v_lock); 3844 if ((vp->v_flag & V_LOCALITY) == 0) { 3845 /* 3846 * Allocate and initialize shared memory locality info 3847 */ 3848 mutex_exit(&vp->v_lock); 3849 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3850 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3851 shm_locality->loc_count = 1; 3852 shm_locality->loc_tree = NULL; 3853 3854 /* 3855 * Point vnode locality field at shared vnode policy info 3856 * and set locality aware flag in vnode 3857 */ 3858 mutex_enter(&vp->v_lock); 3859 if ((vp->v_flag & V_LOCALITY) == 0) { 3860 vp->v_locality = shm_locality; 3861 vp->v_flag |= V_LOCALITY; 3862 } else { 3863 /* 3864 * Lost race so free locality info and increment count. 3865 */ 3866 rw_destroy(&shm_locality->loc_lock); 3867 kmem_free(shm_locality, sizeof (*shm_locality)); 3868 shm_locality = vp->v_locality; 3869 shm_locality->loc_count++; 3870 } 3871 mutex_exit(&vp->v_lock); 3872 3873 return; 3874 } 3875 3876 /* 3877 * Increment reference count of number of segments mapping this vnode 3878 * shared 3879 */ 3880 shm_locality = vp->v_locality; 3881 shm_locality->loc_count++; 3882 mutex_exit(&vp->v_lock); 3883 } 3884 3885 /* 3886 * Destroy the given shared memory policy segment tree 3887 */ 3888 void 3889 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3890 { 3891 lgrp_shm_policy_seg_t *cur; 3892 lgrp_shm_policy_seg_t *next; 3893 3894 if (tree == NULL) 3895 return; 3896 3897 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3898 while (cur != NULL) { 3899 next = AVL_NEXT(tree, cur); 3900 avl_remove(tree, cur); 3901 kmem_free(cur, sizeof (*cur)); 3902 cur = next; 3903 } 3904 kmem_free(tree, sizeof (avl_tree_t)); 3905 } 3906 3907 /* 3908 * Uninitialize lgroup shared memory allocation policy support 3909 */ 3910 void 3911 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3912 { 3913 lgrp_shm_locality_t *shm_locality; 3914 3915 /* 3916 * For anon_map, deallocate shared memory policy tree and 3917 * zero locality field 3918 * Don't need any locks because anon_map is being freed 3919 */ 3920 if (amp) { 3921 if (amp->locality == NULL) 3922 return; 3923 shm_locality = amp->locality; 3924 shm_locality->loc_count = 0; /* not really used for amp */ 3925 rw_destroy(&shm_locality->loc_lock); 3926 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3927 kmem_free(shm_locality, sizeof (*shm_locality)); 3928 amp->locality = 0; 3929 return; 3930 } 3931 3932 /* 3933 * For vnode, decrement reference count of segments mapping this vnode 3934 * shared and delete locality info if reference count drops to 0 3935 */ 3936 mutex_enter(&vp->v_lock); 3937 shm_locality = vp->v_locality; 3938 shm_locality->loc_count--; 3939 3940 if (shm_locality->loc_count == 0) { 3941 rw_destroy(&shm_locality->loc_lock); 3942 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3943 kmem_free(shm_locality, sizeof (*shm_locality)); 3944 vp->v_locality = 0; 3945 vp->v_flag &= ~V_LOCALITY; 3946 } 3947 mutex_exit(&vp->v_lock); 3948 } 3949 3950 /* 3951 * Compare two shared memory policy segments 3952 * Used by AVL tree code for searching 3953 */ 3954 int 3955 lgrp_shm_policy_compar(const void *x, const void *y) 3956 { 3957 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 3958 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 3959 3960 if (a->shm_off < b->shm_off) 3961 return (-1); 3962 if (a->shm_off >= b->shm_off + b->shm_size) 3963 return (1); 3964 return (0); 3965 } 3966 3967 /* 3968 * Concatenate seg1 with seg2 and remove seg2 3969 */ 3970 static int 3971 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 3972 lgrp_shm_policy_seg_t *seg2) 3973 { 3974 if (!seg1 || !seg2 || 3975 seg1->shm_off + seg1->shm_size != seg2->shm_off || 3976 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 3977 return (-1); 3978 3979 seg1->shm_size += seg2->shm_size; 3980 avl_remove(tree, seg2); 3981 kmem_free(seg2, sizeof (*seg2)); 3982 return (0); 3983 } 3984 3985 /* 3986 * Split segment at given offset and return rightmost (uppermost) segment 3987 * Assumes that there are no overlapping segments 3988 */ 3989 static lgrp_shm_policy_seg_t * 3990 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 3991 u_offset_t off) 3992 { 3993 lgrp_shm_policy_seg_t *newseg; 3994 avl_index_t where; 3995 3996 ASSERT(seg != NULL); 3997 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 3998 3999 if (!seg || off < seg->shm_off || off > seg->shm_off + 4000 seg->shm_size) 4001 return (NULL); 4002 4003 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4004 return (seg); 4005 4006 /* 4007 * Adjust size of left segment and allocate new (right) segment 4008 */ 4009 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4010 newseg->shm_policy = seg->shm_policy; 4011 newseg->shm_off = off; 4012 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4013 seg->shm_size = off - seg->shm_off; 4014 4015 /* 4016 * Find where to insert new segment in AVL tree and insert it 4017 */ 4018 (void) avl_find(tree, &off, &where); 4019 avl_insert(tree, newseg, where); 4020 4021 return (newseg); 4022 } 4023 4024 /* 4025 * Set shared memory allocation policy on specified shared object at given 4026 * offset and length 4027 * 4028 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4029 * -1 if can't set policy. 4030 */ 4031 int 4032 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4033 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4034 { 4035 u_offset_t eoff; 4036 lgrp_shm_policy_seg_t *next; 4037 lgrp_shm_policy_seg_t *newseg; 4038 u_offset_t off; 4039 u_offset_t oldeoff; 4040 lgrp_shm_policy_seg_t *prev; 4041 int retval; 4042 lgrp_shm_policy_seg_t *seg; 4043 lgrp_shm_locality_t *shm_locality; 4044 avl_tree_t *tree; 4045 avl_index_t where; 4046 4047 ASSERT(amp || vp); 4048 ASSERT((len & PAGEOFFSET) == 0); 4049 4050 if (len == 0) 4051 return (-1); 4052 4053 retval = 0; 4054 4055 /* 4056 * Get locality info and starting offset into shared object 4057 * Try anon map first and then vnode 4058 * Assume that no locks need to be held on anon_map or vnode, since 4059 * it should be protected by its reference count which must be nonzero 4060 * for an existing segment. 4061 */ 4062 if (amp) { 4063 /* 4064 * Get policy info from anon_map 4065 * 4066 */ 4067 ASSERT(amp->refcnt != 0); 4068 if (amp->locality == NULL) 4069 lgrp_shm_policy_init(amp, NULL); 4070 shm_locality = amp->locality; 4071 off = ptob(anon_index); 4072 } else if (vp) { 4073 /* 4074 * Get policy info from vnode 4075 */ 4076 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4077 lgrp_shm_policy_init(NULL, vp); 4078 shm_locality = vp->v_locality; 4079 ASSERT(shm_locality->loc_count != 0); 4080 off = vn_off; 4081 } else 4082 return (-1); 4083 4084 ASSERT((off & PAGEOFFSET) == 0); 4085 4086 /* 4087 * Figure out default policy 4088 */ 4089 if (policy == LGRP_MEM_POLICY_DEFAULT) 4090 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4091 4092 /* 4093 * Create AVL tree if there isn't one yet 4094 * and set locality field to point at it 4095 */ 4096 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4097 tree = shm_locality->loc_tree; 4098 if (!tree) { 4099 rw_exit(&shm_locality->loc_lock); 4100 4101 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4102 4103 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4104 if (shm_locality->loc_tree == NULL) { 4105 avl_create(tree, lgrp_shm_policy_compar, 4106 sizeof (lgrp_shm_policy_seg_t), 4107 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4108 shm_locality->loc_tree = tree; 4109 } else { 4110 /* 4111 * Another thread managed to set up the tree 4112 * before we could. Free the tree we allocated 4113 * and use the one that's already there. 4114 */ 4115 kmem_free(tree, sizeof (*tree)); 4116 tree = shm_locality->loc_tree; 4117 } 4118 } 4119 4120 /* 4121 * Set policy 4122 * 4123 * Need to maintain hold on writer's lock to keep tree from 4124 * changing out from under us 4125 */ 4126 while (len != 0) { 4127 /* 4128 * Find policy segment for specified offset into shared object 4129 */ 4130 seg = avl_find(tree, &off, &where); 4131 4132 /* 4133 * Didn't find any existing segment that contains specified 4134 * offset, so allocate new segment, insert it, and concatenate 4135 * with adjacent segments if possible 4136 */ 4137 if (seg == NULL) { 4138 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4139 KM_SLEEP); 4140 newseg->shm_policy.mem_policy = policy; 4141 newseg->shm_policy.mem_reserved = 0; 4142 newseg->shm_off = off; 4143 avl_insert(tree, newseg, where); 4144 4145 /* 4146 * Check to see whether new segment overlaps with next 4147 * one, set length of new segment accordingly, and 4148 * calculate remaining length and next offset 4149 */ 4150 seg = AVL_NEXT(tree, newseg); 4151 if (seg == NULL || off + len <= seg->shm_off) { 4152 newseg->shm_size = len; 4153 len = 0; 4154 } else { 4155 newseg->shm_size = seg->shm_off - off; 4156 off = seg->shm_off; 4157 len -= newseg->shm_size; 4158 } 4159 4160 /* 4161 * Try to concatenate new segment with next and 4162 * previous ones, since they might have the same policy 4163 * now. Grab previous and next segments first because 4164 * they will change on concatenation. 4165 */ 4166 prev = AVL_PREV(tree, newseg); 4167 next = AVL_NEXT(tree, newseg); 4168 (void) lgrp_shm_policy_concat(tree, newseg, next); 4169 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4170 4171 continue; 4172 } 4173 4174 eoff = off + len; 4175 oldeoff = seg->shm_off + seg->shm_size; 4176 4177 /* 4178 * Policy set already? 4179 */ 4180 if (policy == seg->shm_policy.mem_policy) { 4181 /* 4182 * Nothing left to do if offset and length 4183 * fall within this segment 4184 */ 4185 if (eoff <= oldeoff) { 4186 retval = 1; 4187 break; 4188 } else { 4189 len = eoff - oldeoff; 4190 off = oldeoff; 4191 continue; 4192 } 4193 } 4194 4195 /* 4196 * Specified offset and length match existing segment exactly 4197 */ 4198 if (off == seg->shm_off && len == seg->shm_size) { 4199 /* 4200 * Set policy and update current length 4201 */ 4202 seg->shm_policy.mem_policy = policy; 4203 seg->shm_policy.mem_reserved = 0; 4204 len = 0; 4205 4206 /* 4207 * Try concatenating new segment with previous and next 4208 * segments, since they might have the same policy now. 4209 * Grab previous and next segments first because they 4210 * will change on concatenation. 4211 */ 4212 prev = AVL_PREV(tree, seg); 4213 next = AVL_NEXT(tree, seg); 4214 (void) lgrp_shm_policy_concat(tree, seg, next); 4215 (void) lgrp_shm_policy_concat(tree, prev, seg); 4216 } else { 4217 /* 4218 * Specified offset and length only apply to part of 4219 * existing segment 4220 */ 4221 4222 /* 4223 * New segment starts in middle of old one, so split 4224 * new one off near beginning of old one 4225 */ 4226 newseg = NULL; 4227 if (off > seg->shm_off) { 4228 newseg = lgrp_shm_policy_split(tree, seg, off); 4229 4230 /* 4231 * New segment ends where old one did, so try 4232 * to concatenate with next segment 4233 */ 4234 if (eoff == oldeoff) { 4235 newseg->shm_policy.mem_policy = policy; 4236 newseg->shm_policy.mem_reserved = 0; 4237 (void) lgrp_shm_policy_concat(tree, 4238 newseg, AVL_NEXT(tree, newseg)); 4239 break; 4240 } 4241 } 4242 4243 /* 4244 * New segment ends before old one, so split off end of 4245 * old one 4246 */ 4247 if (eoff < oldeoff) { 4248 if (newseg) { 4249 (void) lgrp_shm_policy_split(tree, 4250 newseg, eoff); 4251 newseg->shm_policy.mem_policy = policy; 4252 newseg->shm_policy.mem_reserved = 0; 4253 } else { 4254 (void) lgrp_shm_policy_split(tree, seg, 4255 eoff); 4256 seg->shm_policy.mem_policy = policy; 4257 seg->shm_policy.mem_reserved = 0; 4258 } 4259 4260 if (off == seg->shm_off) 4261 (void) lgrp_shm_policy_concat(tree, 4262 AVL_PREV(tree, seg), seg); 4263 break; 4264 } 4265 4266 /* 4267 * Calculate remaining length and next offset 4268 */ 4269 len = eoff - oldeoff; 4270 off = oldeoff; 4271 } 4272 } 4273 4274 rw_exit(&shm_locality->loc_lock); 4275 return (retval); 4276 } 4277 4278 /* 4279 * Return the best memnode from which to allocate memory given 4280 * an lgroup. 4281 * 4282 * "c" is for cookie, which is good enough for me. 4283 * It references a cookie struct that should be zero'ed to initialize. 4284 * The cookie should live on the caller's stack. 4285 * 4286 * The routine returns -1 when: 4287 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4288 * - traverse is 1, and all the memnodes in the system have been 4289 * returned. 4290 */ 4291 int 4292 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4293 { 4294 lgrp_t *lp = c->lmc_lgrp; 4295 mnodeset_t nodes = c->lmc_nodes; 4296 int cnt = c->lmc_cnt; 4297 int offset, mnode; 4298 4299 extern int max_mem_nodes; 4300 4301 /* 4302 * If the set is empty, and the caller is willing, traverse 4303 * up the hierarchy until we find a non-empty set. 4304 */ 4305 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4306 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4307 ((lp = lp->lgrp_parent) == NULL)) 4308 return (-1); 4309 4310 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4311 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4312 } 4313 4314 /* 4315 * Select a memnode by picking one at a "random" offset. 4316 * Because of DR, memnodes can come and go at any time. 4317 * This code must be able to cope with the possibility 4318 * that the nodes count "cnt" is inconsistent with respect 4319 * to the number of elements actually in "nodes", and 4320 * therefore that the offset chosen could be greater than 4321 * the number of elements in the set (some memnodes may 4322 * have dissapeared just before cnt was read). 4323 * If this happens, the search simply wraps back to the 4324 * beginning of the set. 4325 */ 4326 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4327 offset = c->lmc_rand % cnt; 4328 do { 4329 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4330 if (nodes & ((mnodeset_t)1 << mnode)) 4331 if (!offset--) 4332 break; 4333 } while (mnode >= max_mem_nodes); 4334 4335 /* Found a node. Store state before returning. */ 4336 c->lmc_lgrp = lp; 4337 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4338 c->lmc_cnt = cnt - 1; 4339 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4340 c->lmc_ntried++; 4341 4342 return (mnode); 4343 } 4344