1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Basic NUMA support in terms of locality groups 30 * 31 * Solaris needs to know which CPUs, memory, etc. are near each other to 32 * provide good performance on NUMA machines by optimizing for locality. 33 * In order to do this, a new abstraction called a "locality group (lgroup)" 34 * has been introduced to keep track of which CPU-like and memory-like hardware 35 * resources are close to each other. Currently, latency is the only measure 36 * used to determine how to group hardware resources into lgroups, but this 37 * does not limit the groupings to be based solely on latency. Other factors 38 * may be used to determine the groupings in the future. 39 * 40 * Lgroups are organized into a hieararchy or topology that represents the 41 * latency topology of the machine. There is always at least a root lgroup in 42 * the system. It represents all the hardware resources in the machine at a 43 * latency big enough that any hardware resource can at least access any other 44 * hardware resource within that latency. A Uniform Memory Access (UMA) 45 * machine is represented with one lgroup (the root). In contrast, a NUMA 46 * machine is represented at least by the root lgroup and some number of leaf 47 * lgroups where the leaf lgroups contain the hardware resources within the 48 * least latency of each other and the root lgroup still contains all the 49 * resources in the machine. Some number of intermediate lgroups may exist 50 * which represent more levels of locality than just the local latency of the 51 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 52 * (eg. root and intermediate lgroups) contain the next nearest resources to 53 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 54 * to the root lgroup shows the hardware resources from closest to farthest 55 * from the leaf lgroup such that each successive ancestor lgroup contains 56 * the next nearest resources at the next level of locality from the previous. 57 * 58 * The kernel uses the lgroup abstraction to know how to allocate resources 59 * near a given process/thread. At fork() and lwp/thread_create() time, a 60 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 61 * with the lowest load average. Binding to a processor or processor set will 62 * change the home lgroup for a thread. The scheduler has been modified to try 63 * to dispatch a thread on a CPU in its home lgroup. Physical memory 64 * allocation is lgroup aware too, so memory will be allocated from the current 65 * thread's home lgroup if possible. If the desired resources are not 66 * available, the kernel traverses the lgroup hierarchy going to the parent 67 * lgroup to find resources at the next level of locality until it reaches the 68 * root lgroup. 69 */ 70 71 #include <sys/lgrp.h> 72 #include <sys/lgrp_user.h> 73 #include <sys/types.h> 74 #include <sys/mman.h> 75 #include <sys/param.h> 76 #include <sys/var.h> 77 #include <sys/thread.h> 78 #include <sys/cpuvar.h> 79 #include <sys/cpupart.h> 80 #include <sys/kmem.h> 81 #include <vm/seg.h> 82 #include <vm/seg_kmem.h> 83 #include <vm/seg_spt.h> 84 #include <vm/seg_vn.h> 85 #include <vm/as.h> 86 #include <sys/atomic.h> 87 #include <sys/systm.h> 88 #include <sys/errno.h> 89 #include <sys/cmn_err.h> 90 #include <sys/kstat.h> 91 #include <sys/sysmacros.h> 92 #include <sys/pg.h> 93 #include <sys/promif.h> 94 #include <sys/sdt.h> 95 96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 98 /* indexed by lgrp_id */ 99 int nlgrps; /* number of lgroups in machine */ 100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 102 103 /* 104 * Kstat data for lgroups. 105 * 106 * Actual kstat data is collected in lgrp_stats array. 107 * The lgrp_kstat_data array of named kstats is used to extract data from 108 * lgrp_stats and present it to kstat framework. It is protected from partallel 109 * modifications by lgrp_kstat_mutex. This may cause some contention when 110 * several kstat commands run in parallel but this is not the 111 * performance-critical path. 112 */ 113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 114 115 /* 116 * Declare kstat names statically for enums as defined in the header file. 117 */ 118 LGRP_KSTAT_NAMES; 119 120 static void lgrp_kstat_init(void); 121 static int lgrp_kstat_extract(kstat_t *, int); 122 static void lgrp_kstat_reset(lgrp_id_t); 123 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 125 static kmutex_t lgrp_kstat_mutex; 126 127 128 /* 129 * max number of lgroups supported by the platform 130 */ 131 int nlgrpsmax = 0; 132 133 /* 134 * The root lgroup. Represents the set of resources at the system wide 135 * level of locality. 136 */ 137 lgrp_t *lgrp_root = NULL; 138 139 /* 140 * During system bootstrap cp_default does not contain the list of lgrp load 141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 142 * on-line when cp_default is initialized by cpupart_initialize_default(). 143 * Configuring CPU0 may create a two-level topology with root and one leaf node 144 * containing CPU0. This topology is initially constructed in a special 145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 147 * for all lpl operations until cp_default is fully constructed. 148 * 149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 151 * the first element of lpl_bootstrap_list. 152 * 153 * CPUs that are added to the system, but have not yet been assigned to an 154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 155 * on some architectures (x86) it's possible for the slave CPU startup thread 156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 157 */ 158 #define LPL_BOOTSTRAP_SIZE 2 159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 160 lpl_t *lpl_bootstrap; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 /* 172 * Size, in bytes, beyond which random memory allocation policy is applied 173 * to non-shared memory. Default is the maximum size, so random memory 174 * allocation won't be used for non-shared memory by default. 175 */ 176 size_t lgrp_privm_random_thresh = (size_t)(-1); 177 178 /* the maximum effect that a single thread can have on it's lgroup's load */ 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 180 ((lgrp_loadavg_max_effect) / (ncpu)) 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 182 183 184 /* 185 * Size, in bytes, beyond which random memory allocation policy is applied to 186 * shared memory. Default is 8MB (2 ISM pages). 187 */ 188 size_t lgrp_shm_random_thresh = 8*1024*1024; 189 190 /* 191 * Whether to do processor set aware memory allocation by default 192 */ 193 int lgrp_mem_pset_aware = 0; 194 195 /* 196 * Set the default memory allocation policy for root lgroup 197 */ 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 199 200 /* 201 * Set the default memory allocation policy. For most platforms, 202 * next touch is sufficient, but some platforms may wish to override 203 * this. 204 */ 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 206 207 208 /* 209 * lgroup CPU event handlers 210 */ 211 static void lgrp_cpu_init(struct cpu *); 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 214 215 /* 216 * lgroup memory event handlers 217 */ 218 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 219 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 220 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 221 222 /* 223 * lgroup CPU partition event handlers 224 */ 225 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 226 static void lgrp_part_del_cpu(struct cpu *); 227 228 static void lgrp_root_init(void); 229 230 /* 231 * lpl topology 232 */ 233 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 234 static void lpl_clear(lpl_t *); 235 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 236 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 237 static void lpl_rset_add(lpl_t *, lpl_t *); 238 static void lpl_rset_del(lpl_t *, lpl_t *); 239 static int lpl_rset_contains(lpl_t *, lpl_t *); 240 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 241 static void lpl_child_update(lpl_t *, struct cpupart *); 242 static int lpl_pick(lpl_t *, lpl_t *); 243 static void lpl_verify_wrapper(struct cpupart *); 244 245 /* 246 * defines for lpl topology verifier return codes 247 */ 248 249 #define LPL_TOPO_CORRECT 0 250 #define LPL_TOPO_PART_HAS_NO_LPL -1 251 #define LPL_TOPO_CPUS_NOT_EMPTY -2 252 #define LPL_TOPO_LGRP_MISMATCH -3 253 #define LPL_TOPO_MISSING_PARENT -4 254 #define LPL_TOPO_PARENT_MISMATCH -5 255 #define LPL_TOPO_BAD_CPUCNT -6 256 #define LPL_TOPO_RSET_MISMATCH -7 257 #define LPL_TOPO_LPL_ORPHANED -8 258 #define LPL_TOPO_LPL_BAD_NCPU -9 259 #define LPL_TOPO_RSET_MSSNG_LF -10 260 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 261 #define LPL_TOPO_BOGUS_HINT -12 262 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 263 #define LPL_TOPO_LGRP_NOT_LEAF -14 264 #define LPL_TOPO_BAD_RSETCNT -15 265 266 /* 267 * Return whether lgroup optimizations should be enabled on this system 268 */ 269 int 270 lgrp_optimizations(void) 271 { 272 /* 273 * System must have more than 2 lgroups to enable lgroup optimizations 274 * 275 * XXX This assumes that a 2 lgroup system has an empty root lgroup 276 * with one child lgroup containing all the resources. A 2 lgroup 277 * system with a root lgroup directly containing CPUs or memory might 278 * need lgroup optimizations with its child lgroup, but there 279 * isn't such a machine for now.... 280 */ 281 if (nlgrps > 2) 282 return (1); 283 284 return (0); 285 } 286 287 /* 288 * Build full lgroup topology 289 */ 290 static void 291 lgrp_root_init(void) 292 { 293 lgrp_handle_t hand; 294 int i; 295 lgrp_id_t id; 296 297 /* 298 * Create the "root" lgroup 299 */ 300 ASSERT(nlgrps == 0); 301 id = nlgrps++; 302 303 lgrp_root = &lroot; 304 305 lgrp_root->lgrp_cpu = NULL; 306 lgrp_root->lgrp_mnodes = 0; 307 lgrp_root->lgrp_nmnodes = 0; 308 hand = lgrp_plat_root_hand(); 309 lgrp_root->lgrp_plathand = hand; 310 311 lgrp_root->lgrp_id = id; 312 lgrp_root->lgrp_cpucnt = 0; 313 lgrp_root->lgrp_childcnt = 0; 314 klgrpset_clear(lgrp_root->lgrp_children); 315 klgrpset_clear(lgrp_root->lgrp_leaves); 316 lgrp_root->lgrp_parent = NULL; 317 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 318 319 for (i = 0; i < LGRP_RSRC_COUNT; i++) 320 klgrpset_clear(lgrp_root->lgrp_set[i]); 321 322 lgrp_root->lgrp_kstat = NULL; 323 324 lgrp_table[id] = lgrp_root; 325 326 /* 327 * Setup initial lpl list for CPU0 and initial t0 home. 328 * The only lpl space we have so far is lpl_bootstrap. It is used for 329 * all topology operations until cp_default is initialized at which 330 * point t0.t_lpl will be updated. 331 */ 332 lpl_bootstrap = lpl_bootstrap_list; 333 t0.t_lpl = lpl_bootstrap; 334 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 335 lpl_bootstrap_list[1].lpl_lgrpid = 1; 336 cp_default.cp_lgrploads = lpl_bootstrap; 337 } 338 339 /* 340 * Initialize the lgroup framework and allow the platform to do the same 341 */ 342 void 343 lgrp_init(void) 344 { 345 /* 346 * Initialize the platform 347 */ 348 lgrp_plat_init(); 349 350 /* 351 * Set max number of lgroups supported on this platform which must be 352 * less than the max number of lgroups supported by the common lgroup 353 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 354 */ 355 nlgrpsmax = lgrp_plat_max_lgrps(); 356 ASSERT(nlgrpsmax <= NLGRPS_MAX); 357 } 358 359 /* 360 * Create the root and cpu0's lgroup, and set t0's home. 361 */ 362 void 363 lgrp_setup(void) 364 { 365 /* 366 * Setup the root lgroup 367 */ 368 lgrp_root_init(); 369 370 /* 371 * Add cpu0 to an lgroup 372 */ 373 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 374 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 375 } 376 377 /* 378 * Lgroup initialization is split in two parts. The first part 379 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 380 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 381 * when all CPUs are brought online and all distance information is available. 382 * 383 * When lgrp_main_init() is complete it sets lgrp_initialized. The 384 * lgrp_main_mp_init() sets lgrp_topo_initialized. 385 */ 386 387 /* 388 * true when lgrp initialization has been completed. 389 */ 390 int lgrp_initialized = 0; 391 392 /* 393 * True when lgrp topology is constructed. 394 */ 395 int lgrp_topo_initialized = 0; 396 397 /* 398 * Init routine called after startup(), /etc/system has been processed, 399 * and cpu0 has been added to an lgroup. 400 */ 401 void 402 lgrp_main_init(void) 403 { 404 cpu_t *cp = CPU; 405 lgrp_id_t lgrpid; 406 int i; 407 /* 408 * Enforce a valid lgrp_mem_default_policy 409 */ 410 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 411 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 412 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 413 414 /* 415 * See if mpo should be disabled. 416 * This may happen in the case of null proc LPA on Starcat. 417 * The platform won't be able to detect null proc LPA until after 418 * cpu0 and memory have already been added to lgroups. 419 * When and if it is detected, the Starcat platform will return 420 * a different platform handle for cpu0 which is what we check for 421 * here. If mpo should be disabled move cpu0 to it's rightful place 422 * (the root), and destroy the remaining lgroups. This effectively 423 * provides an UMA lgroup topology. 424 */ 425 lgrpid = cp->cpu_lpl->lpl_lgrpid; 426 if (lgrp_table[lgrpid]->lgrp_plathand != 427 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 428 lgrp_part_del_cpu(cp); 429 lgrp_cpu_fini(cp, lgrpid); 430 431 lgrp_cpu_init(cp); 432 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 433 434 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 435 436 /* 437 * Destroy all lgroups except for root 438 */ 439 for (i = 0; i <= lgrp_alloc_max; i++) { 440 if (LGRP_EXISTS(lgrp_table[i]) && 441 lgrp_table[i] != lgrp_root) 442 lgrp_destroy(lgrp_table[i]); 443 } 444 445 /* 446 * Fix up root to point at itself for leaves and resources 447 * and not have any children 448 */ 449 lgrp_root->lgrp_childcnt = 0; 450 klgrpset_clear(lgrp_root->lgrp_children); 451 klgrpset_clear(lgrp_root->lgrp_leaves); 452 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 453 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 454 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 455 } 456 457 /* 458 * Initialize kstats framework. 459 */ 460 lgrp_kstat_init(); 461 /* 462 * cpu0 is finally where it should be, so create it's lgroup's kstats 463 */ 464 mutex_enter(&cpu_lock); 465 lgrp_kstat_create(cp); 466 mutex_exit(&cpu_lock); 467 468 lgrp_plat_main_init(); 469 lgrp_initialized = 1; 470 } 471 472 /* 473 * Finish lgrp initialization after all CPUS are brought on-line. 474 * This routine is called after start_other_cpus(). 475 */ 476 void 477 lgrp_main_mp_init(void) 478 { 479 klgrpset_t changed; 480 481 /* 482 * Update lgroup topology (if necessary) 483 */ 484 klgrpset_clear(changed); 485 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 486 lgrp_topo_initialized = 1; 487 } 488 489 /* 490 * Change latency of lgroup with specified lgroup platform handle (if one is 491 * given) or change all lgroups with old latency to new latency 492 */ 493 void 494 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 495 u_longlong_t newtime) 496 { 497 lgrp_t *lgrp; 498 int i; 499 500 for (i = 0; i <= lgrp_alloc_max; i++) { 501 lgrp = lgrp_table[i]; 502 503 if (!LGRP_EXISTS(lgrp)) 504 continue; 505 506 if ((hand == LGRP_NULL_HANDLE && 507 lgrp->lgrp_latency == oldtime) || 508 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 509 lgrp->lgrp_latency = (int)newtime; 510 } 511 } 512 513 /* 514 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 515 */ 516 void 517 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 518 { 519 klgrpset_t changed; 520 cpu_t *cp; 521 lgrp_id_t id; 522 int rc; 523 524 switch (event) { 525 /* 526 * The following (re)configuration events are common code 527 * initiated. lgrp_plat_config() is called here to inform the 528 * platform of the reconfiguration event. 529 */ 530 case LGRP_CONFIG_CPU_ADD: 531 cp = (cpu_t *)resource; 532 533 /* 534 * Initialize the new CPU's lgrp related next/prev 535 * links, and give it a bootstrap lpl so that it can 536 * survive should it need to enter the dispatcher. 537 */ 538 cp->cpu_next_lpl = cp; 539 cp->cpu_prev_lpl = cp; 540 cp->cpu_next_lgrp = cp; 541 cp->cpu_prev_lgrp = cp; 542 cp->cpu_lpl = lpl_bootstrap; 543 544 lgrp_plat_config(event, resource); 545 atomic_add_32(&lgrp_gen, 1); 546 547 break; 548 case LGRP_CONFIG_CPU_DEL: 549 lgrp_plat_config(event, resource); 550 atomic_add_32(&lgrp_gen, 1); 551 552 break; 553 case LGRP_CONFIG_CPU_ONLINE: 554 cp = (cpu_t *)resource; 555 lgrp_cpu_init(cp); 556 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 557 rc = lpl_topo_verify(cp->cpu_part); 558 if (rc != LPL_TOPO_CORRECT) { 559 panic("lpl_topo_verify failed: %d", rc); 560 } 561 lgrp_plat_config(event, resource); 562 atomic_add_32(&lgrp_gen, 1); 563 564 break; 565 case LGRP_CONFIG_CPU_OFFLINE: 566 cp = (cpu_t *)resource; 567 id = cp->cpu_lpl->lpl_lgrpid; 568 lgrp_part_del_cpu(cp); 569 lgrp_cpu_fini(cp, id); 570 rc = lpl_topo_verify(cp->cpu_part); 571 if (rc != LPL_TOPO_CORRECT) { 572 panic("lpl_topo_verify failed: %d", rc); 573 } 574 lgrp_plat_config(event, resource); 575 atomic_add_32(&lgrp_gen, 1); 576 577 break; 578 case LGRP_CONFIG_CPUPART_ADD: 579 cp = (cpu_t *)resource; 580 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 581 rc = lpl_topo_verify(cp->cpu_part); 582 if (rc != LPL_TOPO_CORRECT) { 583 panic("lpl_topo_verify failed: %d", rc); 584 } 585 lgrp_plat_config(event, resource); 586 587 break; 588 case LGRP_CONFIG_CPUPART_DEL: 589 cp = (cpu_t *)resource; 590 lgrp_part_del_cpu((cpu_t *)resource); 591 rc = lpl_topo_verify(cp->cpu_part); 592 if (rc != LPL_TOPO_CORRECT) { 593 panic("lpl_topo_verify failed: %d", rc); 594 } 595 lgrp_plat_config(event, resource); 596 597 break; 598 /* 599 * The following events are initiated by the memnode 600 * subsystem. 601 */ 602 case LGRP_CONFIG_MEM_ADD: 603 lgrp_mem_init((int)resource, where, B_FALSE); 604 atomic_add_32(&lgrp_gen, 1); 605 606 break; 607 case LGRP_CONFIG_MEM_DEL: 608 lgrp_mem_fini((int)resource, where, B_FALSE); 609 atomic_add_32(&lgrp_gen, 1); 610 611 break; 612 case LGRP_CONFIG_MEM_RENAME: { 613 lgrp_config_mem_rename_t *ren_arg = 614 (lgrp_config_mem_rename_t *)where; 615 616 lgrp_mem_rename((int)resource, 617 ren_arg->lmem_rename_from, 618 ren_arg->lmem_rename_to); 619 atomic_add_32(&lgrp_gen, 1); 620 621 break; 622 } 623 case LGRP_CONFIG_GEN_UPDATE: 624 atomic_add_32(&lgrp_gen, 1); 625 626 break; 627 case LGRP_CONFIG_FLATTEN: 628 if (where == 0) 629 lgrp_topo_levels = (int)resource; 630 else 631 (void) lgrp_topo_flatten(resource, 632 lgrp_table, lgrp_alloc_max, &changed); 633 634 break; 635 /* 636 * Update any lgroups with old latency to new latency 637 */ 638 case LGRP_CONFIG_LAT_CHANGE_ALL: 639 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 640 (u_longlong_t)where); 641 642 break; 643 /* 644 * Update lgroup with specified lgroup platform handle to have 645 * new latency 646 */ 647 case LGRP_CONFIG_LAT_CHANGE: 648 lgrp_latency_change((lgrp_handle_t)resource, 0, 649 (u_longlong_t)where); 650 651 break; 652 case LGRP_CONFIG_NOP: 653 654 break; 655 default: 656 break; 657 } 658 659 } 660 661 /* 662 * Called to add lgrp info into cpu structure from cpu_add_unit; 663 * do not assume cpu is in cpu[] yet! 664 * 665 * CPUs are brought online with all other CPUs paused so we can't 666 * allocate memory or we could deadlock the system, so we rely on 667 * the platform to statically allocate as much space as we need 668 * for the lgrp structs and stats. 669 */ 670 static void 671 lgrp_cpu_init(struct cpu *cp) 672 { 673 klgrpset_t changed; 674 int count; 675 lgrp_handle_t hand; 676 int first_cpu; 677 lgrp_t *my_lgrp; 678 lgrp_id_t lgrpid; 679 struct cpu *cptr; 680 681 /* 682 * This is the first time through if the resource set 683 * for the root lgroup is empty. After cpu0 has been 684 * initially added to an lgroup, the root's CPU resource 685 * set can never be empty, since the system's last CPU 686 * cannot be offlined. 687 */ 688 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 689 /* 690 * First time through. 691 */ 692 first_cpu = 1; 693 } else { 694 /* 695 * If cpu0 needs to move lgroups, we may come 696 * through here again, at which time cpu_lock won't 697 * be held, and lgrp_initialized will be false. 698 */ 699 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 700 ASSERT(cp->cpu_part != NULL); 701 first_cpu = 0; 702 } 703 704 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 705 my_lgrp = lgrp_hand_to_lgrp(hand); 706 707 if (my_lgrp == NULL) { 708 /* 709 * Create new lgrp and add it to lgroup topology 710 */ 711 my_lgrp = lgrp_create(); 712 my_lgrp->lgrp_plathand = hand; 713 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 714 lgrpid = my_lgrp->lgrp_id; 715 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 716 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 717 718 count = 0; 719 klgrpset_clear(changed); 720 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 721 &changed); 722 /* 723 * May have added new intermediate lgroups, so need to add 724 * resources other than CPUs which are added below 725 */ 726 (void) lgrp_mnode_update(changed, NULL); 727 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 728 > 0) { 729 /* 730 * Leaf lgroup was created, but latency wasn't available 731 * then. So, set latency for it and fill in rest of lgroup 732 * topology now that we know how far it is from other leaf 733 * lgroups. 734 */ 735 lgrpid = my_lgrp->lgrp_id; 736 klgrpset_clear(changed); 737 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 738 lgrpid)) 739 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 740 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 741 &changed); 742 743 /* 744 * May have added new intermediate lgroups, so need to add 745 * resources other than CPUs which are added below 746 */ 747 (void) lgrp_mnode_update(changed, NULL); 748 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 749 my_lgrp->lgrp_id)) { 750 int i; 751 752 /* 753 * Update existing lgroup and lgroups containing it with CPU 754 * resource 755 */ 756 lgrpid = my_lgrp->lgrp_id; 757 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 758 for (i = 0; i <= lgrp_alloc_max; i++) { 759 lgrp_t *lgrp; 760 761 lgrp = lgrp_table[i]; 762 if (!LGRP_EXISTS(lgrp) || 763 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 764 continue; 765 766 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 767 } 768 } 769 770 lgrpid = my_lgrp->lgrp_id; 771 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 772 773 /* 774 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 775 * end up in lpl for lgroup 0 whether it is supposed to be in there or 776 * not since none of lgroup IDs in the lpl's have been set yet. 777 */ 778 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 779 cp->cpu_lpl->lpl_lgrpid = lgrpid; 780 781 /* 782 * link the CPU into the lgrp's CPU list 783 */ 784 if (my_lgrp->lgrp_cpucnt == 0) { 785 my_lgrp->lgrp_cpu = cp; 786 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 787 } else { 788 cptr = my_lgrp->lgrp_cpu; 789 cp->cpu_next_lgrp = cptr; 790 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 791 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 792 cptr->cpu_prev_lgrp = cp; 793 } 794 my_lgrp->lgrp_cpucnt++; 795 } 796 797 lgrp_t * 798 lgrp_create(void) 799 { 800 lgrp_t *my_lgrp; 801 lgrp_id_t lgrpid; 802 int i; 803 804 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 805 806 /* 807 * Find an open slot in the lgroup table and recycle unused lgroup 808 * left there if any 809 */ 810 my_lgrp = NULL; 811 if (lgrp_alloc_hint == -1) 812 /* 813 * Allocate from end when hint not set yet because no lgroups 814 * have been deleted yet 815 */ 816 lgrpid = nlgrps++; 817 else { 818 /* 819 * Start looking for next open slot from hint and leave hint 820 * at slot allocated 821 */ 822 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 823 my_lgrp = lgrp_table[i]; 824 if (!LGRP_EXISTS(my_lgrp)) { 825 lgrpid = i; 826 nlgrps++; 827 break; 828 } 829 } 830 lgrp_alloc_hint = lgrpid; 831 } 832 833 /* 834 * Keep track of max lgroup ID allocated so far to cut down on searches 835 */ 836 if (lgrpid > lgrp_alloc_max) 837 lgrp_alloc_max = lgrpid; 838 839 /* 840 * Need to allocate new lgroup if next open slot didn't have one 841 * for recycling 842 */ 843 if (my_lgrp == NULL) 844 my_lgrp = lgrp_plat_alloc(lgrpid); 845 846 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 847 panic("Too many lgrps for platform (%d)", nlgrps); 848 849 my_lgrp->lgrp_id = lgrpid; 850 my_lgrp->lgrp_latency = 0; 851 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 852 my_lgrp->lgrp_parent = NULL; 853 my_lgrp->lgrp_childcnt = 0; 854 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 855 my_lgrp->lgrp_nmnodes = 0; 856 klgrpset_clear(my_lgrp->lgrp_children); 857 klgrpset_clear(my_lgrp->lgrp_leaves); 858 for (i = 0; i < LGRP_RSRC_COUNT; i++) 859 klgrpset_clear(my_lgrp->lgrp_set[i]); 860 861 my_lgrp->lgrp_cpu = NULL; 862 my_lgrp->lgrp_cpucnt = 0; 863 864 if (my_lgrp->lgrp_kstat != NULL) 865 lgrp_kstat_reset(lgrpid); 866 867 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 868 869 return (my_lgrp); 870 } 871 872 void 873 lgrp_destroy(lgrp_t *lgrp) 874 { 875 int i; 876 877 /* 878 * Unless this lgroup is being destroyed on behalf of 879 * the boot CPU, cpu_lock must be held 880 */ 881 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 882 883 if (nlgrps == 1) 884 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 885 886 if (!LGRP_EXISTS(lgrp)) 887 return; 888 889 /* 890 * Set hint to lgroup being deleted and try to keep lower numbered 891 * hints to facilitate finding empty slots 892 */ 893 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 894 lgrp_alloc_hint = lgrp->lgrp_id; 895 896 /* 897 * Mark this lgroup to be recycled by setting its lgroup ID to 898 * LGRP_NONE and clear relevant fields 899 */ 900 lgrp->lgrp_id = LGRP_NONE; 901 lgrp->lgrp_latency = 0; 902 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 903 lgrp->lgrp_parent = NULL; 904 lgrp->lgrp_childcnt = 0; 905 906 klgrpset_clear(lgrp->lgrp_children); 907 klgrpset_clear(lgrp->lgrp_leaves); 908 for (i = 0; i < LGRP_RSRC_COUNT; i++) 909 klgrpset_clear(lgrp->lgrp_set[i]); 910 911 lgrp->lgrp_mnodes = (mnodeset_t)0; 912 lgrp->lgrp_nmnodes = 0; 913 914 lgrp->lgrp_cpu = NULL; 915 lgrp->lgrp_cpucnt = 0; 916 917 nlgrps--; 918 } 919 920 /* 921 * Initialize kstat data. Called from lgrp intialization code. 922 */ 923 static void 924 lgrp_kstat_init(void) 925 { 926 lgrp_stat_t stat; 927 928 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 929 930 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 931 kstat_named_init(&lgrp_kstat_data[stat], 932 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 933 } 934 935 /* 936 * initialize an lgrp's kstats if needed 937 * called with cpu_lock held but not with cpus paused. 938 * we don't tear these down now because we don't know about 939 * memory leaving the lgrp yet... 940 */ 941 942 void 943 lgrp_kstat_create(cpu_t *cp) 944 { 945 kstat_t *lgrp_kstat; 946 lgrp_id_t lgrpid; 947 lgrp_t *my_lgrp; 948 949 ASSERT(MUTEX_HELD(&cpu_lock)); 950 951 lgrpid = cp->cpu_lpl->lpl_lgrpid; 952 my_lgrp = lgrp_table[lgrpid]; 953 954 if (my_lgrp->lgrp_kstat != NULL) 955 return; /* already initialized */ 956 957 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 958 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 959 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 960 961 if (lgrp_kstat != NULL) { 962 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 963 lgrp_kstat->ks_private = my_lgrp; 964 lgrp_kstat->ks_data = &lgrp_kstat_data; 965 lgrp_kstat->ks_update = lgrp_kstat_extract; 966 my_lgrp->lgrp_kstat = lgrp_kstat; 967 kstat_install(lgrp_kstat); 968 } 969 } 970 971 /* 972 * this will do something when we manage to remove now unused lgrps 973 */ 974 975 /* ARGSUSED */ 976 void 977 lgrp_kstat_destroy(cpu_t *cp) 978 { 979 ASSERT(MUTEX_HELD(&cpu_lock)); 980 } 981 982 /* 983 * Called when a CPU is off-lined. 984 */ 985 static void 986 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 987 { 988 lgrp_t *my_lgrp; 989 struct cpu *prev; 990 struct cpu *next; 991 992 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 993 994 prev = cp->cpu_prev_lgrp; 995 next = cp->cpu_next_lgrp; 996 997 prev->cpu_next_lgrp = next; 998 next->cpu_prev_lgrp = prev; 999 1000 /* 1001 * just because I'm paranoid doesn't mean... 1002 */ 1003 1004 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1005 1006 my_lgrp = lgrp_table[lgrpid]; 1007 my_lgrp->lgrp_cpucnt--; 1008 1009 /* 1010 * Removing last CPU in lgroup, so update lgroup topology 1011 */ 1012 if (my_lgrp->lgrp_cpucnt == 0) { 1013 klgrpset_t changed; 1014 int count; 1015 int i; 1016 1017 my_lgrp->lgrp_cpu = NULL; 1018 1019 /* 1020 * Remove this lgroup from its lgroup CPU resources and remove 1021 * lgroup from lgroup topology if it doesn't have any more 1022 * resources in it now 1023 */ 1024 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1025 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1026 count = 0; 1027 klgrpset_clear(changed); 1028 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1029 lgrp_alloc_max + 1, &changed); 1030 return; 1031 } 1032 1033 /* 1034 * This lgroup isn't empty, so just remove it from CPU 1035 * resources of any lgroups that contain it as such 1036 */ 1037 for (i = 0; i <= lgrp_alloc_max; i++) { 1038 lgrp_t *lgrp; 1039 1040 lgrp = lgrp_table[i]; 1041 if (!LGRP_EXISTS(lgrp) || 1042 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1043 lgrpid)) 1044 continue; 1045 1046 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1047 } 1048 return; 1049 } 1050 1051 if (my_lgrp->lgrp_cpu == cp) 1052 my_lgrp->lgrp_cpu = next; 1053 1054 } 1055 1056 /* 1057 * Update memory nodes in target lgroups and return ones that get changed 1058 */ 1059 int 1060 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1061 { 1062 int count; 1063 int i; 1064 int j; 1065 lgrp_t *lgrp; 1066 lgrp_t *lgrp_rsrc; 1067 1068 count = 0; 1069 if (changed) 1070 klgrpset_clear(*changed); 1071 1072 if (klgrpset_isempty(target)) 1073 return (0); 1074 1075 /* 1076 * Find each lgroup in target lgroups 1077 */ 1078 for (i = 0; i <= lgrp_alloc_max; i++) { 1079 /* 1080 * Skip any lgroups that don't exist or aren't in target group 1081 */ 1082 lgrp = lgrp_table[i]; 1083 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1084 continue; 1085 } 1086 1087 /* 1088 * Initialize memnodes for intermediate lgroups to 0 1089 * and update them from scratch since they may have completely 1090 * changed 1091 */ 1092 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1093 lgrp->lgrp_mnodes = (mnodeset_t)0; 1094 lgrp->lgrp_nmnodes = 0; 1095 } 1096 1097 /* 1098 * Update memory nodes of of target lgroup with memory nodes 1099 * from each lgroup in its lgroup memory resource set 1100 */ 1101 for (j = 0; j <= lgrp_alloc_max; j++) { 1102 int k; 1103 1104 /* 1105 * Skip any lgroups that don't exist or aren't in 1106 * memory resources of target lgroup 1107 */ 1108 lgrp_rsrc = lgrp_table[j]; 1109 if (!LGRP_EXISTS(lgrp_rsrc) || 1110 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1111 j)) 1112 continue; 1113 1114 /* 1115 * Update target lgroup's memnodes to include memnodes 1116 * of this lgroup 1117 */ 1118 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1119 mnodeset_t mnode_mask; 1120 1121 mnode_mask = (mnodeset_t)1 << k; 1122 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1123 !(lgrp->lgrp_mnodes & mnode_mask)) { 1124 lgrp->lgrp_mnodes |= mnode_mask; 1125 lgrp->lgrp_nmnodes++; 1126 } 1127 } 1128 count++; 1129 if (changed) 1130 klgrpset_add(*changed, lgrp->lgrp_id); 1131 } 1132 } 1133 1134 return (count); 1135 } 1136 1137 /* 1138 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1139 * is moved from one board to another. The "from" and "to" arguments specify the 1140 * source and the destination of the move. 1141 * 1142 * See plat_lgrp_config() for a detailed description of the copy-rename 1143 * semantics. 1144 * 1145 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1146 * the lgroup topology which is changing as memory moves from one lgroup to 1147 * another. It removes the mnode from the source lgroup and re-inserts it in the 1148 * target lgroup. 1149 * 1150 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1151 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1152 * copy-rename operation. 1153 * 1154 * There is one case which requires special handling. If the system contains 1155 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1156 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1157 * lgrp_mem_init), but there is a window when the system has no memory in the 1158 * lgroup hierarchy. If another thread tries to allocate memory during this 1159 * window, the allocation will fail, although the system has physical memory. 1160 * This may cause a system panic or a deadlock (some sleeping memory allocations 1161 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1162 * the mnode back). 1163 * 1164 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1165 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1166 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1167 * but it updates the rest of the lgroup topology as if the mnode was actually 1168 * removed. The lgrp_mem_init() function recognizes that the mnode being 1169 * inserted represents such a special case and updates the topology 1170 * appropriately. 1171 */ 1172 void 1173 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1174 { 1175 /* 1176 * Remove the memory from the source node and add it to the destination 1177 * node. 1178 */ 1179 lgrp_mem_fini(mnode, from, B_TRUE); 1180 lgrp_mem_init(mnode, to, B_TRUE); 1181 } 1182 1183 /* 1184 * Called to indicate that the lgrp with platform handle "hand" now 1185 * contains the memory identified by "mnode". 1186 * 1187 * LOCKING for this routine is a bit tricky. Usually it is called without 1188 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1189 * callers. During DR of the board containing the caged memory it may be called 1190 * with cpu_lock already held and CPUs paused. 1191 * 1192 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1193 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1194 * dealing with the special case of DR copy-rename described in 1195 * lgrp_mem_rename(). 1196 */ 1197 void 1198 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1199 { 1200 klgrpset_t changed; 1201 int count; 1202 int i; 1203 lgrp_t *my_lgrp; 1204 lgrp_id_t lgrpid; 1205 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1206 boolean_t drop_lock = B_FALSE; 1207 boolean_t need_synch = B_FALSE; 1208 1209 /* 1210 * Grab CPU lock (if we haven't already) 1211 */ 1212 if (!MUTEX_HELD(&cpu_lock)) { 1213 mutex_enter(&cpu_lock); 1214 drop_lock = B_TRUE; 1215 } 1216 1217 /* 1218 * This routine may be called from a context where we already 1219 * hold cpu_lock, and have already paused cpus. 1220 */ 1221 if (!cpus_paused()) 1222 need_synch = B_TRUE; 1223 1224 /* 1225 * Check if this mnode is already configured and return immediately if 1226 * it is. 1227 * 1228 * NOTE: in special case of copy-rename of the only remaining mnode, 1229 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1230 * recognize this case and continue as usual, but skip the update to 1231 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1232 * in topology, temporarily introduced by lgrp_mem_fini(). 1233 */ 1234 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1235 lgrp_root->lgrp_mnodes & mnodes_mask) { 1236 if (drop_lock) 1237 mutex_exit(&cpu_lock); 1238 return; 1239 } 1240 1241 /* 1242 * Update lgroup topology with new memory resources, keeping track of 1243 * which lgroups change 1244 */ 1245 count = 0; 1246 klgrpset_clear(changed); 1247 my_lgrp = lgrp_hand_to_lgrp(hand); 1248 if (my_lgrp == NULL) { 1249 /* new lgrp */ 1250 my_lgrp = lgrp_create(); 1251 lgrpid = my_lgrp->lgrp_id; 1252 my_lgrp->lgrp_plathand = hand; 1253 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1254 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1255 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1256 1257 if (need_synch) 1258 pause_cpus(NULL); 1259 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1260 &changed); 1261 if (need_synch) 1262 start_cpus(); 1263 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1264 > 0) { 1265 /* 1266 * Leaf lgroup was created, but latency wasn't available 1267 * then. So, set latency for it and fill in rest of lgroup 1268 * topology now that we know how far it is from other leaf 1269 * lgroups. 1270 */ 1271 klgrpset_clear(changed); 1272 lgrpid = my_lgrp->lgrp_id; 1273 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1274 lgrpid)) 1275 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1276 if (need_synch) 1277 pause_cpus(NULL); 1278 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1279 &changed); 1280 if (need_synch) 1281 start_cpus(); 1282 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1283 my_lgrp->lgrp_id)) { 1284 /* 1285 * Add new lgroup memory resource to existing lgroup 1286 */ 1287 lgrpid = my_lgrp->lgrp_id; 1288 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1289 klgrpset_add(changed, lgrpid); 1290 count++; 1291 for (i = 0; i <= lgrp_alloc_max; i++) { 1292 lgrp_t *lgrp; 1293 1294 lgrp = lgrp_table[i]; 1295 if (!LGRP_EXISTS(lgrp) || 1296 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1297 continue; 1298 1299 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1300 klgrpset_add(changed, lgrp->lgrp_id); 1301 count++; 1302 } 1303 } 1304 1305 /* 1306 * Add memory node to lgroup and remove lgroup from ones that need 1307 * to be updated 1308 */ 1309 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1310 my_lgrp->lgrp_mnodes |= mnodes_mask; 1311 my_lgrp->lgrp_nmnodes++; 1312 } 1313 klgrpset_del(changed, lgrpid); 1314 1315 /* 1316 * Update memory node information for all lgroups that changed and 1317 * contain new memory node as a resource 1318 */ 1319 if (count) 1320 (void) lgrp_mnode_update(changed, NULL); 1321 1322 if (drop_lock) 1323 mutex_exit(&cpu_lock); 1324 } 1325 1326 /* 1327 * Called to indicate that the lgroup associated with the platform 1328 * handle "hand" no longer contains given memory node 1329 * 1330 * LOCKING for this routine is a bit tricky. Usually it is called without 1331 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1332 * callers. During DR of the board containing the caged memory it may be called 1333 * with cpu_lock already held and CPUs paused. 1334 * 1335 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1336 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1337 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1338 * the same mnode back into the topology. See lgrp_mem_rename() and 1339 * lgrp_mem_init() for additional details. 1340 */ 1341 void 1342 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1343 { 1344 klgrpset_t changed; 1345 int count; 1346 int i; 1347 lgrp_t *my_lgrp; 1348 lgrp_id_t lgrpid; 1349 mnodeset_t mnodes_mask; 1350 boolean_t drop_lock = B_FALSE; 1351 boolean_t need_synch = B_FALSE; 1352 1353 /* 1354 * Grab CPU lock (if we haven't already) 1355 */ 1356 if (!MUTEX_HELD(&cpu_lock)) { 1357 mutex_enter(&cpu_lock); 1358 drop_lock = B_TRUE; 1359 } 1360 1361 /* 1362 * This routine may be called from a context where we already 1363 * hold cpu_lock and have already paused cpus. 1364 */ 1365 if (!cpus_paused()) 1366 need_synch = B_TRUE; 1367 1368 my_lgrp = lgrp_hand_to_lgrp(hand); 1369 1370 /* 1371 * The lgrp *must* be pre-existing 1372 */ 1373 ASSERT(my_lgrp != NULL); 1374 1375 /* 1376 * Delete memory node from lgroups which contain it 1377 */ 1378 mnodes_mask = ((mnodeset_t)1 << mnode); 1379 for (i = 0; i <= lgrp_alloc_max; i++) { 1380 lgrp_t *lgrp = lgrp_table[i]; 1381 /* 1382 * Skip any non-existent lgroups and any lgroups that don't 1383 * contain leaf lgroup of memory as a memory resource 1384 */ 1385 if (!LGRP_EXISTS(lgrp) || 1386 !(lgrp->lgrp_mnodes & mnodes_mask)) 1387 continue; 1388 1389 /* 1390 * Avoid removing the last mnode from the root in the DR 1391 * copy-rename case. See lgrp_mem_rename() for details. 1392 */ 1393 if (is_copy_rename && 1394 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1395 continue; 1396 1397 /* 1398 * Remove memory node from lgroup. 1399 */ 1400 lgrp->lgrp_mnodes &= ~mnodes_mask; 1401 lgrp->lgrp_nmnodes--; 1402 ASSERT(lgrp->lgrp_nmnodes >= 0); 1403 } 1404 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1405 1406 /* 1407 * Don't need to update lgroup topology if this lgroup still has memory. 1408 * 1409 * In the special case of DR copy-rename with the only mnode being 1410 * removed, the lgrp_mnodes for the root is always non-zero, but we 1411 * still need to update the lgroup topology. 1412 */ 1413 if ((my_lgrp->lgrp_nmnodes > 0) && 1414 !(is_copy_rename && 1415 (my_lgrp == lgrp_root) && 1416 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1417 if (drop_lock) 1418 mutex_exit(&cpu_lock); 1419 return; 1420 } 1421 1422 /* 1423 * This lgroup does not contain any memory now 1424 */ 1425 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1426 1427 /* 1428 * Remove this lgroup from lgroup topology if it does not contain any 1429 * resources now 1430 */ 1431 lgrpid = my_lgrp->lgrp_id; 1432 count = 0; 1433 klgrpset_clear(changed); 1434 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1435 /* 1436 * Delete lgroup when no more resources 1437 */ 1438 if (need_synch) 1439 pause_cpus(NULL); 1440 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1441 lgrp_alloc_max + 1, &changed); 1442 ASSERT(count > 0); 1443 if (need_synch) 1444 start_cpus(); 1445 } else { 1446 /* 1447 * Remove lgroup from memory resources of any lgroups that 1448 * contain it as such 1449 */ 1450 for (i = 0; i <= lgrp_alloc_max; i++) { 1451 lgrp_t *lgrp; 1452 1453 lgrp = lgrp_table[i]; 1454 if (!LGRP_EXISTS(lgrp) || 1455 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1456 lgrpid)) 1457 continue; 1458 1459 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1460 } 1461 } 1462 if (drop_lock) 1463 mutex_exit(&cpu_lock); 1464 } 1465 1466 /* 1467 * Return lgroup with given platform handle 1468 */ 1469 lgrp_t * 1470 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1471 { 1472 int i; 1473 lgrp_t *lgrp; 1474 1475 if (hand == LGRP_NULL_HANDLE) 1476 return (NULL); 1477 1478 for (i = 0; i <= lgrp_alloc_max; i++) { 1479 lgrp = lgrp_table[i]; 1480 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1481 return (lgrp); 1482 } 1483 return (NULL); 1484 } 1485 1486 /* 1487 * Return the home lgroup of the current thread. 1488 * We must do this with kernel preemption disabled, since we don't want our 1489 * thread to be re-homed while we're poking around with its lpl, and the lpl 1490 * should never be NULL. 1491 * 1492 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1493 * is enabled because of DR. Callers can use disable kernel preemption 1494 * around this call to guarantee that the lgroup will be valid beyond this 1495 * routine, since kernel preemption can be recursive. 1496 */ 1497 lgrp_t * 1498 lgrp_home_lgrp(void) 1499 { 1500 lgrp_t *lgrp; 1501 lpl_t *lpl; 1502 1503 kpreempt_disable(); 1504 1505 lpl = curthread->t_lpl; 1506 ASSERT(lpl != NULL); 1507 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1508 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1509 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1510 1511 kpreempt_enable(); 1512 1513 return (lgrp); 1514 } 1515 1516 /* 1517 * Return ID of home lgroup for given thread 1518 * (See comments for lgrp_home_lgrp() for special care and handling 1519 * instructions) 1520 */ 1521 lgrp_id_t 1522 lgrp_home_id(kthread_t *t) 1523 { 1524 lgrp_id_t lgrp; 1525 lpl_t *lpl; 1526 1527 ASSERT(t != NULL); 1528 /* 1529 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1530 * cannot since the HAT layer can call into this routine to 1531 * determine the locality for its data structures in the context 1532 * of a page fault. 1533 */ 1534 1535 kpreempt_disable(); 1536 1537 lpl = t->t_lpl; 1538 ASSERT(lpl != NULL); 1539 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1540 lgrp = lpl->lpl_lgrpid; 1541 1542 kpreempt_enable(); 1543 1544 return (lgrp); 1545 } 1546 1547 /* 1548 * Return lgroup containing the physical memory for the given page frame number 1549 */ 1550 lgrp_t * 1551 lgrp_pfn_to_lgrp(pfn_t pfn) 1552 { 1553 lgrp_handle_t hand; 1554 int i; 1555 lgrp_t *lgrp; 1556 1557 hand = lgrp_plat_pfn_to_hand(pfn); 1558 if (hand != LGRP_NULL_HANDLE) 1559 for (i = 0; i <= lgrp_alloc_max; i++) { 1560 lgrp = lgrp_table[i]; 1561 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1562 return (lgrp); 1563 } 1564 return (NULL); 1565 } 1566 1567 /* 1568 * Return lgroup containing the physical memory for the given page frame number 1569 */ 1570 lgrp_t * 1571 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1572 { 1573 lgrp_handle_t hand; 1574 int i; 1575 lgrp_t *lgrp; 1576 pfn_t pfn; 1577 1578 pfn = btop(physaddr); 1579 hand = lgrp_plat_pfn_to_hand(pfn); 1580 if (hand != LGRP_NULL_HANDLE) 1581 for (i = 0; i <= lgrp_alloc_max; i++) { 1582 lgrp = lgrp_table[i]; 1583 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1584 return (lgrp); 1585 } 1586 return (NULL); 1587 } 1588 1589 /* 1590 * Return the leaf lgroup containing the given CPU 1591 * 1592 * The caller needs to take precautions necessary to prevent 1593 * "cpu", and it's lpl from going away across a call to this function. 1594 * hint: kpreempt_disable()/kpreempt_enable() 1595 */ 1596 static lgrp_t * 1597 lgrp_cpu_to_lgrp(cpu_t *cpu) 1598 { 1599 return (cpu->cpu_lpl->lpl_lgrp); 1600 } 1601 1602 /* 1603 * Return the sum of the partition loads in an lgrp divided by 1604 * the number of CPUs in the lgrp. This is our best approximation 1605 * of an 'lgroup load average' for a useful per-lgroup kstat. 1606 */ 1607 static uint64_t 1608 lgrp_sum_loadavgs(lgrp_t *lgrp) 1609 { 1610 cpu_t *cpu; 1611 int ncpu; 1612 uint64_t loads = 0; 1613 1614 mutex_enter(&cpu_lock); 1615 1616 cpu = lgrp->lgrp_cpu; 1617 ncpu = lgrp->lgrp_cpucnt; 1618 1619 if (cpu == NULL || ncpu == 0) { 1620 mutex_exit(&cpu_lock); 1621 return (0ull); 1622 } 1623 1624 do { 1625 loads += cpu->cpu_lpl->lpl_loadavg; 1626 cpu = cpu->cpu_next_lgrp; 1627 } while (cpu != lgrp->lgrp_cpu); 1628 1629 mutex_exit(&cpu_lock); 1630 1631 return (loads / ncpu); 1632 } 1633 1634 void 1635 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1636 { 1637 struct lgrp_stats *pstats; 1638 1639 /* 1640 * Verify that the caller isn't trying to add to 1641 * a statistic for an lgroup that has gone away 1642 */ 1643 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1644 return; 1645 1646 pstats = &lgrp_stats[lgrpid]; 1647 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1648 } 1649 1650 int64_t 1651 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1652 { 1653 uint64_t val; 1654 struct lgrp_stats *pstats; 1655 1656 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1657 return ((int64_t)0); 1658 1659 pstats = &lgrp_stats[lgrpid]; 1660 LGRP_STAT_READ(pstats, stat, val); 1661 return (val); 1662 } 1663 1664 /* 1665 * Reset all kstats for lgrp specified by its lgrpid. 1666 */ 1667 static void 1668 lgrp_kstat_reset(lgrp_id_t lgrpid) 1669 { 1670 lgrp_stat_t stat; 1671 1672 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1673 return; 1674 1675 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1676 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1677 } 1678 } 1679 1680 /* 1681 * Collect all per-lgrp statistics for the lgrp associated with this 1682 * kstat, and store them in the ks_data array. 1683 * 1684 * The superuser can reset all the running counter statistics for an 1685 * lgrp by writing to any of the lgrp's stats. 1686 */ 1687 static int 1688 lgrp_kstat_extract(kstat_t *ksp, int rw) 1689 { 1690 lgrp_stat_t stat; 1691 struct kstat_named *ksd; 1692 lgrp_t *lgrp; 1693 lgrp_id_t lgrpid; 1694 1695 lgrp = (lgrp_t *)ksp->ks_private; 1696 1697 ksd = (struct kstat_named *)ksp->ks_data; 1698 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1699 1700 lgrpid = lgrp->lgrp_id; 1701 1702 if (lgrpid == LGRP_NONE) { 1703 /* 1704 * Return all zeroes as stats for freed lgrp. 1705 */ 1706 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1707 ksd[stat].value.i64 = 0; 1708 } 1709 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1710 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1711 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1712 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1713 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1714 } else if (rw != KSTAT_WRITE) { 1715 /* 1716 * Handle counter stats 1717 */ 1718 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1719 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1720 } 1721 1722 /* 1723 * Handle kernel data snapshot stats 1724 */ 1725 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1726 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1727 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1728 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1729 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1730 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1731 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1732 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1733 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1734 lgrp_loadavg_max_effect; 1735 } else { 1736 lgrp_kstat_reset(lgrpid); 1737 } 1738 1739 return (0); 1740 } 1741 1742 int 1743 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1744 { 1745 cpu_t *cp; 1746 1747 mutex_enter(&cpu_lock); 1748 1749 if ((cp = cpu_get(id)) == NULL) { 1750 mutex_exit(&cpu_lock); 1751 return (EINVAL); 1752 } 1753 1754 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1755 mutex_exit(&cpu_lock); 1756 return (EINVAL); 1757 } 1758 1759 ASSERT(cp->cpu_lpl != NULL); 1760 1761 *lp = cp->cpu_lpl->lpl_lgrpid; 1762 1763 mutex_exit(&cpu_lock); 1764 1765 return (0); 1766 } 1767 1768 int 1769 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1770 { 1771 cpu_t *cp; 1772 1773 mutex_enter(&cpu_lock); 1774 1775 if ((cp = cpu_get(id)) == NULL) { 1776 mutex_exit(&cpu_lock); 1777 return (EINVAL); 1778 } 1779 1780 ASSERT(cp->cpu_lpl != NULL); 1781 1782 *lp = cp->cpu_lpl->lpl_loadavg; 1783 1784 mutex_exit(&cpu_lock); 1785 1786 return (0); 1787 } 1788 1789 /* 1790 * Add a resource named by lpl_leaf to rset of lpl_target 1791 * 1792 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1793 * resource. It is adjusted here, as this is presently the only place that we 1794 * can be certain a resource addition has succeeded. 1795 * 1796 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1797 * list in order until it reaches a NULL. (This list is required to be NULL 1798 * terminated, too). This is done so that we can mark start pos + 1, so that 1799 * each lpl is traversed sequentially, but in a different order. We hope this 1800 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1801 */ 1802 1803 void 1804 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1805 { 1806 int i; 1807 int entry_slot = 0; 1808 1809 /* return if leaf is already present */ 1810 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1811 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1812 return; 1813 } 1814 1815 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1816 lpl_leaf->lpl_lgrpid) { 1817 break; 1818 } 1819 } 1820 1821 /* insert leaf, update counts */ 1822 entry_slot = i; 1823 i = lpl_target->lpl_nrset++; 1824 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1825 panic("More leaf lgrps in system than are supported!\n"); 1826 } 1827 1828 /* 1829 * Start at the end of the rset array and work backwards towards the 1830 * slot into which the new lpl will be inserted. This effectively 1831 * preserves the current ordering by scooting everybody over one entry, 1832 * and placing the new entry into the space created. 1833 */ 1834 1835 while (i-- > entry_slot) { 1836 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1837 } 1838 1839 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1840 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1841 } 1842 1843 /* 1844 * Update each of lpl_parent's children with a proper hint and 1845 * a reference to their parent. 1846 * The lgrp topology is used as the reference since it is fully 1847 * consistent and correct at this point. 1848 * 1849 * Each child's hint will reference an element in lpl_parent's 1850 * rset that designates where the child should start searching 1851 * for CPU resources. The hint selected is the highest order leaf present 1852 * in the child's lineage. 1853 * 1854 * This should be called after any potential change in lpl_parent's 1855 * rset. 1856 */ 1857 static void 1858 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1859 { 1860 klgrpset_t children, leaves; 1861 lpl_t *lpl; 1862 int hint; 1863 int i, j; 1864 1865 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1866 if (klgrpset_isempty(children)) 1867 return; /* nothing to do */ 1868 1869 for (i = 0; i <= lgrp_alloc_max; i++) { 1870 if (klgrpset_ismember(children, i)) { 1871 1872 /* 1873 * Given the set of leaves in this child's lineage, 1874 * find the highest order leaf present in the parent's 1875 * rset. Select this as the hint for the child. 1876 */ 1877 leaves = lgrp_table[i]->lgrp_leaves; 1878 hint = 0; 1879 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1880 lpl = lpl_parent->lpl_rset[j]; 1881 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1882 hint = j; 1883 } 1884 cp->cp_lgrploads[i].lpl_hint = hint; 1885 1886 /* 1887 * (Re)set the parent. It may be incorrect if 1888 * lpl_parent is new in the topology. 1889 */ 1890 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1891 } 1892 } 1893 } 1894 1895 /* 1896 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1897 * 1898 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1899 * resource. The values are adjusted here, as this is the only place that we can 1900 * be certain a resource was successfully deleted. 1901 */ 1902 void 1903 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1904 { 1905 int i; 1906 1907 /* find leaf in intermediate node */ 1908 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1909 if (lpl_target->lpl_rset[i] == lpl_leaf) 1910 break; 1911 } 1912 1913 /* return if leaf not found */ 1914 if (lpl_target->lpl_rset[i] != lpl_leaf) 1915 return; 1916 1917 /* prune leaf, compress array */ 1918 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1919 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1920 lpl_target->lpl_ncpu--; 1921 do { 1922 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1923 } while (i++ < lpl_target->lpl_nrset); 1924 } 1925 1926 /* 1927 * Check to see if the resource set of the target lpl contains the 1928 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1929 */ 1930 1931 int 1932 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1933 { 1934 int i; 1935 1936 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1937 if (lpl_target->lpl_rset[i] == lpl_leaf) 1938 return (1); 1939 } 1940 1941 return (0); 1942 } 1943 1944 /* 1945 * Called when we change cpu lpl membership. This increments or decrements the 1946 * per-cpu counter in every lpl in which our leaf appears. 1947 */ 1948 void 1949 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1950 { 1951 cpupart_t *cpupart; 1952 lgrp_t *lgrp_leaf; 1953 lgrp_t *lgrp_cur; 1954 lpl_t *lpl_leaf; 1955 lpl_t *lpl_cur; 1956 int i; 1957 1958 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1959 1960 cpupart = cp->cpu_part; 1961 lpl_leaf = cp->cpu_lpl; 1962 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1963 1964 for (i = 0; i <= lgrp_alloc_max; i++) { 1965 lgrp_cur = lgrp_table[i]; 1966 1967 /* 1968 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 1969 * for the cpu in question, or if the current lgrp and leaf 1970 * don't share the same resources. 1971 */ 1972 1973 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 1974 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 1975 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 1976 continue; 1977 1978 1979 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 1980 1981 if (lpl_cur->lpl_nrset > 0) { 1982 if (act == LPL_INCREMENT) { 1983 lpl_cur->lpl_ncpu++; 1984 } else if (act == LPL_DECREMENT) { 1985 lpl_cur->lpl_ncpu--; 1986 } 1987 } 1988 } 1989 } 1990 1991 /* 1992 * Initialize lpl with given resources and specified lgrp 1993 */ 1994 1995 void 1996 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 1997 { 1998 lpl->lpl_lgrpid = lgrp->lgrp_id; 1999 lpl->lpl_loadavg = 0; 2000 if (lpl == lpl_leaf) 2001 lpl->lpl_ncpu = 1; 2002 else 2003 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2004 lpl->lpl_nrset = 1; 2005 lpl->lpl_rset[0] = lpl_leaf; 2006 lpl->lpl_lgrp = lgrp; 2007 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2008 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2009 } 2010 2011 /* 2012 * Clear an unused lpl 2013 */ 2014 2015 void 2016 lpl_clear(lpl_t *lpl) 2017 { 2018 lgrp_id_t lid; 2019 2020 /* save lid for debugging purposes */ 2021 lid = lpl->lpl_lgrpid; 2022 bzero(lpl, sizeof (lpl_t)); 2023 lpl->lpl_lgrpid = lid; 2024 } 2025 2026 /* 2027 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2028 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2029 * make full use of all of the lgroup topology, but this checks to make sure 2030 * that for the parts that it does use, it has correctly understood the 2031 * relationships that exist. This function returns 2032 * 0 if the topology is correct, and a non-zero error code, for non-debug 2033 * kernels if incorrect. Asserts are spread throughout the code to aid in 2034 * debugging on a DEBUG kernel. 2035 */ 2036 int 2037 lpl_topo_verify(cpupart_t *cpupart) 2038 { 2039 lgrp_t *lgrp; 2040 lpl_t *lpl; 2041 klgrpset_t rset; 2042 klgrpset_t cset; 2043 cpu_t *cpu; 2044 cpu_t *cp_start; 2045 int i; 2046 int j; 2047 int sum; 2048 2049 /* topology can't be incorrect if it doesn't exist */ 2050 if (!lgrp_topo_initialized || !lgrp_initialized) 2051 return (LPL_TOPO_CORRECT); 2052 2053 ASSERT(cpupart != NULL); 2054 2055 for (i = 0; i <= lgrp_alloc_max; i++) { 2056 lgrp = lgrp_table[i]; 2057 lpl = NULL; 2058 /* make sure lpls are allocated */ 2059 ASSERT(cpupart->cp_lgrploads); 2060 if (!cpupart->cp_lgrploads) 2061 return (LPL_TOPO_PART_HAS_NO_LPL); 2062 2063 lpl = &cpupart->cp_lgrploads[i]; 2064 /* make sure our index is good */ 2065 ASSERT(i < cpupart->cp_nlgrploads); 2066 2067 /* if lgroup doesn't exist, make sure lpl is empty */ 2068 if (!LGRP_EXISTS(lgrp)) { 2069 ASSERT(lpl->lpl_ncpu == 0); 2070 if (lpl->lpl_ncpu > 0) { 2071 return (LPL_TOPO_CPUS_NOT_EMPTY); 2072 } else { 2073 continue; 2074 } 2075 } 2076 2077 /* verify that lgroup and lpl are identically numbered */ 2078 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2079 2080 /* if lgroup isn't in our partition, make sure lpl is empty */ 2081 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2082 cpupart->cp_lgrpset)) { 2083 ASSERT(lpl->lpl_ncpu == 0); 2084 if (lpl->lpl_ncpu > 0) { 2085 return (LPL_TOPO_CPUS_NOT_EMPTY); 2086 } 2087 /* 2088 * lpl is empty, and lgroup isn't in partition. verify 2089 * that lpl doesn't show up in anyone else's rsets (in 2090 * this partition, anyway) 2091 */ 2092 2093 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2094 lpl_t *i_lpl; /* lpl we're iterating over */ 2095 2096 i_lpl = &cpupart->cp_lgrploads[j]; 2097 2098 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2099 if (lpl_rset_contains(i_lpl, lpl)) { 2100 return (LPL_TOPO_LPL_ORPHANED); 2101 } 2102 } 2103 /* lgroup is empty, and everything is ok. continue */ 2104 continue; 2105 } 2106 2107 2108 /* lgroup is in this partition, now check it against lpl */ 2109 2110 /* do both have matching lgrps? */ 2111 ASSERT(lgrp == lpl->lpl_lgrp); 2112 if (lgrp != lpl->lpl_lgrp) { 2113 return (LPL_TOPO_LGRP_MISMATCH); 2114 } 2115 2116 /* do the parent lgroups exist and do they match? */ 2117 if (lgrp->lgrp_parent) { 2118 ASSERT(lpl->lpl_parent); 2119 ASSERT(lgrp->lgrp_parent->lgrp_id == 2120 lpl->lpl_parent->lpl_lgrpid); 2121 2122 if (!lpl->lpl_parent) { 2123 return (LPL_TOPO_MISSING_PARENT); 2124 } else if (lgrp->lgrp_parent->lgrp_id != 2125 lpl->lpl_parent->lpl_lgrpid) { 2126 return (LPL_TOPO_PARENT_MISMATCH); 2127 } 2128 } 2129 2130 /* only leaf lgroups keep a cpucnt, only check leaves */ 2131 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2132 2133 /* verify that lgrp is also a leaf */ 2134 ASSERT((lgrp->lgrp_childcnt == 0) && 2135 (klgrpset_ismember(lgrp->lgrp_leaves, 2136 lpl->lpl_lgrpid))); 2137 2138 if ((lgrp->lgrp_childcnt > 0) || 2139 (!klgrpset_ismember(lgrp->lgrp_leaves, 2140 lpl->lpl_lgrpid))) { 2141 return (LPL_TOPO_LGRP_NOT_LEAF); 2142 } 2143 2144 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2145 (lpl->lpl_ncpu > 0)); 2146 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2147 (lpl->lpl_ncpu <= 0)) { 2148 return (LPL_TOPO_BAD_CPUCNT); 2149 } 2150 2151 /* 2152 * Check that lpl_ncpu also matches the number of 2153 * cpus in the lpl's linked list. This only exists in 2154 * leaves, but they should always match. 2155 */ 2156 j = 0; 2157 cpu = cp_start = lpl->lpl_cpus; 2158 while (cpu != NULL) { 2159 j++; 2160 2161 /* check to make sure cpu's lpl is leaf lpl */ 2162 ASSERT(cpu->cpu_lpl == lpl); 2163 if (cpu->cpu_lpl != lpl) { 2164 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2165 } 2166 2167 /* check next cpu */ 2168 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2169 continue; 2170 } else { 2171 cpu = NULL; 2172 } 2173 } 2174 2175 ASSERT(j == lpl->lpl_ncpu); 2176 if (j != lpl->lpl_ncpu) { 2177 return (LPL_TOPO_LPL_BAD_NCPU); 2178 } 2179 2180 /* 2181 * Also, check that leaf lpl is contained in all 2182 * intermediate lpls that name the leaf as a descendant 2183 */ 2184 2185 for (j = 0; j <= lgrp_alloc_max; j++) { 2186 klgrpset_t intersect; 2187 lgrp_t *lgrp_cand; 2188 lpl_t *lpl_cand; 2189 2190 lgrp_cand = lgrp_table[j]; 2191 intersect = klgrpset_intersects( 2192 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2193 cpupart->cp_lgrpset); 2194 2195 if (!LGRP_EXISTS(lgrp_cand) || 2196 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2197 cpupart->cp_lgrpset) || 2198 (intersect == 0)) 2199 continue; 2200 2201 lpl_cand = 2202 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2203 2204 if (klgrpset_ismember(intersect, 2205 lgrp->lgrp_id)) { 2206 ASSERT(lpl_rset_contains(lpl_cand, 2207 lpl)); 2208 2209 if (!lpl_rset_contains(lpl_cand, lpl)) { 2210 return (LPL_TOPO_RSET_MSSNG_LF); 2211 } 2212 } 2213 } 2214 2215 } else { /* non-leaf specific checks */ 2216 2217 /* 2218 * Non-leaf lpls should have lpl_cpus == NULL 2219 * verify that this is so 2220 */ 2221 ASSERT(lpl->lpl_cpus == NULL); 2222 if (lpl->lpl_cpus != NULL) { 2223 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2224 } 2225 2226 /* 2227 * verify that the sum of the cpus in the leaf resources 2228 * is equal to the total ncpu in the intermediate 2229 */ 2230 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2231 sum += lpl->lpl_rset[j]->lpl_ncpu; 2232 } 2233 2234 ASSERT(sum == lpl->lpl_ncpu); 2235 if (sum != lpl->lpl_ncpu) { 2236 return (LPL_TOPO_LPL_BAD_NCPU); 2237 } 2238 } 2239 2240 /* 2241 * check on lpl_hint. Don't check root, since it has no parent. 2242 */ 2243 if (lpl->lpl_parent != NULL) { 2244 int hint; 2245 lpl_t *hint_lpl; 2246 2247 /* make sure hint is within limits of nrset */ 2248 hint = lpl->lpl_hint; 2249 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2250 if (lpl->lpl_parent->lpl_nrset < hint) { 2251 return (LPL_TOPO_BOGUS_HINT); 2252 } 2253 2254 /* make sure hint points to valid lpl */ 2255 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2256 ASSERT(hint_lpl->lpl_ncpu > 0); 2257 if (hint_lpl->lpl_ncpu <= 0) { 2258 return (LPL_TOPO_BOGUS_HINT); 2259 } 2260 } 2261 2262 /* 2263 * Check the rset of the lpl in question. Make sure that each 2264 * rset contains a subset of the resources in 2265 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2266 * sure that each rset doesn't include resources that are 2267 * outside of that set. (Which would be resources somehow not 2268 * accounted for). 2269 */ 2270 2271 klgrpset_clear(rset); 2272 for (j = 0; j < lpl->lpl_nrset; j++) { 2273 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2274 } 2275 klgrpset_copy(cset, rset); 2276 /* make sure lpl rset matches lgrp rset */ 2277 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2278 /* make sure rset is contained with in partition, too */ 2279 klgrpset_diff(cset, cpupart->cp_lgrpset); 2280 2281 ASSERT(klgrpset_isempty(rset) && 2282 klgrpset_isempty(cset)); 2283 if (!klgrpset_isempty(rset) || 2284 !klgrpset_isempty(cset)) { 2285 return (LPL_TOPO_RSET_MISMATCH); 2286 } 2287 2288 /* 2289 * check to make sure lpl_nrset matches the number of rsets 2290 * contained in the lpl 2291 */ 2292 2293 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2294 j++); 2295 2296 ASSERT(j == lpl->lpl_nrset); 2297 if (j != lpl->lpl_nrset) { 2298 return (LPL_TOPO_BAD_RSETCNT); 2299 } 2300 2301 } 2302 return (LPL_TOPO_CORRECT); 2303 } 2304 2305 /* 2306 * Flatten lpl topology to given number of levels. This is presently only 2307 * implemented for a flatten to 2 levels, which will prune out the intermediates 2308 * and home the leaf lpls to the root lpl. 2309 */ 2310 int 2311 lpl_topo_flatten(int levels) 2312 { 2313 int i; 2314 uint_t sum; 2315 lgrp_t *lgrp_cur; 2316 lpl_t *lpl_cur; 2317 lpl_t *lpl_root; 2318 cpupart_t *cp; 2319 2320 if (levels != 2) 2321 return (0); 2322 2323 /* called w/ cpus paused - grab no locks! */ 2324 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2325 !lgrp_initialized); 2326 2327 cp = cp_list_head; 2328 do { 2329 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2330 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2331 2332 for (i = 0; i <= lgrp_alloc_max; i++) { 2333 lgrp_cur = lgrp_table[i]; 2334 lpl_cur = &cp->cp_lgrploads[i]; 2335 2336 if ((lgrp_cur == lgrp_root) || 2337 (!LGRP_EXISTS(lgrp_cur) && 2338 (lpl_cur->lpl_ncpu == 0))) 2339 continue; 2340 2341 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2342 /* 2343 * this should be a deleted intermediate, so 2344 * clear it 2345 */ 2346 lpl_clear(lpl_cur); 2347 } else if ((lpl_cur->lpl_nrset == 1) && 2348 (lpl_cur->lpl_rset[0] == lpl_cur) && 2349 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2350 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2351 /* 2352 * this is a leaf whose parent was deleted, or 2353 * whose parent had their lgrp deleted. (And 2354 * whose parent will soon be deleted). Point 2355 * this guy back to the root lpl. 2356 */ 2357 lpl_cur->lpl_parent = lpl_root; 2358 lpl_rset_add(lpl_root, lpl_cur); 2359 } 2360 2361 } 2362 2363 /* 2364 * Now that we're done, make sure the count on the root lpl is 2365 * correct, and update the hints of the children for the sake of 2366 * thoroughness 2367 */ 2368 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2369 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2370 } 2371 lpl_root->lpl_ncpu = sum; 2372 lpl_child_update(lpl_root, cp); 2373 2374 cp = cp->cp_next; 2375 } while (cp != cp_list_head); 2376 2377 return (levels); 2378 } 2379 2380 /* 2381 * Insert a lpl into the resource hierarchy and create any additional lpls that 2382 * are necessary to represent the varying states of locality for the cpu 2383 * resoruces newly added to the partition. 2384 * 2385 * This routine is clever enough that it can correctly add resources from the 2386 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2387 * those for which the lpl is a leaf as opposed to simply a named equally local 2388 * resource). The one special case that needs additional processing is when a 2389 * new intermediate lpl is introduced. Since the main loop only traverses 2390 * looking to add the leaf resource where it does not yet exist, additional work 2391 * is necessary to add other leaf resources that may need to exist in the newly 2392 * created intermediate. This is performed by the second inner loop, and is 2393 * only done when the check for more than one overlapping resource succeeds. 2394 */ 2395 2396 void 2397 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2398 { 2399 int i; 2400 int j; 2401 int hint; 2402 int rset_num_intersect; 2403 lgrp_t *lgrp_cur; 2404 lpl_t *lpl_cur; 2405 lpl_t *lpl_parent; 2406 lgrp_id_t parent_id; 2407 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2408 2409 for (i = 0; i <= lgrp_alloc_max; i++) { 2410 lgrp_cur = lgrp_table[i]; 2411 2412 /* 2413 * Don't insert if the lgrp isn't there, if the leaf isn't 2414 * contained within the current lgrp, or if the current lgrp has 2415 * no leaves in this partition 2416 */ 2417 2418 if (!LGRP_EXISTS(lgrp_cur) || 2419 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2420 lpl_leaf->lpl_lgrpid) || 2421 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2422 cpupart->cp_lgrpset)) 2423 continue; 2424 2425 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2426 if (lgrp_cur->lgrp_parent != NULL) { 2427 /* if lgrp has a parent, assign it properly */ 2428 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2429 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2430 } else { 2431 /* if not, make sure parent ptr gets set to null */ 2432 lpl_parent = NULL; 2433 } 2434 2435 if (lpl_cur == lpl_leaf) { 2436 /* 2437 * Almost all leaf state was initialized elsewhere. The 2438 * only thing left to do is to set the parent. 2439 */ 2440 lpl_cur->lpl_parent = lpl_parent; 2441 continue; 2442 } 2443 2444 /* 2445 * Initialize intermediate lpl 2446 * Save this lpl's hint though. Since we're changing this 2447 * lpl's resources, we need to update the hint in this lpl's 2448 * children, but the hint in this lpl is unaffected and 2449 * should be preserved. 2450 */ 2451 hint = lpl_cur->lpl_hint; 2452 2453 lpl_clear(lpl_cur); 2454 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2455 2456 lpl_cur->lpl_hint = hint; 2457 lpl_cur->lpl_parent = lpl_parent; 2458 2459 /* does new lpl need to be populated with other resources? */ 2460 rset_intersect = 2461 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2462 cpupart->cp_lgrpset); 2463 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2464 2465 if (rset_num_intersect > 1) { 2466 /* 2467 * If so, figure out what lpls have resources that 2468 * intersect this one, and add them. 2469 */ 2470 for (j = 0; j <= lgrp_alloc_max; j++) { 2471 lgrp_t *lgrp_cand; /* candidate lgrp */ 2472 lpl_t *lpl_cand; /* candidate lpl */ 2473 2474 lgrp_cand = lgrp_table[j]; 2475 if (!LGRP_EXISTS(lgrp_cand) || 2476 !klgrpset_ismember(rset_intersect, 2477 lgrp_cand->lgrp_id)) 2478 continue; 2479 lpl_cand = 2480 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2481 lpl_rset_add(lpl_cur, lpl_cand); 2482 } 2483 } 2484 /* 2485 * This lpl's rset has changed. Update the hint in it's 2486 * children. 2487 */ 2488 lpl_child_update(lpl_cur, cpupart); 2489 } 2490 } 2491 2492 /* 2493 * remove a lpl from the hierarchy of resources, clearing its state when 2494 * finished. If the lpls at the intermediate levels of the hierarchy have no 2495 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2496 * delete them as well. 2497 */ 2498 2499 void 2500 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2501 { 2502 int i; 2503 lgrp_t *lgrp_cur; 2504 lpl_t *lpl_cur; 2505 klgrpset_t leaf_intersect; /* intersection of leaves */ 2506 2507 for (i = 0; i <= lgrp_alloc_max; i++) { 2508 lgrp_cur = lgrp_table[i]; 2509 2510 /* 2511 * Don't attempt to remove from lgrps that aren't there, that 2512 * don't contain our leaf, or from the leaf itself. (We do that 2513 * later) 2514 */ 2515 2516 if (!LGRP_EXISTS(lgrp_cur)) 2517 continue; 2518 2519 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2520 2521 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2522 lpl_leaf->lpl_lgrpid) || 2523 (lpl_cur == lpl_leaf)) { 2524 continue; 2525 } 2526 2527 /* 2528 * This is a slightly sleazy simplification in that we have 2529 * already marked the cp_lgrpset as no longer containing the 2530 * leaf we've deleted. Any lpls that pass the above checks 2531 * based upon lgrp membership but not necessarily cpu-part 2532 * membership also get cleared by the checks below. Currently 2533 * this is harmless, as the lpls should be empty anyway. 2534 * 2535 * In particular, we want to preserve lpls that have additional 2536 * leaf resources, even though we don't yet have a processor 2537 * architecture that represents resources this way. 2538 */ 2539 2540 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2541 cpupart->cp_lgrpset); 2542 2543 lpl_rset_del(lpl_cur, lpl_leaf); 2544 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2545 lpl_clear(lpl_cur); 2546 } else { 2547 /* 2548 * Update this lpl's children 2549 */ 2550 lpl_child_update(lpl_cur, cpupart); 2551 } 2552 } 2553 lpl_clear(lpl_leaf); 2554 } 2555 2556 /* 2557 * add a cpu to a partition in terms of lgrp load avg bookeeping 2558 * 2559 * The lpl (cpu partition load average information) is now arranged in a 2560 * hierarchical fashion whereby resources that are closest, ie. most local, to 2561 * the cpu in question are considered to be leaves in a tree of resources. 2562 * There are two general cases for cpu additon: 2563 * 2564 * 1. A lpl structure that contains resources already in the hierarchy tree. 2565 * In this case, all of the associated lpl relationships have been defined, and 2566 * all that is necessary is that we link the new cpu into the per-lpl list of 2567 * cpus, and increment the ncpu count of all places where this cpu resource will 2568 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2569 * pushing is accomplished by this routine. 2570 * 2571 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2572 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2573 * construct the hierarchy of state necessary to name it's more distant 2574 * resources, if they should exist. The leaf structure is initialized by this 2575 * routine, as is the cpu-partition state for the lgrp membership. This routine 2576 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2577 * and builds all of the "ancestoral" state necessary to identify resources at 2578 * differing levels of locality. 2579 */ 2580 void 2581 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2582 { 2583 cpupart_t *cpupart; 2584 lgrp_t *lgrp_leaf; 2585 lpl_t *lpl_leaf; 2586 2587 /* called sometimes w/ cpus paused - grab no locks */ 2588 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2589 2590 cpupart = cp->cpu_part; 2591 lgrp_leaf = lgrp_table[lgrpid]; 2592 2593 /* don't add non-existent lgrp */ 2594 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2595 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2596 cp->cpu_lpl = lpl_leaf; 2597 2598 /* only leaf lpls contain cpus */ 2599 2600 if (lpl_leaf->lpl_ncpu++ == 0) { 2601 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2602 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2603 lpl_leaf_insert(lpl_leaf, cpupart); 2604 } else { 2605 /* 2606 * the lpl should already exist in the parent, so just update 2607 * the count of available CPUs 2608 */ 2609 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2610 } 2611 2612 /* link cpu into list of cpus in lpl */ 2613 2614 if (lpl_leaf->lpl_cpus) { 2615 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2616 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2617 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2618 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2619 } else { 2620 /* 2621 * We increment ncpu immediately after we create a new leaf 2622 * lpl, so assert that ncpu == 1 for the case where we don't 2623 * have any cpu pointers yet. 2624 */ 2625 ASSERT(lpl_leaf->lpl_ncpu == 1); 2626 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2627 } 2628 2629 } 2630 2631 2632 /* 2633 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2634 * 2635 * The lpl (cpu partition load average information) is now arranged in a 2636 * hierarchical fashion whereby resources that are closest, ie. most local, to 2637 * the cpu in question are considered to be leaves in a tree of resources. 2638 * There are two removal cases in question: 2639 * 2640 * 1. Removal of the resource in the leaf leaves other resources remaining in 2641 * that leaf. (Another cpu still exists at this level of locality). In this 2642 * case, the count of available cpus is decremented in all assocated lpls by 2643 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2644 * from the per-cpu lpl list. 2645 * 2646 * 2. Removal of the resource results in the lpl containing no resources. (It's 2647 * empty) In this case, all of what has occurred for the first step must take 2648 * place; however, additionally we must remove the lpl structure itself, prune 2649 * out any stranded lpls that do not directly name a leaf resource, and mark the 2650 * cpu partition in question as no longer containing resources from the lgrp of 2651 * the lpl that has been delted. Cpu-partition changes are handled by this 2652 * method, but the lpl_leaf_remove function deals with the details of pruning 2653 * out the empty lpl and any of its orphaned direct ancestors. 2654 */ 2655 void 2656 lgrp_part_del_cpu(cpu_t *cp) 2657 { 2658 lpl_t *lpl; 2659 lpl_t *leaf_lpl; 2660 lgrp_t *lgrp_leaf; 2661 2662 /* called sometimes w/ cpus paused - grab no locks */ 2663 2664 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2665 2666 lpl = leaf_lpl = cp->cpu_lpl; 2667 lgrp_leaf = leaf_lpl->lpl_lgrp; 2668 2669 /* don't delete a leaf that isn't there */ 2670 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2671 2672 /* no double-deletes */ 2673 ASSERT(lpl->lpl_ncpu); 2674 if (--lpl->lpl_ncpu == 0) { 2675 /* 2676 * This was the last cpu in this lgroup for this partition, 2677 * clear its bit in the partition's lgroup bitmask 2678 */ 2679 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2680 2681 /* eliminate remaning lpl link pointers in cpu, lpl */ 2682 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2683 2684 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2685 } else { 2686 2687 /* unlink cpu from lists of cpus in lpl */ 2688 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2689 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2690 if (lpl->lpl_cpus == cp) { 2691 lpl->lpl_cpus = cp->cpu_next_lpl; 2692 } 2693 2694 /* 2695 * Update the cpu count in the lpls associated with parent 2696 * lgroups. 2697 */ 2698 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2699 2700 } 2701 /* clear cpu's lpl ptr when we're all done */ 2702 cp->cpu_lpl = NULL; 2703 } 2704 2705 /* 2706 * Recompute load average for the specified partition/lgrp fragment. 2707 * 2708 * We rely on the fact that this routine is called from the clock thread 2709 * at a point before the clock thread can block (i.e. before its first 2710 * lock request). Since the clock thread can not be preempted (since it 2711 * runs at highest priority), we know that cpu partitions can not change 2712 * (since doing so would require either the repartition requester or the 2713 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2714 * without grabbing cpu_lock. 2715 */ 2716 void 2717 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2718 { 2719 uint_t ncpu; 2720 int64_t old, new, f; 2721 2722 /* 2723 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2724 */ 2725 static short expval[] = { 2726 0, 3196, 1618, 1083, 2727 814, 652, 543, 466, 2728 408, 363, 326, 297, 2729 272, 251, 233, 218, 2730 204, 192, 181, 172, 2731 163, 155, 148, 142, 2732 136, 130, 125, 121, 2733 116, 112, 109, 105 2734 }; 2735 2736 /* ASSERT (called from clock level) */ 2737 2738 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2739 ((ncpu = lpl->lpl_ncpu) == 0)) { 2740 return; 2741 } 2742 2743 for (;;) { 2744 2745 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2746 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2747 else 2748 f = expval[ncpu]; 2749 2750 /* 2751 * Modify the load average atomically to avoid losing 2752 * anticipatory load updates (see lgrp_move_thread()). 2753 */ 2754 if (ageflag) { 2755 /* 2756 * We're supposed to both update and age the load. 2757 * This happens 10 times/sec. per cpu. We do a 2758 * little hoop-jumping to avoid integer overflow. 2759 */ 2760 int64_t q, r; 2761 2762 do { 2763 old = new = lpl->lpl_loadavg; 2764 q = (old >> 16) << 7; 2765 r = (old & 0xffff) << 7; 2766 new += ((long long)(nrcpus - q) * f - 2767 ((r * f) >> 16)) >> 7; 2768 2769 /* 2770 * Check for overflow 2771 */ 2772 if (new > LGRP_LOADAVG_MAX) 2773 new = LGRP_LOADAVG_MAX; 2774 else if (new < 0) 2775 new = 0; 2776 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2777 new) != old); 2778 } else { 2779 /* 2780 * We're supposed to update the load, but not age it. 2781 * This option is used to update the load (which either 2782 * has already been aged in this 1/10 sec. interval or 2783 * soon will be) to account for a remotely executing 2784 * thread. 2785 */ 2786 do { 2787 old = new = lpl->lpl_loadavg; 2788 new += f; 2789 /* 2790 * Check for overflow 2791 * Underflow not possible here 2792 */ 2793 if (new < old) 2794 new = LGRP_LOADAVG_MAX; 2795 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2796 new) != old); 2797 } 2798 2799 /* 2800 * Do the same for this lpl's parent 2801 */ 2802 if ((lpl = lpl->lpl_parent) == NULL) 2803 break; 2804 ncpu = lpl->lpl_ncpu; 2805 } 2806 } 2807 2808 /* 2809 * Initialize lpl topology in the target based on topology currently present in 2810 * lpl_bootstrap. 2811 * 2812 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2813 * initialize cp_default list of lpls. Up to this point all topology operations 2814 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2815 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2816 * `target' points to the list of lpls in cp_default and `size' is the size of 2817 * this list. 2818 * 2819 * This function walks the lpl topology in lpl_bootstrap and does for things: 2820 * 2821 * 1) Copies all fields from lpl_bootstrap to the target. 2822 * 2823 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2824 * 2825 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2826 * instead of lpl_bootstrap. 2827 * 2828 * 4) Updates pointers in the resource list of the target to point to the lpls 2829 * in the target list instead of lpl_bootstrap. 2830 * 2831 * After lpl_topo_bootstrap() completes, target contains the same information 2832 * that would be present there if it were used during boot instead of 2833 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2834 * and it is bzeroed. 2835 */ 2836 void 2837 lpl_topo_bootstrap(lpl_t *target, int size) 2838 { 2839 lpl_t *lpl = lpl_bootstrap; 2840 lpl_t *target_lpl = target; 2841 int howmany; 2842 int id; 2843 int i; 2844 2845 /* 2846 * The only target that should be passed here is cp_default lpl list. 2847 */ 2848 ASSERT(target == cp_default.cp_lgrploads); 2849 ASSERT(size == cp_default.cp_nlgrploads); 2850 ASSERT(!lgrp_topo_initialized); 2851 ASSERT(ncpus == 1); 2852 2853 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2854 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2855 /* 2856 * Copy all fields from lpl. 2857 */ 2858 2859 *target_lpl = *lpl; 2860 2861 /* 2862 * Substitute CPU0 lpl pointer with one relative to target. 2863 */ 2864 if (lpl->lpl_cpus == CPU) { 2865 ASSERT(CPU->cpu_lpl == lpl); 2866 CPU->cpu_lpl = target_lpl; 2867 } 2868 2869 /* 2870 * Substitute parent information with parent relative to target. 2871 */ 2872 if (lpl->lpl_parent != NULL) 2873 target_lpl->lpl_parent = (lpl_t *) 2874 (((uintptr_t)lpl->lpl_parent - 2875 (uintptr_t)lpl_bootstrap) + 2876 (uintptr_t)target); 2877 2878 /* 2879 * Walk over resource set substituting pointers relative to 2880 * lpl_bootstrap to pointers relative to target. 2881 */ 2882 ASSERT(lpl->lpl_nrset <= 1); 2883 2884 for (id = 0; id < lpl->lpl_nrset; id++) { 2885 if (lpl->lpl_rset[id] != NULL) { 2886 target_lpl->lpl_rset[id] = 2887 (lpl_t *) 2888 (((uintptr_t)lpl->lpl_rset[id] - 2889 (uintptr_t)lpl_bootstrap) + 2890 (uintptr_t)target); 2891 } 2892 } 2893 } 2894 2895 /* 2896 * Topology information in lpl_bootstrap is no longer needed. 2897 */ 2898 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2899 } 2900 2901 /* 2902 * If the lowest load among the lgroups a process' threads are currently 2903 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2904 * expanding the process to a new lgroup. 2905 */ 2906 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2907 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2908 2909 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2910 ((lgrp_expand_proc_thresh) / (ncpu)) 2911 2912 /* 2913 * A process will be expanded to a new lgroup only if the difference between 2914 * the lowest load on the lgroups the process' thread's are currently spread 2915 * across and the lowest load on the other lgroups in the process' partition 2916 * is greater than lgrp_expand_proc_diff. 2917 */ 2918 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2919 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2920 2921 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2922 ((lgrp_expand_proc_diff) / (ncpu)) 2923 2924 /* 2925 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2926 * be present due to impreciseness of the load average decay algorithm. 2927 * 2928 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2929 * tolerance is scaled by the number of cpus in the lgroup just like 2930 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2931 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2932 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2933 */ 2934 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2935 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2936 ((lgrp_loadavg_tolerance) / ncpu) 2937 2938 /* 2939 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2940 * average is above this threshold 2941 */ 2942 uint32_t lgrp_load_thresh = UINT32_MAX; 2943 2944 /* 2945 * lgrp_choose() will try to skip any lgroups with less memory 2946 * than this free when choosing a home lgroup 2947 */ 2948 pgcnt_t lgrp_mem_free_thresh = 0; 2949 2950 /* 2951 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2952 * one based on one of the following policies: 2953 * - Random selection 2954 * - Pseudo round robin placement 2955 * - Longest time since a thread was last placed 2956 */ 2957 #define LGRP_CHOOSE_RANDOM 1 2958 #define LGRP_CHOOSE_RR 2 2959 #define LGRP_CHOOSE_TIME 3 2960 2961 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2962 2963 /* 2964 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 2965 * be bound to a CPU or processor set. 2966 * 2967 * Arguments: 2968 * t The thread 2969 * cpupart The partition the thread belongs to. 2970 * 2971 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 2972 * disabled, or thread_lock held (at splhigh) to protect against the CPU 2973 * partitions changing out from under us and assumes that given thread is 2974 * protected. Also, called sometimes w/ cpus paused or kernel preemption 2975 * disabled, so don't grab any locks because we should never block under 2976 * those conditions. 2977 */ 2978 lpl_t * 2979 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 2980 { 2981 lgrp_load_t bestload, bestrload; 2982 int lgrpid_offset, lgrp_count; 2983 lgrp_id_t lgrpid, lgrpid_start; 2984 lpl_t *lpl, *bestlpl, *bestrlpl; 2985 klgrpset_t lgrpset; 2986 proc_t *p; 2987 2988 ASSERT(t != NULL); 2989 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2990 THREAD_LOCK_HELD(t)); 2991 ASSERT(cpupart != NULL); 2992 2993 p = t->t_procp; 2994 2995 /* A process should always be in an active partition */ 2996 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 2997 2998 bestlpl = bestrlpl = NULL; 2999 bestload = bestrload = LGRP_LOADAVG_MAX; 3000 lgrpset = cpupart->cp_lgrpset; 3001 3002 switch (lgrp_choose_policy) { 3003 case LGRP_CHOOSE_RR: 3004 lgrpid = cpupart->cp_lgrp_hint; 3005 do { 3006 if (++lgrpid > lgrp_alloc_max) 3007 lgrpid = 0; 3008 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3009 3010 break; 3011 default: 3012 case LGRP_CHOOSE_TIME: 3013 case LGRP_CHOOSE_RANDOM: 3014 klgrpset_nlgrps(lgrpset, lgrp_count); 3015 lgrpid_offset = 3016 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3017 for (lgrpid = 0; ; lgrpid++) { 3018 if (klgrpset_ismember(lgrpset, lgrpid)) { 3019 if (--lgrpid_offset == 0) 3020 break; 3021 } 3022 } 3023 break; 3024 } 3025 3026 lgrpid_start = lgrpid; 3027 3028 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3029 lgrp_id_t, cpupart->cp_lgrp_hint); 3030 3031 /* 3032 * Use lgroup affinities (if any) to choose best lgroup 3033 * 3034 * NOTE: Assumes that thread is protected from going away and its 3035 * lgroup affinities won't change (ie. p_lock, or 3036 * thread_lock() being held and/or CPUs paused) 3037 */ 3038 if (t->t_lgrp_affinity) { 3039 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 3040 if (lpl != NULL) 3041 return (lpl); 3042 } 3043 3044 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3045 3046 do { 3047 pgcnt_t npgs; 3048 3049 /* 3050 * Skip any lgroups outside of thread's pset 3051 */ 3052 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3053 if (++lgrpid > lgrp_alloc_max) 3054 lgrpid = 0; /* wrap the search */ 3055 continue; 3056 } 3057 3058 /* 3059 * Skip any non-leaf lgroups 3060 */ 3061 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3062 continue; 3063 3064 /* 3065 * Skip any lgroups without enough free memory 3066 * (when threshold set to nonzero positive value) 3067 */ 3068 if (lgrp_mem_free_thresh > 0) { 3069 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3070 if (npgs < lgrp_mem_free_thresh) { 3071 if (++lgrpid > lgrp_alloc_max) 3072 lgrpid = 0; /* wrap the search */ 3073 continue; 3074 } 3075 } 3076 3077 lpl = &cpupart->cp_lgrploads[lgrpid]; 3078 if (klgrpset_isempty(p->p_lgrpset) || 3079 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3080 /* 3081 * Either this is a new process or the process already 3082 * has threads on this lgrp, so this is a preferred 3083 * lgroup for the thread. 3084 */ 3085 if (bestlpl == NULL || 3086 lpl_pick(lpl, bestlpl)) { 3087 bestload = lpl->lpl_loadavg; 3088 bestlpl = lpl; 3089 } 3090 } else { 3091 /* 3092 * The process doesn't have any threads on this lgrp, 3093 * but we're willing to consider this lgrp if the load 3094 * difference is big enough to justify splitting up 3095 * the process' threads. 3096 */ 3097 if (bestrlpl == NULL || 3098 lpl_pick(lpl, bestrlpl)) { 3099 bestrload = lpl->lpl_loadavg; 3100 bestrlpl = lpl; 3101 } 3102 } 3103 if (++lgrpid > lgrp_alloc_max) 3104 lgrpid = 0; /* wrap the search */ 3105 } while (lgrpid != lgrpid_start); 3106 3107 /* 3108 * Return root lgroup if threshold isn't set to maximum value and 3109 * lowest lgroup load average more than a certain threshold 3110 */ 3111 if (lgrp_load_thresh != UINT32_MAX && 3112 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3113 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3114 3115 /* 3116 * If all the lgroups over which the thread's process is spread are 3117 * heavily loaded, or otherwise undesirable, we'll consider placing 3118 * the thread on one of the other leaf lgroups in the thread's 3119 * partition. 3120 */ 3121 if ((bestlpl == NULL) || 3122 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3123 (bestrload < bestload) && /* paranoid about wraparound */ 3124 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3125 bestload))) { 3126 bestlpl = bestrlpl; 3127 } 3128 3129 if (bestlpl == NULL) { 3130 /* 3131 * No lgroup looked particularly good, but we still 3132 * have to pick something. Go with the randomly selected 3133 * legal lgroup we started with above. 3134 */ 3135 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3136 } 3137 3138 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3139 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3140 3141 ASSERT(bestlpl->lpl_ncpu > 0); 3142 return (bestlpl); 3143 } 3144 3145 /* 3146 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3147 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3148 */ 3149 static int 3150 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3151 { 3152 lgrp_load_t l1, l2; 3153 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3154 3155 l1 = lpl1->lpl_loadavg; 3156 l2 = lpl2->lpl_loadavg; 3157 3158 if ((l1 + tolerance < l2) && (l1 < l2)) { 3159 /* lpl1 is significantly less loaded than lpl2 */ 3160 return (1); 3161 } 3162 3163 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3164 l1 + tolerance >= l2 && l1 < l2 && 3165 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3166 /* 3167 * lpl1's load is within the tolerance of lpl2. We're 3168 * willing to consider it be to better however if 3169 * it has been longer since we last homed a thread there 3170 */ 3171 return (1); 3172 } 3173 3174 return (0); 3175 } 3176 3177 /* 3178 * An LWP is expected to be assigned to an lgroup for at least this long 3179 * for its anticipatory load to be justified. NOTE that this value should 3180 * not be set extremely huge (say, larger than 100 years), to avoid problems 3181 * with overflow in the calculation that uses it. 3182 */ 3183 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3184 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3185 3186 /* 3187 * Routine to change a thread's lgroup affiliation. This routine updates 3188 * the thread's kthread_t struct and its process' proc_t struct to note the 3189 * thread's new lgroup affiliation, and its lgroup affinities. 3190 * 3191 * Note that this is the only routine that modifies a thread's t_lpl field, 3192 * and that adds in or removes anticipatory load. 3193 * 3194 * If the thread is exiting, newlpl is NULL. 3195 * 3196 * Locking: 3197 * The following lock must be held on entry: 3198 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3199 * doesn't get removed from t's partition 3200 * 3201 * This routine is not allowed to grab any locks, since it may be called 3202 * with cpus paused (such as from cpu_offline). 3203 */ 3204 void 3205 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3206 { 3207 proc_t *p; 3208 lpl_t *lpl, *oldlpl; 3209 lgrp_id_t oldid; 3210 kthread_t *tp; 3211 uint_t ncpu; 3212 lgrp_load_t old, new; 3213 3214 ASSERT(t); 3215 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3216 THREAD_LOCK_HELD(t)); 3217 3218 /* 3219 * If not changing lpls, just return 3220 */ 3221 if ((oldlpl = t->t_lpl) == newlpl) 3222 return; 3223 3224 /* 3225 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3226 * associated with process 0 rather than with its original process). 3227 */ 3228 if (t->t_proc_flag & TP_LWPEXIT) { 3229 if (newlpl != NULL) { 3230 t->t_lpl = newlpl; 3231 } 3232 return; 3233 } 3234 3235 p = ttoproc(t); 3236 3237 /* 3238 * If the thread had a previous lgroup, update its process' p_lgrpset 3239 * to account for it being moved from its old lgroup. 3240 */ 3241 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3242 (p->p_tlist != NULL)) { 3243 oldid = oldlpl->lpl_lgrpid; 3244 3245 if (newlpl != NULL) 3246 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3247 3248 if ((do_lgrpset_delete) && 3249 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3250 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3251 /* 3252 * Check if a thread other than the thread 3253 * that's moving is assigned to the same 3254 * lgroup as the thread that's moving. Note 3255 * that we have to compare lgroup IDs, rather 3256 * than simply comparing t_lpl's, since the 3257 * threads may belong to different partitions 3258 * but be assigned to the same lgroup. 3259 */ 3260 ASSERT(tp->t_lpl != NULL); 3261 3262 if ((tp != t) && 3263 (tp->t_lpl->lpl_lgrpid == oldid)) { 3264 /* 3265 * Another thread is assigned to the 3266 * same lgroup as the thread that's 3267 * moving, p_lgrpset doesn't change. 3268 */ 3269 break; 3270 } else if (tp == p->p_tlist) { 3271 /* 3272 * No other thread is assigned to the 3273 * same lgroup as the exiting thread, 3274 * clear the lgroup's bit in p_lgrpset. 3275 */ 3276 klgrpset_del(p->p_lgrpset, oldid); 3277 break; 3278 } 3279 } 3280 } 3281 3282 /* 3283 * If this thread was assigned to its old lgroup for such a 3284 * short amount of time that the anticipatory load that was 3285 * added on its behalf has aged very little, remove that 3286 * anticipatory load. 3287 */ 3288 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3289 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3290 lpl = oldlpl; 3291 for (;;) { 3292 do { 3293 old = new = lpl->lpl_loadavg; 3294 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3295 if (new > old) { 3296 /* 3297 * this can happen if the load 3298 * average was aged since we 3299 * added in the anticipatory 3300 * load 3301 */ 3302 new = 0; 3303 } 3304 } while (cas32( 3305 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3306 new) != old); 3307 3308 lpl = lpl->lpl_parent; 3309 if (lpl == NULL) 3310 break; 3311 3312 ncpu = lpl->lpl_ncpu; 3313 ASSERT(ncpu > 0); 3314 } 3315 } 3316 } 3317 /* 3318 * If the thread has a new lgroup (i.e. it's not exiting), update its 3319 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3320 * to its new lgroup to account for its move to its new lgroup. 3321 */ 3322 if (newlpl != NULL) { 3323 /* 3324 * This thread is moving to a new lgroup 3325 */ 3326 t->t_lpl = newlpl; 3327 3328 /* 3329 * Reflect move in load average of new lgroup 3330 * unless it is root lgroup 3331 */ 3332 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3333 return; 3334 3335 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3336 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3337 } 3338 3339 /* 3340 * It'll take some time for the load on the new lgroup 3341 * to reflect this thread's placement on it. We'd 3342 * like not, however, to have all threads between now 3343 * and then also piling on to this lgroup. To avoid 3344 * this pileup, we anticipate the load this thread 3345 * will generate on its new lgroup. The goal is to 3346 * make the lgroup's load appear as though the thread 3347 * had been there all along. We're very conservative 3348 * in calculating this anticipatory load, we assume 3349 * the worst case case (100% CPU-bound thread). This 3350 * may be modified in the future to be more accurate. 3351 */ 3352 lpl = newlpl; 3353 for (;;) { 3354 ncpu = lpl->lpl_ncpu; 3355 ASSERT(ncpu > 0); 3356 do { 3357 old = new = lpl->lpl_loadavg; 3358 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3359 /* 3360 * Check for overflow 3361 * Underflow not possible here 3362 */ 3363 if (new < old) 3364 new = UINT32_MAX; 3365 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3366 new) != old); 3367 3368 lpl = lpl->lpl_parent; 3369 if (lpl == NULL) 3370 break; 3371 } 3372 t->t_anttime = gethrtime(); 3373 } 3374 } 3375 3376 /* 3377 * Return lgroup memory allocation policy given advice from madvise(3C) 3378 */ 3379 lgrp_mem_policy_t 3380 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3381 { 3382 switch (advice) { 3383 case MADV_ACCESS_LWP: 3384 return (LGRP_MEM_POLICY_NEXT); 3385 case MADV_ACCESS_MANY: 3386 return (LGRP_MEM_POLICY_RANDOM); 3387 default: 3388 return (lgrp_mem_policy_default(size, type)); 3389 } 3390 } 3391 3392 /* 3393 * Figure out default policy 3394 */ 3395 lgrp_mem_policy_t 3396 lgrp_mem_policy_default(size_t size, int type) 3397 { 3398 cpupart_t *cp; 3399 lgrp_mem_policy_t policy; 3400 size_t pset_mem_size; 3401 3402 /* 3403 * Randomly allocate memory across lgroups for shared memory 3404 * beyond a certain threshold 3405 */ 3406 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3407 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3408 /* 3409 * Get total memory size of current thread's pset 3410 */ 3411 kpreempt_disable(); 3412 cp = curthread->t_cpupart; 3413 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3414 kpreempt_enable(); 3415 3416 /* 3417 * Choose policy to randomly allocate memory across 3418 * lgroups in pset if it will fit and is not default 3419 * partition. Otherwise, allocate memory randomly 3420 * across machine. 3421 */ 3422 if (lgrp_mem_pset_aware && size < pset_mem_size) 3423 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3424 else 3425 policy = LGRP_MEM_POLICY_RANDOM; 3426 } else 3427 /* 3428 * Apply default policy for private memory and 3429 * shared memory under the respective random 3430 * threshold. 3431 */ 3432 policy = lgrp_mem_default_policy; 3433 3434 return (policy); 3435 } 3436 3437 /* 3438 * Get memory allocation policy for this segment 3439 */ 3440 lgrp_mem_policy_info_t * 3441 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3442 { 3443 lgrp_mem_policy_info_t *policy_info; 3444 extern struct seg_ops segspt_ops; 3445 extern struct seg_ops segspt_shmops; 3446 3447 /* 3448 * This is for binary compatibility to protect against third party 3449 * segment drivers which haven't recompiled to allow for 3450 * SEGOP_GETPOLICY() 3451 */ 3452 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3453 seg->s_ops != &segspt_shmops) 3454 return (NULL); 3455 3456 policy_info = NULL; 3457 if (seg->s_ops->getpolicy != NULL) 3458 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3459 3460 return (policy_info); 3461 } 3462 3463 /* 3464 * Set policy for allocating private memory given desired policy, policy info, 3465 * size in bytes of memory that policy is being applied. 3466 * Return 0 if policy wasn't set already and 1 if policy was set already 3467 */ 3468 int 3469 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3470 lgrp_mem_policy_info_t *policy_info, size_t size) 3471 { 3472 3473 ASSERT(policy_info != NULL); 3474 3475 if (policy == LGRP_MEM_POLICY_DEFAULT) 3476 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3477 3478 /* 3479 * Policy set already? 3480 */ 3481 if (policy == policy_info->mem_policy) 3482 return (1); 3483 3484 /* 3485 * Set policy 3486 */ 3487 policy_info->mem_policy = policy; 3488 policy_info->mem_reserved = 0; 3489 3490 return (0); 3491 } 3492 3493 3494 /* 3495 * Get shared memory allocation policy with given tree and offset 3496 */ 3497 lgrp_mem_policy_info_t * 3498 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3499 u_offset_t vn_off) 3500 { 3501 u_offset_t off; 3502 lgrp_mem_policy_info_t *policy_info; 3503 lgrp_shm_policy_seg_t *policy_seg; 3504 lgrp_shm_locality_t *shm_locality; 3505 avl_tree_t *tree; 3506 avl_index_t where; 3507 3508 /* 3509 * Get policy segment tree from anon_map or vnode and use specified 3510 * anon index or vnode offset as offset 3511 * 3512 * Assume that no lock needs to be held on anon_map or vnode, since 3513 * they should be protected by their reference count which must be 3514 * nonzero for an existing segment 3515 */ 3516 if (amp) { 3517 ASSERT(amp->refcnt != 0); 3518 shm_locality = amp->locality; 3519 if (shm_locality == NULL) 3520 return (NULL); 3521 tree = shm_locality->loc_tree; 3522 off = ptob(anon_index); 3523 } else if (vp) { 3524 shm_locality = vp->v_locality; 3525 if (shm_locality == NULL) 3526 return (NULL); 3527 ASSERT(shm_locality->loc_count != 0); 3528 tree = shm_locality->loc_tree; 3529 off = vn_off; 3530 } 3531 3532 if (tree == NULL) 3533 return (NULL); 3534 3535 /* 3536 * Lookup policy segment for offset into shared object and return 3537 * policy info 3538 */ 3539 rw_enter(&shm_locality->loc_lock, RW_READER); 3540 policy_info = NULL; 3541 policy_seg = avl_find(tree, &off, &where); 3542 if (policy_seg) 3543 policy_info = &policy_seg->shm_policy; 3544 rw_exit(&shm_locality->loc_lock); 3545 3546 return (policy_info); 3547 } 3548 3549 /* 3550 * Default memory allocation policy for kernel segmap pages 3551 */ 3552 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3553 3554 /* 3555 * Return lgroup to use for allocating memory 3556 * given the segment and address 3557 * 3558 * There isn't any mutual exclusion that exists between calls 3559 * to this routine and DR, so this routine and whomever calls it 3560 * should be mindful of the possibility that the lgrp returned 3561 * may be deleted. If this happens, dereferences of the lgrp 3562 * pointer will still be safe, but the resources in the lgrp will 3563 * be gone, and LGRP_EXISTS() will no longer be true. 3564 */ 3565 lgrp_t * 3566 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3567 { 3568 int i; 3569 lgrp_t *lgrp; 3570 klgrpset_t lgrpset; 3571 int lgrps_spanned; 3572 unsigned long off; 3573 lgrp_mem_policy_t policy; 3574 lgrp_mem_policy_info_t *policy_info; 3575 ushort_t random; 3576 int stat = 0; 3577 extern struct seg *segkmap; 3578 3579 /* 3580 * Just return null if the lgrp framework hasn't finished 3581 * initializing or if this is a UMA machine. 3582 */ 3583 if (nlgrps == 1 || !lgrp_initialized) 3584 return (lgrp_root); 3585 3586 /* 3587 * Get memory allocation policy for this segment 3588 */ 3589 policy = lgrp_mem_default_policy; 3590 if (seg != NULL) { 3591 if (seg->s_as == &kas) { 3592 if (seg == segkmap) 3593 policy = lgrp_segmap_default_policy; 3594 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3595 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3596 policy = LGRP_MEM_POLICY_RANDOM; 3597 } else { 3598 policy_info = lgrp_mem_policy_get(seg, vaddr); 3599 if (policy_info != NULL) 3600 policy = policy_info->mem_policy; 3601 } 3602 } 3603 lgrpset = 0; 3604 3605 /* 3606 * Initialize lgroup to home by default 3607 */ 3608 lgrp = lgrp_home_lgrp(); 3609 3610 /* 3611 * When homing threads on root lgrp, override default memory 3612 * allocation policies with root lgroup memory allocation policy 3613 */ 3614 if (lgrp == lgrp_root) 3615 policy = lgrp_mem_policy_root; 3616 3617 /* 3618 * Implement policy 3619 */ 3620 switch (policy) { 3621 case LGRP_MEM_POLICY_NEXT_CPU: 3622 3623 /* 3624 * Return lgroup of current CPU which faulted on memory 3625 * If the CPU isn't currently in an lgrp, then opt to 3626 * allocate from the root. 3627 * 3628 * Kernel preemption needs to be disabled here to prevent 3629 * the current CPU from going away before lgrp is found. 3630 */ 3631 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3632 lgrp = lgrp_root; 3633 } else { 3634 kpreempt_disable(); 3635 lgrp = lgrp_cpu_to_lgrp(CPU); 3636 kpreempt_enable(); 3637 } 3638 break; 3639 3640 case LGRP_MEM_POLICY_NEXT: 3641 case LGRP_MEM_POLICY_DEFAULT: 3642 default: 3643 3644 /* 3645 * Just return current thread's home lgroup 3646 * for default policy (next touch) 3647 * If the thread is homed to the root, 3648 * then the default policy is random across lgroups. 3649 * Fallthrough to the random case. 3650 */ 3651 if (lgrp != lgrp_root) { 3652 if (policy == LGRP_MEM_POLICY_NEXT) 3653 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3654 else 3655 lgrp_stat_add(lgrp->lgrp_id, 3656 LGRP_NUM_DEFAULT, 1); 3657 break; 3658 } 3659 /* LINTED fallthrough on case statement */ 3660 case LGRP_MEM_POLICY_RANDOM: 3661 3662 /* 3663 * Return a random leaf lgroup with memory 3664 */ 3665 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3666 /* 3667 * Count how many lgroups are spanned 3668 */ 3669 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3670 3671 /* 3672 * There may be no memnodes in the root lgroup during DR copy 3673 * rename on a system with only two boards (memnodes) 3674 * configured. In this case just return the root lgrp. 3675 */ 3676 if (lgrps_spanned == 0) { 3677 lgrp = lgrp_root; 3678 break; 3679 } 3680 3681 /* 3682 * Pick a random offset within lgroups spanned 3683 * and return lgroup at that offset 3684 */ 3685 random = (ushort_t)gethrtime() >> 4; 3686 off = random % lgrps_spanned; 3687 ASSERT(off <= lgrp_alloc_max); 3688 3689 for (i = 0; i <= lgrp_alloc_max; i++) { 3690 if (!klgrpset_ismember(lgrpset, i)) 3691 continue; 3692 if (off) 3693 off--; 3694 else { 3695 lgrp = lgrp_table[i]; 3696 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3697 1); 3698 break; 3699 } 3700 } 3701 break; 3702 3703 case LGRP_MEM_POLICY_RANDOM_PROC: 3704 3705 /* 3706 * Grab copy of bitmask of lgroups spanned by 3707 * this process 3708 */ 3709 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3710 stat = LGRP_NUM_RANDOM_PROC; 3711 3712 /* LINTED fallthrough on case statement */ 3713 case LGRP_MEM_POLICY_RANDOM_PSET: 3714 3715 if (!stat) 3716 stat = LGRP_NUM_RANDOM_PSET; 3717 3718 if (klgrpset_isempty(lgrpset)) { 3719 /* 3720 * Grab copy of bitmask of lgroups spanned by 3721 * this processor set 3722 */ 3723 kpreempt_disable(); 3724 klgrpset_copy(lgrpset, 3725 curthread->t_cpupart->cp_lgrpset); 3726 kpreempt_enable(); 3727 } 3728 3729 /* 3730 * Count how many lgroups are spanned 3731 */ 3732 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3733 ASSERT(lgrps_spanned <= nlgrps); 3734 3735 /* 3736 * Probably lgrps_spanned should be always non-zero, but to be 3737 * on the safe side we return lgrp_root if it is empty. 3738 */ 3739 if (lgrps_spanned == 0) { 3740 lgrp = lgrp_root; 3741 break; 3742 } 3743 3744 /* 3745 * Pick a random offset within lgroups spanned 3746 * and return lgroup at that offset 3747 */ 3748 random = (ushort_t)gethrtime() >> 4; 3749 off = random % lgrps_spanned; 3750 ASSERT(off <= lgrp_alloc_max); 3751 3752 for (i = 0; i <= lgrp_alloc_max; i++) { 3753 if (!klgrpset_ismember(lgrpset, i)) 3754 continue; 3755 if (off) 3756 off--; 3757 else { 3758 lgrp = lgrp_table[i]; 3759 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3760 1); 3761 break; 3762 } 3763 } 3764 break; 3765 3766 case LGRP_MEM_POLICY_ROUNDROBIN: 3767 3768 /* 3769 * Use offset within segment to determine 3770 * offset from home lgroup to choose for 3771 * next lgroup to allocate memory from 3772 */ 3773 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3774 (lgrp_alloc_max + 1); 3775 3776 kpreempt_disable(); 3777 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3778 i = lgrp->lgrp_id; 3779 kpreempt_enable(); 3780 3781 while (off > 0) { 3782 i = (i + 1) % (lgrp_alloc_max + 1); 3783 lgrp = lgrp_table[i]; 3784 if (klgrpset_ismember(lgrpset, i)) 3785 off--; 3786 } 3787 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3788 3789 break; 3790 } 3791 3792 ASSERT(lgrp != NULL); 3793 return (lgrp); 3794 } 3795 3796 /* 3797 * Return the number of pages in an lgroup 3798 * 3799 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3800 * could cause tests that rely on the numat driver to fail.... 3801 */ 3802 pgcnt_t 3803 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3804 { 3805 lgrp_t *lgrp; 3806 3807 lgrp = lgrp_table[lgrpid]; 3808 if (!LGRP_EXISTS(lgrp) || 3809 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3810 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3811 return (0); 3812 3813 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3814 } 3815 3816 /* 3817 * Initialize lgroup shared memory allocation policy support 3818 */ 3819 void 3820 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3821 { 3822 lgrp_shm_locality_t *shm_locality; 3823 3824 /* 3825 * Initialize locality field in anon_map 3826 * Don't need any locks because this is called when anon_map is 3827 * allocated, but not used anywhere yet. 3828 */ 3829 if (amp) { 3830 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3831 if (amp->locality == NULL) { 3832 /* 3833 * Allocate and initialize shared memory locality info 3834 * and set anon_map locality pointer to it 3835 * Drop lock across kmem_alloc(KM_SLEEP) 3836 */ 3837 ANON_LOCK_EXIT(&->a_rwlock); 3838 shm_locality = kmem_alloc(sizeof (*shm_locality), 3839 KM_SLEEP); 3840 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3841 NULL); 3842 shm_locality->loc_count = 1; /* not used for amp */ 3843 shm_locality->loc_tree = NULL; 3844 3845 /* 3846 * Reacquire lock and check to see whether anyone beat 3847 * us to initializing the locality info 3848 */ 3849 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3850 if (amp->locality != NULL) { 3851 rw_destroy(&shm_locality->loc_lock); 3852 kmem_free(shm_locality, 3853 sizeof (*shm_locality)); 3854 } else 3855 amp->locality = shm_locality; 3856 } 3857 ANON_LOCK_EXIT(&->a_rwlock); 3858 return; 3859 } 3860 3861 /* 3862 * Allocate shared vnode policy info if vnode is not locality aware yet 3863 */ 3864 mutex_enter(&vp->v_lock); 3865 if ((vp->v_flag & V_LOCALITY) == 0) { 3866 /* 3867 * Allocate and initialize shared memory locality info 3868 */ 3869 mutex_exit(&vp->v_lock); 3870 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3871 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3872 shm_locality->loc_count = 1; 3873 shm_locality->loc_tree = NULL; 3874 3875 /* 3876 * Point vnode locality field at shared vnode policy info 3877 * and set locality aware flag in vnode 3878 */ 3879 mutex_enter(&vp->v_lock); 3880 if ((vp->v_flag & V_LOCALITY) == 0) { 3881 vp->v_locality = shm_locality; 3882 vp->v_flag |= V_LOCALITY; 3883 } else { 3884 /* 3885 * Lost race so free locality info and increment count. 3886 */ 3887 rw_destroy(&shm_locality->loc_lock); 3888 kmem_free(shm_locality, sizeof (*shm_locality)); 3889 shm_locality = vp->v_locality; 3890 shm_locality->loc_count++; 3891 } 3892 mutex_exit(&vp->v_lock); 3893 3894 return; 3895 } 3896 3897 /* 3898 * Increment reference count of number of segments mapping this vnode 3899 * shared 3900 */ 3901 shm_locality = vp->v_locality; 3902 shm_locality->loc_count++; 3903 mutex_exit(&vp->v_lock); 3904 } 3905 3906 /* 3907 * Destroy the given shared memory policy segment tree 3908 */ 3909 void 3910 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3911 { 3912 lgrp_shm_policy_seg_t *cur; 3913 lgrp_shm_policy_seg_t *next; 3914 3915 if (tree == NULL) 3916 return; 3917 3918 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3919 while (cur != NULL) { 3920 next = AVL_NEXT(tree, cur); 3921 avl_remove(tree, cur); 3922 kmem_free(cur, sizeof (*cur)); 3923 cur = next; 3924 } 3925 kmem_free(tree, sizeof (avl_tree_t)); 3926 } 3927 3928 /* 3929 * Uninitialize lgroup shared memory allocation policy support 3930 */ 3931 void 3932 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3933 { 3934 lgrp_shm_locality_t *shm_locality; 3935 3936 /* 3937 * For anon_map, deallocate shared memory policy tree and 3938 * zero locality field 3939 * Don't need any locks because anon_map is being freed 3940 */ 3941 if (amp) { 3942 if (amp->locality == NULL) 3943 return; 3944 shm_locality = amp->locality; 3945 shm_locality->loc_count = 0; /* not really used for amp */ 3946 rw_destroy(&shm_locality->loc_lock); 3947 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3948 kmem_free(shm_locality, sizeof (*shm_locality)); 3949 amp->locality = 0; 3950 return; 3951 } 3952 3953 /* 3954 * For vnode, decrement reference count of segments mapping this vnode 3955 * shared and delete locality info if reference count drops to 0 3956 */ 3957 mutex_enter(&vp->v_lock); 3958 shm_locality = vp->v_locality; 3959 shm_locality->loc_count--; 3960 3961 if (shm_locality->loc_count == 0) { 3962 rw_destroy(&shm_locality->loc_lock); 3963 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3964 kmem_free(shm_locality, sizeof (*shm_locality)); 3965 vp->v_locality = 0; 3966 vp->v_flag &= ~V_LOCALITY; 3967 } 3968 mutex_exit(&vp->v_lock); 3969 } 3970 3971 /* 3972 * Compare two shared memory policy segments 3973 * Used by AVL tree code for searching 3974 */ 3975 int 3976 lgrp_shm_policy_compar(const void *x, const void *y) 3977 { 3978 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 3979 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 3980 3981 if (a->shm_off < b->shm_off) 3982 return (-1); 3983 if (a->shm_off >= b->shm_off + b->shm_size) 3984 return (1); 3985 return (0); 3986 } 3987 3988 /* 3989 * Concatenate seg1 with seg2 and remove seg2 3990 */ 3991 static int 3992 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 3993 lgrp_shm_policy_seg_t *seg2) 3994 { 3995 if (!seg1 || !seg2 || 3996 seg1->shm_off + seg1->shm_size != seg2->shm_off || 3997 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 3998 return (-1); 3999 4000 seg1->shm_size += seg2->shm_size; 4001 avl_remove(tree, seg2); 4002 kmem_free(seg2, sizeof (*seg2)); 4003 return (0); 4004 } 4005 4006 /* 4007 * Split segment at given offset and return rightmost (uppermost) segment 4008 * Assumes that there are no overlapping segments 4009 */ 4010 static lgrp_shm_policy_seg_t * 4011 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4012 u_offset_t off) 4013 { 4014 lgrp_shm_policy_seg_t *newseg; 4015 avl_index_t where; 4016 4017 ASSERT(seg != NULL); 4018 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4019 4020 if (!seg || off < seg->shm_off || off > seg->shm_off + 4021 seg->shm_size) 4022 return (NULL); 4023 4024 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4025 return (seg); 4026 4027 /* 4028 * Adjust size of left segment and allocate new (right) segment 4029 */ 4030 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4031 newseg->shm_policy = seg->shm_policy; 4032 newseg->shm_off = off; 4033 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4034 seg->shm_size = off - seg->shm_off; 4035 4036 /* 4037 * Find where to insert new segment in AVL tree and insert it 4038 */ 4039 (void) avl_find(tree, &off, &where); 4040 avl_insert(tree, newseg, where); 4041 4042 return (newseg); 4043 } 4044 4045 /* 4046 * Set shared memory allocation policy on specified shared object at given 4047 * offset and length 4048 * 4049 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4050 * -1 if can't set policy. 4051 */ 4052 int 4053 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4054 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4055 { 4056 u_offset_t eoff; 4057 lgrp_shm_policy_seg_t *next; 4058 lgrp_shm_policy_seg_t *newseg; 4059 u_offset_t off; 4060 u_offset_t oldeoff; 4061 lgrp_shm_policy_seg_t *prev; 4062 int retval; 4063 lgrp_shm_policy_seg_t *seg; 4064 lgrp_shm_locality_t *shm_locality; 4065 avl_tree_t *tree; 4066 avl_index_t where; 4067 4068 ASSERT(amp || vp); 4069 ASSERT((len & PAGEOFFSET) == 0); 4070 4071 if (len == 0) 4072 return (-1); 4073 4074 retval = 0; 4075 4076 /* 4077 * Get locality info and starting offset into shared object 4078 * Try anon map first and then vnode 4079 * Assume that no locks need to be held on anon_map or vnode, since 4080 * it should be protected by its reference count which must be nonzero 4081 * for an existing segment. 4082 */ 4083 if (amp) { 4084 /* 4085 * Get policy info from anon_map 4086 * 4087 */ 4088 ASSERT(amp->refcnt != 0); 4089 if (amp->locality == NULL) 4090 lgrp_shm_policy_init(amp, NULL); 4091 shm_locality = amp->locality; 4092 off = ptob(anon_index); 4093 } else if (vp) { 4094 /* 4095 * Get policy info from vnode 4096 */ 4097 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4098 lgrp_shm_policy_init(NULL, vp); 4099 shm_locality = vp->v_locality; 4100 ASSERT(shm_locality->loc_count != 0); 4101 off = vn_off; 4102 } else 4103 return (-1); 4104 4105 ASSERT((off & PAGEOFFSET) == 0); 4106 4107 /* 4108 * Figure out default policy 4109 */ 4110 if (policy == LGRP_MEM_POLICY_DEFAULT) 4111 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4112 4113 /* 4114 * Create AVL tree if there isn't one yet 4115 * and set locality field to point at it 4116 */ 4117 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4118 tree = shm_locality->loc_tree; 4119 if (!tree) { 4120 rw_exit(&shm_locality->loc_lock); 4121 4122 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4123 4124 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4125 if (shm_locality->loc_tree == NULL) { 4126 avl_create(tree, lgrp_shm_policy_compar, 4127 sizeof (lgrp_shm_policy_seg_t), 4128 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4129 shm_locality->loc_tree = tree; 4130 } else { 4131 /* 4132 * Another thread managed to set up the tree 4133 * before we could. Free the tree we allocated 4134 * and use the one that's already there. 4135 */ 4136 kmem_free(tree, sizeof (*tree)); 4137 tree = shm_locality->loc_tree; 4138 } 4139 } 4140 4141 /* 4142 * Set policy 4143 * 4144 * Need to maintain hold on writer's lock to keep tree from 4145 * changing out from under us 4146 */ 4147 while (len != 0) { 4148 /* 4149 * Find policy segment for specified offset into shared object 4150 */ 4151 seg = avl_find(tree, &off, &where); 4152 4153 /* 4154 * Didn't find any existing segment that contains specified 4155 * offset, so allocate new segment, insert it, and concatenate 4156 * with adjacent segments if possible 4157 */ 4158 if (seg == NULL) { 4159 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4160 KM_SLEEP); 4161 newseg->shm_policy.mem_policy = policy; 4162 newseg->shm_policy.mem_reserved = 0; 4163 newseg->shm_off = off; 4164 avl_insert(tree, newseg, where); 4165 4166 /* 4167 * Check to see whether new segment overlaps with next 4168 * one, set length of new segment accordingly, and 4169 * calculate remaining length and next offset 4170 */ 4171 seg = AVL_NEXT(tree, newseg); 4172 if (seg == NULL || off + len <= seg->shm_off) { 4173 newseg->shm_size = len; 4174 len = 0; 4175 } else { 4176 newseg->shm_size = seg->shm_off - off; 4177 off = seg->shm_off; 4178 len -= newseg->shm_size; 4179 } 4180 4181 /* 4182 * Try to concatenate new segment with next and 4183 * previous ones, since they might have the same policy 4184 * now. Grab previous and next segments first because 4185 * they will change on concatenation. 4186 */ 4187 prev = AVL_PREV(tree, newseg); 4188 next = AVL_NEXT(tree, newseg); 4189 (void) lgrp_shm_policy_concat(tree, newseg, next); 4190 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4191 4192 continue; 4193 } 4194 4195 eoff = off + len; 4196 oldeoff = seg->shm_off + seg->shm_size; 4197 4198 /* 4199 * Policy set already? 4200 */ 4201 if (policy == seg->shm_policy.mem_policy) { 4202 /* 4203 * Nothing left to do if offset and length 4204 * fall within this segment 4205 */ 4206 if (eoff <= oldeoff) { 4207 retval = 1; 4208 break; 4209 } else { 4210 len = eoff - oldeoff; 4211 off = oldeoff; 4212 continue; 4213 } 4214 } 4215 4216 /* 4217 * Specified offset and length match existing segment exactly 4218 */ 4219 if (off == seg->shm_off && len == seg->shm_size) { 4220 /* 4221 * Set policy and update current length 4222 */ 4223 seg->shm_policy.mem_policy = policy; 4224 seg->shm_policy.mem_reserved = 0; 4225 len = 0; 4226 4227 /* 4228 * Try concatenating new segment with previous and next 4229 * segments, since they might have the same policy now. 4230 * Grab previous and next segments first because they 4231 * will change on concatenation. 4232 */ 4233 prev = AVL_PREV(tree, seg); 4234 next = AVL_NEXT(tree, seg); 4235 (void) lgrp_shm_policy_concat(tree, seg, next); 4236 (void) lgrp_shm_policy_concat(tree, prev, seg); 4237 } else { 4238 /* 4239 * Specified offset and length only apply to part of 4240 * existing segment 4241 */ 4242 4243 /* 4244 * New segment starts in middle of old one, so split 4245 * new one off near beginning of old one 4246 */ 4247 newseg = NULL; 4248 if (off > seg->shm_off) { 4249 newseg = lgrp_shm_policy_split(tree, seg, off); 4250 4251 /* 4252 * New segment ends where old one did, so try 4253 * to concatenate with next segment 4254 */ 4255 if (eoff == oldeoff) { 4256 newseg->shm_policy.mem_policy = policy; 4257 newseg->shm_policy.mem_reserved = 0; 4258 (void) lgrp_shm_policy_concat(tree, 4259 newseg, AVL_NEXT(tree, newseg)); 4260 break; 4261 } 4262 } 4263 4264 /* 4265 * New segment ends before old one, so split off end of 4266 * old one 4267 */ 4268 if (eoff < oldeoff) { 4269 if (newseg) { 4270 (void) lgrp_shm_policy_split(tree, 4271 newseg, eoff); 4272 newseg->shm_policy.mem_policy = policy; 4273 newseg->shm_policy.mem_reserved = 0; 4274 } else { 4275 (void) lgrp_shm_policy_split(tree, seg, 4276 eoff); 4277 seg->shm_policy.mem_policy = policy; 4278 seg->shm_policy.mem_reserved = 0; 4279 } 4280 4281 if (off == seg->shm_off) 4282 (void) lgrp_shm_policy_concat(tree, 4283 AVL_PREV(tree, seg), seg); 4284 break; 4285 } 4286 4287 /* 4288 * Calculate remaining length and next offset 4289 */ 4290 len = eoff - oldeoff; 4291 off = oldeoff; 4292 } 4293 } 4294 4295 rw_exit(&shm_locality->loc_lock); 4296 return (retval); 4297 } 4298 4299 /* 4300 * Return the best memnode from which to allocate memory given 4301 * an lgroup. 4302 * 4303 * "c" is for cookie, which is good enough for me. 4304 * It references a cookie struct that should be zero'ed to initialize. 4305 * The cookie should live on the caller's stack. 4306 * 4307 * The routine returns -1 when: 4308 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4309 * - traverse is 1, and all the memnodes in the system have been 4310 * returned. 4311 */ 4312 int 4313 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4314 { 4315 lgrp_t *lp = c->lmc_lgrp; 4316 mnodeset_t nodes = c->lmc_nodes; 4317 int cnt = c->lmc_cnt; 4318 int offset, mnode; 4319 4320 extern int max_mem_nodes; 4321 4322 /* 4323 * If the set is empty, and the caller is willing, traverse 4324 * up the hierarchy until we find a non-empty set. 4325 */ 4326 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4327 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4328 ((lp = lp->lgrp_parent) == NULL)) 4329 return (-1); 4330 4331 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4332 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4333 } 4334 4335 /* 4336 * Select a memnode by picking one at a "random" offset. 4337 * Because of DR, memnodes can come and go at any time. 4338 * This code must be able to cope with the possibility 4339 * that the nodes count "cnt" is inconsistent with respect 4340 * to the number of elements actually in "nodes", and 4341 * therefore that the offset chosen could be greater than 4342 * the number of elements in the set (some memnodes may 4343 * have dissapeared just before cnt was read). 4344 * If this happens, the search simply wraps back to the 4345 * beginning of the set. 4346 */ 4347 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4348 offset = c->lmc_rand % cnt; 4349 do { 4350 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4351 if (nodes & ((mnodeset_t)1 << mnode)) 4352 if (!offset--) 4353 break; 4354 } while (mnode >= max_mem_nodes); 4355 4356 /* Found a node. Store state before returning. */ 4357 c->lmc_lgrp = lp; 4358 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4359 c->lmc_cnt = cnt - 1; 4360 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4361 c->lmc_ntried++; 4362 4363 return (mnode); 4364 } 4365