1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Basic NUMA support in terms of locality groups 30 * 31 * Solaris needs to know which CPUs, memory, etc. are near each other to 32 * provide good performance on NUMA machines by optimizing for locality. 33 * In order to do this, a new abstraction called a "locality group (lgroup)" 34 * has been introduced to keep track of which CPU-like and memory-like hardware 35 * resources are close to each other. Currently, latency is the only measure 36 * used to determine how to group hardware resources into lgroups, but this 37 * does not limit the groupings to be based solely on latency. Other factors 38 * may be used to determine the groupings in the future. 39 * 40 * Lgroups are organized into a hieararchy or topology that represents the 41 * latency topology of the machine. There is always at least a root lgroup in 42 * the system. It represents all the hardware resources in the machine at a 43 * latency big enough that any hardware resource can at least access any other 44 * hardware resource within that latency. A Uniform Memory Access (UMA) 45 * machine is represented with one lgroup (the root). In contrast, a NUMA 46 * machine is represented at least by the root lgroup and some number of leaf 47 * lgroups where the leaf lgroups contain the hardware resources within the 48 * least latency of each other and the root lgroup still contains all the 49 * resources in the machine. Some number of intermediate lgroups may exist 50 * which represent more levels of locality than just the local latency of the 51 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 52 * (eg. root and intermediate lgroups) contain the next nearest resources to 53 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 54 * to the root lgroup shows the hardware resources from closest to farthest 55 * from the leaf lgroup such that each successive ancestor lgroup contains 56 * the next nearest resources at the next level of locality from the previous. 57 * 58 * The kernel uses the lgroup abstraction to know how to allocate resources 59 * near a given process/thread. At fork() and lwp/thread_create() time, a 60 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 61 * with the lowest load average. Binding to a processor or processor set will 62 * change the home lgroup for a thread. The scheduler has been modified to try 63 * to dispatch a thread on a CPU in its home lgroup. Physical memory 64 * allocation is lgroup aware too, so memory will be allocated from the current 65 * thread's home lgroup if possible. If the desired resources are not 66 * available, the kernel traverses the lgroup hierarchy going to the parent 67 * lgroup to find resources at the next level of locality until it reaches the 68 * root lgroup. 69 */ 70 71 #include <sys/lgrp.h> 72 #include <sys/lgrp_user.h> 73 #include <sys/types.h> 74 #include <sys/mman.h> 75 #include <sys/param.h> 76 #include <sys/var.h> 77 #include <sys/thread.h> 78 #include <sys/cpuvar.h> 79 #include <sys/cpupart.h> 80 #include <sys/kmem.h> 81 #include <vm/seg.h> 82 #include <vm/seg_kmem.h> 83 #include <vm/seg_spt.h> 84 #include <vm/seg_vn.h> 85 #include <vm/as.h> 86 #include <sys/atomic.h> 87 #include <sys/systm.h> 88 #include <sys/errno.h> 89 #include <sys/cmn_err.h> 90 #include <sys/kstat.h> 91 #include <sys/sysmacros.h> 92 #include <sys/pg.h> 93 #include <sys/promif.h> 94 #include <sys/sdt.h> 95 96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 98 /* indexed by lgrp_id */ 99 int nlgrps; /* number of lgroups in machine */ 100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 102 103 /* 104 * Kstat data for lgroups. 105 * 106 * Actual kstat data is collected in lgrp_stats array. 107 * The lgrp_kstat_data array of named kstats is used to extract data from 108 * lgrp_stats and present it to kstat framework. It is protected from partallel 109 * modifications by lgrp_kstat_mutex. This may cause some contention when 110 * several kstat commands run in parallel but this is not the 111 * performance-critical path. 112 */ 113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 114 115 /* 116 * Declare kstat names statically for enums as defined in the header file. 117 */ 118 LGRP_KSTAT_NAMES; 119 120 static void lgrp_kstat_init(void); 121 static int lgrp_kstat_extract(kstat_t *, int); 122 static void lgrp_kstat_reset(lgrp_id_t); 123 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 125 static kmutex_t lgrp_kstat_mutex; 126 127 128 /* 129 * max number of lgroups supported by the platform 130 */ 131 int nlgrpsmax = 0; 132 133 /* 134 * The root lgroup. Represents the set of resources at the system wide 135 * level of locality. 136 */ 137 lgrp_t *lgrp_root = NULL; 138 139 /* 140 * During system bootstrap cp_default does not contain the list of lgrp load 141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 142 * on-line when cp_default is initialized by cpupart_initialize_default(). 143 * Configuring CPU0 may create a two-level topology with root and one leaf node 144 * containing CPU0. This topology is initially constructed in a special 145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 147 * for all lpl operations until cp_default is fully constructed. 148 * 149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 151 * the first element of lpl_bootstrap_list. 152 * 153 * CPUs that are added to the system, but have not yet been assigned to an 154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 155 * on some architectures (x86) it's possible for the slave CPU startup thread 156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 157 */ 158 #define LPL_BOOTSTRAP_SIZE 2 159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 160 lpl_t *lpl_bootstrap; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 /* 172 * Size, in bytes, beyond which random memory allocation policy is applied 173 * to non-shared memory. Default is the maximum size, so random memory 174 * allocation won't be used for non-shared memory by default. 175 */ 176 size_t lgrp_privm_random_thresh = (size_t)(-1); 177 178 /* the maximum effect that a single thread can have on it's lgroup's load */ 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 180 ((lgrp_loadavg_max_effect) / (ncpu)) 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 182 183 184 /* 185 * Size, in bytes, beyond which random memory allocation policy is applied to 186 * shared memory. Default is 8MB (2 ISM pages). 187 */ 188 size_t lgrp_shm_random_thresh = 8*1024*1024; 189 190 /* 191 * Whether to do processor set aware memory allocation by default 192 */ 193 int lgrp_mem_pset_aware = 0; 194 195 /* 196 * Set the default memory allocation policy for root lgroup 197 */ 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 199 200 /* 201 * Set the default memory allocation policy. For most platforms, 202 * next touch is sufficient, but some platforms may wish to override 203 * this. 204 */ 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 206 207 208 /* 209 * lgroup CPU event handlers 210 */ 211 static void lgrp_cpu_init(struct cpu *); 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 214 215 /* 216 * lgroup memory event handlers 217 */ 218 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 219 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 220 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 221 222 /* 223 * lgroup CPU partition event handlers 224 */ 225 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 226 static void lgrp_part_del_cpu(struct cpu *); 227 228 static void lgrp_root_init(void); 229 230 /* 231 * lpl topology 232 */ 233 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 234 static void lpl_clear(lpl_t *); 235 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 236 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 237 static void lpl_rset_add(lpl_t *, lpl_t *); 238 static void lpl_rset_del(lpl_t *, lpl_t *); 239 static int lpl_rset_contains(lpl_t *, lpl_t *); 240 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 241 static void lpl_child_update(lpl_t *, struct cpupart *); 242 static int lpl_pick(lpl_t *, lpl_t *); 243 static void lpl_verify_wrapper(struct cpupart *); 244 245 /* 246 * defines for lpl topology verifier return codes 247 */ 248 249 #define LPL_TOPO_CORRECT 0 250 #define LPL_TOPO_PART_HAS_NO_LPL -1 251 #define LPL_TOPO_CPUS_NOT_EMPTY -2 252 #define LPL_TOPO_LGRP_MISMATCH -3 253 #define LPL_TOPO_MISSING_PARENT -4 254 #define LPL_TOPO_PARENT_MISMATCH -5 255 #define LPL_TOPO_BAD_CPUCNT -6 256 #define LPL_TOPO_RSET_MISMATCH -7 257 #define LPL_TOPO_LPL_ORPHANED -8 258 #define LPL_TOPO_LPL_BAD_NCPU -9 259 #define LPL_TOPO_RSET_MSSNG_LF -10 260 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 261 #define LPL_TOPO_BOGUS_HINT -12 262 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 263 #define LPL_TOPO_LGRP_NOT_LEAF -14 264 #define LPL_TOPO_BAD_RSETCNT -15 265 266 /* 267 * Return whether lgroup optimizations should be enabled on this system 268 */ 269 int 270 lgrp_optimizations(void) 271 { 272 /* 273 * System must have more than 2 lgroups to enable lgroup optimizations 274 * 275 * XXX This assumes that a 2 lgroup system has an empty root lgroup 276 * with one child lgroup containing all the resources. A 2 lgroup 277 * system with a root lgroup directly containing CPUs or memory might 278 * need lgroup optimizations with its child lgroup, but there 279 * isn't such a machine for now.... 280 */ 281 if (nlgrps > 2) 282 return (1); 283 284 return (0); 285 } 286 287 /* 288 * Build full lgroup topology 289 */ 290 static void 291 lgrp_root_init(void) 292 { 293 lgrp_handle_t hand; 294 int i; 295 lgrp_id_t id; 296 297 /* 298 * Create the "root" lgroup 299 */ 300 ASSERT(nlgrps == 0); 301 id = nlgrps++; 302 303 lgrp_root = &lroot; 304 305 lgrp_root->lgrp_cpu = NULL; 306 lgrp_root->lgrp_mnodes = 0; 307 lgrp_root->lgrp_nmnodes = 0; 308 hand = lgrp_plat_root_hand(); 309 lgrp_root->lgrp_plathand = hand; 310 311 lgrp_root->lgrp_id = id; 312 lgrp_root->lgrp_cpucnt = 0; 313 lgrp_root->lgrp_childcnt = 0; 314 klgrpset_clear(lgrp_root->lgrp_children); 315 klgrpset_clear(lgrp_root->lgrp_leaves); 316 lgrp_root->lgrp_parent = NULL; 317 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 318 319 for (i = 0; i < LGRP_RSRC_COUNT; i++) 320 klgrpset_clear(lgrp_root->lgrp_set[i]); 321 322 lgrp_root->lgrp_kstat = NULL; 323 324 lgrp_table[id] = lgrp_root; 325 326 /* 327 * Setup initial lpl list for CPU0 and initial t0 home. 328 * The only lpl space we have so far is lpl_bootstrap. It is used for 329 * all topology operations until cp_default is initialized at which 330 * point t0.t_lpl will be updated. 331 */ 332 lpl_bootstrap = lpl_bootstrap_list; 333 t0.t_lpl = lpl_bootstrap; 334 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 335 lpl_bootstrap_list[1].lpl_lgrpid = 1; 336 cp_default.cp_lgrploads = lpl_bootstrap; 337 } 338 339 /* 340 * Initialize the lgroup framework and allow the platform to do the same 341 */ 342 void 343 lgrp_init(void) 344 { 345 /* 346 * Initialize the platform 347 */ 348 lgrp_plat_init(); 349 350 /* 351 * Set max number of lgroups supported on this platform which must be 352 * less than the max number of lgroups supported by the common lgroup 353 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 354 */ 355 nlgrpsmax = lgrp_plat_max_lgrps(); 356 ASSERT(nlgrpsmax <= NLGRPS_MAX); 357 } 358 359 /* 360 * Create the root and cpu0's lgroup, and set t0's home. 361 */ 362 void 363 lgrp_setup(void) 364 { 365 /* 366 * Setup the root lgroup 367 */ 368 lgrp_root_init(); 369 370 /* 371 * Add cpu0 to an lgroup 372 */ 373 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 374 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 375 } 376 377 /* 378 * Lgroup initialization is split in two parts. The first part 379 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 380 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 381 * when all CPUs are brought online and all distance information is available. 382 * 383 * When lgrp_main_init() is complete it sets lgrp_initialized. The 384 * lgrp_main_mp_init() sets lgrp_topo_initialized. 385 */ 386 387 /* 388 * true when lgrp initialization has been completed. 389 */ 390 int lgrp_initialized = 0; 391 392 /* 393 * True when lgrp topology is constructed. 394 */ 395 int lgrp_topo_initialized = 0; 396 397 /* 398 * Init routine called after startup(), /etc/system has been processed, 399 * and cpu0 has been added to an lgroup. 400 */ 401 void 402 lgrp_main_init(void) 403 { 404 cpu_t *cp = CPU; 405 lgrp_id_t lgrpid; 406 int i; 407 extern void pg_cpu0_reinit(); 408 409 /* 410 * Enforce a valid lgrp_mem_default_policy 411 */ 412 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 413 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 414 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 415 416 /* 417 * See if mpo should be disabled. 418 * This may happen in the case of null proc LPA on Starcat. 419 * The platform won't be able to detect null proc LPA until after 420 * cpu0 and memory have already been added to lgroups. 421 * When and if it is detected, the Starcat platform will return 422 * a different platform handle for cpu0 which is what we check for 423 * here. If mpo should be disabled move cpu0 to it's rightful place 424 * (the root), and destroy the remaining lgroups. This effectively 425 * provides an UMA lgroup topology. 426 */ 427 lgrpid = cp->cpu_lpl->lpl_lgrpid; 428 if (lgrp_table[lgrpid]->lgrp_plathand != 429 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 430 lgrp_part_del_cpu(cp); 431 lgrp_cpu_fini(cp, lgrpid); 432 433 lgrp_cpu_init(cp); 434 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 435 436 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 437 438 /* 439 * Notify the PG subsystem that the CPU's lgrp 440 * association has changed 441 */ 442 pg_cpu0_reinit(); 443 444 /* 445 * Destroy all lgroups except for root 446 */ 447 for (i = 0; i <= lgrp_alloc_max; i++) { 448 if (LGRP_EXISTS(lgrp_table[i]) && 449 lgrp_table[i] != lgrp_root) 450 lgrp_destroy(lgrp_table[i]); 451 } 452 453 /* 454 * Fix up root to point at itself for leaves and resources 455 * and not have any children 456 */ 457 lgrp_root->lgrp_childcnt = 0; 458 klgrpset_clear(lgrp_root->lgrp_children); 459 klgrpset_clear(lgrp_root->lgrp_leaves); 460 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 461 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 462 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 463 } 464 465 /* 466 * Initialize kstats framework. 467 */ 468 lgrp_kstat_init(); 469 /* 470 * cpu0 is finally where it should be, so create it's lgroup's kstats 471 */ 472 mutex_enter(&cpu_lock); 473 lgrp_kstat_create(cp); 474 mutex_exit(&cpu_lock); 475 476 lgrp_plat_main_init(); 477 lgrp_initialized = 1; 478 } 479 480 /* 481 * Finish lgrp initialization after all CPUS are brought on-line. 482 * This routine is called after start_other_cpus(). 483 */ 484 void 485 lgrp_main_mp_init(void) 486 { 487 klgrpset_t changed; 488 489 /* 490 * Update lgroup topology (if necessary) 491 */ 492 klgrpset_clear(changed); 493 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 494 lgrp_topo_initialized = 1; 495 } 496 497 /* 498 * Change latency of lgroup with specified lgroup platform handle (if one is 499 * given) or change all lgroups with old latency to new latency 500 */ 501 void 502 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 503 u_longlong_t newtime) 504 { 505 lgrp_t *lgrp; 506 int i; 507 508 for (i = 0; i <= lgrp_alloc_max; i++) { 509 lgrp = lgrp_table[i]; 510 511 if (!LGRP_EXISTS(lgrp)) 512 continue; 513 514 if ((hand == LGRP_NULL_HANDLE && 515 lgrp->lgrp_latency == oldtime) || 516 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 517 lgrp->lgrp_latency = (int)newtime; 518 } 519 } 520 521 /* 522 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 523 */ 524 void 525 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 526 { 527 klgrpset_t changed; 528 cpu_t *cp; 529 lgrp_id_t id; 530 int rc; 531 532 switch (event) { 533 /* 534 * The following (re)configuration events are common code 535 * initiated. lgrp_plat_config() is called here to inform the 536 * platform of the reconfiguration event. 537 */ 538 case LGRP_CONFIG_CPU_ADD: 539 cp = (cpu_t *)resource; 540 541 /* 542 * Initialize the new CPU's lgrp related next/prev 543 * links, and give it a bootstrap lpl so that it can 544 * survive should it need to enter the dispatcher. 545 */ 546 cp->cpu_next_lpl = cp; 547 cp->cpu_prev_lpl = cp; 548 cp->cpu_next_lgrp = cp; 549 cp->cpu_prev_lgrp = cp; 550 cp->cpu_lpl = lpl_bootstrap; 551 552 lgrp_plat_config(event, resource); 553 atomic_add_32(&lgrp_gen, 1); 554 555 break; 556 case LGRP_CONFIG_CPU_DEL: 557 lgrp_plat_config(event, resource); 558 atomic_add_32(&lgrp_gen, 1); 559 560 break; 561 case LGRP_CONFIG_CPU_ONLINE: 562 cp = (cpu_t *)resource; 563 lgrp_cpu_init(cp); 564 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 565 rc = lpl_topo_verify(cp->cpu_part); 566 if (rc != LPL_TOPO_CORRECT) { 567 panic("lpl_topo_verify failed: %d", rc); 568 } 569 lgrp_plat_config(event, resource); 570 atomic_add_32(&lgrp_gen, 1); 571 572 break; 573 case LGRP_CONFIG_CPU_OFFLINE: 574 cp = (cpu_t *)resource; 575 id = cp->cpu_lpl->lpl_lgrpid; 576 lgrp_part_del_cpu(cp); 577 lgrp_cpu_fini(cp, id); 578 rc = lpl_topo_verify(cp->cpu_part); 579 if (rc != LPL_TOPO_CORRECT) { 580 panic("lpl_topo_verify failed: %d", rc); 581 } 582 lgrp_plat_config(event, resource); 583 atomic_add_32(&lgrp_gen, 1); 584 585 break; 586 case LGRP_CONFIG_CPUPART_ADD: 587 cp = (cpu_t *)resource; 588 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 589 rc = lpl_topo_verify(cp->cpu_part); 590 if (rc != LPL_TOPO_CORRECT) { 591 panic("lpl_topo_verify failed: %d", rc); 592 } 593 lgrp_plat_config(event, resource); 594 595 break; 596 case LGRP_CONFIG_CPUPART_DEL: 597 cp = (cpu_t *)resource; 598 lgrp_part_del_cpu((cpu_t *)resource); 599 rc = lpl_topo_verify(cp->cpu_part); 600 if (rc != LPL_TOPO_CORRECT) { 601 panic("lpl_topo_verify failed: %d", rc); 602 } 603 lgrp_plat_config(event, resource); 604 605 break; 606 /* 607 * The following events are initiated by the memnode 608 * subsystem. 609 */ 610 case LGRP_CONFIG_MEM_ADD: 611 lgrp_mem_init((int)resource, where, B_FALSE); 612 atomic_add_32(&lgrp_gen, 1); 613 614 break; 615 case LGRP_CONFIG_MEM_DEL: 616 lgrp_mem_fini((int)resource, where, B_FALSE); 617 atomic_add_32(&lgrp_gen, 1); 618 619 break; 620 case LGRP_CONFIG_MEM_RENAME: { 621 lgrp_config_mem_rename_t *ren_arg = 622 (lgrp_config_mem_rename_t *)where; 623 624 lgrp_mem_rename((int)resource, 625 ren_arg->lmem_rename_from, 626 ren_arg->lmem_rename_to); 627 atomic_add_32(&lgrp_gen, 1); 628 629 break; 630 } 631 case LGRP_CONFIG_GEN_UPDATE: 632 atomic_add_32(&lgrp_gen, 1); 633 634 break; 635 case LGRP_CONFIG_FLATTEN: 636 if (where == 0) 637 lgrp_topo_levels = (int)resource; 638 else 639 (void) lgrp_topo_flatten(resource, 640 lgrp_table, lgrp_alloc_max, &changed); 641 642 break; 643 /* 644 * Update any lgroups with old latency to new latency 645 */ 646 case LGRP_CONFIG_LAT_CHANGE_ALL: 647 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 648 (u_longlong_t)where); 649 650 break; 651 /* 652 * Update lgroup with specified lgroup platform handle to have 653 * new latency 654 */ 655 case LGRP_CONFIG_LAT_CHANGE: 656 lgrp_latency_change((lgrp_handle_t)resource, 0, 657 (u_longlong_t)where); 658 659 break; 660 case LGRP_CONFIG_NOP: 661 662 break; 663 default: 664 break; 665 } 666 667 } 668 669 /* 670 * Called to add lgrp info into cpu structure from cpu_add_unit; 671 * do not assume cpu is in cpu[] yet! 672 * 673 * CPUs are brought online with all other CPUs paused so we can't 674 * allocate memory or we could deadlock the system, so we rely on 675 * the platform to statically allocate as much space as we need 676 * for the lgrp structs and stats. 677 */ 678 static void 679 lgrp_cpu_init(struct cpu *cp) 680 { 681 klgrpset_t changed; 682 int count; 683 lgrp_handle_t hand; 684 int first_cpu; 685 lgrp_t *my_lgrp; 686 lgrp_id_t lgrpid; 687 struct cpu *cptr; 688 689 /* 690 * This is the first time through if the resource set 691 * for the root lgroup is empty. After cpu0 has been 692 * initially added to an lgroup, the root's CPU resource 693 * set can never be empty, since the system's last CPU 694 * cannot be offlined. 695 */ 696 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 697 /* 698 * First time through. 699 */ 700 first_cpu = 1; 701 } else { 702 /* 703 * If cpu0 needs to move lgroups, we may come 704 * through here again, at which time cpu_lock won't 705 * be held, and lgrp_initialized will be false. 706 */ 707 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 708 ASSERT(cp->cpu_part != NULL); 709 first_cpu = 0; 710 } 711 712 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 713 my_lgrp = lgrp_hand_to_lgrp(hand); 714 715 if (my_lgrp == NULL) { 716 /* 717 * Create new lgrp and add it to lgroup topology 718 */ 719 my_lgrp = lgrp_create(); 720 my_lgrp->lgrp_plathand = hand; 721 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 722 lgrpid = my_lgrp->lgrp_id; 723 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 724 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 725 726 count = 0; 727 klgrpset_clear(changed); 728 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 729 &changed); 730 /* 731 * May have added new intermediate lgroups, so need to add 732 * resources other than CPUs which are added below 733 */ 734 (void) lgrp_mnode_update(changed, NULL); 735 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 736 > 0) { 737 /* 738 * Leaf lgroup was created, but latency wasn't available 739 * then. So, set latency for it and fill in rest of lgroup 740 * topology now that we know how far it is from other leaf 741 * lgroups. 742 */ 743 lgrpid = my_lgrp->lgrp_id; 744 klgrpset_clear(changed); 745 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 746 lgrpid)) 747 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 748 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 749 &changed); 750 751 /* 752 * May have added new intermediate lgroups, so need to add 753 * resources other than CPUs which are added below 754 */ 755 (void) lgrp_mnode_update(changed, NULL); 756 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 757 my_lgrp->lgrp_id)) { 758 int i; 759 760 /* 761 * Update existing lgroup and lgroups containing it with CPU 762 * resource 763 */ 764 lgrpid = my_lgrp->lgrp_id; 765 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 766 for (i = 0; i <= lgrp_alloc_max; i++) { 767 lgrp_t *lgrp; 768 769 lgrp = lgrp_table[i]; 770 if (!LGRP_EXISTS(lgrp) || 771 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 772 continue; 773 774 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 775 } 776 } 777 778 lgrpid = my_lgrp->lgrp_id; 779 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 780 781 /* 782 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 783 * end up in lpl for lgroup 0 whether it is supposed to be in there or 784 * not since none of lgroup IDs in the lpl's have been set yet. 785 */ 786 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 787 cp->cpu_lpl->lpl_lgrpid = lgrpid; 788 789 /* 790 * link the CPU into the lgrp's CPU list 791 */ 792 if (my_lgrp->lgrp_cpucnt == 0) { 793 my_lgrp->lgrp_cpu = cp; 794 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 795 } else { 796 cptr = my_lgrp->lgrp_cpu; 797 cp->cpu_next_lgrp = cptr; 798 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 799 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 800 cptr->cpu_prev_lgrp = cp; 801 } 802 my_lgrp->lgrp_cpucnt++; 803 } 804 805 lgrp_t * 806 lgrp_create(void) 807 { 808 lgrp_t *my_lgrp; 809 lgrp_id_t lgrpid; 810 int i; 811 812 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 813 814 /* 815 * Find an open slot in the lgroup table and recycle unused lgroup 816 * left there if any 817 */ 818 my_lgrp = NULL; 819 if (lgrp_alloc_hint == -1) 820 /* 821 * Allocate from end when hint not set yet because no lgroups 822 * have been deleted yet 823 */ 824 lgrpid = nlgrps++; 825 else { 826 /* 827 * Start looking for next open slot from hint and leave hint 828 * at slot allocated 829 */ 830 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 831 my_lgrp = lgrp_table[i]; 832 if (!LGRP_EXISTS(my_lgrp)) { 833 lgrpid = i; 834 nlgrps++; 835 break; 836 } 837 } 838 lgrp_alloc_hint = lgrpid; 839 } 840 841 /* 842 * Keep track of max lgroup ID allocated so far to cut down on searches 843 */ 844 if (lgrpid > lgrp_alloc_max) 845 lgrp_alloc_max = lgrpid; 846 847 /* 848 * Need to allocate new lgroup if next open slot didn't have one 849 * for recycling 850 */ 851 if (my_lgrp == NULL) 852 my_lgrp = lgrp_plat_alloc(lgrpid); 853 854 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 855 panic("Too many lgrps for platform (%d)", nlgrps); 856 857 my_lgrp->lgrp_id = lgrpid; 858 my_lgrp->lgrp_latency = 0; 859 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 860 my_lgrp->lgrp_parent = NULL; 861 my_lgrp->lgrp_childcnt = 0; 862 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 863 my_lgrp->lgrp_nmnodes = 0; 864 klgrpset_clear(my_lgrp->lgrp_children); 865 klgrpset_clear(my_lgrp->lgrp_leaves); 866 for (i = 0; i < LGRP_RSRC_COUNT; i++) 867 klgrpset_clear(my_lgrp->lgrp_set[i]); 868 869 my_lgrp->lgrp_cpu = NULL; 870 my_lgrp->lgrp_cpucnt = 0; 871 872 if (my_lgrp->lgrp_kstat != NULL) 873 lgrp_kstat_reset(lgrpid); 874 875 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 876 877 return (my_lgrp); 878 } 879 880 void 881 lgrp_destroy(lgrp_t *lgrp) 882 { 883 int i; 884 885 /* 886 * Unless this lgroup is being destroyed on behalf of 887 * the boot CPU, cpu_lock must be held 888 */ 889 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 890 891 if (nlgrps == 1) 892 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 893 894 if (!LGRP_EXISTS(lgrp)) 895 return; 896 897 /* 898 * Set hint to lgroup being deleted and try to keep lower numbered 899 * hints to facilitate finding empty slots 900 */ 901 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 902 lgrp_alloc_hint = lgrp->lgrp_id; 903 904 /* 905 * Mark this lgroup to be recycled by setting its lgroup ID to 906 * LGRP_NONE and clear relevant fields 907 */ 908 lgrp->lgrp_id = LGRP_NONE; 909 lgrp->lgrp_latency = 0; 910 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 911 lgrp->lgrp_parent = NULL; 912 lgrp->lgrp_childcnt = 0; 913 914 klgrpset_clear(lgrp->lgrp_children); 915 klgrpset_clear(lgrp->lgrp_leaves); 916 for (i = 0; i < LGRP_RSRC_COUNT; i++) 917 klgrpset_clear(lgrp->lgrp_set[i]); 918 919 lgrp->lgrp_mnodes = (mnodeset_t)0; 920 lgrp->lgrp_nmnodes = 0; 921 922 lgrp->lgrp_cpu = NULL; 923 lgrp->lgrp_cpucnt = 0; 924 925 nlgrps--; 926 } 927 928 /* 929 * Initialize kstat data. Called from lgrp intialization code. 930 */ 931 static void 932 lgrp_kstat_init(void) 933 { 934 lgrp_stat_t stat; 935 936 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 937 938 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 939 kstat_named_init(&lgrp_kstat_data[stat], 940 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 941 } 942 943 /* 944 * initialize an lgrp's kstats if needed 945 * called with cpu_lock held but not with cpus paused. 946 * we don't tear these down now because we don't know about 947 * memory leaving the lgrp yet... 948 */ 949 950 void 951 lgrp_kstat_create(cpu_t *cp) 952 { 953 kstat_t *lgrp_kstat; 954 lgrp_id_t lgrpid; 955 lgrp_t *my_lgrp; 956 957 ASSERT(MUTEX_HELD(&cpu_lock)); 958 959 lgrpid = cp->cpu_lpl->lpl_lgrpid; 960 my_lgrp = lgrp_table[lgrpid]; 961 962 if (my_lgrp->lgrp_kstat != NULL) 963 return; /* already initialized */ 964 965 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 966 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 967 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 968 969 if (lgrp_kstat != NULL) { 970 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 971 lgrp_kstat->ks_private = my_lgrp; 972 lgrp_kstat->ks_data = &lgrp_kstat_data; 973 lgrp_kstat->ks_update = lgrp_kstat_extract; 974 my_lgrp->lgrp_kstat = lgrp_kstat; 975 kstat_install(lgrp_kstat); 976 } 977 } 978 979 /* 980 * this will do something when we manage to remove now unused lgrps 981 */ 982 983 /* ARGSUSED */ 984 void 985 lgrp_kstat_destroy(cpu_t *cp) 986 { 987 ASSERT(MUTEX_HELD(&cpu_lock)); 988 } 989 990 /* 991 * Called when a CPU is off-lined. 992 */ 993 static void 994 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 995 { 996 lgrp_t *my_lgrp; 997 struct cpu *prev; 998 struct cpu *next; 999 1000 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 1001 1002 prev = cp->cpu_prev_lgrp; 1003 next = cp->cpu_next_lgrp; 1004 1005 prev->cpu_next_lgrp = next; 1006 next->cpu_prev_lgrp = prev; 1007 1008 /* 1009 * just because I'm paranoid doesn't mean... 1010 */ 1011 1012 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1013 1014 my_lgrp = lgrp_table[lgrpid]; 1015 my_lgrp->lgrp_cpucnt--; 1016 1017 /* 1018 * Removing last CPU in lgroup, so update lgroup topology 1019 */ 1020 if (my_lgrp->lgrp_cpucnt == 0) { 1021 klgrpset_t changed; 1022 int count; 1023 int i; 1024 1025 my_lgrp->lgrp_cpu = NULL; 1026 1027 /* 1028 * Remove this lgroup from its lgroup CPU resources and remove 1029 * lgroup from lgroup topology if it doesn't have any more 1030 * resources in it now 1031 */ 1032 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1033 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1034 count = 0; 1035 klgrpset_clear(changed); 1036 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1037 lgrp_alloc_max + 1, &changed); 1038 return; 1039 } 1040 1041 /* 1042 * This lgroup isn't empty, so just remove it from CPU 1043 * resources of any lgroups that contain it as such 1044 */ 1045 for (i = 0; i <= lgrp_alloc_max; i++) { 1046 lgrp_t *lgrp; 1047 1048 lgrp = lgrp_table[i]; 1049 if (!LGRP_EXISTS(lgrp) || 1050 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1051 lgrpid)) 1052 continue; 1053 1054 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1055 } 1056 return; 1057 } 1058 1059 if (my_lgrp->lgrp_cpu == cp) 1060 my_lgrp->lgrp_cpu = next; 1061 1062 } 1063 1064 /* 1065 * Update memory nodes in target lgroups and return ones that get changed 1066 */ 1067 int 1068 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1069 { 1070 int count; 1071 int i; 1072 int j; 1073 lgrp_t *lgrp; 1074 lgrp_t *lgrp_rsrc; 1075 1076 count = 0; 1077 if (changed) 1078 klgrpset_clear(*changed); 1079 1080 if (klgrpset_isempty(target)) 1081 return (0); 1082 1083 /* 1084 * Find each lgroup in target lgroups 1085 */ 1086 for (i = 0; i <= lgrp_alloc_max; i++) { 1087 /* 1088 * Skip any lgroups that don't exist or aren't in target group 1089 */ 1090 lgrp = lgrp_table[i]; 1091 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1092 continue; 1093 } 1094 1095 /* 1096 * Initialize memnodes for intermediate lgroups to 0 1097 * and update them from scratch since they may have completely 1098 * changed 1099 */ 1100 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1101 lgrp->lgrp_mnodes = (mnodeset_t)0; 1102 lgrp->lgrp_nmnodes = 0; 1103 } 1104 1105 /* 1106 * Update memory nodes of of target lgroup with memory nodes 1107 * from each lgroup in its lgroup memory resource set 1108 */ 1109 for (j = 0; j <= lgrp_alloc_max; j++) { 1110 int k; 1111 1112 /* 1113 * Skip any lgroups that don't exist or aren't in 1114 * memory resources of target lgroup 1115 */ 1116 lgrp_rsrc = lgrp_table[j]; 1117 if (!LGRP_EXISTS(lgrp_rsrc) || 1118 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1119 j)) 1120 continue; 1121 1122 /* 1123 * Update target lgroup's memnodes to include memnodes 1124 * of this lgroup 1125 */ 1126 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1127 mnodeset_t mnode_mask; 1128 1129 mnode_mask = (mnodeset_t)1 << k; 1130 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1131 !(lgrp->lgrp_mnodes & mnode_mask)) { 1132 lgrp->lgrp_mnodes |= mnode_mask; 1133 lgrp->lgrp_nmnodes++; 1134 } 1135 } 1136 count++; 1137 if (changed) 1138 klgrpset_add(*changed, lgrp->lgrp_id); 1139 } 1140 } 1141 1142 return (count); 1143 } 1144 1145 /* 1146 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1147 * is moved from one board to another. The "from" and "to" arguments specify the 1148 * source and the destination of the move. 1149 * 1150 * See plat_lgrp_config() for a detailed description of the copy-rename 1151 * semantics. 1152 * 1153 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1154 * the lgroup topology which is changing as memory moves from one lgroup to 1155 * another. It removes the mnode from the source lgroup and re-inserts it in the 1156 * target lgroup. 1157 * 1158 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1159 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1160 * copy-rename operation. 1161 * 1162 * There is one case which requires special handling. If the system contains 1163 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1164 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1165 * lgrp_mem_init), but there is a window when the system has no memory in the 1166 * lgroup hierarchy. If another thread tries to allocate memory during this 1167 * window, the allocation will fail, although the system has physical memory. 1168 * This may cause a system panic or a deadlock (some sleeping memory allocations 1169 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1170 * the mnode back). 1171 * 1172 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1173 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1174 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1175 * but it updates the rest of the lgroup topology as if the mnode was actually 1176 * removed. The lgrp_mem_init() function recognizes that the mnode being 1177 * inserted represents such a special case and updates the topology 1178 * appropriately. 1179 */ 1180 void 1181 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1182 { 1183 /* 1184 * Remove the memory from the source node and add it to the destination 1185 * node. 1186 */ 1187 lgrp_mem_fini(mnode, from, B_TRUE); 1188 lgrp_mem_init(mnode, to, B_TRUE); 1189 } 1190 1191 /* 1192 * Called to indicate that the lgrp with platform handle "hand" now 1193 * contains the memory identified by "mnode". 1194 * 1195 * LOCKING for this routine is a bit tricky. Usually it is called without 1196 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1197 * callers. During DR of the board containing the caged memory it may be called 1198 * with cpu_lock already held and CPUs paused. 1199 * 1200 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1201 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1202 * dealing with the special case of DR copy-rename described in 1203 * lgrp_mem_rename(). 1204 */ 1205 void 1206 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1207 { 1208 klgrpset_t changed; 1209 int count; 1210 int i; 1211 lgrp_t *my_lgrp; 1212 lgrp_id_t lgrpid; 1213 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1214 boolean_t drop_lock = B_FALSE; 1215 boolean_t need_synch = B_FALSE; 1216 1217 /* 1218 * Grab CPU lock (if we haven't already) 1219 */ 1220 if (!MUTEX_HELD(&cpu_lock)) { 1221 mutex_enter(&cpu_lock); 1222 drop_lock = B_TRUE; 1223 } 1224 1225 /* 1226 * This routine may be called from a context where we already 1227 * hold cpu_lock, and have already paused cpus. 1228 */ 1229 if (!cpus_paused()) 1230 need_synch = B_TRUE; 1231 1232 /* 1233 * Check if this mnode is already configured and return immediately if 1234 * it is. 1235 * 1236 * NOTE: in special case of copy-rename of the only remaining mnode, 1237 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1238 * recognize this case and continue as usual, but skip the update to 1239 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1240 * in topology, temporarily introduced by lgrp_mem_fini(). 1241 */ 1242 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1243 lgrp_root->lgrp_mnodes & mnodes_mask) { 1244 if (drop_lock) 1245 mutex_exit(&cpu_lock); 1246 return; 1247 } 1248 1249 /* 1250 * Update lgroup topology with new memory resources, keeping track of 1251 * which lgroups change 1252 */ 1253 count = 0; 1254 klgrpset_clear(changed); 1255 my_lgrp = lgrp_hand_to_lgrp(hand); 1256 if (my_lgrp == NULL) { 1257 /* new lgrp */ 1258 my_lgrp = lgrp_create(); 1259 lgrpid = my_lgrp->lgrp_id; 1260 my_lgrp->lgrp_plathand = hand; 1261 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1262 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1263 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1264 1265 if (need_synch) 1266 pause_cpus(NULL); 1267 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1268 &changed); 1269 if (need_synch) 1270 start_cpus(); 1271 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1272 > 0) { 1273 /* 1274 * Leaf lgroup was created, but latency wasn't available 1275 * then. So, set latency for it and fill in rest of lgroup 1276 * topology now that we know how far it is from other leaf 1277 * lgroups. 1278 */ 1279 klgrpset_clear(changed); 1280 lgrpid = my_lgrp->lgrp_id; 1281 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1282 lgrpid)) 1283 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1284 if (need_synch) 1285 pause_cpus(NULL); 1286 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1287 &changed); 1288 if (need_synch) 1289 start_cpus(); 1290 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1291 my_lgrp->lgrp_id)) { 1292 /* 1293 * Add new lgroup memory resource to existing lgroup 1294 */ 1295 lgrpid = my_lgrp->lgrp_id; 1296 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1297 klgrpset_add(changed, lgrpid); 1298 count++; 1299 for (i = 0; i <= lgrp_alloc_max; i++) { 1300 lgrp_t *lgrp; 1301 1302 lgrp = lgrp_table[i]; 1303 if (!LGRP_EXISTS(lgrp) || 1304 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1305 continue; 1306 1307 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1308 klgrpset_add(changed, lgrp->lgrp_id); 1309 count++; 1310 } 1311 } 1312 1313 /* 1314 * Add memory node to lgroup and remove lgroup from ones that need 1315 * to be updated 1316 */ 1317 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1318 my_lgrp->lgrp_mnodes |= mnodes_mask; 1319 my_lgrp->lgrp_nmnodes++; 1320 } 1321 klgrpset_del(changed, lgrpid); 1322 1323 /* 1324 * Update memory node information for all lgroups that changed and 1325 * contain new memory node as a resource 1326 */ 1327 if (count) 1328 (void) lgrp_mnode_update(changed, NULL); 1329 1330 if (drop_lock) 1331 mutex_exit(&cpu_lock); 1332 } 1333 1334 /* 1335 * Called to indicate that the lgroup associated with the platform 1336 * handle "hand" no longer contains given memory node 1337 * 1338 * LOCKING for this routine is a bit tricky. Usually it is called without 1339 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1340 * callers. During DR of the board containing the caged memory it may be called 1341 * with cpu_lock already held and CPUs paused. 1342 * 1343 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1344 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1345 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1346 * the same mnode back into the topology. See lgrp_mem_rename() and 1347 * lgrp_mem_init() for additional details. 1348 */ 1349 void 1350 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1351 { 1352 klgrpset_t changed; 1353 int count; 1354 int i; 1355 lgrp_t *my_lgrp; 1356 lgrp_id_t lgrpid; 1357 mnodeset_t mnodes_mask; 1358 boolean_t drop_lock = B_FALSE; 1359 boolean_t need_synch = B_FALSE; 1360 1361 /* 1362 * Grab CPU lock (if we haven't already) 1363 */ 1364 if (!MUTEX_HELD(&cpu_lock)) { 1365 mutex_enter(&cpu_lock); 1366 drop_lock = B_TRUE; 1367 } 1368 1369 /* 1370 * This routine may be called from a context where we already 1371 * hold cpu_lock and have already paused cpus. 1372 */ 1373 if (!cpus_paused()) 1374 need_synch = B_TRUE; 1375 1376 my_lgrp = lgrp_hand_to_lgrp(hand); 1377 1378 /* 1379 * The lgrp *must* be pre-existing 1380 */ 1381 ASSERT(my_lgrp != NULL); 1382 1383 /* 1384 * Delete memory node from lgroups which contain it 1385 */ 1386 mnodes_mask = ((mnodeset_t)1 << mnode); 1387 for (i = 0; i <= lgrp_alloc_max; i++) { 1388 lgrp_t *lgrp = lgrp_table[i]; 1389 /* 1390 * Skip any non-existent lgroups and any lgroups that don't 1391 * contain leaf lgroup of memory as a memory resource 1392 */ 1393 if (!LGRP_EXISTS(lgrp) || 1394 !(lgrp->lgrp_mnodes & mnodes_mask)) 1395 continue; 1396 1397 /* 1398 * Avoid removing the last mnode from the root in the DR 1399 * copy-rename case. See lgrp_mem_rename() for details. 1400 */ 1401 if (is_copy_rename && 1402 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1403 continue; 1404 1405 /* 1406 * Remove memory node from lgroup. 1407 */ 1408 lgrp->lgrp_mnodes &= ~mnodes_mask; 1409 lgrp->lgrp_nmnodes--; 1410 ASSERT(lgrp->lgrp_nmnodes >= 0); 1411 } 1412 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1413 1414 /* 1415 * Don't need to update lgroup topology if this lgroup still has memory. 1416 * 1417 * In the special case of DR copy-rename with the only mnode being 1418 * removed, the lgrp_mnodes for the root is always non-zero, but we 1419 * still need to update the lgroup topology. 1420 */ 1421 if ((my_lgrp->lgrp_nmnodes > 0) && 1422 !(is_copy_rename && 1423 (my_lgrp == lgrp_root) && 1424 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1425 if (drop_lock) 1426 mutex_exit(&cpu_lock); 1427 return; 1428 } 1429 1430 /* 1431 * This lgroup does not contain any memory now 1432 */ 1433 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1434 1435 /* 1436 * Remove this lgroup from lgroup topology if it does not contain any 1437 * resources now 1438 */ 1439 lgrpid = my_lgrp->lgrp_id; 1440 count = 0; 1441 klgrpset_clear(changed); 1442 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1443 /* 1444 * Delete lgroup when no more resources 1445 */ 1446 if (need_synch) 1447 pause_cpus(NULL); 1448 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1449 lgrp_alloc_max + 1, &changed); 1450 ASSERT(count > 0); 1451 if (need_synch) 1452 start_cpus(); 1453 } else { 1454 /* 1455 * Remove lgroup from memory resources of any lgroups that 1456 * contain it as such 1457 */ 1458 for (i = 0; i <= lgrp_alloc_max; i++) { 1459 lgrp_t *lgrp; 1460 1461 lgrp = lgrp_table[i]; 1462 if (!LGRP_EXISTS(lgrp) || 1463 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1464 lgrpid)) 1465 continue; 1466 1467 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1468 } 1469 } 1470 if (drop_lock) 1471 mutex_exit(&cpu_lock); 1472 } 1473 1474 /* 1475 * Return lgroup with given platform handle 1476 */ 1477 lgrp_t * 1478 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1479 { 1480 int i; 1481 lgrp_t *lgrp; 1482 1483 if (hand == LGRP_NULL_HANDLE) 1484 return (NULL); 1485 1486 for (i = 0; i <= lgrp_alloc_max; i++) { 1487 lgrp = lgrp_table[i]; 1488 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1489 return (lgrp); 1490 } 1491 return (NULL); 1492 } 1493 1494 /* 1495 * Return the home lgroup of the current thread. 1496 * We must do this with kernel preemption disabled, since we don't want our 1497 * thread to be re-homed while we're poking around with its lpl, and the lpl 1498 * should never be NULL. 1499 * 1500 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1501 * is enabled because of DR. Callers can use disable kernel preemption 1502 * around this call to guarantee that the lgroup will be valid beyond this 1503 * routine, since kernel preemption can be recursive. 1504 */ 1505 lgrp_t * 1506 lgrp_home_lgrp(void) 1507 { 1508 lgrp_t *lgrp; 1509 lpl_t *lpl; 1510 1511 kpreempt_disable(); 1512 1513 lpl = curthread->t_lpl; 1514 ASSERT(lpl != NULL); 1515 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1516 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1517 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1518 1519 kpreempt_enable(); 1520 1521 return (lgrp); 1522 } 1523 1524 /* 1525 * Return ID of home lgroup for given thread 1526 * (See comments for lgrp_home_lgrp() for special care and handling 1527 * instructions) 1528 */ 1529 lgrp_id_t 1530 lgrp_home_id(kthread_t *t) 1531 { 1532 lgrp_id_t lgrp; 1533 lpl_t *lpl; 1534 1535 ASSERT(t != NULL); 1536 /* 1537 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1538 * cannot since the HAT layer can call into this routine to 1539 * determine the locality for its data structures in the context 1540 * of a page fault. 1541 */ 1542 1543 kpreempt_disable(); 1544 1545 lpl = t->t_lpl; 1546 ASSERT(lpl != NULL); 1547 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1548 lgrp = lpl->lpl_lgrpid; 1549 1550 kpreempt_enable(); 1551 1552 return (lgrp); 1553 } 1554 1555 /* 1556 * Return lgroup containing the physical memory for the given page frame number 1557 */ 1558 lgrp_t * 1559 lgrp_pfn_to_lgrp(pfn_t pfn) 1560 { 1561 lgrp_handle_t hand; 1562 int i; 1563 lgrp_t *lgrp; 1564 1565 hand = lgrp_plat_pfn_to_hand(pfn); 1566 if (hand != LGRP_NULL_HANDLE) 1567 for (i = 0; i <= lgrp_alloc_max; i++) { 1568 lgrp = lgrp_table[i]; 1569 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1570 return (lgrp); 1571 } 1572 return (NULL); 1573 } 1574 1575 /* 1576 * Return lgroup containing the physical memory for the given page frame number 1577 */ 1578 lgrp_t * 1579 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1580 { 1581 lgrp_handle_t hand; 1582 int i; 1583 lgrp_t *lgrp; 1584 pfn_t pfn; 1585 1586 pfn = btop(physaddr); 1587 hand = lgrp_plat_pfn_to_hand(pfn); 1588 if (hand != LGRP_NULL_HANDLE) 1589 for (i = 0; i <= lgrp_alloc_max; i++) { 1590 lgrp = lgrp_table[i]; 1591 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1592 return (lgrp); 1593 } 1594 return (NULL); 1595 } 1596 1597 /* 1598 * Return the leaf lgroup containing the given CPU 1599 * 1600 * The caller needs to take precautions necessary to prevent 1601 * "cpu", and it's lpl from going away across a call to this function. 1602 * hint: kpreempt_disable()/kpreempt_enable() 1603 */ 1604 static lgrp_t * 1605 lgrp_cpu_to_lgrp(cpu_t *cpu) 1606 { 1607 return (cpu->cpu_lpl->lpl_lgrp); 1608 } 1609 1610 /* 1611 * Return the sum of the partition loads in an lgrp divided by 1612 * the number of CPUs in the lgrp. This is our best approximation 1613 * of an 'lgroup load average' for a useful per-lgroup kstat. 1614 */ 1615 static uint64_t 1616 lgrp_sum_loadavgs(lgrp_t *lgrp) 1617 { 1618 cpu_t *cpu; 1619 int ncpu; 1620 uint64_t loads = 0; 1621 1622 mutex_enter(&cpu_lock); 1623 1624 cpu = lgrp->lgrp_cpu; 1625 ncpu = lgrp->lgrp_cpucnt; 1626 1627 if (cpu == NULL || ncpu == 0) { 1628 mutex_exit(&cpu_lock); 1629 return (0ull); 1630 } 1631 1632 do { 1633 loads += cpu->cpu_lpl->lpl_loadavg; 1634 cpu = cpu->cpu_next_lgrp; 1635 } while (cpu != lgrp->lgrp_cpu); 1636 1637 mutex_exit(&cpu_lock); 1638 1639 return (loads / ncpu); 1640 } 1641 1642 void 1643 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1644 { 1645 struct lgrp_stats *pstats; 1646 1647 /* 1648 * Verify that the caller isn't trying to add to 1649 * a statistic for an lgroup that has gone away 1650 */ 1651 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1652 return; 1653 1654 pstats = &lgrp_stats[lgrpid]; 1655 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1656 } 1657 1658 int64_t 1659 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1660 { 1661 uint64_t val; 1662 struct lgrp_stats *pstats; 1663 1664 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1665 return ((int64_t)0); 1666 1667 pstats = &lgrp_stats[lgrpid]; 1668 LGRP_STAT_READ(pstats, stat, val); 1669 return (val); 1670 } 1671 1672 /* 1673 * Reset all kstats for lgrp specified by its lgrpid. 1674 */ 1675 static void 1676 lgrp_kstat_reset(lgrp_id_t lgrpid) 1677 { 1678 lgrp_stat_t stat; 1679 1680 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1681 return; 1682 1683 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1684 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1685 } 1686 } 1687 1688 /* 1689 * Collect all per-lgrp statistics for the lgrp associated with this 1690 * kstat, and store them in the ks_data array. 1691 * 1692 * The superuser can reset all the running counter statistics for an 1693 * lgrp by writing to any of the lgrp's stats. 1694 */ 1695 static int 1696 lgrp_kstat_extract(kstat_t *ksp, int rw) 1697 { 1698 lgrp_stat_t stat; 1699 struct kstat_named *ksd; 1700 lgrp_t *lgrp; 1701 lgrp_id_t lgrpid; 1702 1703 lgrp = (lgrp_t *)ksp->ks_private; 1704 1705 ksd = (struct kstat_named *)ksp->ks_data; 1706 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1707 1708 lgrpid = lgrp->lgrp_id; 1709 1710 if (lgrpid == LGRP_NONE) { 1711 /* 1712 * Return all zeroes as stats for freed lgrp. 1713 */ 1714 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1715 ksd[stat].value.i64 = 0; 1716 } 1717 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1718 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1719 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1720 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1721 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1722 } else if (rw != KSTAT_WRITE) { 1723 /* 1724 * Handle counter stats 1725 */ 1726 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1727 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1728 } 1729 1730 /* 1731 * Handle kernel data snapshot stats 1732 */ 1733 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1734 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1735 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1736 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1737 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1738 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1739 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1740 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1741 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1742 lgrp_loadavg_max_effect; 1743 } else { 1744 lgrp_kstat_reset(lgrpid); 1745 } 1746 1747 return (0); 1748 } 1749 1750 int 1751 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1752 { 1753 cpu_t *cp; 1754 1755 mutex_enter(&cpu_lock); 1756 1757 if ((cp = cpu_get(id)) == NULL) { 1758 mutex_exit(&cpu_lock); 1759 return (EINVAL); 1760 } 1761 1762 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1763 mutex_exit(&cpu_lock); 1764 return (EINVAL); 1765 } 1766 1767 ASSERT(cp->cpu_lpl != NULL); 1768 1769 *lp = cp->cpu_lpl->lpl_lgrpid; 1770 1771 mutex_exit(&cpu_lock); 1772 1773 return (0); 1774 } 1775 1776 int 1777 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1778 { 1779 cpu_t *cp; 1780 1781 mutex_enter(&cpu_lock); 1782 1783 if ((cp = cpu_get(id)) == NULL) { 1784 mutex_exit(&cpu_lock); 1785 return (EINVAL); 1786 } 1787 1788 ASSERT(cp->cpu_lpl != NULL); 1789 1790 *lp = cp->cpu_lpl->lpl_loadavg; 1791 1792 mutex_exit(&cpu_lock); 1793 1794 return (0); 1795 } 1796 1797 /* 1798 * Add a resource named by lpl_leaf to rset of lpl_target 1799 * 1800 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1801 * resource. It is adjusted here, as this is presently the only place that we 1802 * can be certain a resource addition has succeeded. 1803 * 1804 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1805 * list in order until it reaches a NULL. (This list is required to be NULL 1806 * terminated, too). This is done so that we can mark start pos + 1, so that 1807 * each lpl is traversed sequentially, but in a different order. We hope this 1808 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1809 */ 1810 1811 void 1812 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1813 { 1814 int i; 1815 int entry_slot = 0; 1816 1817 /* return if leaf is already present */ 1818 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1819 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1820 return; 1821 } 1822 1823 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1824 lpl_leaf->lpl_lgrpid) { 1825 break; 1826 } 1827 } 1828 1829 /* insert leaf, update counts */ 1830 entry_slot = i; 1831 i = lpl_target->lpl_nrset++; 1832 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1833 panic("More leaf lgrps in system than are supported!\n"); 1834 } 1835 1836 /* 1837 * Start at the end of the rset array and work backwards towards the 1838 * slot into which the new lpl will be inserted. This effectively 1839 * preserves the current ordering by scooting everybody over one entry, 1840 * and placing the new entry into the space created. 1841 */ 1842 1843 while (i-- > entry_slot) { 1844 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1845 } 1846 1847 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1848 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1849 } 1850 1851 /* 1852 * Update each of lpl_parent's children with a proper hint and 1853 * a reference to their parent. 1854 * The lgrp topology is used as the reference since it is fully 1855 * consistent and correct at this point. 1856 * 1857 * Each child's hint will reference an element in lpl_parent's 1858 * rset that designates where the child should start searching 1859 * for CPU resources. The hint selected is the highest order leaf present 1860 * in the child's lineage. 1861 * 1862 * This should be called after any potential change in lpl_parent's 1863 * rset. 1864 */ 1865 static void 1866 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1867 { 1868 klgrpset_t children, leaves; 1869 lpl_t *lpl; 1870 int hint; 1871 int i, j; 1872 1873 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1874 if (klgrpset_isempty(children)) 1875 return; /* nothing to do */ 1876 1877 for (i = 0; i <= lgrp_alloc_max; i++) { 1878 if (klgrpset_ismember(children, i)) { 1879 1880 /* 1881 * Given the set of leaves in this child's lineage, 1882 * find the highest order leaf present in the parent's 1883 * rset. Select this as the hint for the child. 1884 */ 1885 leaves = lgrp_table[i]->lgrp_leaves; 1886 hint = 0; 1887 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1888 lpl = lpl_parent->lpl_rset[j]; 1889 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1890 hint = j; 1891 } 1892 cp->cp_lgrploads[i].lpl_hint = hint; 1893 1894 /* 1895 * (Re)set the parent. It may be incorrect if 1896 * lpl_parent is new in the topology. 1897 */ 1898 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1899 } 1900 } 1901 } 1902 1903 /* 1904 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1905 * 1906 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1907 * resource. The values are adjusted here, as this is the only place that we can 1908 * be certain a resource was successfully deleted. 1909 */ 1910 void 1911 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1912 { 1913 int i; 1914 1915 /* find leaf in intermediate node */ 1916 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1917 if (lpl_target->lpl_rset[i] == lpl_leaf) 1918 break; 1919 } 1920 1921 /* return if leaf not found */ 1922 if (lpl_target->lpl_rset[i] != lpl_leaf) 1923 return; 1924 1925 /* prune leaf, compress array */ 1926 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1927 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1928 lpl_target->lpl_ncpu--; 1929 do { 1930 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1931 } while (i++ < lpl_target->lpl_nrset); 1932 } 1933 1934 /* 1935 * Check to see if the resource set of the target lpl contains the 1936 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1937 */ 1938 1939 int 1940 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1941 { 1942 int i; 1943 1944 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1945 if (lpl_target->lpl_rset[i] == lpl_leaf) 1946 return (1); 1947 } 1948 1949 return (0); 1950 } 1951 1952 /* 1953 * Called when we change cpu lpl membership. This increments or decrements the 1954 * per-cpu counter in every lpl in which our leaf appears. 1955 */ 1956 void 1957 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1958 { 1959 cpupart_t *cpupart; 1960 lgrp_t *lgrp_leaf; 1961 lgrp_t *lgrp_cur; 1962 lpl_t *lpl_leaf; 1963 lpl_t *lpl_cur; 1964 int i; 1965 1966 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1967 1968 cpupart = cp->cpu_part; 1969 lpl_leaf = cp->cpu_lpl; 1970 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1971 1972 for (i = 0; i <= lgrp_alloc_max; i++) { 1973 lgrp_cur = lgrp_table[i]; 1974 1975 /* 1976 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 1977 * for the cpu in question, or if the current lgrp and leaf 1978 * don't share the same resources. 1979 */ 1980 1981 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 1982 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 1983 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 1984 continue; 1985 1986 1987 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 1988 1989 if (lpl_cur->lpl_nrset > 0) { 1990 if (act == LPL_INCREMENT) { 1991 lpl_cur->lpl_ncpu++; 1992 } else if (act == LPL_DECREMENT) { 1993 lpl_cur->lpl_ncpu--; 1994 } 1995 } 1996 } 1997 } 1998 1999 /* 2000 * Initialize lpl with given resources and specified lgrp 2001 */ 2002 2003 void 2004 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2005 { 2006 lpl->lpl_lgrpid = lgrp->lgrp_id; 2007 lpl->lpl_loadavg = 0; 2008 if (lpl == lpl_leaf) 2009 lpl->lpl_ncpu = 1; 2010 else 2011 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2012 lpl->lpl_nrset = 1; 2013 lpl->lpl_rset[0] = lpl_leaf; 2014 lpl->lpl_lgrp = lgrp; 2015 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2016 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2017 } 2018 2019 /* 2020 * Clear an unused lpl 2021 */ 2022 2023 void 2024 lpl_clear(lpl_t *lpl) 2025 { 2026 lgrp_id_t lid; 2027 2028 /* save lid for debugging purposes */ 2029 lid = lpl->lpl_lgrpid; 2030 bzero(lpl, sizeof (lpl_t)); 2031 lpl->lpl_lgrpid = lid; 2032 } 2033 2034 /* 2035 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2036 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2037 * make full use of all of the lgroup topology, but this checks to make sure 2038 * that for the parts that it does use, it has correctly understood the 2039 * relationships that exist. This function returns 2040 * 0 if the topology is correct, and a non-zero error code, for non-debug 2041 * kernels if incorrect. Asserts are spread throughout the code to aid in 2042 * debugging on a DEBUG kernel. 2043 */ 2044 int 2045 lpl_topo_verify(cpupart_t *cpupart) 2046 { 2047 lgrp_t *lgrp; 2048 lpl_t *lpl; 2049 klgrpset_t rset; 2050 klgrpset_t cset; 2051 cpu_t *cpu; 2052 cpu_t *cp_start; 2053 int i; 2054 int j; 2055 int sum; 2056 2057 /* topology can't be incorrect if it doesn't exist */ 2058 if (!lgrp_topo_initialized || !lgrp_initialized) 2059 return (LPL_TOPO_CORRECT); 2060 2061 ASSERT(cpupart != NULL); 2062 2063 for (i = 0; i <= lgrp_alloc_max; i++) { 2064 lgrp = lgrp_table[i]; 2065 lpl = NULL; 2066 /* make sure lpls are allocated */ 2067 ASSERT(cpupart->cp_lgrploads); 2068 if (!cpupart->cp_lgrploads) 2069 return (LPL_TOPO_PART_HAS_NO_LPL); 2070 2071 lpl = &cpupart->cp_lgrploads[i]; 2072 /* make sure our index is good */ 2073 ASSERT(i < cpupart->cp_nlgrploads); 2074 2075 /* if lgroup doesn't exist, make sure lpl is empty */ 2076 if (!LGRP_EXISTS(lgrp)) { 2077 ASSERT(lpl->lpl_ncpu == 0); 2078 if (lpl->lpl_ncpu > 0) { 2079 return (LPL_TOPO_CPUS_NOT_EMPTY); 2080 } else { 2081 continue; 2082 } 2083 } 2084 2085 /* verify that lgroup and lpl are identically numbered */ 2086 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2087 2088 /* if lgroup isn't in our partition, make sure lpl is empty */ 2089 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2090 cpupart->cp_lgrpset)) { 2091 ASSERT(lpl->lpl_ncpu == 0); 2092 if (lpl->lpl_ncpu > 0) { 2093 return (LPL_TOPO_CPUS_NOT_EMPTY); 2094 } 2095 /* 2096 * lpl is empty, and lgroup isn't in partition. verify 2097 * that lpl doesn't show up in anyone else's rsets (in 2098 * this partition, anyway) 2099 */ 2100 2101 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2102 lpl_t *i_lpl; /* lpl we're iterating over */ 2103 2104 i_lpl = &cpupart->cp_lgrploads[j]; 2105 2106 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2107 if (lpl_rset_contains(i_lpl, lpl)) { 2108 return (LPL_TOPO_LPL_ORPHANED); 2109 } 2110 } 2111 /* lgroup is empty, and everything is ok. continue */ 2112 continue; 2113 } 2114 2115 2116 /* lgroup is in this partition, now check it against lpl */ 2117 2118 /* do both have matching lgrps? */ 2119 ASSERT(lgrp == lpl->lpl_lgrp); 2120 if (lgrp != lpl->lpl_lgrp) { 2121 return (LPL_TOPO_LGRP_MISMATCH); 2122 } 2123 2124 /* do the parent lgroups exist and do they match? */ 2125 if (lgrp->lgrp_parent) { 2126 ASSERT(lpl->lpl_parent); 2127 ASSERT(lgrp->lgrp_parent->lgrp_id == 2128 lpl->lpl_parent->lpl_lgrpid); 2129 2130 if (!lpl->lpl_parent) { 2131 return (LPL_TOPO_MISSING_PARENT); 2132 } else if (lgrp->lgrp_parent->lgrp_id != 2133 lpl->lpl_parent->lpl_lgrpid) { 2134 return (LPL_TOPO_PARENT_MISMATCH); 2135 } 2136 } 2137 2138 /* only leaf lgroups keep a cpucnt, only check leaves */ 2139 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2140 2141 /* verify that lgrp is also a leaf */ 2142 ASSERT((lgrp->lgrp_childcnt == 0) && 2143 (klgrpset_ismember(lgrp->lgrp_leaves, 2144 lpl->lpl_lgrpid))); 2145 2146 if ((lgrp->lgrp_childcnt > 0) || 2147 (!klgrpset_ismember(lgrp->lgrp_leaves, 2148 lpl->lpl_lgrpid))) { 2149 return (LPL_TOPO_LGRP_NOT_LEAF); 2150 } 2151 2152 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2153 (lpl->lpl_ncpu > 0)); 2154 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2155 (lpl->lpl_ncpu <= 0)) { 2156 return (LPL_TOPO_BAD_CPUCNT); 2157 } 2158 2159 /* 2160 * Check that lpl_ncpu also matches the number of 2161 * cpus in the lpl's linked list. This only exists in 2162 * leaves, but they should always match. 2163 */ 2164 j = 0; 2165 cpu = cp_start = lpl->lpl_cpus; 2166 while (cpu != NULL) { 2167 j++; 2168 2169 /* check to make sure cpu's lpl is leaf lpl */ 2170 ASSERT(cpu->cpu_lpl == lpl); 2171 if (cpu->cpu_lpl != lpl) { 2172 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2173 } 2174 2175 /* check next cpu */ 2176 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2177 continue; 2178 } else { 2179 cpu = NULL; 2180 } 2181 } 2182 2183 ASSERT(j == lpl->lpl_ncpu); 2184 if (j != lpl->lpl_ncpu) { 2185 return (LPL_TOPO_LPL_BAD_NCPU); 2186 } 2187 2188 /* 2189 * Also, check that leaf lpl is contained in all 2190 * intermediate lpls that name the leaf as a descendant 2191 */ 2192 2193 for (j = 0; j <= lgrp_alloc_max; j++) { 2194 klgrpset_t intersect; 2195 lgrp_t *lgrp_cand; 2196 lpl_t *lpl_cand; 2197 2198 lgrp_cand = lgrp_table[j]; 2199 intersect = klgrpset_intersects( 2200 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2201 cpupart->cp_lgrpset); 2202 2203 if (!LGRP_EXISTS(lgrp_cand) || 2204 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2205 cpupart->cp_lgrpset) || 2206 (intersect == 0)) 2207 continue; 2208 2209 lpl_cand = 2210 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2211 2212 if (klgrpset_ismember(intersect, 2213 lgrp->lgrp_id)) { 2214 ASSERT(lpl_rset_contains(lpl_cand, 2215 lpl)); 2216 2217 if (!lpl_rset_contains(lpl_cand, lpl)) { 2218 return (LPL_TOPO_RSET_MSSNG_LF); 2219 } 2220 } 2221 } 2222 2223 } else { /* non-leaf specific checks */ 2224 2225 /* 2226 * Non-leaf lpls should have lpl_cpus == NULL 2227 * verify that this is so 2228 */ 2229 ASSERT(lpl->lpl_cpus == NULL); 2230 if (lpl->lpl_cpus != NULL) { 2231 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2232 } 2233 2234 /* 2235 * verify that the sum of the cpus in the leaf resources 2236 * is equal to the total ncpu in the intermediate 2237 */ 2238 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2239 sum += lpl->lpl_rset[j]->lpl_ncpu; 2240 } 2241 2242 ASSERT(sum == lpl->lpl_ncpu); 2243 if (sum != lpl->lpl_ncpu) { 2244 return (LPL_TOPO_LPL_BAD_NCPU); 2245 } 2246 } 2247 2248 /* 2249 * check on lpl_hint. Don't check root, since it has no parent. 2250 */ 2251 if (lpl->lpl_parent != NULL) { 2252 int hint; 2253 lpl_t *hint_lpl; 2254 2255 /* make sure hint is within limits of nrset */ 2256 hint = lpl->lpl_hint; 2257 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2258 if (lpl->lpl_parent->lpl_nrset < hint) { 2259 return (LPL_TOPO_BOGUS_HINT); 2260 } 2261 2262 /* make sure hint points to valid lpl */ 2263 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2264 ASSERT(hint_lpl->lpl_ncpu > 0); 2265 if (hint_lpl->lpl_ncpu <= 0) { 2266 return (LPL_TOPO_BOGUS_HINT); 2267 } 2268 } 2269 2270 /* 2271 * Check the rset of the lpl in question. Make sure that each 2272 * rset contains a subset of the resources in 2273 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2274 * sure that each rset doesn't include resources that are 2275 * outside of that set. (Which would be resources somehow not 2276 * accounted for). 2277 */ 2278 2279 klgrpset_clear(rset); 2280 for (j = 0; j < lpl->lpl_nrset; j++) { 2281 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2282 } 2283 klgrpset_copy(cset, rset); 2284 /* make sure lpl rset matches lgrp rset */ 2285 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2286 /* make sure rset is contained with in partition, too */ 2287 klgrpset_diff(cset, cpupart->cp_lgrpset); 2288 2289 ASSERT(klgrpset_isempty(rset) && 2290 klgrpset_isempty(cset)); 2291 if (!klgrpset_isempty(rset) || 2292 !klgrpset_isempty(cset)) { 2293 return (LPL_TOPO_RSET_MISMATCH); 2294 } 2295 2296 /* 2297 * check to make sure lpl_nrset matches the number of rsets 2298 * contained in the lpl 2299 */ 2300 2301 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2302 j++); 2303 2304 ASSERT(j == lpl->lpl_nrset); 2305 if (j != lpl->lpl_nrset) { 2306 return (LPL_TOPO_BAD_RSETCNT); 2307 } 2308 2309 } 2310 return (LPL_TOPO_CORRECT); 2311 } 2312 2313 /* 2314 * Flatten lpl topology to given number of levels. This is presently only 2315 * implemented for a flatten to 2 levels, which will prune out the intermediates 2316 * and home the leaf lpls to the root lpl. 2317 */ 2318 int 2319 lpl_topo_flatten(int levels) 2320 { 2321 int i; 2322 uint_t sum; 2323 lgrp_t *lgrp_cur; 2324 lpl_t *lpl_cur; 2325 lpl_t *lpl_root; 2326 cpupart_t *cp; 2327 2328 if (levels != 2) 2329 return (0); 2330 2331 /* called w/ cpus paused - grab no locks! */ 2332 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2333 !lgrp_initialized); 2334 2335 cp = cp_list_head; 2336 do { 2337 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2338 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2339 2340 for (i = 0; i <= lgrp_alloc_max; i++) { 2341 lgrp_cur = lgrp_table[i]; 2342 lpl_cur = &cp->cp_lgrploads[i]; 2343 2344 if ((lgrp_cur == lgrp_root) || 2345 (!LGRP_EXISTS(lgrp_cur) && 2346 (lpl_cur->lpl_ncpu == 0))) 2347 continue; 2348 2349 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2350 /* 2351 * this should be a deleted intermediate, so 2352 * clear it 2353 */ 2354 lpl_clear(lpl_cur); 2355 } else if ((lpl_cur->lpl_nrset == 1) && 2356 (lpl_cur->lpl_rset[0] == lpl_cur) && 2357 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2358 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2359 /* 2360 * this is a leaf whose parent was deleted, or 2361 * whose parent had their lgrp deleted. (And 2362 * whose parent will soon be deleted). Point 2363 * this guy back to the root lpl. 2364 */ 2365 lpl_cur->lpl_parent = lpl_root; 2366 lpl_rset_add(lpl_root, lpl_cur); 2367 } 2368 2369 } 2370 2371 /* 2372 * Now that we're done, make sure the count on the root lpl is 2373 * correct, and update the hints of the children for the sake of 2374 * thoroughness 2375 */ 2376 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2377 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2378 } 2379 lpl_root->lpl_ncpu = sum; 2380 lpl_child_update(lpl_root, cp); 2381 2382 cp = cp->cp_next; 2383 } while (cp != cp_list_head); 2384 2385 return (levels); 2386 } 2387 2388 /* 2389 * Insert a lpl into the resource hierarchy and create any additional lpls that 2390 * are necessary to represent the varying states of locality for the cpu 2391 * resoruces newly added to the partition. 2392 * 2393 * This routine is clever enough that it can correctly add resources from the 2394 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2395 * those for which the lpl is a leaf as opposed to simply a named equally local 2396 * resource). The one special case that needs additional processing is when a 2397 * new intermediate lpl is introduced. Since the main loop only traverses 2398 * looking to add the leaf resource where it does not yet exist, additional work 2399 * is necessary to add other leaf resources that may need to exist in the newly 2400 * created intermediate. This is performed by the second inner loop, and is 2401 * only done when the check for more than one overlapping resource succeeds. 2402 */ 2403 2404 void 2405 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2406 { 2407 int i; 2408 int j; 2409 int hint; 2410 int rset_num_intersect; 2411 lgrp_t *lgrp_cur; 2412 lpl_t *lpl_cur; 2413 lpl_t *lpl_parent; 2414 lgrp_id_t parent_id; 2415 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2416 2417 for (i = 0; i <= lgrp_alloc_max; i++) { 2418 lgrp_cur = lgrp_table[i]; 2419 2420 /* 2421 * Don't insert if the lgrp isn't there, if the leaf isn't 2422 * contained within the current lgrp, or if the current lgrp has 2423 * no leaves in this partition 2424 */ 2425 2426 if (!LGRP_EXISTS(lgrp_cur) || 2427 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2428 lpl_leaf->lpl_lgrpid) || 2429 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2430 cpupart->cp_lgrpset)) 2431 continue; 2432 2433 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2434 if (lgrp_cur->lgrp_parent != NULL) { 2435 /* if lgrp has a parent, assign it properly */ 2436 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2437 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2438 } else { 2439 /* if not, make sure parent ptr gets set to null */ 2440 lpl_parent = NULL; 2441 } 2442 2443 if (lpl_cur == lpl_leaf) { 2444 /* 2445 * Almost all leaf state was initialized elsewhere. The 2446 * only thing left to do is to set the parent. 2447 */ 2448 lpl_cur->lpl_parent = lpl_parent; 2449 continue; 2450 } 2451 2452 /* 2453 * Initialize intermediate lpl 2454 * Save this lpl's hint though. Since we're changing this 2455 * lpl's resources, we need to update the hint in this lpl's 2456 * children, but the hint in this lpl is unaffected and 2457 * should be preserved. 2458 */ 2459 hint = lpl_cur->lpl_hint; 2460 2461 lpl_clear(lpl_cur); 2462 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2463 2464 lpl_cur->lpl_hint = hint; 2465 lpl_cur->lpl_parent = lpl_parent; 2466 2467 /* does new lpl need to be populated with other resources? */ 2468 rset_intersect = 2469 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2470 cpupart->cp_lgrpset); 2471 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2472 2473 if (rset_num_intersect > 1) { 2474 /* 2475 * If so, figure out what lpls have resources that 2476 * intersect this one, and add them. 2477 */ 2478 for (j = 0; j <= lgrp_alloc_max; j++) { 2479 lgrp_t *lgrp_cand; /* candidate lgrp */ 2480 lpl_t *lpl_cand; /* candidate lpl */ 2481 2482 lgrp_cand = lgrp_table[j]; 2483 if (!LGRP_EXISTS(lgrp_cand) || 2484 !klgrpset_ismember(rset_intersect, 2485 lgrp_cand->lgrp_id)) 2486 continue; 2487 lpl_cand = 2488 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2489 lpl_rset_add(lpl_cur, lpl_cand); 2490 } 2491 } 2492 /* 2493 * This lpl's rset has changed. Update the hint in it's 2494 * children. 2495 */ 2496 lpl_child_update(lpl_cur, cpupart); 2497 } 2498 } 2499 2500 /* 2501 * remove a lpl from the hierarchy of resources, clearing its state when 2502 * finished. If the lpls at the intermediate levels of the hierarchy have no 2503 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2504 * delete them as well. 2505 */ 2506 2507 void 2508 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2509 { 2510 int i; 2511 lgrp_t *lgrp_cur; 2512 lpl_t *lpl_cur; 2513 klgrpset_t leaf_intersect; /* intersection of leaves */ 2514 2515 for (i = 0; i <= lgrp_alloc_max; i++) { 2516 lgrp_cur = lgrp_table[i]; 2517 2518 /* 2519 * Don't attempt to remove from lgrps that aren't there, that 2520 * don't contain our leaf, or from the leaf itself. (We do that 2521 * later) 2522 */ 2523 2524 if (!LGRP_EXISTS(lgrp_cur)) 2525 continue; 2526 2527 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2528 2529 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2530 lpl_leaf->lpl_lgrpid) || 2531 (lpl_cur == lpl_leaf)) { 2532 continue; 2533 } 2534 2535 /* 2536 * This is a slightly sleazy simplification in that we have 2537 * already marked the cp_lgrpset as no longer containing the 2538 * leaf we've deleted. Any lpls that pass the above checks 2539 * based upon lgrp membership but not necessarily cpu-part 2540 * membership also get cleared by the checks below. Currently 2541 * this is harmless, as the lpls should be empty anyway. 2542 * 2543 * In particular, we want to preserve lpls that have additional 2544 * leaf resources, even though we don't yet have a processor 2545 * architecture that represents resources this way. 2546 */ 2547 2548 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2549 cpupart->cp_lgrpset); 2550 2551 lpl_rset_del(lpl_cur, lpl_leaf); 2552 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2553 lpl_clear(lpl_cur); 2554 } else { 2555 /* 2556 * Update this lpl's children 2557 */ 2558 lpl_child_update(lpl_cur, cpupart); 2559 } 2560 } 2561 lpl_clear(lpl_leaf); 2562 } 2563 2564 /* 2565 * add a cpu to a partition in terms of lgrp load avg bookeeping 2566 * 2567 * The lpl (cpu partition load average information) is now arranged in a 2568 * hierarchical fashion whereby resources that are closest, ie. most local, to 2569 * the cpu in question are considered to be leaves in a tree of resources. 2570 * There are two general cases for cpu additon: 2571 * 2572 * 1. A lpl structure that contains resources already in the hierarchy tree. 2573 * In this case, all of the associated lpl relationships have been defined, and 2574 * all that is necessary is that we link the new cpu into the per-lpl list of 2575 * cpus, and increment the ncpu count of all places where this cpu resource will 2576 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2577 * pushing is accomplished by this routine. 2578 * 2579 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2580 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2581 * construct the hierarchy of state necessary to name it's more distant 2582 * resources, if they should exist. The leaf structure is initialized by this 2583 * routine, as is the cpu-partition state for the lgrp membership. This routine 2584 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2585 * and builds all of the "ancestoral" state necessary to identify resources at 2586 * differing levels of locality. 2587 */ 2588 void 2589 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2590 { 2591 cpupart_t *cpupart; 2592 lgrp_t *lgrp_leaf; 2593 lpl_t *lpl_leaf; 2594 2595 /* called sometimes w/ cpus paused - grab no locks */ 2596 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2597 2598 cpupart = cp->cpu_part; 2599 lgrp_leaf = lgrp_table[lgrpid]; 2600 2601 /* don't add non-existent lgrp */ 2602 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2603 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2604 cp->cpu_lpl = lpl_leaf; 2605 2606 /* only leaf lpls contain cpus */ 2607 2608 if (lpl_leaf->lpl_ncpu++ == 0) { 2609 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2610 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2611 lpl_leaf_insert(lpl_leaf, cpupart); 2612 } else { 2613 /* 2614 * the lpl should already exist in the parent, so just update 2615 * the count of available CPUs 2616 */ 2617 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2618 } 2619 2620 /* link cpu into list of cpus in lpl */ 2621 2622 if (lpl_leaf->lpl_cpus) { 2623 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2624 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2625 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2626 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2627 } else { 2628 /* 2629 * We increment ncpu immediately after we create a new leaf 2630 * lpl, so assert that ncpu == 1 for the case where we don't 2631 * have any cpu pointers yet. 2632 */ 2633 ASSERT(lpl_leaf->lpl_ncpu == 1); 2634 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2635 } 2636 2637 } 2638 2639 2640 /* 2641 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2642 * 2643 * The lpl (cpu partition load average information) is now arranged in a 2644 * hierarchical fashion whereby resources that are closest, ie. most local, to 2645 * the cpu in question are considered to be leaves in a tree of resources. 2646 * There are two removal cases in question: 2647 * 2648 * 1. Removal of the resource in the leaf leaves other resources remaining in 2649 * that leaf. (Another cpu still exists at this level of locality). In this 2650 * case, the count of available cpus is decremented in all assocated lpls by 2651 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2652 * from the per-cpu lpl list. 2653 * 2654 * 2. Removal of the resource results in the lpl containing no resources. (It's 2655 * empty) In this case, all of what has occurred for the first step must take 2656 * place; however, additionally we must remove the lpl structure itself, prune 2657 * out any stranded lpls that do not directly name a leaf resource, and mark the 2658 * cpu partition in question as no longer containing resources from the lgrp of 2659 * the lpl that has been delted. Cpu-partition changes are handled by this 2660 * method, but the lpl_leaf_remove function deals with the details of pruning 2661 * out the empty lpl and any of its orphaned direct ancestors. 2662 */ 2663 void 2664 lgrp_part_del_cpu(cpu_t *cp) 2665 { 2666 lpl_t *lpl; 2667 lpl_t *leaf_lpl; 2668 lgrp_t *lgrp_leaf; 2669 2670 /* called sometimes w/ cpus paused - grab no locks */ 2671 2672 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2673 2674 lpl = leaf_lpl = cp->cpu_lpl; 2675 lgrp_leaf = leaf_lpl->lpl_lgrp; 2676 2677 /* don't delete a leaf that isn't there */ 2678 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2679 2680 /* no double-deletes */ 2681 ASSERT(lpl->lpl_ncpu); 2682 if (--lpl->lpl_ncpu == 0) { 2683 /* 2684 * This was the last cpu in this lgroup for this partition, 2685 * clear its bit in the partition's lgroup bitmask 2686 */ 2687 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2688 2689 /* eliminate remaning lpl link pointers in cpu, lpl */ 2690 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2691 2692 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2693 } else { 2694 2695 /* unlink cpu from lists of cpus in lpl */ 2696 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2697 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2698 if (lpl->lpl_cpus == cp) { 2699 lpl->lpl_cpus = cp->cpu_next_lpl; 2700 } 2701 2702 /* 2703 * Update the cpu count in the lpls associated with parent 2704 * lgroups. 2705 */ 2706 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2707 2708 } 2709 /* clear cpu's lpl ptr when we're all done */ 2710 cp->cpu_lpl = NULL; 2711 } 2712 2713 /* 2714 * Recompute load average for the specified partition/lgrp fragment. 2715 * 2716 * We rely on the fact that this routine is called from the clock thread 2717 * at a point before the clock thread can block (i.e. before its first 2718 * lock request). Since the clock thread can not be preempted (since it 2719 * runs at highest priority), we know that cpu partitions can not change 2720 * (since doing so would require either the repartition requester or the 2721 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2722 * without grabbing cpu_lock. 2723 */ 2724 void 2725 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2726 { 2727 uint_t ncpu; 2728 int64_t old, new, f; 2729 2730 /* 2731 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2732 */ 2733 static short expval[] = { 2734 0, 3196, 1618, 1083, 2735 814, 652, 543, 466, 2736 408, 363, 326, 297, 2737 272, 251, 233, 218, 2738 204, 192, 181, 172, 2739 163, 155, 148, 142, 2740 136, 130, 125, 121, 2741 116, 112, 109, 105 2742 }; 2743 2744 /* ASSERT (called from clock level) */ 2745 2746 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2747 ((ncpu = lpl->lpl_ncpu) == 0)) { 2748 return; 2749 } 2750 2751 for (;;) { 2752 2753 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2754 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2755 else 2756 f = expval[ncpu]; 2757 2758 /* 2759 * Modify the load average atomically to avoid losing 2760 * anticipatory load updates (see lgrp_move_thread()). 2761 */ 2762 if (ageflag) { 2763 /* 2764 * We're supposed to both update and age the load. 2765 * This happens 10 times/sec. per cpu. We do a 2766 * little hoop-jumping to avoid integer overflow. 2767 */ 2768 int64_t q, r; 2769 2770 do { 2771 old = new = lpl->lpl_loadavg; 2772 q = (old >> 16) << 7; 2773 r = (old & 0xffff) << 7; 2774 new += ((long long)(nrcpus - q) * f - 2775 ((r * f) >> 16)) >> 7; 2776 2777 /* 2778 * Check for overflow 2779 */ 2780 if (new > LGRP_LOADAVG_MAX) 2781 new = LGRP_LOADAVG_MAX; 2782 else if (new < 0) 2783 new = 0; 2784 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2785 new) != old); 2786 } else { 2787 /* 2788 * We're supposed to update the load, but not age it. 2789 * This option is used to update the load (which either 2790 * has already been aged in this 1/10 sec. interval or 2791 * soon will be) to account for a remotely executing 2792 * thread. 2793 */ 2794 do { 2795 old = new = lpl->lpl_loadavg; 2796 new += f; 2797 /* 2798 * Check for overflow 2799 * Underflow not possible here 2800 */ 2801 if (new < old) 2802 new = LGRP_LOADAVG_MAX; 2803 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2804 new) != old); 2805 } 2806 2807 /* 2808 * Do the same for this lpl's parent 2809 */ 2810 if ((lpl = lpl->lpl_parent) == NULL) 2811 break; 2812 ncpu = lpl->lpl_ncpu; 2813 } 2814 } 2815 2816 /* 2817 * Initialize lpl topology in the target based on topology currently present in 2818 * lpl_bootstrap. 2819 * 2820 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2821 * initialize cp_default list of lpls. Up to this point all topology operations 2822 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2823 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2824 * `target' points to the list of lpls in cp_default and `size' is the size of 2825 * this list. 2826 * 2827 * This function walks the lpl topology in lpl_bootstrap and does for things: 2828 * 2829 * 1) Copies all fields from lpl_bootstrap to the target. 2830 * 2831 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2832 * 2833 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2834 * instead of lpl_bootstrap. 2835 * 2836 * 4) Updates pointers in the resource list of the target to point to the lpls 2837 * in the target list instead of lpl_bootstrap. 2838 * 2839 * After lpl_topo_bootstrap() completes, target contains the same information 2840 * that would be present there if it were used during boot instead of 2841 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2842 * and it is bzeroed. 2843 */ 2844 void 2845 lpl_topo_bootstrap(lpl_t *target, int size) 2846 { 2847 lpl_t *lpl = lpl_bootstrap; 2848 lpl_t *target_lpl = target; 2849 int howmany; 2850 int id; 2851 int i; 2852 2853 /* 2854 * The only target that should be passed here is cp_default lpl list. 2855 */ 2856 ASSERT(target == cp_default.cp_lgrploads); 2857 ASSERT(size == cp_default.cp_nlgrploads); 2858 ASSERT(!lgrp_topo_initialized); 2859 ASSERT(ncpus == 1); 2860 2861 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2862 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2863 /* 2864 * Copy all fields from lpl. 2865 */ 2866 2867 *target_lpl = *lpl; 2868 2869 /* 2870 * Substitute CPU0 lpl pointer with one relative to target. 2871 */ 2872 if (lpl->lpl_cpus == CPU) { 2873 ASSERT(CPU->cpu_lpl == lpl); 2874 CPU->cpu_lpl = target_lpl; 2875 } 2876 2877 /* 2878 * Substitute parent information with parent relative to target. 2879 */ 2880 if (lpl->lpl_parent != NULL) 2881 target_lpl->lpl_parent = (lpl_t *) 2882 (((uintptr_t)lpl->lpl_parent - 2883 (uintptr_t)lpl_bootstrap) + 2884 (uintptr_t)target); 2885 2886 /* 2887 * Walk over resource set substituting pointers relative to 2888 * lpl_bootstrap to pointers relative to target. 2889 */ 2890 ASSERT(lpl->lpl_nrset <= 1); 2891 2892 for (id = 0; id < lpl->lpl_nrset; id++) { 2893 if (lpl->lpl_rset[id] != NULL) { 2894 target_lpl->lpl_rset[id] = 2895 (lpl_t *) 2896 (((uintptr_t)lpl->lpl_rset[id] - 2897 (uintptr_t)lpl_bootstrap) + 2898 (uintptr_t)target); 2899 } 2900 } 2901 } 2902 2903 /* 2904 * Topology information in lpl_bootstrap is no longer needed. 2905 */ 2906 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2907 } 2908 2909 /* 2910 * If the lowest load among the lgroups a process' threads are currently 2911 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2912 * expanding the process to a new lgroup. 2913 */ 2914 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2915 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2916 2917 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2918 ((lgrp_expand_proc_thresh) / (ncpu)) 2919 2920 /* 2921 * A process will be expanded to a new lgroup only if the difference between 2922 * the lowest load on the lgroups the process' thread's are currently spread 2923 * across and the lowest load on the other lgroups in the process' partition 2924 * is greater than lgrp_expand_proc_diff. 2925 */ 2926 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2927 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2928 2929 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2930 ((lgrp_expand_proc_diff) / (ncpu)) 2931 2932 /* 2933 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2934 * be present due to impreciseness of the load average decay algorithm. 2935 * 2936 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2937 * tolerance is scaled by the number of cpus in the lgroup just like 2938 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2939 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2940 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2941 */ 2942 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2943 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2944 ((lgrp_loadavg_tolerance) / ncpu) 2945 2946 /* 2947 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2948 * average is above this threshold 2949 */ 2950 uint32_t lgrp_load_thresh = UINT32_MAX; 2951 2952 /* 2953 * lgrp_choose() will try to skip any lgroups with less memory 2954 * than this free when choosing a home lgroup 2955 */ 2956 pgcnt_t lgrp_mem_free_thresh = 0; 2957 2958 /* 2959 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2960 * one based on one of the following policies: 2961 * - Random selection 2962 * - Pseudo round robin placement 2963 * - Longest time since a thread was last placed 2964 */ 2965 #define LGRP_CHOOSE_RANDOM 1 2966 #define LGRP_CHOOSE_RR 2 2967 #define LGRP_CHOOSE_TIME 3 2968 2969 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2970 2971 /* 2972 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 2973 * be bound to a CPU or processor set. 2974 * 2975 * Arguments: 2976 * t The thread 2977 * cpupart The partition the thread belongs to. 2978 * 2979 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 2980 * disabled, or thread_lock held (at splhigh) to protect against the CPU 2981 * partitions changing out from under us and assumes that given thread is 2982 * protected. Also, called sometimes w/ cpus paused or kernel preemption 2983 * disabled, so don't grab any locks because we should never block under 2984 * those conditions. 2985 */ 2986 lpl_t * 2987 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 2988 { 2989 lgrp_load_t bestload, bestrload; 2990 int lgrpid_offset, lgrp_count; 2991 lgrp_id_t lgrpid, lgrpid_start; 2992 lpl_t *lpl, *bestlpl, *bestrlpl; 2993 klgrpset_t lgrpset; 2994 proc_t *p; 2995 2996 ASSERT(t != NULL); 2997 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2998 THREAD_LOCK_HELD(t)); 2999 ASSERT(cpupart != NULL); 3000 3001 p = t->t_procp; 3002 3003 /* A process should always be in an active partition */ 3004 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3005 3006 bestlpl = bestrlpl = NULL; 3007 bestload = bestrload = LGRP_LOADAVG_MAX; 3008 lgrpset = cpupart->cp_lgrpset; 3009 3010 switch (lgrp_choose_policy) { 3011 case LGRP_CHOOSE_RR: 3012 lgrpid = cpupart->cp_lgrp_hint; 3013 do { 3014 if (++lgrpid > lgrp_alloc_max) 3015 lgrpid = 0; 3016 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3017 3018 break; 3019 default: 3020 case LGRP_CHOOSE_TIME: 3021 case LGRP_CHOOSE_RANDOM: 3022 klgrpset_nlgrps(lgrpset, lgrp_count); 3023 lgrpid_offset = 3024 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3025 for (lgrpid = 0; ; lgrpid++) { 3026 if (klgrpset_ismember(lgrpset, lgrpid)) { 3027 if (--lgrpid_offset == 0) 3028 break; 3029 } 3030 } 3031 break; 3032 } 3033 3034 lgrpid_start = lgrpid; 3035 3036 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3037 lgrp_id_t, cpupart->cp_lgrp_hint); 3038 3039 /* 3040 * Use lgroup affinities (if any) to choose best lgroup 3041 * 3042 * NOTE: Assumes that thread is protected from going away and its 3043 * lgroup affinities won't change (ie. p_lock, or 3044 * thread_lock() being held and/or CPUs paused) 3045 */ 3046 if (t->t_lgrp_affinity) { 3047 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 3048 if (lpl != NULL) 3049 return (lpl); 3050 } 3051 3052 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3053 3054 do { 3055 pgcnt_t npgs; 3056 3057 /* 3058 * Skip any lgroups outside of thread's pset 3059 */ 3060 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3061 if (++lgrpid > lgrp_alloc_max) 3062 lgrpid = 0; /* wrap the search */ 3063 continue; 3064 } 3065 3066 /* 3067 * Skip any non-leaf lgroups 3068 */ 3069 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3070 continue; 3071 3072 /* 3073 * Skip any lgroups without enough free memory 3074 * (when threshold set to nonzero positive value) 3075 */ 3076 if (lgrp_mem_free_thresh > 0) { 3077 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3078 if (npgs < lgrp_mem_free_thresh) { 3079 if (++lgrpid > lgrp_alloc_max) 3080 lgrpid = 0; /* wrap the search */ 3081 continue; 3082 } 3083 } 3084 3085 lpl = &cpupart->cp_lgrploads[lgrpid]; 3086 if (klgrpset_isempty(p->p_lgrpset) || 3087 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3088 /* 3089 * Either this is a new process or the process already 3090 * has threads on this lgrp, so this is a preferred 3091 * lgroup for the thread. 3092 */ 3093 if (bestlpl == NULL || 3094 lpl_pick(lpl, bestlpl)) { 3095 bestload = lpl->lpl_loadavg; 3096 bestlpl = lpl; 3097 } 3098 } else { 3099 /* 3100 * The process doesn't have any threads on this lgrp, 3101 * but we're willing to consider this lgrp if the load 3102 * difference is big enough to justify splitting up 3103 * the process' threads. 3104 */ 3105 if (bestrlpl == NULL || 3106 lpl_pick(lpl, bestrlpl)) { 3107 bestrload = lpl->lpl_loadavg; 3108 bestrlpl = lpl; 3109 } 3110 } 3111 if (++lgrpid > lgrp_alloc_max) 3112 lgrpid = 0; /* wrap the search */ 3113 } while (lgrpid != lgrpid_start); 3114 3115 /* 3116 * Return root lgroup if threshold isn't set to maximum value and 3117 * lowest lgroup load average more than a certain threshold 3118 */ 3119 if (lgrp_load_thresh != UINT32_MAX && 3120 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3121 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3122 3123 /* 3124 * If all the lgroups over which the thread's process is spread are 3125 * heavily loaded, or otherwise undesirable, we'll consider placing 3126 * the thread on one of the other leaf lgroups in the thread's 3127 * partition. 3128 */ 3129 if ((bestlpl == NULL) || 3130 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3131 (bestrload < bestload) && /* paranoid about wraparound */ 3132 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3133 bestload))) { 3134 bestlpl = bestrlpl; 3135 } 3136 3137 if (bestlpl == NULL) { 3138 /* 3139 * No lgroup looked particularly good, but we still 3140 * have to pick something. Go with the randomly selected 3141 * legal lgroup we started with above. 3142 */ 3143 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3144 } 3145 3146 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3147 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3148 3149 ASSERT(bestlpl->lpl_ncpu > 0); 3150 return (bestlpl); 3151 } 3152 3153 /* 3154 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3155 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3156 */ 3157 static int 3158 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3159 { 3160 lgrp_load_t l1, l2; 3161 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3162 3163 l1 = lpl1->lpl_loadavg; 3164 l2 = lpl2->lpl_loadavg; 3165 3166 if ((l1 + tolerance < l2) && (l1 < l2)) { 3167 /* lpl1 is significantly less loaded than lpl2 */ 3168 return (1); 3169 } 3170 3171 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3172 l1 + tolerance >= l2 && l1 < l2 && 3173 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3174 /* 3175 * lpl1's load is within the tolerance of lpl2. We're 3176 * willing to consider it be to better however if 3177 * it has been longer since we last homed a thread there 3178 */ 3179 return (1); 3180 } 3181 3182 return (0); 3183 } 3184 3185 /* 3186 * An LWP is expected to be assigned to an lgroup for at least this long 3187 * for its anticipatory load to be justified. NOTE that this value should 3188 * not be set extremely huge (say, larger than 100 years), to avoid problems 3189 * with overflow in the calculation that uses it. 3190 */ 3191 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3192 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3193 3194 /* 3195 * Routine to change a thread's lgroup affiliation. This routine updates 3196 * the thread's kthread_t struct and its process' proc_t struct to note the 3197 * thread's new lgroup affiliation, and its lgroup affinities. 3198 * 3199 * Note that this is the only routine that modifies a thread's t_lpl field, 3200 * and that adds in or removes anticipatory load. 3201 * 3202 * If the thread is exiting, newlpl is NULL. 3203 * 3204 * Locking: 3205 * The following lock must be held on entry: 3206 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3207 * doesn't get removed from t's partition 3208 * 3209 * This routine is not allowed to grab any locks, since it may be called 3210 * with cpus paused (such as from cpu_offline). 3211 */ 3212 void 3213 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3214 { 3215 proc_t *p; 3216 lpl_t *lpl, *oldlpl; 3217 lgrp_id_t oldid; 3218 kthread_t *tp; 3219 uint_t ncpu; 3220 lgrp_load_t old, new; 3221 3222 ASSERT(t); 3223 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3224 THREAD_LOCK_HELD(t)); 3225 3226 /* 3227 * If not changing lpls, just return 3228 */ 3229 if ((oldlpl = t->t_lpl) == newlpl) 3230 return; 3231 3232 /* 3233 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3234 * associated with process 0 rather than with its original process). 3235 */ 3236 if (t->t_proc_flag & TP_LWPEXIT) { 3237 if (newlpl != NULL) { 3238 t->t_lpl = newlpl; 3239 } 3240 return; 3241 } 3242 3243 p = ttoproc(t); 3244 3245 /* 3246 * If the thread had a previous lgroup, update its process' p_lgrpset 3247 * to account for it being moved from its old lgroup. 3248 */ 3249 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3250 (p->p_tlist != NULL)) { 3251 oldid = oldlpl->lpl_lgrpid; 3252 3253 if (newlpl != NULL) 3254 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3255 3256 if ((do_lgrpset_delete) && 3257 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3258 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3259 /* 3260 * Check if a thread other than the thread 3261 * that's moving is assigned to the same 3262 * lgroup as the thread that's moving. Note 3263 * that we have to compare lgroup IDs, rather 3264 * than simply comparing t_lpl's, since the 3265 * threads may belong to different partitions 3266 * but be assigned to the same lgroup. 3267 */ 3268 ASSERT(tp->t_lpl != NULL); 3269 3270 if ((tp != t) && 3271 (tp->t_lpl->lpl_lgrpid == oldid)) { 3272 /* 3273 * Another thread is assigned to the 3274 * same lgroup as the thread that's 3275 * moving, p_lgrpset doesn't change. 3276 */ 3277 break; 3278 } else if (tp == p->p_tlist) { 3279 /* 3280 * No other thread is assigned to the 3281 * same lgroup as the exiting thread, 3282 * clear the lgroup's bit in p_lgrpset. 3283 */ 3284 klgrpset_del(p->p_lgrpset, oldid); 3285 break; 3286 } 3287 } 3288 } 3289 3290 /* 3291 * If this thread was assigned to its old lgroup for such a 3292 * short amount of time that the anticipatory load that was 3293 * added on its behalf has aged very little, remove that 3294 * anticipatory load. 3295 */ 3296 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3297 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3298 lpl = oldlpl; 3299 for (;;) { 3300 do { 3301 old = new = lpl->lpl_loadavg; 3302 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3303 if (new > old) { 3304 /* 3305 * this can happen if the load 3306 * average was aged since we 3307 * added in the anticipatory 3308 * load 3309 */ 3310 new = 0; 3311 } 3312 } while (cas32( 3313 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3314 new) != old); 3315 3316 lpl = lpl->lpl_parent; 3317 if (lpl == NULL) 3318 break; 3319 3320 ncpu = lpl->lpl_ncpu; 3321 ASSERT(ncpu > 0); 3322 } 3323 } 3324 } 3325 /* 3326 * If the thread has a new lgroup (i.e. it's not exiting), update its 3327 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3328 * to its new lgroup to account for its move to its new lgroup. 3329 */ 3330 if (newlpl != NULL) { 3331 /* 3332 * This thread is moving to a new lgroup 3333 */ 3334 t->t_lpl = newlpl; 3335 3336 /* 3337 * Reflect move in load average of new lgroup 3338 * unless it is root lgroup 3339 */ 3340 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3341 return; 3342 3343 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3344 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3345 } 3346 3347 /* 3348 * It'll take some time for the load on the new lgroup 3349 * to reflect this thread's placement on it. We'd 3350 * like not, however, to have all threads between now 3351 * and then also piling on to this lgroup. To avoid 3352 * this pileup, we anticipate the load this thread 3353 * will generate on its new lgroup. The goal is to 3354 * make the lgroup's load appear as though the thread 3355 * had been there all along. We're very conservative 3356 * in calculating this anticipatory load, we assume 3357 * the worst case case (100% CPU-bound thread). This 3358 * may be modified in the future to be more accurate. 3359 */ 3360 lpl = newlpl; 3361 for (;;) { 3362 ncpu = lpl->lpl_ncpu; 3363 ASSERT(ncpu > 0); 3364 do { 3365 old = new = lpl->lpl_loadavg; 3366 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3367 /* 3368 * Check for overflow 3369 * Underflow not possible here 3370 */ 3371 if (new < old) 3372 new = UINT32_MAX; 3373 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3374 new) != old); 3375 3376 lpl = lpl->lpl_parent; 3377 if (lpl == NULL) 3378 break; 3379 } 3380 t->t_anttime = gethrtime(); 3381 } 3382 } 3383 3384 /* 3385 * Return lgroup memory allocation policy given advice from madvise(3C) 3386 */ 3387 lgrp_mem_policy_t 3388 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3389 { 3390 switch (advice) { 3391 case MADV_ACCESS_LWP: 3392 return (LGRP_MEM_POLICY_NEXT); 3393 case MADV_ACCESS_MANY: 3394 return (LGRP_MEM_POLICY_RANDOM); 3395 default: 3396 return (lgrp_mem_policy_default(size, type)); 3397 } 3398 } 3399 3400 /* 3401 * Figure out default policy 3402 */ 3403 lgrp_mem_policy_t 3404 lgrp_mem_policy_default(size_t size, int type) 3405 { 3406 cpupart_t *cp; 3407 lgrp_mem_policy_t policy; 3408 size_t pset_mem_size; 3409 3410 /* 3411 * Randomly allocate memory across lgroups for shared memory 3412 * beyond a certain threshold 3413 */ 3414 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3415 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3416 /* 3417 * Get total memory size of current thread's pset 3418 */ 3419 kpreempt_disable(); 3420 cp = curthread->t_cpupart; 3421 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3422 kpreempt_enable(); 3423 3424 /* 3425 * Choose policy to randomly allocate memory across 3426 * lgroups in pset if it will fit and is not default 3427 * partition. Otherwise, allocate memory randomly 3428 * across machine. 3429 */ 3430 if (lgrp_mem_pset_aware && size < pset_mem_size) 3431 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3432 else 3433 policy = LGRP_MEM_POLICY_RANDOM; 3434 } else 3435 /* 3436 * Apply default policy for private memory and 3437 * shared memory under the respective random 3438 * threshold. 3439 */ 3440 policy = lgrp_mem_default_policy; 3441 3442 return (policy); 3443 } 3444 3445 /* 3446 * Get memory allocation policy for this segment 3447 */ 3448 lgrp_mem_policy_info_t * 3449 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3450 { 3451 lgrp_mem_policy_info_t *policy_info; 3452 extern struct seg_ops segspt_ops; 3453 extern struct seg_ops segspt_shmops; 3454 3455 /* 3456 * This is for binary compatibility to protect against third party 3457 * segment drivers which haven't recompiled to allow for 3458 * SEGOP_GETPOLICY() 3459 */ 3460 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3461 seg->s_ops != &segspt_shmops) 3462 return (NULL); 3463 3464 policy_info = NULL; 3465 if (seg->s_ops->getpolicy != NULL) 3466 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3467 3468 return (policy_info); 3469 } 3470 3471 /* 3472 * Set policy for allocating private memory given desired policy, policy info, 3473 * size in bytes of memory that policy is being applied. 3474 * Return 0 if policy wasn't set already and 1 if policy was set already 3475 */ 3476 int 3477 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3478 lgrp_mem_policy_info_t *policy_info, size_t size) 3479 { 3480 3481 ASSERT(policy_info != NULL); 3482 3483 if (policy == LGRP_MEM_POLICY_DEFAULT) 3484 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3485 3486 /* 3487 * Policy set already? 3488 */ 3489 if (policy == policy_info->mem_policy) 3490 return (1); 3491 3492 /* 3493 * Set policy 3494 */ 3495 policy_info->mem_policy = policy; 3496 policy_info->mem_reserved = 0; 3497 3498 return (0); 3499 } 3500 3501 3502 /* 3503 * Get shared memory allocation policy with given tree and offset 3504 */ 3505 lgrp_mem_policy_info_t * 3506 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3507 u_offset_t vn_off) 3508 { 3509 u_offset_t off; 3510 lgrp_mem_policy_info_t *policy_info; 3511 lgrp_shm_policy_seg_t *policy_seg; 3512 lgrp_shm_locality_t *shm_locality; 3513 avl_tree_t *tree; 3514 avl_index_t where; 3515 3516 /* 3517 * Get policy segment tree from anon_map or vnode and use specified 3518 * anon index or vnode offset as offset 3519 * 3520 * Assume that no lock needs to be held on anon_map or vnode, since 3521 * they should be protected by their reference count which must be 3522 * nonzero for an existing segment 3523 */ 3524 if (amp) { 3525 ASSERT(amp->refcnt != 0); 3526 shm_locality = amp->locality; 3527 if (shm_locality == NULL) 3528 return (NULL); 3529 tree = shm_locality->loc_tree; 3530 off = ptob(anon_index); 3531 } else if (vp) { 3532 shm_locality = vp->v_locality; 3533 if (shm_locality == NULL) 3534 return (NULL); 3535 ASSERT(shm_locality->loc_count != 0); 3536 tree = shm_locality->loc_tree; 3537 off = vn_off; 3538 } 3539 3540 if (tree == NULL) 3541 return (NULL); 3542 3543 /* 3544 * Lookup policy segment for offset into shared object and return 3545 * policy info 3546 */ 3547 rw_enter(&shm_locality->loc_lock, RW_READER); 3548 policy_info = NULL; 3549 policy_seg = avl_find(tree, &off, &where); 3550 if (policy_seg) 3551 policy_info = &policy_seg->shm_policy; 3552 rw_exit(&shm_locality->loc_lock); 3553 3554 return (policy_info); 3555 } 3556 3557 /* 3558 * Default memory allocation policy for kernel segmap pages 3559 */ 3560 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3561 3562 /* 3563 * Return lgroup to use for allocating memory 3564 * given the segment and address 3565 * 3566 * There isn't any mutual exclusion that exists between calls 3567 * to this routine and DR, so this routine and whomever calls it 3568 * should be mindful of the possibility that the lgrp returned 3569 * may be deleted. If this happens, dereferences of the lgrp 3570 * pointer will still be safe, but the resources in the lgrp will 3571 * be gone, and LGRP_EXISTS() will no longer be true. 3572 */ 3573 lgrp_t * 3574 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3575 { 3576 int i; 3577 lgrp_t *lgrp; 3578 klgrpset_t lgrpset; 3579 int lgrps_spanned; 3580 unsigned long off; 3581 lgrp_mem_policy_t policy; 3582 lgrp_mem_policy_info_t *policy_info; 3583 ushort_t random; 3584 int stat = 0; 3585 extern struct seg *segkmap; 3586 3587 /* 3588 * Just return null if the lgrp framework hasn't finished 3589 * initializing or if this is a UMA machine. 3590 */ 3591 if (nlgrps == 1 || !lgrp_initialized) 3592 return (lgrp_root); 3593 3594 /* 3595 * Get memory allocation policy for this segment 3596 */ 3597 policy = lgrp_mem_default_policy; 3598 if (seg != NULL) { 3599 if (seg->s_as == &kas) { 3600 if (seg == segkmap) 3601 policy = lgrp_segmap_default_policy; 3602 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3603 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3604 policy = LGRP_MEM_POLICY_RANDOM; 3605 } else { 3606 policy_info = lgrp_mem_policy_get(seg, vaddr); 3607 if (policy_info != NULL) 3608 policy = policy_info->mem_policy; 3609 } 3610 } 3611 lgrpset = 0; 3612 3613 /* 3614 * Initialize lgroup to home by default 3615 */ 3616 lgrp = lgrp_home_lgrp(); 3617 3618 /* 3619 * When homing threads on root lgrp, override default memory 3620 * allocation policies with root lgroup memory allocation policy 3621 */ 3622 if (lgrp == lgrp_root) 3623 policy = lgrp_mem_policy_root; 3624 3625 /* 3626 * Implement policy 3627 */ 3628 switch (policy) { 3629 case LGRP_MEM_POLICY_NEXT_CPU: 3630 3631 /* 3632 * Return lgroup of current CPU which faulted on memory 3633 * If the CPU isn't currently in an lgrp, then opt to 3634 * allocate from the root. 3635 * 3636 * Kernel preemption needs to be disabled here to prevent 3637 * the current CPU from going away before lgrp is found. 3638 */ 3639 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3640 lgrp = lgrp_root; 3641 } else { 3642 kpreempt_disable(); 3643 lgrp = lgrp_cpu_to_lgrp(CPU); 3644 kpreempt_enable(); 3645 } 3646 break; 3647 3648 case LGRP_MEM_POLICY_NEXT: 3649 case LGRP_MEM_POLICY_DEFAULT: 3650 default: 3651 3652 /* 3653 * Just return current thread's home lgroup 3654 * for default policy (next touch) 3655 * If the thread is homed to the root, 3656 * then the default policy is random across lgroups. 3657 * Fallthrough to the random case. 3658 */ 3659 if (lgrp != lgrp_root) { 3660 if (policy == LGRP_MEM_POLICY_NEXT) 3661 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3662 else 3663 lgrp_stat_add(lgrp->lgrp_id, 3664 LGRP_NUM_DEFAULT, 1); 3665 break; 3666 } 3667 /* LINTED fallthrough on case statement */ 3668 case LGRP_MEM_POLICY_RANDOM: 3669 3670 /* 3671 * Return a random leaf lgroup with memory 3672 */ 3673 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3674 /* 3675 * Count how many lgroups are spanned 3676 */ 3677 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3678 3679 /* 3680 * There may be no memnodes in the root lgroup during DR copy 3681 * rename on a system with only two boards (memnodes) 3682 * configured. In this case just return the root lgrp. 3683 */ 3684 if (lgrps_spanned == 0) { 3685 lgrp = lgrp_root; 3686 break; 3687 } 3688 3689 /* 3690 * Pick a random offset within lgroups spanned 3691 * and return lgroup at that offset 3692 */ 3693 random = (ushort_t)gethrtime() >> 4; 3694 off = random % lgrps_spanned; 3695 ASSERT(off <= lgrp_alloc_max); 3696 3697 for (i = 0; i <= lgrp_alloc_max; i++) { 3698 if (!klgrpset_ismember(lgrpset, i)) 3699 continue; 3700 if (off) 3701 off--; 3702 else { 3703 lgrp = lgrp_table[i]; 3704 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3705 1); 3706 break; 3707 } 3708 } 3709 break; 3710 3711 case LGRP_MEM_POLICY_RANDOM_PROC: 3712 3713 /* 3714 * Grab copy of bitmask of lgroups spanned by 3715 * this process 3716 */ 3717 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3718 stat = LGRP_NUM_RANDOM_PROC; 3719 3720 /* LINTED fallthrough on case statement */ 3721 case LGRP_MEM_POLICY_RANDOM_PSET: 3722 3723 if (!stat) 3724 stat = LGRP_NUM_RANDOM_PSET; 3725 3726 if (klgrpset_isempty(lgrpset)) { 3727 /* 3728 * Grab copy of bitmask of lgroups spanned by 3729 * this processor set 3730 */ 3731 kpreempt_disable(); 3732 klgrpset_copy(lgrpset, 3733 curthread->t_cpupart->cp_lgrpset); 3734 kpreempt_enable(); 3735 } 3736 3737 /* 3738 * Count how many lgroups are spanned 3739 */ 3740 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3741 ASSERT(lgrps_spanned <= nlgrps); 3742 3743 /* 3744 * Probably lgrps_spanned should be always non-zero, but to be 3745 * on the safe side we return lgrp_root if it is empty. 3746 */ 3747 if (lgrps_spanned == 0) { 3748 lgrp = lgrp_root; 3749 break; 3750 } 3751 3752 /* 3753 * Pick a random offset within lgroups spanned 3754 * and return lgroup at that offset 3755 */ 3756 random = (ushort_t)gethrtime() >> 4; 3757 off = random % lgrps_spanned; 3758 ASSERT(off <= lgrp_alloc_max); 3759 3760 for (i = 0; i <= lgrp_alloc_max; i++) { 3761 if (!klgrpset_ismember(lgrpset, i)) 3762 continue; 3763 if (off) 3764 off--; 3765 else { 3766 lgrp = lgrp_table[i]; 3767 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3768 1); 3769 break; 3770 } 3771 } 3772 break; 3773 3774 case LGRP_MEM_POLICY_ROUNDROBIN: 3775 3776 /* 3777 * Use offset within segment to determine 3778 * offset from home lgroup to choose for 3779 * next lgroup to allocate memory from 3780 */ 3781 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3782 (lgrp_alloc_max + 1); 3783 3784 kpreempt_disable(); 3785 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3786 i = lgrp->lgrp_id; 3787 kpreempt_enable(); 3788 3789 while (off > 0) { 3790 i = (i + 1) % (lgrp_alloc_max + 1); 3791 lgrp = lgrp_table[i]; 3792 if (klgrpset_ismember(lgrpset, i)) 3793 off--; 3794 } 3795 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3796 3797 break; 3798 } 3799 3800 ASSERT(lgrp != NULL); 3801 return (lgrp); 3802 } 3803 3804 /* 3805 * Return the number of pages in an lgroup 3806 * 3807 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3808 * could cause tests that rely on the numat driver to fail.... 3809 */ 3810 pgcnt_t 3811 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3812 { 3813 lgrp_t *lgrp; 3814 3815 lgrp = lgrp_table[lgrpid]; 3816 if (!LGRP_EXISTS(lgrp) || 3817 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3818 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3819 return (0); 3820 3821 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3822 } 3823 3824 /* 3825 * Initialize lgroup shared memory allocation policy support 3826 */ 3827 void 3828 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3829 { 3830 lgrp_shm_locality_t *shm_locality; 3831 3832 /* 3833 * Initialize locality field in anon_map 3834 * Don't need any locks because this is called when anon_map is 3835 * allocated, but not used anywhere yet. 3836 */ 3837 if (amp) { 3838 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3839 if (amp->locality == NULL) { 3840 /* 3841 * Allocate and initialize shared memory locality info 3842 * and set anon_map locality pointer to it 3843 * Drop lock across kmem_alloc(KM_SLEEP) 3844 */ 3845 ANON_LOCK_EXIT(&->a_rwlock); 3846 shm_locality = kmem_alloc(sizeof (*shm_locality), 3847 KM_SLEEP); 3848 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3849 NULL); 3850 shm_locality->loc_count = 1; /* not used for amp */ 3851 shm_locality->loc_tree = NULL; 3852 3853 /* 3854 * Reacquire lock and check to see whether anyone beat 3855 * us to initializing the locality info 3856 */ 3857 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3858 if (amp->locality != NULL) { 3859 rw_destroy(&shm_locality->loc_lock); 3860 kmem_free(shm_locality, 3861 sizeof (*shm_locality)); 3862 } else 3863 amp->locality = shm_locality; 3864 } 3865 ANON_LOCK_EXIT(&->a_rwlock); 3866 return; 3867 } 3868 3869 /* 3870 * Allocate shared vnode policy info if vnode is not locality aware yet 3871 */ 3872 mutex_enter(&vp->v_lock); 3873 if ((vp->v_flag & V_LOCALITY) == 0) { 3874 /* 3875 * Allocate and initialize shared memory locality info 3876 */ 3877 mutex_exit(&vp->v_lock); 3878 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3879 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3880 shm_locality->loc_count = 1; 3881 shm_locality->loc_tree = NULL; 3882 3883 /* 3884 * Point vnode locality field at shared vnode policy info 3885 * and set locality aware flag in vnode 3886 */ 3887 mutex_enter(&vp->v_lock); 3888 if ((vp->v_flag & V_LOCALITY) == 0) { 3889 vp->v_locality = shm_locality; 3890 vp->v_flag |= V_LOCALITY; 3891 } else { 3892 /* 3893 * Lost race so free locality info and increment count. 3894 */ 3895 rw_destroy(&shm_locality->loc_lock); 3896 kmem_free(shm_locality, sizeof (*shm_locality)); 3897 shm_locality = vp->v_locality; 3898 shm_locality->loc_count++; 3899 } 3900 mutex_exit(&vp->v_lock); 3901 3902 return; 3903 } 3904 3905 /* 3906 * Increment reference count of number of segments mapping this vnode 3907 * shared 3908 */ 3909 shm_locality = vp->v_locality; 3910 shm_locality->loc_count++; 3911 mutex_exit(&vp->v_lock); 3912 } 3913 3914 /* 3915 * Destroy the given shared memory policy segment tree 3916 */ 3917 void 3918 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3919 { 3920 lgrp_shm_policy_seg_t *cur; 3921 lgrp_shm_policy_seg_t *next; 3922 3923 if (tree == NULL) 3924 return; 3925 3926 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3927 while (cur != NULL) { 3928 next = AVL_NEXT(tree, cur); 3929 avl_remove(tree, cur); 3930 kmem_free(cur, sizeof (*cur)); 3931 cur = next; 3932 } 3933 kmem_free(tree, sizeof (avl_tree_t)); 3934 } 3935 3936 /* 3937 * Uninitialize lgroup shared memory allocation policy support 3938 */ 3939 void 3940 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3941 { 3942 lgrp_shm_locality_t *shm_locality; 3943 3944 /* 3945 * For anon_map, deallocate shared memory policy tree and 3946 * zero locality field 3947 * Don't need any locks because anon_map is being freed 3948 */ 3949 if (amp) { 3950 if (amp->locality == NULL) 3951 return; 3952 shm_locality = amp->locality; 3953 shm_locality->loc_count = 0; /* not really used for amp */ 3954 rw_destroy(&shm_locality->loc_lock); 3955 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3956 kmem_free(shm_locality, sizeof (*shm_locality)); 3957 amp->locality = 0; 3958 return; 3959 } 3960 3961 /* 3962 * For vnode, decrement reference count of segments mapping this vnode 3963 * shared and delete locality info if reference count drops to 0 3964 */ 3965 mutex_enter(&vp->v_lock); 3966 shm_locality = vp->v_locality; 3967 shm_locality->loc_count--; 3968 3969 if (shm_locality->loc_count == 0) { 3970 rw_destroy(&shm_locality->loc_lock); 3971 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3972 kmem_free(shm_locality, sizeof (*shm_locality)); 3973 vp->v_locality = 0; 3974 vp->v_flag &= ~V_LOCALITY; 3975 } 3976 mutex_exit(&vp->v_lock); 3977 } 3978 3979 /* 3980 * Compare two shared memory policy segments 3981 * Used by AVL tree code for searching 3982 */ 3983 int 3984 lgrp_shm_policy_compar(const void *x, const void *y) 3985 { 3986 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 3987 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 3988 3989 if (a->shm_off < b->shm_off) 3990 return (-1); 3991 if (a->shm_off >= b->shm_off + b->shm_size) 3992 return (1); 3993 return (0); 3994 } 3995 3996 /* 3997 * Concatenate seg1 with seg2 and remove seg2 3998 */ 3999 static int 4000 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4001 lgrp_shm_policy_seg_t *seg2) 4002 { 4003 if (!seg1 || !seg2 || 4004 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4005 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4006 return (-1); 4007 4008 seg1->shm_size += seg2->shm_size; 4009 avl_remove(tree, seg2); 4010 kmem_free(seg2, sizeof (*seg2)); 4011 return (0); 4012 } 4013 4014 /* 4015 * Split segment at given offset and return rightmost (uppermost) segment 4016 * Assumes that there are no overlapping segments 4017 */ 4018 static lgrp_shm_policy_seg_t * 4019 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4020 u_offset_t off) 4021 { 4022 lgrp_shm_policy_seg_t *newseg; 4023 avl_index_t where; 4024 4025 ASSERT(seg != NULL); 4026 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4027 4028 if (!seg || off < seg->shm_off || off > seg->shm_off + 4029 seg->shm_size) 4030 return (NULL); 4031 4032 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4033 return (seg); 4034 4035 /* 4036 * Adjust size of left segment and allocate new (right) segment 4037 */ 4038 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4039 newseg->shm_policy = seg->shm_policy; 4040 newseg->shm_off = off; 4041 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4042 seg->shm_size = off - seg->shm_off; 4043 4044 /* 4045 * Find where to insert new segment in AVL tree and insert it 4046 */ 4047 (void) avl_find(tree, &off, &where); 4048 avl_insert(tree, newseg, where); 4049 4050 return (newseg); 4051 } 4052 4053 /* 4054 * Set shared memory allocation policy on specified shared object at given 4055 * offset and length 4056 * 4057 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4058 * -1 if can't set policy. 4059 */ 4060 int 4061 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4062 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4063 { 4064 u_offset_t eoff; 4065 lgrp_shm_policy_seg_t *next; 4066 lgrp_shm_policy_seg_t *newseg; 4067 u_offset_t off; 4068 u_offset_t oldeoff; 4069 lgrp_shm_policy_seg_t *prev; 4070 int retval; 4071 lgrp_shm_policy_seg_t *seg; 4072 lgrp_shm_locality_t *shm_locality; 4073 avl_tree_t *tree; 4074 avl_index_t where; 4075 4076 ASSERT(amp || vp); 4077 ASSERT((len & PAGEOFFSET) == 0); 4078 4079 if (len == 0) 4080 return (-1); 4081 4082 retval = 0; 4083 4084 /* 4085 * Get locality info and starting offset into shared object 4086 * Try anon map first and then vnode 4087 * Assume that no locks need to be held on anon_map or vnode, since 4088 * it should be protected by its reference count which must be nonzero 4089 * for an existing segment. 4090 */ 4091 if (amp) { 4092 /* 4093 * Get policy info from anon_map 4094 * 4095 */ 4096 ASSERT(amp->refcnt != 0); 4097 if (amp->locality == NULL) 4098 lgrp_shm_policy_init(amp, NULL); 4099 shm_locality = amp->locality; 4100 off = ptob(anon_index); 4101 } else if (vp) { 4102 /* 4103 * Get policy info from vnode 4104 */ 4105 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4106 lgrp_shm_policy_init(NULL, vp); 4107 shm_locality = vp->v_locality; 4108 ASSERT(shm_locality->loc_count != 0); 4109 off = vn_off; 4110 } else 4111 return (-1); 4112 4113 ASSERT((off & PAGEOFFSET) == 0); 4114 4115 /* 4116 * Figure out default policy 4117 */ 4118 if (policy == LGRP_MEM_POLICY_DEFAULT) 4119 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4120 4121 /* 4122 * Create AVL tree if there isn't one yet 4123 * and set locality field to point at it 4124 */ 4125 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4126 tree = shm_locality->loc_tree; 4127 if (!tree) { 4128 rw_exit(&shm_locality->loc_lock); 4129 4130 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4131 4132 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4133 if (shm_locality->loc_tree == NULL) { 4134 avl_create(tree, lgrp_shm_policy_compar, 4135 sizeof (lgrp_shm_policy_seg_t), 4136 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4137 shm_locality->loc_tree = tree; 4138 } else { 4139 /* 4140 * Another thread managed to set up the tree 4141 * before we could. Free the tree we allocated 4142 * and use the one that's already there. 4143 */ 4144 kmem_free(tree, sizeof (*tree)); 4145 tree = shm_locality->loc_tree; 4146 } 4147 } 4148 4149 /* 4150 * Set policy 4151 * 4152 * Need to maintain hold on writer's lock to keep tree from 4153 * changing out from under us 4154 */ 4155 while (len != 0) { 4156 /* 4157 * Find policy segment for specified offset into shared object 4158 */ 4159 seg = avl_find(tree, &off, &where); 4160 4161 /* 4162 * Didn't find any existing segment that contains specified 4163 * offset, so allocate new segment, insert it, and concatenate 4164 * with adjacent segments if possible 4165 */ 4166 if (seg == NULL) { 4167 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4168 KM_SLEEP); 4169 newseg->shm_policy.mem_policy = policy; 4170 newseg->shm_policy.mem_reserved = 0; 4171 newseg->shm_off = off; 4172 avl_insert(tree, newseg, where); 4173 4174 /* 4175 * Check to see whether new segment overlaps with next 4176 * one, set length of new segment accordingly, and 4177 * calculate remaining length and next offset 4178 */ 4179 seg = AVL_NEXT(tree, newseg); 4180 if (seg == NULL || off + len <= seg->shm_off) { 4181 newseg->shm_size = len; 4182 len = 0; 4183 } else { 4184 newseg->shm_size = seg->shm_off - off; 4185 off = seg->shm_off; 4186 len -= newseg->shm_size; 4187 } 4188 4189 /* 4190 * Try to concatenate new segment with next and 4191 * previous ones, since they might have the same policy 4192 * now. Grab previous and next segments first because 4193 * they will change on concatenation. 4194 */ 4195 prev = AVL_PREV(tree, newseg); 4196 next = AVL_NEXT(tree, newseg); 4197 (void) lgrp_shm_policy_concat(tree, newseg, next); 4198 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4199 4200 continue; 4201 } 4202 4203 eoff = off + len; 4204 oldeoff = seg->shm_off + seg->shm_size; 4205 4206 /* 4207 * Policy set already? 4208 */ 4209 if (policy == seg->shm_policy.mem_policy) { 4210 /* 4211 * Nothing left to do if offset and length 4212 * fall within this segment 4213 */ 4214 if (eoff <= oldeoff) { 4215 retval = 1; 4216 break; 4217 } else { 4218 len = eoff - oldeoff; 4219 off = oldeoff; 4220 continue; 4221 } 4222 } 4223 4224 /* 4225 * Specified offset and length match existing segment exactly 4226 */ 4227 if (off == seg->shm_off && len == seg->shm_size) { 4228 /* 4229 * Set policy and update current length 4230 */ 4231 seg->shm_policy.mem_policy = policy; 4232 seg->shm_policy.mem_reserved = 0; 4233 len = 0; 4234 4235 /* 4236 * Try concatenating new segment with previous and next 4237 * segments, since they might have the same policy now. 4238 * Grab previous and next segments first because they 4239 * will change on concatenation. 4240 */ 4241 prev = AVL_PREV(tree, seg); 4242 next = AVL_NEXT(tree, seg); 4243 (void) lgrp_shm_policy_concat(tree, seg, next); 4244 (void) lgrp_shm_policy_concat(tree, prev, seg); 4245 } else { 4246 /* 4247 * Specified offset and length only apply to part of 4248 * existing segment 4249 */ 4250 4251 /* 4252 * New segment starts in middle of old one, so split 4253 * new one off near beginning of old one 4254 */ 4255 newseg = NULL; 4256 if (off > seg->shm_off) { 4257 newseg = lgrp_shm_policy_split(tree, seg, off); 4258 4259 /* 4260 * New segment ends where old one did, so try 4261 * to concatenate with next segment 4262 */ 4263 if (eoff == oldeoff) { 4264 newseg->shm_policy.mem_policy = policy; 4265 newseg->shm_policy.mem_reserved = 0; 4266 (void) lgrp_shm_policy_concat(tree, 4267 newseg, AVL_NEXT(tree, newseg)); 4268 break; 4269 } 4270 } 4271 4272 /* 4273 * New segment ends before old one, so split off end of 4274 * old one 4275 */ 4276 if (eoff < oldeoff) { 4277 if (newseg) { 4278 (void) lgrp_shm_policy_split(tree, 4279 newseg, eoff); 4280 newseg->shm_policy.mem_policy = policy; 4281 newseg->shm_policy.mem_reserved = 0; 4282 } else { 4283 (void) lgrp_shm_policy_split(tree, seg, 4284 eoff); 4285 seg->shm_policy.mem_policy = policy; 4286 seg->shm_policy.mem_reserved = 0; 4287 } 4288 4289 if (off == seg->shm_off) 4290 (void) lgrp_shm_policy_concat(tree, 4291 AVL_PREV(tree, seg), seg); 4292 break; 4293 } 4294 4295 /* 4296 * Calculate remaining length and next offset 4297 */ 4298 len = eoff - oldeoff; 4299 off = oldeoff; 4300 } 4301 } 4302 4303 rw_exit(&shm_locality->loc_lock); 4304 return (retval); 4305 } 4306 4307 /* 4308 * Return the best memnode from which to allocate memory given 4309 * an lgroup. 4310 * 4311 * "c" is for cookie, which is good enough for me. 4312 * It references a cookie struct that should be zero'ed to initialize. 4313 * The cookie should live on the caller's stack. 4314 * 4315 * The routine returns -1 when: 4316 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4317 * - traverse is 1, and all the memnodes in the system have been 4318 * returned. 4319 */ 4320 int 4321 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4322 { 4323 lgrp_t *lp = c->lmc_lgrp; 4324 mnodeset_t nodes = c->lmc_nodes; 4325 int cnt = c->lmc_cnt; 4326 int offset, mnode; 4327 4328 extern int max_mem_nodes; 4329 4330 /* 4331 * If the set is empty, and the caller is willing, traverse 4332 * up the hierarchy until we find a non-empty set. 4333 */ 4334 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4335 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4336 ((lp = lp->lgrp_parent) == NULL)) 4337 return (-1); 4338 4339 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4340 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4341 } 4342 4343 /* 4344 * Select a memnode by picking one at a "random" offset. 4345 * Because of DR, memnodes can come and go at any time. 4346 * This code must be able to cope with the possibility 4347 * that the nodes count "cnt" is inconsistent with respect 4348 * to the number of elements actually in "nodes", and 4349 * therefore that the offset chosen could be greater than 4350 * the number of elements in the set (some memnodes may 4351 * have dissapeared just before cnt was read). 4352 * If this happens, the search simply wraps back to the 4353 * beginning of the set. 4354 */ 4355 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4356 offset = c->lmc_rand % cnt; 4357 do { 4358 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4359 if (nodes & ((mnodeset_t)1 << mnode)) 4360 if (!offset--) 4361 break; 4362 } while (mnode >= max_mem_nodes); 4363 4364 /* Found a node. Store state before returning. */ 4365 c->lmc_lgrp = lp; 4366 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4367 c->lmc_cnt = cnt - 1; 4368 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4369 c->lmc_ntried++; 4370 4371 return (mnode); 4372 } 4373