1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Basic NUMA support in terms of locality groups 28 * 29 * Solaris needs to know which CPUs, memory, etc. are near each other to 30 * provide good performance on NUMA machines by optimizing for locality. 31 * In order to do this, a new abstraction called a "locality group (lgroup)" 32 * has been introduced to keep track of which CPU-like and memory-like hardware 33 * resources are close to each other. Currently, latency is the only measure 34 * used to determine how to group hardware resources into lgroups, but this 35 * does not limit the groupings to be based solely on latency. Other factors 36 * may be used to determine the groupings in the future. 37 * 38 * Lgroups are organized into a hieararchy or topology that represents the 39 * latency topology of the machine. There is always at least a root lgroup in 40 * the system. It represents all the hardware resources in the machine at a 41 * latency big enough that any hardware resource can at least access any other 42 * hardware resource within that latency. A Uniform Memory Access (UMA) 43 * machine is represented with one lgroup (the root). In contrast, a NUMA 44 * machine is represented at least by the root lgroup and some number of leaf 45 * lgroups where the leaf lgroups contain the hardware resources within the 46 * least latency of each other and the root lgroup still contains all the 47 * resources in the machine. Some number of intermediate lgroups may exist 48 * which represent more levels of locality than just the local latency of the 49 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 50 * (eg. root and intermediate lgroups) contain the next nearest resources to 51 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 52 * to the root lgroup shows the hardware resources from closest to farthest 53 * from the leaf lgroup such that each successive ancestor lgroup contains 54 * the next nearest resources at the next level of locality from the previous. 55 * 56 * The kernel uses the lgroup abstraction to know how to allocate resources 57 * near a given process/thread. At fork() and lwp/thread_create() time, a 58 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 59 * with the lowest load average. Binding to a processor or processor set will 60 * change the home lgroup for a thread. The scheduler has been modified to try 61 * to dispatch a thread on a CPU in its home lgroup. Physical memory 62 * allocation is lgroup aware too, so memory will be allocated from the current 63 * thread's home lgroup if possible. If the desired resources are not 64 * available, the kernel traverses the lgroup hierarchy going to the parent 65 * lgroup to find resources at the next level of locality until it reaches the 66 * root lgroup. 67 */ 68 69 #include <sys/lgrp.h> 70 #include <sys/lgrp_user.h> 71 #include <sys/types.h> 72 #include <sys/mman.h> 73 #include <sys/param.h> 74 #include <sys/var.h> 75 #include <sys/thread.h> 76 #include <sys/cpuvar.h> 77 #include <sys/cpupart.h> 78 #include <sys/kmem.h> 79 #include <vm/seg.h> 80 #include <vm/seg_kmem.h> 81 #include <vm/seg_spt.h> 82 #include <vm/seg_vn.h> 83 #include <vm/as.h> 84 #include <sys/atomic.h> 85 #include <sys/systm.h> 86 #include <sys/errno.h> 87 #include <sys/cmn_err.h> 88 #include <sys/kstat.h> 89 #include <sys/sysmacros.h> 90 #include <sys/pg.h> 91 #include <sys/promif.h> 92 #include <sys/sdt.h> 93 94 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 95 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 96 /* indexed by lgrp_id */ 97 int nlgrps; /* number of lgroups in machine */ 98 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 99 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 100 101 /* 102 * Kstat data for lgroups. 103 * 104 * Actual kstat data is collected in lgrp_stats array. 105 * The lgrp_kstat_data array of named kstats is used to extract data from 106 * lgrp_stats and present it to kstat framework. It is protected from partallel 107 * modifications by lgrp_kstat_mutex. This may cause some contention when 108 * several kstat commands run in parallel but this is not the 109 * performance-critical path. 110 */ 111 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 112 113 /* 114 * Declare kstat names statically for enums as defined in the header file. 115 */ 116 LGRP_KSTAT_NAMES; 117 118 static void lgrp_kstat_init(void); 119 static int lgrp_kstat_extract(kstat_t *, int); 120 static void lgrp_kstat_reset(lgrp_id_t); 121 122 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 123 static kmutex_t lgrp_kstat_mutex; 124 125 126 /* 127 * max number of lgroups supported by the platform 128 */ 129 int nlgrpsmax = 0; 130 131 /* 132 * The root lgroup. Represents the set of resources at the system wide 133 * level of locality. 134 */ 135 lgrp_t *lgrp_root = NULL; 136 137 /* 138 * During system bootstrap cp_default does not contain the list of lgrp load 139 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 140 * on-line when cp_default is initialized by cpupart_initialize_default(). 141 * Configuring CPU0 may create a two-level topology with root and one leaf node 142 * containing CPU0. This topology is initially constructed in a special 143 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 144 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 145 * for all lpl operations until cp_default is fully constructed. 146 * 147 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 148 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 149 * the first element of lpl_bootstrap_list. 150 * 151 * CPUs that are added to the system, but have not yet been assigned to an 152 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 153 * on some architectures (x86) it's possible for the slave CPU startup thread 154 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 155 */ 156 #define LPL_BOOTSTRAP_SIZE 2 157 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 158 lpl_t *lpl_bootstrap; 159 static lpl_t *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE]; 160 static int lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE]; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 /* 172 * Size, in bytes, beyond which random memory allocation policy is applied 173 * to non-shared memory. Default is the maximum size, so random memory 174 * allocation won't be used for non-shared memory by default. 175 */ 176 size_t lgrp_privm_random_thresh = (size_t)(-1); 177 178 /* the maximum effect that a single thread can have on it's lgroup's load */ 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 180 ((lgrp_loadavg_max_effect) / (ncpu)) 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 182 183 184 /* 185 * Size, in bytes, beyond which random memory allocation policy is applied to 186 * shared memory. Default is 8MB (2 ISM pages). 187 */ 188 size_t lgrp_shm_random_thresh = 8*1024*1024; 189 190 /* 191 * Whether to do processor set aware memory allocation by default 192 */ 193 int lgrp_mem_pset_aware = 0; 194 195 /* 196 * Set the default memory allocation policy for root lgroup 197 */ 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 199 200 /* 201 * Set the default memory allocation policy. For most platforms, 202 * next touch is sufficient, but some platforms may wish to override 203 * this. 204 */ 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 206 207 208 /* 209 * lgroup CPU event handlers 210 */ 211 static void lgrp_cpu_init(struct cpu *); 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 214 215 /* 216 * lgroup memory event handlers 217 */ 218 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 219 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 220 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 221 222 /* 223 * lgroup CPU partition event handlers 224 */ 225 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 226 static void lgrp_part_del_cpu(struct cpu *); 227 228 /* 229 * lgroup framework initialization 230 */ 231 static void lgrp_main_init(void); 232 static void lgrp_main_mp_init(void); 233 static void lgrp_root_init(void); 234 static void lgrp_setup(void); 235 236 /* 237 * lpl topology 238 */ 239 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 240 static void lpl_clear(lpl_t *); 241 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 242 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 243 static void lpl_rset_add(lpl_t *, lpl_t *); 244 static void lpl_rset_del(lpl_t *, lpl_t *); 245 static int lpl_rset_contains(lpl_t *, lpl_t *); 246 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 247 static void lpl_child_update(lpl_t *, struct cpupart *); 248 static int lpl_pick(lpl_t *, lpl_t *); 249 static void lpl_verify_wrapper(struct cpupart *); 250 251 /* 252 * defines for lpl topology verifier return codes 253 */ 254 255 #define LPL_TOPO_CORRECT 0 256 #define LPL_TOPO_PART_HAS_NO_LPL -1 257 #define LPL_TOPO_CPUS_NOT_EMPTY -2 258 #define LPL_TOPO_LGRP_MISMATCH -3 259 #define LPL_TOPO_MISSING_PARENT -4 260 #define LPL_TOPO_PARENT_MISMATCH -5 261 #define LPL_TOPO_BAD_CPUCNT -6 262 #define LPL_TOPO_RSET_MISMATCH -7 263 #define LPL_TOPO_LPL_ORPHANED -8 264 #define LPL_TOPO_LPL_BAD_NCPU -9 265 #define LPL_TOPO_RSET_MSSNG_LF -10 266 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 267 #define LPL_TOPO_NONLEAF_HAS_CPUS -12 268 #define LPL_TOPO_LGRP_NOT_LEAF -13 269 #define LPL_TOPO_BAD_RSETCNT -14 270 271 /* 272 * Return whether lgroup optimizations should be enabled on this system 273 */ 274 int 275 lgrp_optimizations(void) 276 { 277 /* 278 * System must have more than 2 lgroups to enable lgroup optimizations 279 * 280 * XXX This assumes that a 2 lgroup system has an empty root lgroup 281 * with one child lgroup containing all the resources. A 2 lgroup 282 * system with a root lgroup directly containing CPUs or memory might 283 * need lgroup optimizations with its child lgroup, but there 284 * isn't such a machine for now.... 285 */ 286 if (nlgrps > 2) 287 return (1); 288 289 return (0); 290 } 291 292 /* 293 * Setup root lgroup 294 */ 295 static void 296 lgrp_root_init(void) 297 { 298 lgrp_handle_t hand; 299 int i; 300 lgrp_id_t id; 301 302 /* 303 * Create the "root" lgroup 304 */ 305 ASSERT(nlgrps == 0); 306 id = nlgrps++; 307 308 lgrp_root = &lroot; 309 310 lgrp_root->lgrp_cpu = NULL; 311 lgrp_root->lgrp_mnodes = 0; 312 lgrp_root->lgrp_nmnodes = 0; 313 hand = lgrp_plat_root_hand(); 314 lgrp_root->lgrp_plathand = hand; 315 316 lgrp_root->lgrp_id = id; 317 lgrp_root->lgrp_cpucnt = 0; 318 lgrp_root->lgrp_childcnt = 0; 319 klgrpset_clear(lgrp_root->lgrp_children); 320 klgrpset_clear(lgrp_root->lgrp_leaves); 321 lgrp_root->lgrp_parent = NULL; 322 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 323 324 for (i = 0; i < LGRP_RSRC_COUNT; i++) 325 klgrpset_clear(lgrp_root->lgrp_set[i]); 326 327 lgrp_root->lgrp_kstat = NULL; 328 329 lgrp_table[id] = lgrp_root; 330 331 /* 332 * Setup initial lpl list for CPU0 and initial t0 home. 333 * The only lpl space we have so far is lpl_bootstrap. It is used for 334 * all topology operations until cp_default is initialized at which 335 * point t0.t_lpl will be updated. 336 */ 337 lpl_bootstrap = lpl_bootstrap_list; 338 t0.t_lpl = lpl_bootstrap; 339 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 340 lpl_bootstrap_list[1].lpl_lgrpid = 1; 341 342 /* 343 * Set up the bootstrap rset 344 * Since the bootstrap toplogy has just the root, and a leaf, 345 * the rset contains just the leaf, and both lpls can use the same rset 346 */ 347 lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1]; 348 lpl_bootstrap_list[0].lpl_rset_sz = 1; 349 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; 350 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; 351 352 lpl_bootstrap_list[1].lpl_rset_sz = 1; 353 lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset; 354 lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset; 355 356 cp_default.cp_lgrploads = lpl_bootstrap; 357 } 358 359 /* 360 * Initialize the lgroup framework and allow the platform to do the same 361 * 362 * This happens in stages during boot and is all funnelled through this routine 363 * (see definition of lgrp_init_stages_t to see what happens at each stage and 364 * when) 365 */ 366 void 367 lgrp_init(lgrp_init_stages_t stage) 368 { 369 /* 370 * Initialize the platform 371 */ 372 lgrp_plat_init(stage); 373 374 switch (stage) { 375 case LGRP_INIT_STAGE1: 376 /* 377 * Set max number of lgroups supported on this platform which 378 * must be less than the max number of lgroups supported by the 379 * common lgroup framework (eg. NLGRPS_MAX is max elements in 380 * lgrp_table[], etc.) 381 */ 382 nlgrpsmax = lgrp_plat_max_lgrps(); 383 ASSERT(nlgrpsmax <= NLGRPS_MAX); 384 break; 385 386 case LGRP_INIT_STAGE2: 387 lgrp_setup(); 388 break; 389 390 case LGRP_INIT_STAGE4: 391 lgrp_main_init(); 392 break; 393 394 case LGRP_INIT_STAGE5: 395 lgrp_main_mp_init(); 396 break; 397 398 default: 399 break; 400 } 401 } 402 403 /* 404 * Create the root and cpu0's lgroup, and set t0's home. 405 */ 406 static void 407 lgrp_setup(void) 408 { 409 /* 410 * Setup the root lgroup 411 */ 412 lgrp_root_init(); 413 414 /* 415 * Add cpu0 to an lgroup 416 */ 417 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 418 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 419 } 420 421 /* 422 * true when lgrp initialization has been completed. 423 */ 424 int lgrp_initialized = 0; 425 426 /* 427 * True when lgrp topology is constructed. 428 */ 429 int lgrp_topo_initialized = 0; 430 431 /* 432 * Init routine called after startup(), /etc/system has been processed, 433 * and cpu0 has been added to an lgroup. 434 */ 435 static void 436 lgrp_main_init(void) 437 { 438 cpu_t *cp = CPU; 439 lgrp_id_t lgrpid; 440 int i; 441 extern void pg_cpu0_reinit(); 442 443 /* 444 * Enforce a valid lgrp_mem_default_policy 445 */ 446 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 447 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) || 448 (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG)) 449 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 450 451 /* 452 * See if mpo should be disabled. 453 * This may happen in the case of null proc LPA on Starcat. 454 * The platform won't be able to detect null proc LPA until after 455 * cpu0 and memory have already been added to lgroups. 456 * When and if it is detected, the Starcat platform will return 457 * a different platform handle for cpu0 which is what we check for 458 * here. If mpo should be disabled move cpu0 to it's rightful place 459 * (the root), and destroy the remaining lgroups. This effectively 460 * provides an UMA lgroup topology. 461 */ 462 lgrpid = cp->cpu_lpl->lpl_lgrpid; 463 if (lgrp_table[lgrpid]->lgrp_plathand != 464 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 465 lgrp_part_del_cpu(cp); 466 lgrp_cpu_fini(cp, lgrpid); 467 468 lgrp_cpu_init(cp); 469 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 470 471 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 472 473 /* 474 * Notify the PG subsystem that the CPU's lgrp 475 * association has changed 476 */ 477 pg_cpu0_reinit(); 478 479 /* 480 * Destroy all lgroups except for root 481 */ 482 for (i = 0; i <= lgrp_alloc_max; i++) { 483 if (LGRP_EXISTS(lgrp_table[i]) && 484 lgrp_table[i] != lgrp_root) 485 lgrp_destroy(lgrp_table[i]); 486 } 487 488 /* 489 * Fix up root to point at itself for leaves and resources 490 * and not have any children 491 */ 492 lgrp_root->lgrp_childcnt = 0; 493 klgrpset_clear(lgrp_root->lgrp_children); 494 klgrpset_clear(lgrp_root->lgrp_leaves); 495 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 496 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 497 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 498 } 499 500 /* 501 * Initialize kstats framework. 502 */ 503 lgrp_kstat_init(); 504 /* 505 * cpu0 is finally where it should be, so create it's lgroup's kstats 506 */ 507 mutex_enter(&cpu_lock); 508 lgrp_kstat_create(cp); 509 mutex_exit(&cpu_lock); 510 511 lgrp_initialized = 1; 512 } 513 514 /* 515 * Finish lgrp initialization after all CPUS are brought on-line. 516 * This routine is called after start_other_cpus(). 517 */ 518 static void 519 lgrp_main_mp_init(void) 520 { 521 klgrpset_t changed; 522 523 /* 524 * Update lgroup topology (if necessary) 525 */ 526 klgrpset_clear(changed); 527 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 528 lgrp_topo_initialized = 1; 529 } 530 531 /* 532 * Change latency of lgroup with specified lgroup platform handle (if one is 533 * given) or change all lgroups with old latency to new latency 534 */ 535 void 536 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 537 u_longlong_t newtime) 538 { 539 lgrp_t *lgrp; 540 int i; 541 542 for (i = 0; i <= lgrp_alloc_max; i++) { 543 lgrp = lgrp_table[i]; 544 545 if (!LGRP_EXISTS(lgrp)) 546 continue; 547 548 if ((hand == LGRP_NULL_HANDLE && 549 lgrp->lgrp_latency == oldtime) || 550 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 551 lgrp->lgrp_latency = (int)newtime; 552 } 553 } 554 555 /* 556 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 557 */ 558 void 559 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 560 { 561 klgrpset_t changed; 562 cpu_t *cp; 563 lgrp_id_t id; 564 int rc; 565 566 switch (event) { 567 /* 568 * The following (re)configuration events are common code 569 * initiated. lgrp_plat_config() is called here to inform the 570 * platform of the reconfiguration event. 571 */ 572 case LGRP_CONFIG_CPU_ADD: 573 cp = (cpu_t *)resource; 574 575 /* 576 * Initialize the new CPU's lgrp related next/prev 577 * links, and give it a bootstrap lpl so that it can 578 * survive should it need to enter the dispatcher. 579 */ 580 cp->cpu_next_lpl = cp; 581 cp->cpu_prev_lpl = cp; 582 cp->cpu_next_lgrp = cp; 583 cp->cpu_prev_lgrp = cp; 584 cp->cpu_lpl = lpl_bootstrap; 585 586 lgrp_plat_config(event, resource); 587 atomic_inc_32(&lgrp_gen); 588 589 break; 590 case LGRP_CONFIG_CPU_DEL: 591 lgrp_plat_config(event, resource); 592 atomic_inc_32(&lgrp_gen); 593 594 break; 595 case LGRP_CONFIG_CPU_ONLINE: 596 cp = (cpu_t *)resource; 597 lgrp_cpu_init(cp); 598 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 599 rc = lpl_topo_verify(cp->cpu_part); 600 if (rc != LPL_TOPO_CORRECT) { 601 panic("lpl_topo_verify failed: %d", rc); 602 } 603 lgrp_plat_config(event, resource); 604 atomic_inc_32(&lgrp_gen); 605 606 break; 607 case LGRP_CONFIG_CPU_OFFLINE: 608 cp = (cpu_t *)resource; 609 id = cp->cpu_lpl->lpl_lgrpid; 610 lgrp_part_del_cpu(cp); 611 lgrp_cpu_fini(cp, id); 612 rc = lpl_topo_verify(cp->cpu_part); 613 if (rc != LPL_TOPO_CORRECT) { 614 panic("lpl_topo_verify failed: %d", rc); 615 } 616 lgrp_plat_config(event, resource); 617 atomic_inc_32(&lgrp_gen); 618 619 break; 620 case LGRP_CONFIG_CPUPART_ADD: 621 cp = (cpu_t *)resource; 622 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 623 rc = lpl_topo_verify(cp->cpu_part); 624 if (rc != LPL_TOPO_CORRECT) { 625 panic("lpl_topo_verify failed: %d", rc); 626 } 627 lgrp_plat_config(event, resource); 628 629 break; 630 case LGRP_CONFIG_CPUPART_DEL: 631 cp = (cpu_t *)resource; 632 lgrp_part_del_cpu((cpu_t *)resource); 633 rc = lpl_topo_verify(cp->cpu_part); 634 if (rc != LPL_TOPO_CORRECT) { 635 panic("lpl_topo_verify failed: %d", rc); 636 } 637 lgrp_plat_config(event, resource); 638 639 break; 640 /* 641 * The following events are initiated by the memnode 642 * subsystem. 643 */ 644 case LGRP_CONFIG_MEM_ADD: 645 lgrp_mem_init((int)resource, where, B_FALSE); 646 atomic_inc_32(&lgrp_gen); 647 648 break; 649 case LGRP_CONFIG_MEM_DEL: 650 lgrp_mem_fini((int)resource, where, B_FALSE); 651 atomic_inc_32(&lgrp_gen); 652 653 break; 654 case LGRP_CONFIG_MEM_RENAME: { 655 lgrp_config_mem_rename_t *ren_arg = 656 (lgrp_config_mem_rename_t *)where; 657 658 lgrp_mem_rename((int)resource, 659 ren_arg->lmem_rename_from, 660 ren_arg->lmem_rename_to); 661 atomic_inc_32(&lgrp_gen); 662 663 break; 664 } 665 case LGRP_CONFIG_GEN_UPDATE: 666 atomic_inc_32(&lgrp_gen); 667 668 break; 669 case LGRP_CONFIG_FLATTEN: 670 if (where == 0) 671 lgrp_topo_levels = (int)resource; 672 else 673 (void) lgrp_topo_flatten(resource, 674 lgrp_table, lgrp_alloc_max, &changed); 675 676 break; 677 /* 678 * Update any lgroups with old latency to new latency 679 */ 680 case LGRP_CONFIG_LAT_CHANGE_ALL: 681 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 682 (u_longlong_t)where); 683 684 break; 685 /* 686 * Update lgroup with specified lgroup platform handle to have 687 * new latency 688 */ 689 case LGRP_CONFIG_LAT_CHANGE: 690 lgrp_latency_change((lgrp_handle_t)resource, 0, 691 (u_longlong_t)where); 692 693 break; 694 case LGRP_CONFIG_NOP: 695 696 break; 697 default: 698 break; 699 } 700 701 } 702 703 /* 704 * Called to add lgrp info into cpu structure from cpu_add_unit; 705 * do not assume cpu is in cpu[] yet! 706 * 707 * CPUs are brought online with all other CPUs paused so we can't 708 * allocate memory or we could deadlock the system, so we rely on 709 * the platform to statically allocate as much space as we need 710 * for the lgrp structs and stats. 711 */ 712 static void 713 lgrp_cpu_init(struct cpu *cp) 714 { 715 klgrpset_t changed; 716 int count; 717 lgrp_handle_t hand; 718 int first_cpu; 719 lgrp_t *my_lgrp; 720 lgrp_id_t lgrpid; 721 struct cpu *cptr; 722 723 /* 724 * This is the first time through if the resource set 725 * for the root lgroup is empty. After cpu0 has been 726 * initially added to an lgroup, the root's CPU resource 727 * set can never be empty, since the system's last CPU 728 * cannot be offlined. 729 */ 730 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 731 /* 732 * First time through. 733 */ 734 first_cpu = 1; 735 } else { 736 /* 737 * If cpu0 needs to move lgroups, we may come 738 * through here again, at which time cpu_lock won't 739 * be held, and lgrp_initialized will be false. 740 */ 741 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 742 ASSERT(cp->cpu_part != NULL); 743 first_cpu = 0; 744 } 745 746 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 747 my_lgrp = lgrp_hand_to_lgrp(hand); 748 749 if (my_lgrp == NULL) { 750 /* 751 * Create new lgrp and add it to lgroup topology 752 */ 753 my_lgrp = lgrp_create(); 754 my_lgrp->lgrp_plathand = hand; 755 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 756 lgrpid = my_lgrp->lgrp_id; 757 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 758 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 759 760 count = 0; 761 klgrpset_clear(changed); 762 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 763 &changed); 764 /* 765 * May have added new intermediate lgroups, so need to add 766 * resources other than CPUs which are added below 767 */ 768 (void) lgrp_mnode_update(changed, NULL); 769 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 770 > 0) { 771 /* 772 * Leaf lgroup was created, but latency wasn't available 773 * then. So, set latency for it and fill in rest of lgroup 774 * topology now that we know how far it is from other leaf 775 * lgroups. 776 */ 777 lgrpid = my_lgrp->lgrp_id; 778 klgrpset_clear(changed); 779 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 780 lgrpid)) 781 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 782 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 783 &changed); 784 785 /* 786 * May have added new intermediate lgroups, so need to add 787 * resources other than CPUs which are added below 788 */ 789 (void) lgrp_mnode_update(changed, NULL); 790 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 791 my_lgrp->lgrp_id)) { 792 int i; 793 794 /* 795 * Update existing lgroup and lgroups containing it with CPU 796 * resource 797 */ 798 lgrpid = my_lgrp->lgrp_id; 799 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 800 for (i = 0; i <= lgrp_alloc_max; i++) { 801 lgrp_t *lgrp; 802 803 lgrp = lgrp_table[i]; 804 if (!LGRP_EXISTS(lgrp) || 805 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 806 continue; 807 808 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 809 } 810 } 811 812 lgrpid = my_lgrp->lgrp_id; 813 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 814 815 /* 816 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 817 * end up in lpl for lgroup 0 whether it is supposed to be in there or 818 * not since none of lgroup IDs in the lpl's have been set yet. 819 */ 820 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 821 cp->cpu_lpl->lpl_lgrpid = lgrpid; 822 823 /* 824 * link the CPU into the lgrp's CPU list 825 */ 826 if (my_lgrp->lgrp_cpucnt == 0) { 827 my_lgrp->lgrp_cpu = cp; 828 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 829 } else { 830 cptr = my_lgrp->lgrp_cpu; 831 cp->cpu_next_lgrp = cptr; 832 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 833 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 834 cptr->cpu_prev_lgrp = cp; 835 } 836 my_lgrp->lgrp_cpucnt++; 837 } 838 839 lgrp_t * 840 lgrp_create(void) 841 { 842 lgrp_t *my_lgrp; 843 lgrp_id_t lgrpid; 844 int i; 845 846 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 847 848 /* 849 * Find an open slot in the lgroup table and recycle unused lgroup 850 * left there if any 851 */ 852 my_lgrp = NULL; 853 if (lgrp_alloc_hint == -1) 854 /* 855 * Allocate from end when hint not set yet because no lgroups 856 * have been deleted yet 857 */ 858 lgrpid = nlgrps++; 859 else { 860 /* 861 * Start looking for next open slot from hint and leave hint 862 * at slot allocated 863 */ 864 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 865 my_lgrp = lgrp_table[i]; 866 if (!LGRP_EXISTS(my_lgrp)) { 867 lgrpid = i; 868 nlgrps++; 869 break; 870 } 871 } 872 lgrp_alloc_hint = lgrpid; 873 } 874 875 /* 876 * Keep track of max lgroup ID allocated so far to cut down on searches 877 */ 878 if (lgrpid > lgrp_alloc_max) 879 lgrp_alloc_max = lgrpid; 880 881 /* 882 * Need to allocate new lgroup if next open slot didn't have one 883 * for recycling 884 */ 885 if (my_lgrp == NULL) 886 my_lgrp = lgrp_plat_alloc(lgrpid); 887 888 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 889 panic("Too many lgrps for platform (%d)", nlgrps); 890 891 my_lgrp->lgrp_id = lgrpid; 892 my_lgrp->lgrp_latency = 0; 893 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 894 my_lgrp->lgrp_parent = NULL; 895 my_lgrp->lgrp_childcnt = 0; 896 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 897 my_lgrp->lgrp_nmnodes = 0; 898 klgrpset_clear(my_lgrp->lgrp_children); 899 klgrpset_clear(my_lgrp->lgrp_leaves); 900 for (i = 0; i < LGRP_RSRC_COUNT; i++) 901 klgrpset_clear(my_lgrp->lgrp_set[i]); 902 903 my_lgrp->lgrp_cpu = NULL; 904 my_lgrp->lgrp_cpucnt = 0; 905 906 if (my_lgrp->lgrp_kstat != NULL) 907 lgrp_kstat_reset(lgrpid); 908 909 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 910 911 return (my_lgrp); 912 } 913 914 void 915 lgrp_destroy(lgrp_t *lgrp) 916 { 917 int i; 918 919 /* 920 * Unless this lgroup is being destroyed on behalf of 921 * the boot CPU, cpu_lock must be held 922 */ 923 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 924 925 if (nlgrps == 1) 926 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 927 928 if (!LGRP_EXISTS(lgrp)) 929 return; 930 931 /* 932 * Set hint to lgroup being deleted and try to keep lower numbered 933 * hints to facilitate finding empty slots 934 */ 935 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 936 lgrp_alloc_hint = lgrp->lgrp_id; 937 938 /* 939 * Mark this lgroup to be recycled by setting its lgroup ID to 940 * LGRP_NONE and clear relevant fields 941 */ 942 lgrp->lgrp_id = LGRP_NONE; 943 lgrp->lgrp_latency = 0; 944 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 945 lgrp->lgrp_parent = NULL; 946 lgrp->lgrp_childcnt = 0; 947 948 klgrpset_clear(lgrp->lgrp_children); 949 klgrpset_clear(lgrp->lgrp_leaves); 950 for (i = 0; i < LGRP_RSRC_COUNT; i++) 951 klgrpset_clear(lgrp->lgrp_set[i]); 952 953 lgrp->lgrp_mnodes = (mnodeset_t)0; 954 lgrp->lgrp_nmnodes = 0; 955 956 lgrp->lgrp_cpu = NULL; 957 lgrp->lgrp_cpucnt = 0; 958 959 nlgrps--; 960 } 961 962 /* 963 * Initialize kstat data. Called from lgrp intialization code. 964 */ 965 static void 966 lgrp_kstat_init(void) 967 { 968 lgrp_stat_t stat; 969 970 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 971 972 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 973 kstat_named_init(&lgrp_kstat_data[stat], 974 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 975 } 976 977 /* 978 * initialize an lgrp's kstats if needed 979 * called with cpu_lock held but not with cpus paused. 980 * we don't tear these down now because we don't know about 981 * memory leaving the lgrp yet... 982 */ 983 984 void 985 lgrp_kstat_create(cpu_t *cp) 986 { 987 kstat_t *lgrp_kstat; 988 lgrp_id_t lgrpid; 989 lgrp_t *my_lgrp; 990 991 ASSERT(MUTEX_HELD(&cpu_lock)); 992 993 lgrpid = cp->cpu_lpl->lpl_lgrpid; 994 my_lgrp = lgrp_table[lgrpid]; 995 996 if (my_lgrp->lgrp_kstat != NULL) 997 return; /* already initialized */ 998 999 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 1000 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 1001 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 1002 1003 if (lgrp_kstat != NULL) { 1004 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 1005 lgrp_kstat->ks_private = my_lgrp; 1006 lgrp_kstat->ks_data = &lgrp_kstat_data; 1007 lgrp_kstat->ks_update = lgrp_kstat_extract; 1008 my_lgrp->lgrp_kstat = lgrp_kstat; 1009 kstat_install(lgrp_kstat); 1010 } 1011 } 1012 1013 /* 1014 * this will do something when we manage to remove now unused lgrps 1015 */ 1016 1017 /* ARGSUSED */ 1018 void 1019 lgrp_kstat_destroy(cpu_t *cp) 1020 { 1021 ASSERT(MUTEX_HELD(&cpu_lock)); 1022 } 1023 1024 /* 1025 * Called when a CPU is off-lined. 1026 */ 1027 static void 1028 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 1029 { 1030 lgrp_t *my_lgrp; 1031 struct cpu *prev; 1032 struct cpu *next; 1033 1034 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 1035 1036 prev = cp->cpu_prev_lgrp; 1037 next = cp->cpu_next_lgrp; 1038 1039 prev->cpu_next_lgrp = next; 1040 next->cpu_prev_lgrp = prev; 1041 1042 /* 1043 * just because I'm paranoid doesn't mean... 1044 */ 1045 1046 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1047 1048 my_lgrp = lgrp_table[lgrpid]; 1049 my_lgrp->lgrp_cpucnt--; 1050 1051 /* 1052 * Removing last CPU in lgroup, so update lgroup topology 1053 */ 1054 if (my_lgrp->lgrp_cpucnt == 0) { 1055 klgrpset_t changed; 1056 int count; 1057 int i; 1058 1059 my_lgrp->lgrp_cpu = NULL; 1060 1061 /* 1062 * Remove this lgroup from its lgroup CPU resources and remove 1063 * lgroup from lgroup topology if it doesn't have any more 1064 * resources in it now 1065 */ 1066 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1067 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1068 count = 0; 1069 klgrpset_clear(changed); 1070 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1071 lgrp_alloc_max + 1, &changed); 1072 return; 1073 } 1074 1075 /* 1076 * This lgroup isn't empty, so just remove it from CPU 1077 * resources of any lgroups that contain it as such 1078 */ 1079 for (i = 0; i <= lgrp_alloc_max; i++) { 1080 lgrp_t *lgrp; 1081 1082 lgrp = lgrp_table[i]; 1083 if (!LGRP_EXISTS(lgrp) || 1084 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1085 lgrpid)) 1086 continue; 1087 1088 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1089 } 1090 return; 1091 } 1092 1093 if (my_lgrp->lgrp_cpu == cp) 1094 my_lgrp->lgrp_cpu = next; 1095 1096 } 1097 1098 /* 1099 * Update memory nodes in target lgroups and return ones that get changed 1100 */ 1101 int 1102 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1103 { 1104 int count; 1105 int i; 1106 int j; 1107 lgrp_t *lgrp; 1108 lgrp_t *lgrp_rsrc; 1109 1110 count = 0; 1111 if (changed) 1112 klgrpset_clear(*changed); 1113 1114 if (klgrpset_isempty(target)) 1115 return (0); 1116 1117 /* 1118 * Find each lgroup in target lgroups 1119 */ 1120 for (i = 0; i <= lgrp_alloc_max; i++) { 1121 /* 1122 * Skip any lgroups that don't exist or aren't in target group 1123 */ 1124 lgrp = lgrp_table[i]; 1125 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1126 continue; 1127 } 1128 1129 /* 1130 * Initialize memnodes for intermediate lgroups to 0 1131 * and update them from scratch since they may have completely 1132 * changed 1133 */ 1134 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1135 lgrp->lgrp_mnodes = (mnodeset_t)0; 1136 lgrp->lgrp_nmnodes = 0; 1137 } 1138 1139 /* 1140 * Update memory nodes of of target lgroup with memory nodes 1141 * from each lgroup in its lgroup memory resource set 1142 */ 1143 for (j = 0; j <= lgrp_alloc_max; j++) { 1144 int k; 1145 1146 /* 1147 * Skip any lgroups that don't exist or aren't in 1148 * memory resources of target lgroup 1149 */ 1150 lgrp_rsrc = lgrp_table[j]; 1151 if (!LGRP_EXISTS(lgrp_rsrc) || 1152 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1153 j)) 1154 continue; 1155 1156 /* 1157 * Update target lgroup's memnodes to include memnodes 1158 * of this lgroup 1159 */ 1160 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1161 mnodeset_t mnode_mask; 1162 1163 mnode_mask = (mnodeset_t)1 << k; 1164 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1165 !(lgrp->lgrp_mnodes & mnode_mask)) { 1166 lgrp->lgrp_mnodes |= mnode_mask; 1167 lgrp->lgrp_nmnodes++; 1168 } 1169 } 1170 count++; 1171 if (changed) 1172 klgrpset_add(*changed, lgrp->lgrp_id); 1173 } 1174 } 1175 1176 return (count); 1177 } 1178 1179 /* 1180 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1181 * is moved from one board to another. The "from" and "to" arguments specify the 1182 * source and the destination of the move. 1183 * 1184 * See plat_lgrp_config() for a detailed description of the copy-rename 1185 * semantics. 1186 * 1187 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1188 * the lgroup topology which is changing as memory moves from one lgroup to 1189 * another. It removes the mnode from the source lgroup and re-inserts it in the 1190 * target lgroup. 1191 * 1192 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1193 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1194 * copy-rename operation. 1195 * 1196 * There is one case which requires special handling. If the system contains 1197 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1198 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1199 * lgrp_mem_init), but there is a window when the system has no memory in the 1200 * lgroup hierarchy. If another thread tries to allocate memory during this 1201 * window, the allocation will fail, although the system has physical memory. 1202 * This may cause a system panic or a deadlock (some sleeping memory allocations 1203 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1204 * the mnode back). 1205 * 1206 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1207 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1208 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1209 * but it updates the rest of the lgroup topology as if the mnode was actually 1210 * removed. The lgrp_mem_init() function recognizes that the mnode being 1211 * inserted represents such a special case and updates the topology 1212 * appropriately. 1213 */ 1214 void 1215 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1216 { 1217 /* 1218 * Remove the memory from the source node and add it to the destination 1219 * node. 1220 */ 1221 lgrp_mem_fini(mnode, from, B_TRUE); 1222 lgrp_mem_init(mnode, to, B_TRUE); 1223 } 1224 1225 /* 1226 * Called to indicate that the lgrp with platform handle "hand" now 1227 * contains the memory identified by "mnode". 1228 * 1229 * LOCKING for this routine is a bit tricky. Usually it is called without 1230 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1231 * callers. During DR of the board containing the caged memory it may be called 1232 * with cpu_lock already held and CPUs paused. 1233 * 1234 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1235 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1236 * dealing with the special case of DR copy-rename described in 1237 * lgrp_mem_rename(). 1238 */ 1239 void 1240 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1241 { 1242 klgrpset_t changed; 1243 int count; 1244 int i; 1245 lgrp_t *my_lgrp; 1246 lgrp_id_t lgrpid; 1247 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1248 boolean_t drop_lock = B_FALSE; 1249 boolean_t need_synch = B_FALSE; 1250 1251 /* 1252 * Grab CPU lock (if we haven't already) 1253 */ 1254 if (!MUTEX_HELD(&cpu_lock)) { 1255 mutex_enter(&cpu_lock); 1256 drop_lock = B_TRUE; 1257 } 1258 1259 /* 1260 * This routine may be called from a context where we already 1261 * hold cpu_lock, and have already paused cpus. 1262 */ 1263 if (!cpus_paused()) 1264 need_synch = B_TRUE; 1265 1266 /* 1267 * Check if this mnode is already configured and return immediately if 1268 * it is. 1269 * 1270 * NOTE: in special case of copy-rename of the only remaining mnode, 1271 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1272 * recognize this case and continue as usual, but skip the update to 1273 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1274 * in topology, temporarily introduced by lgrp_mem_fini(). 1275 */ 1276 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1277 lgrp_root->lgrp_mnodes & mnodes_mask) { 1278 if (drop_lock) 1279 mutex_exit(&cpu_lock); 1280 return; 1281 } 1282 1283 /* 1284 * Update lgroup topology with new memory resources, keeping track of 1285 * which lgroups change 1286 */ 1287 count = 0; 1288 klgrpset_clear(changed); 1289 my_lgrp = lgrp_hand_to_lgrp(hand); 1290 if (my_lgrp == NULL) { 1291 /* new lgrp */ 1292 my_lgrp = lgrp_create(); 1293 lgrpid = my_lgrp->lgrp_id; 1294 my_lgrp->lgrp_plathand = hand; 1295 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1296 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1297 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1298 1299 if (need_synch) 1300 pause_cpus(NULL, NULL); 1301 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1302 &changed); 1303 if (need_synch) 1304 start_cpus(); 1305 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1306 > 0) { 1307 /* 1308 * Leaf lgroup was created, but latency wasn't available 1309 * then. So, set latency for it and fill in rest of lgroup 1310 * topology now that we know how far it is from other leaf 1311 * lgroups. 1312 */ 1313 klgrpset_clear(changed); 1314 lgrpid = my_lgrp->lgrp_id; 1315 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1316 lgrpid)) 1317 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1318 if (need_synch) 1319 pause_cpus(NULL, NULL); 1320 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1321 &changed); 1322 if (need_synch) 1323 start_cpus(); 1324 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1325 my_lgrp->lgrp_id)) { 1326 /* 1327 * Add new lgroup memory resource to existing lgroup 1328 */ 1329 lgrpid = my_lgrp->lgrp_id; 1330 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1331 klgrpset_add(changed, lgrpid); 1332 count++; 1333 for (i = 0; i <= lgrp_alloc_max; i++) { 1334 lgrp_t *lgrp; 1335 1336 lgrp = lgrp_table[i]; 1337 if (!LGRP_EXISTS(lgrp) || 1338 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1339 continue; 1340 1341 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1342 klgrpset_add(changed, lgrp->lgrp_id); 1343 count++; 1344 } 1345 } 1346 1347 /* 1348 * Add memory node to lgroup and remove lgroup from ones that need 1349 * to be updated 1350 */ 1351 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1352 my_lgrp->lgrp_mnodes |= mnodes_mask; 1353 my_lgrp->lgrp_nmnodes++; 1354 } 1355 klgrpset_del(changed, lgrpid); 1356 1357 /* 1358 * Update memory node information for all lgroups that changed and 1359 * contain new memory node as a resource 1360 */ 1361 if (count) 1362 (void) lgrp_mnode_update(changed, NULL); 1363 1364 if (drop_lock) 1365 mutex_exit(&cpu_lock); 1366 } 1367 1368 /* 1369 * Called to indicate that the lgroup associated with the platform 1370 * handle "hand" no longer contains given memory node 1371 * 1372 * LOCKING for this routine is a bit tricky. Usually it is called without 1373 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1374 * callers. During DR of the board containing the caged memory it may be called 1375 * with cpu_lock already held and CPUs paused. 1376 * 1377 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1378 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1379 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1380 * the same mnode back into the topology. See lgrp_mem_rename() and 1381 * lgrp_mem_init() for additional details. 1382 */ 1383 void 1384 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1385 { 1386 klgrpset_t changed; 1387 int count; 1388 int i; 1389 lgrp_t *my_lgrp; 1390 lgrp_id_t lgrpid; 1391 mnodeset_t mnodes_mask; 1392 boolean_t drop_lock = B_FALSE; 1393 boolean_t need_synch = B_FALSE; 1394 1395 /* 1396 * Grab CPU lock (if we haven't already) 1397 */ 1398 if (!MUTEX_HELD(&cpu_lock)) { 1399 mutex_enter(&cpu_lock); 1400 drop_lock = B_TRUE; 1401 } 1402 1403 /* 1404 * This routine may be called from a context where we already 1405 * hold cpu_lock and have already paused cpus. 1406 */ 1407 if (!cpus_paused()) 1408 need_synch = B_TRUE; 1409 1410 my_lgrp = lgrp_hand_to_lgrp(hand); 1411 1412 /* 1413 * The lgrp *must* be pre-existing 1414 */ 1415 ASSERT(my_lgrp != NULL); 1416 1417 /* 1418 * Delete memory node from lgroups which contain it 1419 */ 1420 mnodes_mask = ((mnodeset_t)1 << mnode); 1421 for (i = 0; i <= lgrp_alloc_max; i++) { 1422 lgrp_t *lgrp = lgrp_table[i]; 1423 /* 1424 * Skip any non-existent lgroups and any lgroups that don't 1425 * contain leaf lgroup of memory as a memory resource 1426 */ 1427 if (!LGRP_EXISTS(lgrp) || 1428 !(lgrp->lgrp_mnodes & mnodes_mask)) 1429 continue; 1430 1431 /* 1432 * Avoid removing the last mnode from the root in the DR 1433 * copy-rename case. See lgrp_mem_rename() for details. 1434 */ 1435 if (is_copy_rename && 1436 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1437 continue; 1438 1439 /* 1440 * Remove memory node from lgroup. 1441 */ 1442 lgrp->lgrp_mnodes &= ~mnodes_mask; 1443 lgrp->lgrp_nmnodes--; 1444 ASSERT(lgrp->lgrp_nmnodes >= 0); 1445 } 1446 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1447 1448 /* 1449 * Don't need to update lgroup topology if this lgroup still has memory. 1450 * 1451 * In the special case of DR copy-rename with the only mnode being 1452 * removed, the lgrp_mnodes for the root is always non-zero, but we 1453 * still need to update the lgroup topology. 1454 */ 1455 if ((my_lgrp->lgrp_nmnodes > 0) && 1456 !(is_copy_rename && (my_lgrp == lgrp_root) && 1457 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1458 if (drop_lock) 1459 mutex_exit(&cpu_lock); 1460 return; 1461 } 1462 1463 /* 1464 * This lgroup does not contain any memory now 1465 */ 1466 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1467 1468 /* 1469 * Remove this lgroup from lgroup topology if it does not contain any 1470 * resources now 1471 */ 1472 lgrpid = my_lgrp->lgrp_id; 1473 count = 0; 1474 klgrpset_clear(changed); 1475 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1476 /* 1477 * Delete lgroup when no more resources 1478 */ 1479 if (need_synch) 1480 pause_cpus(NULL, NULL); 1481 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1482 lgrp_alloc_max + 1, &changed); 1483 ASSERT(count > 0); 1484 if (need_synch) 1485 start_cpus(); 1486 } else { 1487 /* 1488 * Remove lgroup from memory resources of any lgroups that 1489 * contain it as such 1490 */ 1491 for (i = 0; i <= lgrp_alloc_max; i++) { 1492 lgrp_t *lgrp; 1493 1494 lgrp = lgrp_table[i]; 1495 if (!LGRP_EXISTS(lgrp) || 1496 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1497 lgrpid)) 1498 continue; 1499 1500 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1501 } 1502 } 1503 if (drop_lock) 1504 mutex_exit(&cpu_lock); 1505 } 1506 1507 /* 1508 * Return lgroup with given platform handle 1509 */ 1510 lgrp_t * 1511 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1512 { 1513 int i; 1514 lgrp_t *lgrp; 1515 1516 if (hand == LGRP_NULL_HANDLE) 1517 return (NULL); 1518 1519 for (i = 0; i <= lgrp_alloc_max; i++) { 1520 lgrp = lgrp_table[i]; 1521 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1522 return (lgrp); 1523 } 1524 return (NULL); 1525 } 1526 1527 /* 1528 * Return the home lgroup of the current thread. 1529 * We must do this with kernel preemption disabled, since we don't want our 1530 * thread to be re-homed while we're poking around with its lpl, and the lpl 1531 * should never be NULL. 1532 * 1533 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1534 * is enabled because of DR. Callers can use disable kernel preemption 1535 * around this call to guarantee that the lgroup will be valid beyond this 1536 * routine, since kernel preemption can be recursive. 1537 */ 1538 lgrp_t * 1539 lgrp_home_lgrp(void) 1540 { 1541 lgrp_t *lgrp; 1542 lpl_t *lpl; 1543 1544 kpreempt_disable(); 1545 1546 lpl = curthread->t_lpl; 1547 ASSERT(lpl != NULL); 1548 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1549 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1550 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1551 1552 kpreempt_enable(); 1553 1554 return (lgrp); 1555 } 1556 1557 /* 1558 * Return ID of home lgroup for given thread 1559 * (See comments for lgrp_home_lgrp() for special care and handling 1560 * instructions) 1561 */ 1562 lgrp_id_t 1563 lgrp_home_id(kthread_t *t) 1564 { 1565 lgrp_id_t lgrp; 1566 lpl_t *lpl; 1567 1568 ASSERT(t != NULL); 1569 /* 1570 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1571 * cannot since the HAT layer can call into this routine to 1572 * determine the locality for its data structures in the context 1573 * of a page fault. 1574 */ 1575 1576 kpreempt_disable(); 1577 1578 lpl = t->t_lpl; 1579 ASSERT(lpl != NULL); 1580 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1581 lgrp = lpl->lpl_lgrpid; 1582 1583 kpreempt_enable(); 1584 1585 return (lgrp); 1586 } 1587 1588 /* 1589 * Return lgroup containing the physical memory for the given page frame number 1590 */ 1591 lgrp_t * 1592 lgrp_pfn_to_lgrp(pfn_t pfn) 1593 { 1594 lgrp_handle_t hand; 1595 int i; 1596 lgrp_t *lgrp; 1597 1598 hand = lgrp_plat_pfn_to_hand(pfn); 1599 if (hand != LGRP_NULL_HANDLE) 1600 for (i = 0; i <= lgrp_alloc_max; i++) { 1601 lgrp = lgrp_table[i]; 1602 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1603 return (lgrp); 1604 } 1605 return (NULL); 1606 } 1607 1608 /* 1609 * Return lgroup containing the physical memory for the given page frame number 1610 */ 1611 lgrp_t * 1612 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1613 { 1614 lgrp_handle_t hand; 1615 int i; 1616 lgrp_t *lgrp; 1617 pfn_t pfn; 1618 1619 pfn = btop(physaddr); 1620 hand = lgrp_plat_pfn_to_hand(pfn); 1621 if (hand != LGRP_NULL_HANDLE) 1622 for (i = 0; i <= lgrp_alloc_max; i++) { 1623 lgrp = lgrp_table[i]; 1624 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1625 return (lgrp); 1626 } 1627 return (NULL); 1628 } 1629 1630 /* 1631 * Return the leaf lgroup containing the given CPU 1632 * 1633 * The caller needs to take precautions necessary to prevent 1634 * "cpu", and it's lpl from going away across a call to this function. 1635 * hint: kpreempt_disable()/kpreempt_enable() 1636 */ 1637 static lgrp_t * 1638 lgrp_cpu_to_lgrp(cpu_t *cpu) 1639 { 1640 return (cpu->cpu_lpl->lpl_lgrp); 1641 } 1642 1643 /* 1644 * Return the sum of the partition loads in an lgrp divided by 1645 * the number of CPUs in the lgrp. This is our best approximation 1646 * of an 'lgroup load average' for a useful per-lgroup kstat. 1647 */ 1648 static uint64_t 1649 lgrp_sum_loadavgs(lgrp_t *lgrp) 1650 { 1651 cpu_t *cpu; 1652 int ncpu; 1653 uint64_t loads = 0; 1654 1655 mutex_enter(&cpu_lock); 1656 1657 cpu = lgrp->lgrp_cpu; 1658 ncpu = lgrp->lgrp_cpucnt; 1659 1660 if (cpu == NULL || ncpu == 0) { 1661 mutex_exit(&cpu_lock); 1662 return (0ull); 1663 } 1664 1665 do { 1666 loads += cpu->cpu_lpl->lpl_loadavg; 1667 cpu = cpu->cpu_next_lgrp; 1668 } while (cpu != lgrp->lgrp_cpu); 1669 1670 mutex_exit(&cpu_lock); 1671 1672 return (loads / ncpu); 1673 } 1674 1675 void 1676 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1677 { 1678 struct lgrp_stats *pstats; 1679 1680 /* 1681 * Verify that the caller isn't trying to add to 1682 * a statistic for an lgroup that has gone away 1683 */ 1684 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1685 return; 1686 1687 pstats = &lgrp_stats[lgrpid]; 1688 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1689 } 1690 1691 int64_t 1692 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1693 { 1694 uint64_t val; 1695 struct lgrp_stats *pstats; 1696 1697 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1698 return ((int64_t)0); 1699 1700 pstats = &lgrp_stats[lgrpid]; 1701 LGRP_STAT_READ(pstats, stat, val); 1702 return (val); 1703 } 1704 1705 /* 1706 * Reset all kstats for lgrp specified by its lgrpid. 1707 */ 1708 static void 1709 lgrp_kstat_reset(lgrp_id_t lgrpid) 1710 { 1711 lgrp_stat_t stat; 1712 1713 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1714 return; 1715 1716 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1717 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1718 } 1719 } 1720 1721 /* 1722 * Collect all per-lgrp statistics for the lgrp associated with this 1723 * kstat, and store them in the ks_data array. 1724 * 1725 * The superuser can reset all the running counter statistics for an 1726 * lgrp by writing to any of the lgrp's stats. 1727 */ 1728 static int 1729 lgrp_kstat_extract(kstat_t *ksp, int rw) 1730 { 1731 lgrp_stat_t stat; 1732 struct kstat_named *ksd; 1733 lgrp_t *lgrp; 1734 lgrp_id_t lgrpid; 1735 1736 lgrp = (lgrp_t *)ksp->ks_private; 1737 1738 ksd = (struct kstat_named *)ksp->ks_data; 1739 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1740 1741 lgrpid = lgrp->lgrp_id; 1742 1743 if (lgrpid == LGRP_NONE) { 1744 /* 1745 * Return all zeroes as stats for freed lgrp. 1746 */ 1747 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1748 ksd[stat].value.i64 = 0; 1749 } 1750 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1751 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1752 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1753 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1754 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1755 } else if (rw != KSTAT_WRITE) { 1756 /* 1757 * Handle counter stats 1758 */ 1759 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1760 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1761 } 1762 1763 /* 1764 * Handle kernel data snapshot stats 1765 */ 1766 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1767 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1768 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1769 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1770 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1771 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1772 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1773 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1774 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1775 lgrp_loadavg_max_effect; 1776 } else { 1777 lgrp_kstat_reset(lgrpid); 1778 } 1779 1780 return (0); 1781 } 1782 1783 int 1784 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1785 { 1786 cpu_t *cp; 1787 1788 mutex_enter(&cpu_lock); 1789 1790 if ((cp = cpu_get(id)) == NULL) { 1791 mutex_exit(&cpu_lock); 1792 return (EINVAL); 1793 } 1794 1795 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1796 mutex_exit(&cpu_lock); 1797 return (EINVAL); 1798 } 1799 1800 ASSERT(cp->cpu_lpl != NULL); 1801 1802 *lp = cp->cpu_lpl->lpl_lgrpid; 1803 1804 mutex_exit(&cpu_lock); 1805 1806 return (0); 1807 } 1808 1809 int 1810 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1811 { 1812 cpu_t *cp; 1813 1814 mutex_enter(&cpu_lock); 1815 1816 if ((cp = cpu_get(id)) == NULL) { 1817 mutex_exit(&cpu_lock); 1818 return (EINVAL); 1819 } 1820 1821 ASSERT(cp->cpu_lpl != NULL); 1822 1823 *lp = cp->cpu_lpl->lpl_loadavg; 1824 1825 mutex_exit(&cpu_lock); 1826 1827 return (0); 1828 } 1829 1830 /* 1831 * Add a resource named by lpl_leaf to rset of lpl_target 1832 * 1833 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1834 * resource. It is adjusted here, as this is presently the only place that we 1835 * can be certain a resource addition has succeeded. 1836 * 1837 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1838 * list in order until it reaches a NULL. (This list is required to be NULL 1839 * terminated, too). This is done so that we can mark start pos + 1, so that 1840 * each lpl is traversed sequentially, but in a different order. We hope this 1841 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1842 */ 1843 1844 void 1845 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1846 { 1847 int i; 1848 int entry_slot = 0; 1849 1850 /* return if leaf is already present */ 1851 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1852 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1853 return; 1854 } 1855 1856 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1857 lpl_leaf->lpl_lgrpid) { 1858 break; 1859 } 1860 } 1861 1862 /* insert leaf, update counts */ 1863 entry_slot = i; 1864 i = lpl_target->lpl_nrset++; 1865 1866 /* 1867 * Start at the end of the rset array and work backwards towards the 1868 * slot into which the new lpl will be inserted. This effectively 1869 * preserves the current ordering by scooting everybody over one entry, 1870 * and placing the new entry into the space created. 1871 */ 1872 while (i-- > entry_slot) { 1873 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1874 lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] = 1875 i + 1; 1876 } 1877 1878 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1879 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot; 1880 1881 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1882 } 1883 1884 /* 1885 * Update each of lpl_parent's children with a reference to their parent. 1886 * The lgrp topology is used as the reference since it is fully 1887 * consistent and correct at this point. 1888 * This should be called after any potential change in lpl_parent's 1889 * rset. 1890 */ 1891 static void 1892 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1893 { 1894 klgrpset_t children; 1895 int i; 1896 1897 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1898 if (klgrpset_isempty(children)) 1899 return; /* nothing to do */ 1900 1901 for (i = 0; i <= lgrp_alloc_max; i++) { 1902 if (klgrpset_ismember(children, i)) { 1903 /* 1904 * (Re)set the parent. It may be incorrect if 1905 * lpl_parent is new in the topology. 1906 */ 1907 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1908 } 1909 } 1910 } 1911 1912 /* 1913 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1914 * 1915 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1916 * resource. The values are adjusted here, as this is the only place that we can 1917 * be certain a resource was successfully deleted. 1918 */ 1919 void 1920 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1921 { 1922 int i; 1923 lpl_t *leaf; 1924 1925 if (lpl_target->lpl_nrset == 0) 1926 return; 1927 1928 /* find leaf in intermediate node */ 1929 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1930 if (lpl_target->lpl_rset[i] == lpl_leaf) 1931 break; 1932 } 1933 1934 /* return if leaf not found */ 1935 if (lpl_target->lpl_rset[i] != lpl_leaf) 1936 return; 1937 1938 /* prune leaf, compress array */ 1939 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1940 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1; 1941 lpl_target->lpl_ncpu--; 1942 do { 1943 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1944 /* 1945 * Update the lgrp id <=> rset mapping 1946 */ 1947 if ((leaf = lpl_target->lpl_rset[i]) != NULL) { 1948 lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i; 1949 } 1950 } while (i++ < lpl_target->lpl_nrset); 1951 } 1952 1953 /* 1954 * Check to see if the resource set of the target lpl contains the 1955 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1956 */ 1957 1958 int 1959 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1960 { 1961 int i; 1962 1963 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1964 if (lpl_target->lpl_rset[i] == lpl_leaf) 1965 return (1); 1966 } 1967 1968 return (0); 1969 } 1970 1971 /* 1972 * Called when we change cpu lpl membership. This increments or decrements the 1973 * per-cpu counter in every lpl in which our leaf appears. 1974 */ 1975 void 1976 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1977 { 1978 cpupart_t *cpupart; 1979 lgrp_t *lgrp_leaf; 1980 lgrp_t *lgrp_cur; 1981 lpl_t *lpl_leaf; 1982 lpl_t *lpl_cur; 1983 int i; 1984 1985 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1986 1987 cpupart = cp->cpu_part; 1988 lpl_leaf = cp->cpu_lpl; 1989 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1990 1991 for (i = 0; i <= lgrp_alloc_max; i++) { 1992 lgrp_cur = lgrp_table[i]; 1993 1994 /* 1995 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 1996 * for the cpu in question, or if the current lgrp and leaf 1997 * don't share the same resources. 1998 */ 1999 2000 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 2001 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 2002 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 2003 continue; 2004 2005 2006 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2007 2008 if (lpl_cur->lpl_nrset > 0) { 2009 if (act == LPL_INCREMENT) { 2010 lpl_cur->lpl_ncpu++; 2011 } else if (act == LPL_DECREMENT) { 2012 lpl_cur->lpl_ncpu--; 2013 } 2014 } 2015 } 2016 } 2017 2018 /* 2019 * Initialize lpl with given resources and specified lgrp 2020 */ 2021 void 2022 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2023 { 2024 lpl->lpl_lgrpid = lgrp->lgrp_id; 2025 lpl->lpl_loadavg = 0; 2026 if (lpl == lpl_leaf) 2027 lpl->lpl_ncpu = 1; 2028 else 2029 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2030 lpl->lpl_nrset = 1; 2031 lpl->lpl_rset[0] = lpl_leaf; 2032 lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0; 2033 lpl->lpl_lgrp = lgrp; 2034 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2035 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2036 } 2037 2038 /* 2039 * Clear an unused lpl 2040 */ 2041 void 2042 lpl_clear(lpl_t *lpl) 2043 { 2044 /* 2045 * Clear out all fields in the lpl except: 2046 * lpl_lgrpid - to facilitate debugging 2047 * lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size 2048 * 2049 * Note that the lpl's rset and id2rset mapping are cleared as well. 2050 */ 2051 lpl->lpl_loadavg = 0; 2052 lpl->lpl_ncpu = 0; 2053 lpl->lpl_lgrp = NULL; 2054 lpl->lpl_parent = NULL; 2055 lpl->lpl_cpus = NULL; 2056 lpl->lpl_nrset = 0; 2057 lpl->lpl_homed_time = 0; 2058 bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz); 2059 bzero(lpl->lpl_id2rset, 2060 sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz); 2061 } 2062 2063 /* 2064 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2065 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2066 * make full use of all of the lgroup topology, but this checks to make sure 2067 * that for the parts that it does use, it has correctly understood the 2068 * relationships that exist. This function returns 2069 * 0 if the topology is correct, and a non-zero error code, for non-debug 2070 * kernels if incorrect. Asserts are spread throughout the code to aid in 2071 * debugging on a DEBUG kernel. 2072 */ 2073 int 2074 lpl_topo_verify(cpupart_t *cpupart) 2075 { 2076 lgrp_t *lgrp; 2077 lpl_t *lpl; 2078 klgrpset_t rset; 2079 klgrpset_t cset; 2080 cpu_t *cpu; 2081 cpu_t *cp_start; 2082 int i; 2083 int j; 2084 int sum; 2085 2086 /* topology can't be incorrect if it doesn't exist */ 2087 if (!lgrp_topo_initialized || !lgrp_initialized) 2088 return (LPL_TOPO_CORRECT); 2089 2090 ASSERT(cpupart != NULL); 2091 2092 for (i = 0; i <= lgrp_alloc_max; i++) { 2093 lgrp = lgrp_table[i]; 2094 lpl = NULL; 2095 /* make sure lpls are allocated */ 2096 ASSERT(cpupart->cp_lgrploads); 2097 if (!cpupart->cp_lgrploads) 2098 return (LPL_TOPO_PART_HAS_NO_LPL); 2099 2100 lpl = &cpupart->cp_lgrploads[i]; 2101 /* make sure our index is good */ 2102 ASSERT(i < cpupart->cp_nlgrploads); 2103 2104 /* if lgroup doesn't exist, make sure lpl is empty */ 2105 if (!LGRP_EXISTS(lgrp)) { 2106 ASSERT(lpl->lpl_ncpu == 0); 2107 if (lpl->lpl_ncpu > 0) { 2108 return (LPL_TOPO_CPUS_NOT_EMPTY); 2109 } else { 2110 continue; 2111 } 2112 } 2113 2114 /* verify that lgroup and lpl are identically numbered */ 2115 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2116 2117 /* if lgroup isn't in our partition, make sure lpl is empty */ 2118 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2119 cpupart->cp_lgrpset)) { 2120 ASSERT(lpl->lpl_ncpu == 0); 2121 if (lpl->lpl_ncpu > 0) { 2122 return (LPL_TOPO_CPUS_NOT_EMPTY); 2123 } 2124 /* 2125 * lpl is empty, and lgroup isn't in partition. verify 2126 * that lpl doesn't show up in anyone else's rsets (in 2127 * this partition, anyway) 2128 */ 2129 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2130 lpl_t *i_lpl; /* lpl we're iterating over */ 2131 2132 i_lpl = &cpupart->cp_lgrploads[j]; 2133 2134 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2135 if (lpl_rset_contains(i_lpl, lpl)) { 2136 return (LPL_TOPO_LPL_ORPHANED); 2137 } 2138 } 2139 /* lgroup is empty, and everything is ok. continue */ 2140 continue; 2141 } 2142 2143 2144 /* lgroup is in this partition, now check it against lpl */ 2145 2146 /* do both have matching lgrps? */ 2147 ASSERT(lgrp == lpl->lpl_lgrp); 2148 if (lgrp != lpl->lpl_lgrp) { 2149 return (LPL_TOPO_LGRP_MISMATCH); 2150 } 2151 2152 /* do the parent lgroups exist and do they match? */ 2153 if (lgrp->lgrp_parent) { 2154 ASSERT(lpl->lpl_parent); 2155 ASSERT(lgrp->lgrp_parent->lgrp_id == 2156 lpl->lpl_parent->lpl_lgrpid); 2157 2158 if (!lpl->lpl_parent) { 2159 return (LPL_TOPO_MISSING_PARENT); 2160 } else if (lgrp->lgrp_parent->lgrp_id != 2161 lpl->lpl_parent->lpl_lgrpid) { 2162 return (LPL_TOPO_PARENT_MISMATCH); 2163 } 2164 } 2165 2166 /* only leaf lgroups keep a cpucnt, only check leaves */ 2167 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2168 2169 /* verify that lgrp is also a leaf */ 2170 ASSERT((lgrp->lgrp_childcnt == 0) && 2171 (klgrpset_ismember(lgrp->lgrp_leaves, 2172 lpl->lpl_lgrpid))); 2173 2174 if ((lgrp->lgrp_childcnt > 0) || 2175 (!klgrpset_ismember(lgrp->lgrp_leaves, 2176 lpl->lpl_lgrpid))) { 2177 return (LPL_TOPO_LGRP_NOT_LEAF); 2178 } 2179 2180 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2181 (lpl->lpl_ncpu > 0)); 2182 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2183 (lpl->lpl_ncpu <= 0)) { 2184 return (LPL_TOPO_BAD_CPUCNT); 2185 } 2186 2187 /* 2188 * Check that lpl_ncpu also matches the number of 2189 * cpus in the lpl's linked list. This only exists in 2190 * leaves, but they should always match. 2191 */ 2192 j = 0; 2193 cpu = cp_start = lpl->lpl_cpus; 2194 while (cpu != NULL) { 2195 j++; 2196 2197 /* check to make sure cpu's lpl is leaf lpl */ 2198 ASSERT(cpu->cpu_lpl == lpl); 2199 if (cpu->cpu_lpl != lpl) { 2200 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2201 } 2202 2203 /* check next cpu */ 2204 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2205 continue; 2206 } else { 2207 cpu = NULL; 2208 } 2209 } 2210 2211 ASSERT(j == lpl->lpl_ncpu); 2212 if (j != lpl->lpl_ncpu) { 2213 return (LPL_TOPO_LPL_BAD_NCPU); 2214 } 2215 2216 /* 2217 * Also, check that leaf lpl is contained in all 2218 * intermediate lpls that name the leaf as a descendant 2219 */ 2220 for (j = 0; j <= lgrp_alloc_max; j++) { 2221 klgrpset_t intersect; 2222 lgrp_t *lgrp_cand; 2223 lpl_t *lpl_cand; 2224 2225 lgrp_cand = lgrp_table[j]; 2226 intersect = klgrpset_intersects( 2227 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2228 cpupart->cp_lgrpset); 2229 2230 if (!LGRP_EXISTS(lgrp_cand) || 2231 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2232 cpupart->cp_lgrpset) || 2233 (intersect == 0)) 2234 continue; 2235 2236 lpl_cand = 2237 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2238 2239 if (klgrpset_ismember(intersect, 2240 lgrp->lgrp_id)) { 2241 ASSERT(lpl_rset_contains(lpl_cand, 2242 lpl)); 2243 2244 if (!lpl_rset_contains(lpl_cand, lpl)) { 2245 return (LPL_TOPO_RSET_MSSNG_LF); 2246 } 2247 } 2248 } 2249 2250 } else { /* non-leaf specific checks */ 2251 2252 /* 2253 * Non-leaf lpls should have lpl_cpus == NULL 2254 * verify that this is so 2255 */ 2256 ASSERT(lpl->lpl_cpus == NULL); 2257 if (lpl->lpl_cpus != NULL) { 2258 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2259 } 2260 2261 /* 2262 * verify that the sum of the cpus in the leaf resources 2263 * is equal to the total ncpu in the intermediate 2264 */ 2265 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2266 sum += lpl->lpl_rset[j]->lpl_ncpu; 2267 } 2268 2269 ASSERT(sum == lpl->lpl_ncpu); 2270 if (sum != lpl->lpl_ncpu) { 2271 return (LPL_TOPO_LPL_BAD_NCPU); 2272 } 2273 } 2274 2275 /* 2276 * Check the rset of the lpl in question. Make sure that each 2277 * rset contains a subset of the resources in 2278 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2279 * sure that each rset doesn't include resources that are 2280 * outside of that set. (Which would be resources somehow not 2281 * accounted for). 2282 */ 2283 klgrpset_clear(rset); 2284 for (j = 0; j < lpl->lpl_nrset; j++) { 2285 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2286 } 2287 klgrpset_copy(cset, rset); 2288 /* make sure lpl rset matches lgrp rset */ 2289 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2290 /* make sure rset is contained with in partition, too */ 2291 klgrpset_diff(cset, cpupart->cp_lgrpset); 2292 2293 ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset)); 2294 if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) { 2295 return (LPL_TOPO_RSET_MISMATCH); 2296 } 2297 2298 /* 2299 * check to make sure lpl_nrset matches the number of rsets 2300 * contained in the lpl 2301 */ 2302 for (j = 0; j < lpl->lpl_nrset; j++) { 2303 if (lpl->lpl_rset[j] == NULL) 2304 break; 2305 } 2306 2307 ASSERT(j == lpl->lpl_nrset); 2308 if (j != lpl->lpl_nrset) { 2309 return (LPL_TOPO_BAD_RSETCNT); 2310 } 2311 2312 } 2313 return (LPL_TOPO_CORRECT); 2314 } 2315 2316 /* 2317 * Flatten lpl topology to given number of levels. This is presently only 2318 * implemented for a flatten to 2 levels, which will prune out the intermediates 2319 * and home the leaf lpls to the root lpl. 2320 */ 2321 int 2322 lpl_topo_flatten(int levels) 2323 { 2324 int i; 2325 uint_t sum; 2326 lgrp_t *lgrp_cur; 2327 lpl_t *lpl_cur; 2328 lpl_t *lpl_root; 2329 cpupart_t *cp; 2330 2331 if (levels != 2) 2332 return (0); 2333 2334 /* called w/ cpus paused - grab no locks! */ 2335 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2336 !lgrp_initialized); 2337 2338 cp = cp_list_head; 2339 do { 2340 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2341 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2342 2343 for (i = 0; i <= lgrp_alloc_max; i++) { 2344 lgrp_cur = lgrp_table[i]; 2345 lpl_cur = &cp->cp_lgrploads[i]; 2346 2347 if ((lgrp_cur == lgrp_root) || 2348 (!LGRP_EXISTS(lgrp_cur) && 2349 (lpl_cur->lpl_ncpu == 0))) 2350 continue; 2351 2352 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2353 /* 2354 * this should be a deleted intermediate, so 2355 * clear it 2356 */ 2357 lpl_clear(lpl_cur); 2358 } else if ((lpl_cur->lpl_nrset == 1) && 2359 (lpl_cur->lpl_rset[0] == lpl_cur) && 2360 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2361 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2362 /* 2363 * this is a leaf whose parent was deleted, or 2364 * whose parent had their lgrp deleted. (And 2365 * whose parent will soon be deleted). Point 2366 * this guy back to the root lpl. 2367 */ 2368 lpl_cur->lpl_parent = lpl_root; 2369 lpl_rset_add(lpl_root, lpl_cur); 2370 } 2371 2372 } 2373 2374 /* 2375 * Now that we're done, make sure the count on the root lpl is 2376 * correct, and update the hints of the children for the sake of 2377 * thoroughness 2378 */ 2379 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2380 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2381 } 2382 lpl_root->lpl_ncpu = sum; 2383 lpl_child_update(lpl_root, cp); 2384 2385 cp = cp->cp_next; 2386 } while (cp != cp_list_head); 2387 2388 return (levels); 2389 } 2390 2391 /* 2392 * Insert a lpl into the resource hierarchy and create any additional lpls that 2393 * are necessary to represent the varying states of locality for the cpu 2394 * resoruces newly added to the partition. 2395 * 2396 * This routine is clever enough that it can correctly add resources from the 2397 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2398 * those for which the lpl is a leaf as opposed to simply a named equally local 2399 * resource). The one special case that needs additional processing is when a 2400 * new intermediate lpl is introduced. Since the main loop only traverses 2401 * looking to add the leaf resource where it does not yet exist, additional work 2402 * is necessary to add other leaf resources that may need to exist in the newly 2403 * created intermediate. This is performed by the second inner loop, and is 2404 * only done when the check for more than one overlapping resource succeeds. 2405 */ 2406 2407 void 2408 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2409 { 2410 int i; 2411 int j; 2412 int rset_num_intersect; 2413 lgrp_t *lgrp_cur; 2414 lpl_t *lpl_cur; 2415 lpl_t *lpl_parent; 2416 lgrp_id_t parent_id; 2417 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2418 2419 for (i = 0; i <= lgrp_alloc_max; i++) { 2420 lgrp_cur = lgrp_table[i]; 2421 2422 /* 2423 * Don't insert if the lgrp isn't there, if the leaf isn't 2424 * contained within the current lgrp, or if the current lgrp has 2425 * no leaves in this partition 2426 */ 2427 2428 if (!LGRP_EXISTS(lgrp_cur) || 2429 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2430 lpl_leaf->lpl_lgrpid) || 2431 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2432 cpupart->cp_lgrpset)) 2433 continue; 2434 2435 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2436 if (lgrp_cur->lgrp_parent != NULL) { 2437 /* if lgrp has a parent, assign it properly */ 2438 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2439 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2440 } else { 2441 /* if not, make sure parent ptr gets set to null */ 2442 lpl_parent = NULL; 2443 } 2444 2445 if (lpl_cur == lpl_leaf) { 2446 /* 2447 * Almost all leaf state was initialized elsewhere. The 2448 * only thing left to do is to set the parent. 2449 */ 2450 lpl_cur->lpl_parent = lpl_parent; 2451 continue; 2452 } 2453 2454 lpl_clear(lpl_cur); 2455 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2456 2457 lpl_cur->lpl_parent = lpl_parent; 2458 2459 /* does new lpl need to be populated with other resources? */ 2460 rset_intersect = 2461 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2462 cpupart->cp_lgrpset); 2463 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2464 2465 if (rset_num_intersect > 1) { 2466 /* 2467 * If so, figure out what lpls have resources that 2468 * intersect this one, and add them. 2469 */ 2470 for (j = 0; j <= lgrp_alloc_max; j++) { 2471 lgrp_t *lgrp_cand; /* candidate lgrp */ 2472 lpl_t *lpl_cand; /* candidate lpl */ 2473 2474 lgrp_cand = lgrp_table[j]; 2475 if (!LGRP_EXISTS(lgrp_cand) || 2476 !klgrpset_ismember(rset_intersect, 2477 lgrp_cand->lgrp_id)) 2478 continue; 2479 lpl_cand = 2480 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2481 lpl_rset_add(lpl_cur, lpl_cand); 2482 } 2483 } 2484 /* 2485 * This lpl's rset has changed. Update the hint in it's 2486 * children. 2487 */ 2488 lpl_child_update(lpl_cur, cpupart); 2489 } 2490 } 2491 2492 /* 2493 * remove a lpl from the hierarchy of resources, clearing its state when 2494 * finished. If the lpls at the intermediate levels of the hierarchy have no 2495 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2496 * delete them as well. 2497 */ 2498 2499 void 2500 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2501 { 2502 int i; 2503 lgrp_t *lgrp_cur; 2504 lpl_t *lpl_cur; 2505 klgrpset_t leaf_intersect; /* intersection of leaves */ 2506 2507 for (i = 0; i <= lgrp_alloc_max; i++) { 2508 lgrp_cur = lgrp_table[i]; 2509 2510 /* 2511 * Don't attempt to remove from lgrps that aren't there, that 2512 * don't contain our leaf, or from the leaf itself. (We do that 2513 * later) 2514 */ 2515 2516 if (!LGRP_EXISTS(lgrp_cur)) 2517 continue; 2518 2519 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2520 2521 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2522 lpl_leaf->lpl_lgrpid) || 2523 (lpl_cur == lpl_leaf)) { 2524 continue; 2525 } 2526 2527 /* 2528 * This is a slightly sleazy simplification in that we have 2529 * already marked the cp_lgrpset as no longer containing the 2530 * leaf we've deleted. Any lpls that pass the above checks 2531 * based upon lgrp membership but not necessarily cpu-part 2532 * membership also get cleared by the checks below. Currently 2533 * this is harmless, as the lpls should be empty anyway. 2534 * 2535 * In particular, we want to preserve lpls that have additional 2536 * leaf resources, even though we don't yet have a processor 2537 * architecture that represents resources this way. 2538 */ 2539 2540 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2541 cpupart->cp_lgrpset); 2542 2543 lpl_rset_del(lpl_cur, lpl_leaf); 2544 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2545 lpl_clear(lpl_cur); 2546 } else { 2547 /* 2548 * Update this lpl's children 2549 */ 2550 lpl_child_update(lpl_cur, cpupart); 2551 } 2552 } 2553 lpl_clear(lpl_leaf); 2554 } 2555 2556 /* 2557 * add a cpu to a partition in terms of lgrp load avg bookeeping 2558 * 2559 * The lpl (cpu partition load average information) is now arranged in a 2560 * hierarchical fashion whereby resources that are closest, ie. most local, to 2561 * the cpu in question are considered to be leaves in a tree of resources. 2562 * There are two general cases for cpu additon: 2563 * 2564 * 1. A lpl structure that contains resources already in the hierarchy tree. 2565 * In this case, all of the associated lpl relationships have been defined, and 2566 * all that is necessary is that we link the new cpu into the per-lpl list of 2567 * cpus, and increment the ncpu count of all places where this cpu resource will 2568 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2569 * pushing is accomplished by this routine. 2570 * 2571 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2572 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2573 * construct the hierarchy of state necessary to name it's more distant 2574 * resources, if they should exist. The leaf structure is initialized by this 2575 * routine, as is the cpu-partition state for the lgrp membership. This routine 2576 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2577 * and builds all of the "ancestoral" state necessary to identify resources at 2578 * differing levels of locality. 2579 */ 2580 void 2581 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2582 { 2583 cpupart_t *cpupart; 2584 lgrp_t *lgrp_leaf; 2585 lpl_t *lpl_leaf; 2586 2587 /* called sometimes w/ cpus paused - grab no locks */ 2588 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2589 2590 cpupart = cp->cpu_part; 2591 lgrp_leaf = lgrp_table[lgrpid]; 2592 2593 /* don't add non-existent lgrp */ 2594 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2595 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2596 cp->cpu_lpl = lpl_leaf; 2597 2598 /* only leaf lpls contain cpus */ 2599 2600 if (lpl_leaf->lpl_ncpu++ == 0) { 2601 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2602 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2603 lpl_leaf_insert(lpl_leaf, cpupart); 2604 } else { 2605 /* 2606 * the lpl should already exist in the parent, so just update 2607 * the count of available CPUs 2608 */ 2609 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2610 } 2611 2612 /* link cpu into list of cpus in lpl */ 2613 2614 if (lpl_leaf->lpl_cpus) { 2615 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2616 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2617 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2618 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2619 } else { 2620 /* 2621 * We increment ncpu immediately after we create a new leaf 2622 * lpl, so assert that ncpu == 1 for the case where we don't 2623 * have any cpu pointers yet. 2624 */ 2625 ASSERT(lpl_leaf->lpl_ncpu == 1); 2626 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2627 } 2628 2629 } 2630 2631 2632 /* 2633 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2634 * 2635 * The lpl (cpu partition load average information) is now arranged in a 2636 * hierarchical fashion whereby resources that are closest, ie. most local, to 2637 * the cpu in question are considered to be leaves in a tree of resources. 2638 * There are two removal cases in question: 2639 * 2640 * 1. Removal of the resource in the leaf leaves other resources remaining in 2641 * that leaf. (Another cpu still exists at this level of locality). In this 2642 * case, the count of available cpus is decremented in all assocated lpls by 2643 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2644 * from the per-cpu lpl list. 2645 * 2646 * 2. Removal of the resource results in the lpl containing no resources. (It's 2647 * empty) In this case, all of what has occurred for the first step must take 2648 * place; however, additionally we must remove the lpl structure itself, prune 2649 * out any stranded lpls that do not directly name a leaf resource, and mark the 2650 * cpu partition in question as no longer containing resources from the lgrp of 2651 * the lpl that has been delted. Cpu-partition changes are handled by this 2652 * method, but the lpl_leaf_remove function deals with the details of pruning 2653 * out the empty lpl and any of its orphaned direct ancestors. 2654 */ 2655 void 2656 lgrp_part_del_cpu(cpu_t *cp) 2657 { 2658 lpl_t *lpl; 2659 lpl_t *leaf_lpl; 2660 lgrp_t *lgrp_leaf; 2661 2662 /* called sometimes w/ cpus paused - grab no locks */ 2663 2664 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2665 2666 lpl = leaf_lpl = cp->cpu_lpl; 2667 lgrp_leaf = leaf_lpl->lpl_lgrp; 2668 2669 /* don't delete a leaf that isn't there */ 2670 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2671 2672 /* no double-deletes */ 2673 ASSERT(lpl->lpl_ncpu); 2674 if (--lpl->lpl_ncpu == 0) { 2675 /* 2676 * This was the last cpu in this lgroup for this partition, 2677 * clear its bit in the partition's lgroup bitmask 2678 */ 2679 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2680 2681 /* eliminate remaning lpl link pointers in cpu, lpl */ 2682 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2683 2684 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2685 } else { 2686 2687 /* unlink cpu from lists of cpus in lpl */ 2688 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2689 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2690 if (lpl->lpl_cpus == cp) { 2691 lpl->lpl_cpus = cp->cpu_next_lpl; 2692 } 2693 2694 /* 2695 * Update the cpu count in the lpls associated with parent 2696 * lgroups. 2697 */ 2698 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2699 2700 } 2701 /* clear cpu's lpl ptr when we're all done */ 2702 cp->cpu_lpl = NULL; 2703 } 2704 2705 /* 2706 * Recompute load average for the specified partition/lgrp fragment. 2707 * 2708 * We rely on the fact that this routine is called from the clock thread 2709 * at a point before the clock thread can block (i.e. before its first 2710 * lock request). Since the clock thread can not be preempted (since it 2711 * runs at highest priority), we know that cpu partitions can not change 2712 * (since doing so would require either the repartition requester or the 2713 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2714 * without grabbing cpu_lock. 2715 */ 2716 void 2717 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2718 { 2719 uint_t ncpu; 2720 int64_t old, new, f; 2721 2722 /* 2723 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2724 */ 2725 static short expval[] = { 2726 0, 3196, 1618, 1083, 2727 814, 652, 543, 466, 2728 408, 363, 326, 297, 2729 272, 251, 233, 218, 2730 204, 192, 181, 172, 2731 163, 155, 148, 142, 2732 136, 130, 125, 121, 2733 116, 112, 109, 105 2734 }; 2735 2736 /* ASSERT (called from clock level) */ 2737 2738 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2739 ((ncpu = lpl->lpl_ncpu) == 0)) { 2740 return; 2741 } 2742 2743 for (;;) { 2744 2745 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2746 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2747 else 2748 f = expval[ncpu]; 2749 2750 /* 2751 * Modify the load average atomically to avoid losing 2752 * anticipatory load updates (see lgrp_move_thread()). 2753 */ 2754 if (ageflag) { 2755 /* 2756 * We're supposed to both update and age the load. 2757 * This happens 10 times/sec. per cpu. We do a 2758 * little hoop-jumping to avoid integer overflow. 2759 */ 2760 int64_t q, r; 2761 2762 do { 2763 old = new = lpl->lpl_loadavg; 2764 q = (old >> 16) << 7; 2765 r = (old & 0xffff) << 7; 2766 new += ((long long)(nrcpus - q) * f - 2767 ((r * f) >> 16)) >> 7; 2768 2769 /* 2770 * Check for overflow 2771 */ 2772 if (new > LGRP_LOADAVG_MAX) 2773 new = LGRP_LOADAVG_MAX; 2774 else if (new < 0) 2775 new = 0; 2776 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg, 2777 old, new) != old); 2778 } else { 2779 /* 2780 * We're supposed to update the load, but not age it. 2781 * This option is used to update the load (which either 2782 * has already been aged in this 1/10 sec. interval or 2783 * soon will be) to account for a remotely executing 2784 * thread. 2785 */ 2786 do { 2787 old = new = lpl->lpl_loadavg; 2788 new += f; 2789 /* 2790 * Check for overflow 2791 * Underflow not possible here 2792 */ 2793 if (new < old) 2794 new = LGRP_LOADAVG_MAX; 2795 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg, 2796 old, new) != old); 2797 } 2798 2799 /* 2800 * Do the same for this lpl's parent 2801 */ 2802 if ((lpl = lpl->lpl_parent) == NULL) 2803 break; 2804 ncpu = lpl->lpl_ncpu; 2805 } 2806 } 2807 2808 /* 2809 * Initialize lpl topology in the target based on topology currently present in 2810 * lpl_bootstrap. 2811 * 2812 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2813 * initialize cp_default list of lpls. Up to this point all topology operations 2814 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2815 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2816 * `target' points to the list of lpls in cp_default and `size' is the size of 2817 * this list. 2818 * 2819 * This function walks the lpl topology in lpl_bootstrap and does for things: 2820 * 2821 * 1) Copies all fields from lpl_bootstrap to the target. 2822 * 2823 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2824 * 2825 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2826 * instead of lpl_bootstrap. 2827 * 2828 * 4) Updates pointers in the resource list of the target to point to the lpls 2829 * in the target list instead of lpl_bootstrap. 2830 * 2831 * After lpl_topo_bootstrap() completes, target contains the same information 2832 * that would be present there if it were used during boot instead of 2833 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2834 * and it is bzeroed. 2835 */ 2836 void 2837 lpl_topo_bootstrap(lpl_t *target, int size) 2838 { 2839 lpl_t *lpl = lpl_bootstrap; 2840 lpl_t *target_lpl = target; 2841 lpl_t **rset; 2842 int *id2rset; 2843 int sz; 2844 int howmany; 2845 int id; 2846 int i; 2847 2848 /* 2849 * The only target that should be passed here is cp_default lpl list. 2850 */ 2851 ASSERT(target == cp_default.cp_lgrploads); 2852 ASSERT(size == cp_default.cp_nlgrploads); 2853 ASSERT(!lgrp_topo_initialized); 2854 ASSERT(ncpus == 1); 2855 2856 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2857 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2858 /* 2859 * Copy all fields from lpl, except for the rset, 2860 * lgrp id <=> rset mapping storage, 2861 * and amount of storage 2862 */ 2863 rset = target_lpl->lpl_rset; 2864 id2rset = target_lpl->lpl_id2rset; 2865 sz = target_lpl->lpl_rset_sz; 2866 2867 *target_lpl = *lpl; 2868 2869 target_lpl->lpl_rset_sz = sz; 2870 target_lpl->lpl_rset = rset; 2871 target_lpl->lpl_id2rset = id2rset; 2872 2873 /* 2874 * Substitute CPU0 lpl pointer with one relative to target. 2875 */ 2876 if (lpl->lpl_cpus == CPU) { 2877 ASSERT(CPU->cpu_lpl == lpl); 2878 CPU->cpu_lpl = target_lpl; 2879 } 2880 2881 /* 2882 * Substitute parent information with parent relative to target. 2883 */ 2884 if (lpl->lpl_parent != NULL) 2885 target_lpl->lpl_parent = (lpl_t *) 2886 (((uintptr_t)lpl->lpl_parent - 2887 (uintptr_t)lpl_bootstrap) + 2888 (uintptr_t)target); 2889 2890 /* 2891 * Walk over resource set substituting pointers relative to 2892 * lpl_bootstrap's rset to pointers relative to target's 2893 */ 2894 ASSERT(lpl->lpl_nrset <= 1); 2895 2896 for (id = 0; id < lpl->lpl_nrset; id++) { 2897 if (lpl->lpl_rset[id] != NULL) { 2898 target_lpl->lpl_rset[id] = (lpl_t *) 2899 (((uintptr_t)lpl->lpl_rset[id] - 2900 (uintptr_t)lpl_bootstrap) + 2901 (uintptr_t)target); 2902 } 2903 target_lpl->lpl_id2rset[id] = 2904 lpl->lpl_id2rset[id]; 2905 } 2906 } 2907 2908 /* 2909 * Clean up the bootstrap lpls since we have switched over to the 2910 * actual lpl array in the default cpu partition. 2911 * 2912 * We still need to keep one empty lpl around for newly starting 2913 * slave CPUs to reference should they need to make it through the 2914 * dispatcher prior to their lgrp/lpl initialization. 2915 * 2916 * The lpl related dispatcher code has been designed to work properly 2917 * (and without extra checks) for this special case of a zero'ed 2918 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl 2919 * with lgrpid 0 and an empty resource set. Iteration over the rset 2920 * array by the dispatcher is also NULL terminated for this reason. 2921 * 2922 * This provides the desired behaviour for an uninitialized CPU. 2923 * It shouldn't see any other CPU to either dispatch to or steal 2924 * from until it is properly initialized. 2925 */ 2926 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2927 bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset)); 2928 bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset)); 2929 2930 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; 2931 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; 2932 } 2933 2934 /* 2935 * If the lowest load among the lgroups a process' threads are currently 2936 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2937 * expanding the process to a new lgroup. 2938 */ 2939 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2940 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2941 2942 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2943 ((lgrp_expand_proc_thresh) / (ncpu)) 2944 2945 /* 2946 * A process will be expanded to a new lgroup only if the difference between 2947 * the lowest load on the lgroups the process' thread's are currently spread 2948 * across and the lowest load on the other lgroups in the process' partition 2949 * is greater than lgrp_expand_proc_diff. 2950 */ 2951 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2952 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2953 2954 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2955 ((lgrp_expand_proc_diff) / (ncpu)) 2956 2957 /* 2958 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2959 * be present due to impreciseness of the load average decay algorithm. 2960 * 2961 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2962 * tolerance is scaled by the number of cpus in the lgroup just like 2963 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2964 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2965 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2966 */ 2967 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2968 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2969 ((lgrp_loadavg_tolerance) / ncpu) 2970 2971 /* 2972 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2973 * average is above this threshold 2974 */ 2975 uint32_t lgrp_load_thresh = UINT32_MAX; 2976 2977 /* 2978 * lgrp_choose() will try to skip any lgroups with less memory 2979 * than this free when choosing a home lgroup 2980 */ 2981 pgcnt_t lgrp_mem_free_thresh = 0; 2982 2983 /* 2984 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2985 * one based on one of the following policies: 2986 * - Random selection 2987 * - Pseudo round robin placement 2988 * - Longest time since a thread was last placed 2989 */ 2990 #define LGRP_CHOOSE_RANDOM 1 2991 #define LGRP_CHOOSE_RR 2 2992 #define LGRP_CHOOSE_TIME 3 2993 2994 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2995 2996 /* 2997 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 2998 * be bound to a CPU or processor set. 2999 * 3000 * Arguments: 3001 * t The thread 3002 * cpupart The partition the thread belongs to. 3003 * 3004 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 3005 * disabled, or thread_lock held (at splhigh) to protect against the CPU 3006 * partitions changing out from under us and assumes that given thread is 3007 * protected. Also, called sometimes w/ cpus paused or kernel preemption 3008 * disabled, so don't grab any locks because we should never block under 3009 * those conditions. 3010 */ 3011 lpl_t * 3012 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 3013 { 3014 lgrp_load_t bestload, bestrload; 3015 int lgrpid_offset, lgrp_count; 3016 lgrp_id_t lgrpid, lgrpid_start; 3017 lpl_t *lpl, *bestlpl, *bestrlpl; 3018 klgrpset_t lgrpset; 3019 proc_t *p; 3020 3021 ASSERT(t != NULL); 3022 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3023 THREAD_LOCK_HELD(t)); 3024 ASSERT(cpupart != NULL); 3025 3026 p = t->t_procp; 3027 3028 /* A process should always be in an active partition */ 3029 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3030 3031 bestlpl = bestrlpl = NULL; 3032 bestload = bestrload = LGRP_LOADAVG_MAX; 3033 lgrpset = cpupart->cp_lgrpset; 3034 3035 switch (lgrp_choose_policy) { 3036 case LGRP_CHOOSE_RR: 3037 lgrpid = cpupart->cp_lgrp_hint; 3038 do { 3039 if (++lgrpid > lgrp_alloc_max) 3040 lgrpid = 0; 3041 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3042 3043 break; 3044 default: 3045 case LGRP_CHOOSE_TIME: 3046 case LGRP_CHOOSE_RANDOM: 3047 klgrpset_nlgrps(lgrpset, lgrp_count); 3048 lgrpid_offset = 3049 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3050 for (lgrpid = 0; ; lgrpid++) { 3051 if (klgrpset_ismember(lgrpset, lgrpid)) { 3052 if (--lgrpid_offset == 0) 3053 break; 3054 } 3055 } 3056 break; 3057 } 3058 3059 lgrpid_start = lgrpid; 3060 3061 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3062 lgrp_id_t, cpupart->cp_lgrp_hint); 3063 3064 /* 3065 * Use lgroup affinities (if any) to choose best lgroup 3066 * 3067 * NOTE: Assumes that thread is protected from going away and its 3068 * lgroup affinities won't change (ie. p_lock, or 3069 * thread_lock() being held and/or CPUs paused) 3070 */ 3071 if (t->t_lgrp_affinity) { 3072 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 3073 if (lpl != NULL) 3074 return (lpl); 3075 } 3076 3077 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3078 3079 do { 3080 pgcnt_t npgs; 3081 3082 /* 3083 * Skip any lgroups outside of thread's pset 3084 */ 3085 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3086 if (++lgrpid > lgrp_alloc_max) 3087 lgrpid = 0; /* wrap the search */ 3088 continue; 3089 } 3090 3091 /* 3092 * Skip any non-leaf lgroups 3093 */ 3094 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3095 continue; 3096 3097 /* 3098 * Skip any lgroups without enough free memory 3099 * (when threshold set to nonzero positive value) 3100 */ 3101 if (lgrp_mem_free_thresh > 0) { 3102 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3103 if (npgs < lgrp_mem_free_thresh) { 3104 if (++lgrpid > lgrp_alloc_max) 3105 lgrpid = 0; /* wrap the search */ 3106 continue; 3107 } 3108 } 3109 3110 lpl = &cpupart->cp_lgrploads[lgrpid]; 3111 if (klgrpset_isempty(p->p_lgrpset) || 3112 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3113 /* 3114 * Either this is a new process or the process already 3115 * has threads on this lgrp, so this is a preferred 3116 * lgroup for the thread. 3117 */ 3118 if (bestlpl == NULL || 3119 lpl_pick(lpl, bestlpl)) { 3120 bestload = lpl->lpl_loadavg; 3121 bestlpl = lpl; 3122 } 3123 } else { 3124 /* 3125 * The process doesn't have any threads on this lgrp, 3126 * but we're willing to consider this lgrp if the load 3127 * difference is big enough to justify splitting up 3128 * the process' threads. 3129 */ 3130 if (bestrlpl == NULL || 3131 lpl_pick(lpl, bestrlpl)) { 3132 bestrload = lpl->lpl_loadavg; 3133 bestrlpl = lpl; 3134 } 3135 } 3136 if (++lgrpid > lgrp_alloc_max) 3137 lgrpid = 0; /* wrap the search */ 3138 } while (lgrpid != lgrpid_start); 3139 3140 /* 3141 * Return root lgroup if threshold isn't set to maximum value and 3142 * lowest lgroup load average more than a certain threshold 3143 */ 3144 if (lgrp_load_thresh != UINT32_MAX && 3145 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3146 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3147 3148 /* 3149 * If all the lgroups over which the thread's process is spread are 3150 * heavily loaded, or otherwise undesirable, we'll consider placing 3151 * the thread on one of the other leaf lgroups in the thread's 3152 * partition. 3153 */ 3154 if ((bestlpl == NULL) || 3155 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3156 (bestrload < bestload) && /* paranoid about wraparound */ 3157 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3158 bestload))) { 3159 bestlpl = bestrlpl; 3160 } 3161 3162 if (bestlpl == NULL) { 3163 /* 3164 * No lgroup looked particularly good, but we still 3165 * have to pick something. Go with the randomly selected 3166 * legal lgroup we started with above. 3167 */ 3168 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3169 } 3170 3171 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3172 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3173 3174 ASSERT(bestlpl->lpl_ncpu > 0); 3175 return (bestlpl); 3176 } 3177 3178 /* 3179 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3180 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3181 */ 3182 static int 3183 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3184 { 3185 lgrp_load_t l1, l2; 3186 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3187 3188 l1 = lpl1->lpl_loadavg; 3189 l2 = lpl2->lpl_loadavg; 3190 3191 if ((l1 + tolerance < l2) && (l1 < l2)) { 3192 /* lpl1 is significantly less loaded than lpl2 */ 3193 return (1); 3194 } 3195 3196 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3197 l1 + tolerance >= l2 && l1 < l2 && 3198 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3199 /* 3200 * lpl1's load is within the tolerance of lpl2. We're 3201 * willing to consider it be to better however if 3202 * it has been longer since we last homed a thread there 3203 */ 3204 return (1); 3205 } 3206 3207 return (0); 3208 } 3209 3210 /* 3211 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a 3212 * process that uses text replication changed home lgrp. This info is used by 3213 * segvn asyncronous thread to detect if it needs to recheck what lgrps 3214 * should be used for text replication. 3215 */ 3216 static uint64_t lgrp_trthr_moves = 0; 3217 3218 uint64_t 3219 lgrp_get_trthr_migrations(void) 3220 { 3221 return (lgrp_trthr_moves); 3222 } 3223 3224 void 3225 lgrp_update_trthr_migrations(uint64_t incr) 3226 { 3227 atomic_add_64(&lgrp_trthr_moves, incr); 3228 } 3229 3230 /* 3231 * An LWP is expected to be assigned to an lgroup for at least this long 3232 * for its anticipatory load to be justified. NOTE that this value should 3233 * not be set extremely huge (say, larger than 100 years), to avoid problems 3234 * with overflow in the calculation that uses it. 3235 */ 3236 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3237 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3238 3239 /* 3240 * Routine to change a thread's lgroup affiliation. This routine updates 3241 * the thread's kthread_t struct and its process' proc_t struct to note the 3242 * thread's new lgroup affiliation, and its lgroup affinities. 3243 * 3244 * Note that this is the only routine that modifies a thread's t_lpl field, 3245 * and that adds in or removes anticipatory load. 3246 * 3247 * If the thread is exiting, newlpl is NULL. 3248 * 3249 * Locking: 3250 * The following lock must be held on entry: 3251 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3252 * doesn't get removed from t's partition 3253 * 3254 * This routine is not allowed to grab any locks, since it may be called 3255 * with cpus paused (such as from cpu_offline). 3256 */ 3257 void 3258 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3259 { 3260 proc_t *p; 3261 lpl_t *lpl, *oldlpl; 3262 lgrp_id_t oldid; 3263 kthread_t *tp; 3264 uint_t ncpu; 3265 lgrp_load_t old, new; 3266 3267 ASSERT(t); 3268 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3269 THREAD_LOCK_HELD(t)); 3270 3271 /* 3272 * If not changing lpls, just return 3273 */ 3274 if ((oldlpl = t->t_lpl) == newlpl) 3275 return; 3276 3277 /* 3278 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3279 * associated with process 0 rather than with its original process). 3280 */ 3281 if (t->t_proc_flag & TP_LWPEXIT) { 3282 if (newlpl != NULL) { 3283 t->t_lpl = newlpl; 3284 } 3285 return; 3286 } 3287 3288 p = ttoproc(t); 3289 3290 /* 3291 * If the thread had a previous lgroup, update its process' p_lgrpset 3292 * to account for it being moved from its old lgroup. 3293 */ 3294 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3295 (p->p_tlist != NULL)) { 3296 oldid = oldlpl->lpl_lgrpid; 3297 3298 if (newlpl != NULL) 3299 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3300 3301 if ((do_lgrpset_delete) && 3302 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3303 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3304 /* 3305 * Check if a thread other than the thread 3306 * that's moving is assigned to the same 3307 * lgroup as the thread that's moving. Note 3308 * that we have to compare lgroup IDs, rather 3309 * than simply comparing t_lpl's, since the 3310 * threads may belong to different partitions 3311 * but be assigned to the same lgroup. 3312 */ 3313 ASSERT(tp->t_lpl != NULL); 3314 3315 if ((tp != t) && 3316 (tp->t_lpl->lpl_lgrpid == oldid)) { 3317 /* 3318 * Another thread is assigned to the 3319 * same lgroup as the thread that's 3320 * moving, p_lgrpset doesn't change. 3321 */ 3322 break; 3323 } else if (tp == p->p_tlist) { 3324 /* 3325 * No other thread is assigned to the 3326 * same lgroup as the exiting thread, 3327 * clear the lgroup's bit in p_lgrpset. 3328 */ 3329 klgrpset_del(p->p_lgrpset, oldid); 3330 break; 3331 } 3332 } 3333 } 3334 3335 /* 3336 * If this thread was assigned to its old lgroup for such a 3337 * short amount of time that the anticipatory load that was 3338 * added on its behalf has aged very little, remove that 3339 * anticipatory load. 3340 */ 3341 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3342 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3343 lpl = oldlpl; 3344 for (;;) { 3345 do { 3346 old = new = lpl->lpl_loadavg; 3347 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3348 if (new > old) { 3349 /* 3350 * this can happen if the load 3351 * average was aged since we 3352 * added in the anticipatory 3353 * load 3354 */ 3355 new = 0; 3356 } 3357 } while (atomic_cas_32( 3358 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3359 new) != old); 3360 3361 lpl = lpl->lpl_parent; 3362 if (lpl == NULL) 3363 break; 3364 3365 ncpu = lpl->lpl_ncpu; 3366 ASSERT(ncpu > 0); 3367 } 3368 } 3369 } 3370 /* 3371 * If the thread has a new lgroup (i.e. it's not exiting), update its 3372 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3373 * to its new lgroup to account for its move to its new lgroup. 3374 */ 3375 if (newlpl != NULL) { 3376 /* 3377 * This thread is moving to a new lgroup 3378 */ 3379 t->t_lpl = newlpl; 3380 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) { 3381 p->p_t1_lgrpid = newlpl->lpl_lgrpid; 3382 membar_producer(); 3383 if (p->p_tr_lgrpid != LGRP_NONE && 3384 p->p_tr_lgrpid != p->p_t1_lgrpid) { 3385 lgrp_update_trthr_migrations(1); 3386 } 3387 } 3388 3389 /* 3390 * Reflect move in load average of new lgroup 3391 * unless it is root lgroup 3392 */ 3393 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3394 return; 3395 3396 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3397 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3398 } 3399 3400 /* 3401 * It'll take some time for the load on the new lgroup 3402 * to reflect this thread's placement on it. We'd 3403 * like not, however, to have all threads between now 3404 * and then also piling on to this lgroup. To avoid 3405 * this pileup, we anticipate the load this thread 3406 * will generate on its new lgroup. The goal is to 3407 * make the lgroup's load appear as though the thread 3408 * had been there all along. We're very conservative 3409 * in calculating this anticipatory load, we assume 3410 * the worst case case (100% CPU-bound thread). This 3411 * may be modified in the future to be more accurate. 3412 */ 3413 lpl = newlpl; 3414 for (;;) { 3415 ncpu = lpl->lpl_ncpu; 3416 ASSERT(ncpu > 0); 3417 do { 3418 old = new = lpl->lpl_loadavg; 3419 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3420 /* 3421 * Check for overflow 3422 * Underflow not possible here 3423 */ 3424 if (new < old) 3425 new = UINT32_MAX; 3426 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg, 3427 old, new) != old); 3428 3429 lpl = lpl->lpl_parent; 3430 if (lpl == NULL) 3431 break; 3432 } 3433 t->t_anttime = gethrtime(); 3434 } 3435 } 3436 3437 /* 3438 * Return lgroup memory allocation policy given advice from madvise(3C) 3439 */ 3440 lgrp_mem_policy_t 3441 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3442 { 3443 switch (advice) { 3444 case MADV_ACCESS_LWP: 3445 return (LGRP_MEM_POLICY_NEXT); 3446 case MADV_ACCESS_MANY: 3447 return (LGRP_MEM_POLICY_RANDOM); 3448 default: 3449 return (lgrp_mem_policy_default(size, type)); 3450 } 3451 } 3452 3453 /* 3454 * Figure out default policy 3455 */ 3456 lgrp_mem_policy_t 3457 lgrp_mem_policy_default(size_t size, int type) 3458 { 3459 cpupart_t *cp; 3460 lgrp_mem_policy_t policy; 3461 size_t pset_mem_size; 3462 3463 /* 3464 * Randomly allocate memory across lgroups for shared memory 3465 * beyond a certain threshold 3466 */ 3467 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3468 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3469 /* 3470 * Get total memory size of current thread's pset 3471 */ 3472 kpreempt_disable(); 3473 cp = curthread->t_cpupart; 3474 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3475 kpreempt_enable(); 3476 3477 /* 3478 * Choose policy to randomly allocate memory across 3479 * lgroups in pset if it will fit and is not default 3480 * partition. Otherwise, allocate memory randomly 3481 * across machine. 3482 */ 3483 if (lgrp_mem_pset_aware && size < pset_mem_size) 3484 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3485 else 3486 policy = LGRP_MEM_POLICY_RANDOM; 3487 } else 3488 /* 3489 * Apply default policy for private memory and 3490 * shared memory under the respective random 3491 * threshold. 3492 */ 3493 policy = lgrp_mem_default_policy; 3494 3495 return (policy); 3496 } 3497 3498 /* 3499 * Get memory allocation policy for this segment 3500 */ 3501 lgrp_mem_policy_info_t * 3502 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3503 { 3504 lgrp_mem_policy_info_t *policy_info; 3505 extern struct seg_ops segspt_ops; 3506 extern struct seg_ops segspt_shmops; 3507 3508 /* 3509 * This is for binary compatibility to protect against third party 3510 * segment drivers which haven't recompiled to allow for 3511 * SEGOP_GETPOLICY() 3512 */ 3513 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3514 seg->s_ops != &segspt_shmops) 3515 return (NULL); 3516 3517 policy_info = NULL; 3518 if (seg->s_ops->getpolicy != NULL) 3519 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3520 3521 return (policy_info); 3522 } 3523 3524 /* 3525 * Set policy for allocating private memory given desired policy, policy info, 3526 * size in bytes of memory that policy is being applied. 3527 * Return 0 if policy wasn't set already and 1 if policy was set already 3528 */ 3529 int 3530 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3531 lgrp_mem_policy_info_t *policy_info, size_t size) 3532 { 3533 3534 ASSERT(policy_info != NULL); 3535 3536 if (policy == LGRP_MEM_POLICY_DEFAULT) 3537 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3538 3539 /* 3540 * Policy set already? 3541 */ 3542 if (policy == policy_info->mem_policy) 3543 return (1); 3544 3545 /* 3546 * Set policy 3547 */ 3548 policy_info->mem_policy = policy; 3549 policy_info->mem_lgrpid = LGRP_NONE; 3550 3551 return (0); 3552 } 3553 3554 3555 /* 3556 * Get shared memory allocation policy with given tree and offset 3557 */ 3558 lgrp_mem_policy_info_t * 3559 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3560 u_offset_t vn_off) 3561 { 3562 u_offset_t off; 3563 lgrp_mem_policy_info_t *policy_info; 3564 lgrp_shm_policy_seg_t *policy_seg; 3565 lgrp_shm_locality_t *shm_locality; 3566 avl_tree_t *tree; 3567 avl_index_t where; 3568 3569 /* 3570 * Get policy segment tree from anon_map or vnode and use specified 3571 * anon index or vnode offset as offset 3572 * 3573 * Assume that no lock needs to be held on anon_map or vnode, since 3574 * they should be protected by their reference count which must be 3575 * nonzero for an existing segment 3576 */ 3577 if (amp) { 3578 ASSERT(amp->refcnt != 0); 3579 shm_locality = amp->locality; 3580 if (shm_locality == NULL) 3581 return (NULL); 3582 tree = shm_locality->loc_tree; 3583 off = ptob(anon_index); 3584 } else if (vp) { 3585 shm_locality = vp->v_locality; 3586 if (shm_locality == NULL) 3587 return (NULL); 3588 ASSERT(shm_locality->loc_count != 0); 3589 tree = shm_locality->loc_tree; 3590 off = vn_off; 3591 } 3592 3593 if (tree == NULL) 3594 return (NULL); 3595 3596 /* 3597 * Lookup policy segment for offset into shared object and return 3598 * policy info 3599 */ 3600 rw_enter(&shm_locality->loc_lock, RW_READER); 3601 policy_info = NULL; 3602 policy_seg = avl_find(tree, &off, &where); 3603 if (policy_seg) 3604 policy_info = &policy_seg->shm_policy; 3605 rw_exit(&shm_locality->loc_lock); 3606 3607 return (policy_info); 3608 } 3609 3610 /* 3611 * Default memory allocation policy for kernel segmap pages 3612 */ 3613 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3614 3615 /* 3616 * Return lgroup to use for allocating memory 3617 * given the segment and address 3618 * 3619 * There isn't any mutual exclusion that exists between calls 3620 * to this routine and DR, so this routine and whomever calls it 3621 * should be mindful of the possibility that the lgrp returned 3622 * may be deleted. If this happens, dereferences of the lgrp 3623 * pointer will still be safe, but the resources in the lgrp will 3624 * be gone, and LGRP_EXISTS() will no longer be true. 3625 */ 3626 lgrp_t * 3627 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3628 { 3629 int i; 3630 lgrp_t *lgrp; 3631 klgrpset_t lgrpset; 3632 int lgrps_spanned; 3633 unsigned long off; 3634 lgrp_mem_policy_t policy; 3635 lgrp_mem_policy_info_t *policy_info; 3636 ushort_t random; 3637 int stat = 0; 3638 extern struct seg *segkmap; 3639 3640 /* 3641 * Just return null if the lgrp framework hasn't finished 3642 * initializing or if this is a UMA machine. 3643 */ 3644 if (nlgrps == 1 || !lgrp_initialized) 3645 return (lgrp_root); 3646 3647 /* 3648 * Get memory allocation policy for this segment 3649 */ 3650 policy = lgrp_mem_default_policy; 3651 if (seg != NULL) { 3652 if (seg->s_as == &kas) { 3653 if (seg == segkmap) 3654 policy = lgrp_segmap_default_policy; 3655 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3656 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3657 policy = LGRP_MEM_POLICY_RANDOM; 3658 } else { 3659 policy_info = lgrp_mem_policy_get(seg, vaddr); 3660 if (policy_info != NULL) { 3661 policy = policy_info->mem_policy; 3662 if (policy == LGRP_MEM_POLICY_NEXT_SEG) { 3663 lgrp_id_t id = policy_info->mem_lgrpid; 3664 ASSERT(id != LGRP_NONE); 3665 ASSERT(id < NLGRPS_MAX); 3666 lgrp = lgrp_table[id]; 3667 if (!LGRP_EXISTS(lgrp)) { 3668 policy = LGRP_MEM_POLICY_NEXT; 3669 } else { 3670 lgrp_stat_add(id, 3671 LGRP_NUM_NEXT_SEG, 1); 3672 return (lgrp); 3673 } 3674 } 3675 } 3676 } 3677 } 3678 lgrpset = 0; 3679 3680 /* 3681 * Initialize lgroup to home by default 3682 */ 3683 lgrp = lgrp_home_lgrp(); 3684 3685 /* 3686 * When homing threads on root lgrp, override default memory 3687 * allocation policies with root lgroup memory allocation policy 3688 */ 3689 if (lgrp == lgrp_root) 3690 policy = lgrp_mem_policy_root; 3691 3692 /* 3693 * Implement policy 3694 */ 3695 switch (policy) { 3696 case LGRP_MEM_POLICY_NEXT_CPU: 3697 3698 /* 3699 * Return lgroup of current CPU which faulted on memory 3700 * If the CPU isn't currently in an lgrp, then opt to 3701 * allocate from the root. 3702 * 3703 * Kernel preemption needs to be disabled here to prevent 3704 * the current CPU from going away before lgrp is found. 3705 */ 3706 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3707 lgrp = lgrp_root; 3708 } else { 3709 kpreempt_disable(); 3710 lgrp = lgrp_cpu_to_lgrp(CPU); 3711 kpreempt_enable(); 3712 } 3713 break; 3714 3715 case LGRP_MEM_POLICY_NEXT: 3716 case LGRP_MEM_POLICY_DEFAULT: 3717 default: 3718 3719 /* 3720 * Just return current thread's home lgroup 3721 * for default policy (next touch) 3722 * If the thread is homed to the root, 3723 * then the default policy is random across lgroups. 3724 * Fallthrough to the random case. 3725 */ 3726 if (lgrp != lgrp_root) { 3727 if (policy == LGRP_MEM_POLICY_NEXT) 3728 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3729 else 3730 lgrp_stat_add(lgrp->lgrp_id, 3731 LGRP_NUM_DEFAULT, 1); 3732 break; 3733 } 3734 /* FALLTHROUGH */ 3735 case LGRP_MEM_POLICY_RANDOM: 3736 3737 /* 3738 * Return a random leaf lgroup with memory 3739 */ 3740 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3741 /* 3742 * Count how many lgroups are spanned 3743 */ 3744 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3745 3746 /* 3747 * There may be no memnodes in the root lgroup during DR copy 3748 * rename on a system with only two boards (memnodes) 3749 * configured. In this case just return the root lgrp. 3750 */ 3751 if (lgrps_spanned == 0) { 3752 lgrp = lgrp_root; 3753 break; 3754 } 3755 3756 /* 3757 * Pick a random offset within lgroups spanned 3758 * and return lgroup at that offset 3759 */ 3760 random = (ushort_t)gethrtime() >> 4; 3761 off = random % lgrps_spanned; 3762 ASSERT(off <= lgrp_alloc_max); 3763 3764 for (i = 0; i <= lgrp_alloc_max; i++) { 3765 if (!klgrpset_ismember(lgrpset, i)) 3766 continue; 3767 if (off) 3768 off--; 3769 else { 3770 lgrp = lgrp_table[i]; 3771 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3772 1); 3773 break; 3774 } 3775 } 3776 break; 3777 3778 case LGRP_MEM_POLICY_RANDOM_PROC: 3779 3780 /* 3781 * Grab copy of bitmask of lgroups spanned by 3782 * this process 3783 */ 3784 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3785 stat = LGRP_NUM_RANDOM_PROC; 3786 3787 /* FALLTHROUGH */ 3788 case LGRP_MEM_POLICY_RANDOM_PSET: 3789 3790 if (!stat) 3791 stat = LGRP_NUM_RANDOM_PSET; 3792 3793 if (klgrpset_isempty(lgrpset)) { 3794 /* 3795 * Grab copy of bitmask of lgroups spanned by 3796 * this processor set 3797 */ 3798 kpreempt_disable(); 3799 klgrpset_copy(lgrpset, 3800 curthread->t_cpupart->cp_lgrpset); 3801 kpreempt_enable(); 3802 } 3803 3804 /* 3805 * Count how many lgroups are spanned 3806 */ 3807 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3808 ASSERT(lgrps_spanned <= nlgrps); 3809 3810 /* 3811 * Probably lgrps_spanned should be always non-zero, but to be 3812 * on the safe side we return lgrp_root if it is empty. 3813 */ 3814 if (lgrps_spanned == 0) { 3815 lgrp = lgrp_root; 3816 break; 3817 } 3818 3819 /* 3820 * Pick a random offset within lgroups spanned 3821 * and return lgroup at that offset 3822 */ 3823 random = (ushort_t)gethrtime() >> 4; 3824 off = random % lgrps_spanned; 3825 ASSERT(off <= lgrp_alloc_max); 3826 3827 for (i = 0; i <= lgrp_alloc_max; i++) { 3828 if (!klgrpset_ismember(lgrpset, i)) 3829 continue; 3830 if (off) 3831 off--; 3832 else { 3833 lgrp = lgrp_table[i]; 3834 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3835 1); 3836 break; 3837 } 3838 } 3839 break; 3840 3841 case LGRP_MEM_POLICY_ROUNDROBIN: 3842 3843 /* 3844 * Use offset within segment to determine 3845 * offset from home lgroup to choose for 3846 * next lgroup to allocate memory from 3847 */ 3848 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3849 (lgrp_alloc_max + 1); 3850 3851 kpreempt_disable(); 3852 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3853 i = lgrp->lgrp_id; 3854 kpreempt_enable(); 3855 3856 while (off > 0) { 3857 i = (i + 1) % (lgrp_alloc_max + 1); 3858 lgrp = lgrp_table[i]; 3859 if (klgrpset_ismember(lgrpset, i)) 3860 off--; 3861 } 3862 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3863 3864 break; 3865 } 3866 3867 ASSERT(lgrp != NULL); 3868 return (lgrp); 3869 } 3870 3871 /* 3872 * Return the number of pages in an lgroup 3873 * 3874 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3875 * could cause tests that rely on the numat driver to fail.... 3876 */ 3877 pgcnt_t 3878 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3879 { 3880 lgrp_t *lgrp; 3881 3882 lgrp = lgrp_table[lgrpid]; 3883 if (!LGRP_EXISTS(lgrp) || 3884 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3885 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3886 return (0); 3887 3888 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3889 } 3890 3891 /* 3892 * Initialize lgroup shared memory allocation policy support 3893 */ 3894 void 3895 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3896 { 3897 lgrp_shm_locality_t *shm_locality; 3898 3899 /* 3900 * Initialize locality field in anon_map 3901 * Don't need any locks because this is called when anon_map is 3902 * allocated, but not used anywhere yet. 3903 */ 3904 if (amp) { 3905 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3906 if (amp->locality == NULL) { 3907 /* 3908 * Allocate and initialize shared memory locality info 3909 * and set anon_map locality pointer to it 3910 * Drop lock across kmem_alloc(KM_SLEEP) 3911 */ 3912 ANON_LOCK_EXIT(&->a_rwlock); 3913 shm_locality = kmem_alloc(sizeof (*shm_locality), 3914 KM_SLEEP); 3915 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3916 NULL); 3917 shm_locality->loc_count = 1; /* not used for amp */ 3918 shm_locality->loc_tree = NULL; 3919 3920 /* 3921 * Reacquire lock and check to see whether anyone beat 3922 * us to initializing the locality info 3923 */ 3924 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3925 if (amp->locality != NULL) { 3926 rw_destroy(&shm_locality->loc_lock); 3927 kmem_free(shm_locality, 3928 sizeof (*shm_locality)); 3929 } else 3930 amp->locality = shm_locality; 3931 } 3932 ANON_LOCK_EXIT(&->a_rwlock); 3933 return; 3934 } 3935 3936 /* 3937 * Allocate shared vnode policy info if vnode is not locality aware yet 3938 */ 3939 mutex_enter(&vp->v_lock); 3940 if ((vp->v_flag & V_LOCALITY) == 0) { 3941 /* 3942 * Allocate and initialize shared memory locality info 3943 */ 3944 mutex_exit(&vp->v_lock); 3945 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3946 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3947 shm_locality->loc_count = 1; 3948 shm_locality->loc_tree = NULL; 3949 3950 /* 3951 * Point vnode locality field at shared vnode policy info 3952 * and set locality aware flag in vnode 3953 */ 3954 mutex_enter(&vp->v_lock); 3955 if ((vp->v_flag & V_LOCALITY) == 0) { 3956 vp->v_locality = shm_locality; 3957 vp->v_flag |= V_LOCALITY; 3958 } else { 3959 /* 3960 * Lost race so free locality info and increment count. 3961 */ 3962 rw_destroy(&shm_locality->loc_lock); 3963 kmem_free(shm_locality, sizeof (*shm_locality)); 3964 shm_locality = vp->v_locality; 3965 shm_locality->loc_count++; 3966 } 3967 mutex_exit(&vp->v_lock); 3968 3969 return; 3970 } 3971 3972 /* 3973 * Increment reference count of number of segments mapping this vnode 3974 * shared 3975 */ 3976 shm_locality = vp->v_locality; 3977 shm_locality->loc_count++; 3978 mutex_exit(&vp->v_lock); 3979 } 3980 3981 /* 3982 * Destroy the given shared memory policy segment tree 3983 */ 3984 void 3985 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3986 { 3987 lgrp_shm_policy_seg_t *cur; 3988 lgrp_shm_policy_seg_t *next; 3989 3990 if (tree == NULL) 3991 return; 3992 3993 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3994 while (cur != NULL) { 3995 next = AVL_NEXT(tree, cur); 3996 avl_remove(tree, cur); 3997 kmem_free(cur, sizeof (*cur)); 3998 cur = next; 3999 } 4000 kmem_free(tree, sizeof (avl_tree_t)); 4001 } 4002 4003 /* 4004 * Uninitialize lgroup shared memory allocation policy support 4005 */ 4006 void 4007 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 4008 { 4009 lgrp_shm_locality_t *shm_locality; 4010 4011 /* 4012 * For anon_map, deallocate shared memory policy tree and 4013 * zero locality field 4014 * Don't need any locks because anon_map is being freed 4015 */ 4016 if (amp) { 4017 if (amp->locality == NULL) 4018 return; 4019 shm_locality = amp->locality; 4020 shm_locality->loc_count = 0; /* not really used for amp */ 4021 rw_destroy(&shm_locality->loc_lock); 4022 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4023 kmem_free(shm_locality, sizeof (*shm_locality)); 4024 amp->locality = 0; 4025 return; 4026 } 4027 4028 /* 4029 * For vnode, decrement reference count of segments mapping this vnode 4030 * shared and delete locality info if reference count drops to 0 4031 */ 4032 mutex_enter(&vp->v_lock); 4033 shm_locality = vp->v_locality; 4034 shm_locality->loc_count--; 4035 4036 if (shm_locality->loc_count == 0) { 4037 rw_destroy(&shm_locality->loc_lock); 4038 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4039 kmem_free(shm_locality, sizeof (*shm_locality)); 4040 vp->v_locality = 0; 4041 vp->v_flag &= ~V_LOCALITY; 4042 } 4043 mutex_exit(&vp->v_lock); 4044 } 4045 4046 /* 4047 * Compare two shared memory policy segments 4048 * Used by AVL tree code for searching 4049 */ 4050 int 4051 lgrp_shm_policy_compar(const void *x, const void *y) 4052 { 4053 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4054 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4055 4056 if (a->shm_off < b->shm_off) 4057 return (-1); 4058 if (a->shm_off >= b->shm_off + b->shm_size) 4059 return (1); 4060 return (0); 4061 } 4062 4063 /* 4064 * Concatenate seg1 with seg2 and remove seg2 4065 */ 4066 static int 4067 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4068 lgrp_shm_policy_seg_t *seg2) 4069 { 4070 if (!seg1 || !seg2 || 4071 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4072 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4073 return (-1); 4074 4075 seg1->shm_size += seg2->shm_size; 4076 avl_remove(tree, seg2); 4077 kmem_free(seg2, sizeof (*seg2)); 4078 return (0); 4079 } 4080 4081 /* 4082 * Split segment at given offset and return rightmost (uppermost) segment 4083 * Assumes that there are no overlapping segments 4084 */ 4085 static lgrp_shm_policy_seg_t * 4086 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4087 u_offset_t off) 4088 { 4089 lgrp_shm_policy_seg_t *newseg; 4090 avl_index_t where; 4091 4092 ASSERT(seg != NULL); 4093 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4094 4095 if (!seg || off < seg->shm_off || off > seg->shm_off + 4096 seg->shm_size) 4097 return (NULL); 4098 4099 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4100 return (seg); 4101 4102 /* 4103 * Adjust size of left segment and allocate new (right) segment 4104 */ 4105 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4106 newseg->shm_policy = seg->shm_policy; 4107 newseg->shm_off = off; 4108 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4109 seg->shm_size = off - seg->shm_off; 4110 4111 /* 4112 * Find where to insert new segment in AVL tree and insert it 4113 */ 4114 (void) avl_find(tree, &off, &where); 4115 avl_insert(tree, newseg, where); 4116 4117 return (newseg); 4118 } 4119 4120 /* 4121 * Set shared memory allocation policy on specified shared object at given 4122 * offset and length 4123 * 4124 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4125 * -1 if can't set policy. 4126 */ 4127 int 4128 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4129 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4130 { 4131 u_offset_t eoff; 4132 lgrp_shm_policy_seg_t *next; 4133 lgrp_shm_policy_seg_t *newseg; 4134 u_offset_t off; 4135 u_offset_t oldeoff; 4136 lgrp_shm_policy_seg_t *prev; 4137 int retval; 4138 lgrp_shm_policy_seg_t *seg; 4139 lgrp_shm_locality_t *shm_locality; 4140 avl_tree_t *tree; 4141 avl_index_t where; 4142 4143 ASSERT(amp || vp); 4144 ASSERT((len & PAGEOFFSET) == 0); 4145 4146 if (len == 0) 4147 return (-1); 4148 4149 retval = 0; 4150 4151 /* 4152 * Get locality info and starting offset into shared object 4153 * Try anon map first and then vnode 4154 * Assume that no locks need to be held on anon_map or vnode, since 4155 * it should be protected by its reference count which must be nonzero 4156 * for an existing segment. 4157 */ 4158 if (amp) { 4159 /* 4160 * Get policy info from anon_map 4161 * 4162 */ 4163 ASSERT(amp->refcnt != 0); 4164 if (amp->locality == NULL) 4165 lgrp_shm_policy_init(amp, NULL); 4166 shm_locality = amp->locality; 4167 off = ptob(anon_index); 4168 } else if (vp) { 4169 /* 4170 * Get policy info from vnode 4171 */ 4172 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4173 lgrp_shm_policy_init(NULL, vp); 4174 shm_locality = vp->v_locality; 4175 ASSERT(shm_locality->loc_count != 0); 4176 off = vn_off; 4177 } else 4178 return (-1); 4179 4180 ASSERT((off & PAGEOFFSET) == 0); 4181 4182 /* 4183 * Figure out default policy 4184 */ 4185 if (policy == LGRP_MEM_POLICY_DEFAULT) 4186 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4187 4188 /* 4189 * Create AVL tree if there isn't one yet 4190 * and set locality field to point at it 4191 */ 4192 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4193 tree = shm_locality->loc_tree; 4194 if (!tree) { 4195 rw_exit(&shm_locality->loc_lock); 4196 4197 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4198 4199 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4200 if (shm_locality->loc_tree == NULL) { 4201 avl_create(tree, lgrp_shm_policy_compar, 4202 sizeof (lgrp_shm_policy_seg_t), 4203 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4204 shm_locality->loc_tree = tree; 4205 } else { 4206 /* 4207 * Another thread managed to set up the tree 4208 * before we could. Free the tree we allocated 4209 * and use the one that's already there. 4210 */ 4211 kmem_free(tree, sizeof (*tree)); 4212 tree = shm_locality->loc_tree; 4213 } 4214 } 4215 4216 /* 4217 * Set policy 4218 * 4219 * Need to maintain hold on writer's lock to keep tree from 4220 * changing out from under us 4221 */ 4222 while (len != 0) { 4223 /* 4224 * Find policy segment for specified offset into shared object 4225 */ 4226 seg = avl_find(tree, &off, &where); 4227 4228 /* 4229 * Didn't find any existing segment that contains specified 4230 * offset, so allocate new segment, insert it, and concatenate 4231 * with adjacent segments if possible 4232 */ 4233 if (seg == NULL) { 4234 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4235 KM_SLEEP); 4236 newseg->shm_policy.mem_policy = policy; 4237 newseg->shm_policy.mem_lgrpid = LGRP_NONE; 4238 newseg->shm_off = off; 4239 avl_insert(tree, newseg, where); 4240 4241 /* 4242 * Check to see whether new segment overlaps with next 4243 * one, set length of new segment accordingly, and 4244 * calculate remaining length and next offset 4245 */ 4246 seg = AVL_NEXT(tree, newseg); 4247 if (seg == NULL || off + len <= seg->shm_off) { 4248 newseg->shm_size = len; 4249 len = 0; 4250 } else { 4251 newseg->shm_size = seg->shm_off - off; 4252 off = seg->shm_off; 4253 len -= newseg->shm_size; 4254 } 4255 4256 /* 4257 * Try to concatenate new segment with next and 4258 * previous ones, since they might have the same policy 4259 * now. Grab previous and next segments first because 4260 * they will change on concatenation. 4261 */ 4262 prev = AVL_PREV(tree, newseg); 4263 next = AVL_NEXT(tree, newseg); 4264 (void) lgrp_shm_policy_concat(tree, newseg, next); 4265 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4266 4267 continue; 4268 } 4269 4270 eoff = off + len; 4271 oldeoff = seg->shm_off + seg->shm_size; 4272 4273 /* 4274 * Policy set already? 4275 */ 4276 if (policy == seg->shm_policy.mem_policy) { 4277 /* 4278 * Nothing left to do if offset and length 4279 * fall within this segment 4280 */ 4281 if (eoff <= oldeoff) { 4282 retval = 1; 4283 break; 4284 } else { 4285 len = eoff - oldeoff; 4286 off = oldeoff; 4287 continue; 4288 } 4289 } 4290 4291 /* 4292 * Specified offset and length match existing segment exactly 4293 */ 4294 if (off == seg->shm_off && len == seg->shm_size) { 4295 /* 4296 * Set policy and update current length 4297 */ 4298 seg->shm_policy.mem_policy = policy; 4299 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4300 len = 0; 4301 4302 /* 4303 * Try concatenating new segment with previous and next 4304 * segments, since they might have the same policy now. 4305 * Grab previous and next segments first because they 4306 * will change on concatenation. 4307 */ 4308 prev = AVL_PREV(tree, seg); 4309 next = AVL_NEXT(tree, seg); 4310 (void) lgrp_shm_policy_concat(tree, seg, next); 4311 (void) lgrp_shm_policy_concat(tree, prev, seg); 4312 } else { 4313 /* 4314 * Specified offset and length only apply to part of 4315 * existing segment 4316 */ 4317 4318 /* 4319 * New segment starts in middle of old one, so split 4320 * new one off near beginning of old one 4321 */ 4322 newseg = NULL; 4323 if (off > seg->shm_off) { 4324 newseg = lgrp_shm_policy_split(tree, seg, off); 4325 4326 /* 4327 * New segment ends where old one did, so try 4328 * to concatenate with next segment 4329 */ 4330 if (eoff == oldeoff) { 4331 newseg->shm_policy.mem_policy = policy; 4332 newseg->shm_policy.mem_lgrpid = 4333 LGRP_NONE; 4334 (void) lgrp_shm_policy_concat(tree, 4335 newseg, AVL_NEXT(tree, newseg)); 4336 break; 4337 } 4338 } 4339 4340 /* 4341 * New segment ends before old one, so split off end of 4342 * old one 4343 */ 4344 if (eoff < oldeoff) { 4345 if (newseg) { 4346 (void) lgrp_shm_policy_split(tree, 4347 newseg, eoff); 4348 newseg->shm_policy.mem_policy = policy; 4349 newseg->shm_policy.mem_lgrpid = 4350 LGRP_NONE; 4351 } else { 4352 (void) lgrp_shm_policy_split(tree, seg, 4353 eoff); 4354 seg->shm_policy.mem_policy = policy; 4355 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4356 } 4357 4358 if (off == seg->shm_off) 4359 (void) lgrp_shm_policy_concat(tree, 4360 AVL_PREV(tree, seg), seg); 4361 break; 4362 } 4363 4364 /* 4365 * Calculate remaining length and next offset 4366 */ 4367 len = eoff - oldeoff; 4368 off = oldeoff; 4369 } 4370 } 4371 4372 rw_exit(&shm_locality->loc_lock); 4373 return (retval); 4374 } 4375 4376 /* 4377 * Return the best memnode from which to allocate memory given 4378 * an lgroup. 4379 * 4380 * "c" is for cookie, which is good enough for me. 4381 * It references a cookie struct that should be zero'ed to initialize. 4382 * The cookie should live on the caller's stack. 4383 * 4384 * The routine returns -1 when: 4385 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4386 * - traverse is 1, and all the memnodes in the system have been 4387 * returned. 4388 */ 4389 int 4390 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4391 { 4392 lgrp_t *lp = c->lmc_lgrp; 4393 mnodeset_t nodes = c->lmc_nodes; 4394 int cnt = c->lmc_cnt; 4395 int offset, mnode; 4396 4397 extern int max_mem_nodes; 4398 4399 /* 4400 * If the set is empty, and the caller is willing, traverse 4401 * up the hierarchy until we find a non-empty set. 4402 */ 4403 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4404 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4405 ((lp = lp->lgrp_parent) == NULL)) 4406 return (-1); 4407 4408 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4409 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4410 } 4411 4412 /* 4413 * Select a memnode by picking one at a "random" offset. 4414 * Because of DR, memnodes can come and go at any time. 4415 * This code must be able to cope with the possibility 4416 * that the nodes count "cnt" is inconsistent with respect 4417 * to the number of elements actually in "nodes", and 4418 * therefore that the offset chosen could be greater than 4419 * the number of elements in the set (some memnodes may 4420 * have dissapeared just before cnt was read). 4421 * If this happens, the search simply wraps back to the 4422 * beginning of the set. 4423 */ 4424 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4425 offset = c->lmc_rand % cnt; 4426 do { 4427 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4428 if (nodes & ((mnodeset_t)1 << mnode)) 4429 if (!offset--) 4430 break; 4431 } while (mnode >= max_mem_nodes); 4432 4433 /* Found a node. Store state before returning. */ 4434 c->lmc_lgrp = lp; 4435 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4436 c->lmc_cnt = cnt - 1; 4437 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4438 c->lmc_ntried++; 4439 4440 return (mnode); 4441 } 4442