1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Basic NUMA support in terms of locality groups 30 * 31 * Solaris needs to know which CPUs, memory, etc. are near each other to 32 * provide good performance on NUMA machines by optimizing for locality. 33 * In order to do this, a new abstraction called a "locality group (lgroup)" 34 * has been introduced to keep track of which CPU-like and memory-like hardware 35 * resources are close to each other. Currently, latency is the only measure 36 * used to determine how to group hardware resources into lgroups, but this 37 * does not limit the groupings to be based solely on latency. Other factors 38 * may be used to determine the groupings in the future. 39 * 40 * Lgroups are organized into a hieararchy or topology that represents the 41 * latency topology of the machine. There is always at least a root lgroup in 42 * the system. It represents all the hardware resources in the machine at a 43 * latency big enough that any hardware resource can at least access any other 44 * hardware resource within that latency. A Uniform Memory Access (UMA) 45 * machine is represented with one lgroup (the root). In contrast, a NUMA 46 * machine is represented at least by the root lgroup and some number of leaf 47 * lgroups where the leaf lgroups contain the hardware resources within the 48 * least latency of each other and the root lgroup still contains all the 49 * resources in the machine. Some number of intermediate lgroups may exist 50 * which represent more levels of locality than just the local latency of the 51 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 52 * (eg. root and intermediate lgroups) contain the next nearest resources to 53 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 54 * to the root lgroup shows the hardware resources from closest to farthest 55 * from the leaf lgroup such that each successive ancestor lgroup contains 56 * the next nearest resources at the next level of locality from the previous. 57 * 58 * The kernel uses the lgroup abstraction to know how to allocate resources 59 * near a given process/thread. At fork() and lwp/thread_create() time, a 60 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 61 * with the lowest load average. Binding to a processor or processor set will 62 * change the home lgroup for a thread. The scheduler has been modified to try 63 * to dispatch a thread on a CPU in its home lgroup. Physical memory 64 * allocation is lgroup aware too, so memory will be allocated from the current 65 * thread's home lgroup if possible. If the desired resources are not 66 * available, the kernel traverses the lgroup hierarchy going to the parent 67 * lgroup to find resources at the next level of locality until it reaches the 68 * root lgroup. 69 */ 70 71 #include <sys/lgrp.h> 72 #include <sys/lgrp_user.h> 73 #include <sys/types.h> 74 #include <sys/mman.h> 75 #include <sys/param.h> 76 #include <sys/var.h> 77 #include <sys/thread.h> 78 #include <sys/cpuvar.h> 79 #include <sys/cpupart.h> 80 #include <sys/kmem.h> 81 #include <vm/seg.h> 82 #include <vm/seg_kmem.h> 83 #include <vm/seg_spt.h> 84 #include <vm/seg_vn.h> 85 #include <vm/as.h> 86 #include <sys/atomic.h> 87 #include <sys/systm.h> 88 #include <sys/errno.h> 89 #include <sys/cmn_err.h> 90 #include <sys/kstat.h> 91 #include <sys/sysmacros.h> 92 #include <sys/pg.h> 93 #include <sys/promif.h> 94 #include <sys/sdt.h> 95 96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 98 /* indexed by lgrp_id */ 99 int nlgrps; /* number of lgroups in machine */ 100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 102 103 /* 104 * Kstat data for lgroups. 105 * 106 * Actual kstat data is collected in lgrp_stats array. 107 * The lgrp_kstat_data array of named kstats is used to extract data from 108 * lgrp_stats and present it to kstat framework. It is protected from partallel 109 * modifications by lgrp_kstat_mutex. This may cause some contention when 110 * several kstat commands run in parallel but this is not the 111 * performance-critical path. 112 */ 113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 114 115 /* 116 * Declare kstat names statically for enums as defined in the header file. 117 */ 118 LGRP_KSTAT_NAMES; 119 120 static void lgrp_kstat_init(void); 121 static int lgrp_kstat_extract(kstat_t *, int); 122 static void lgrp_kstat_reset(lgrp_id_t); 123 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 125 static kmutex_t lgrp_kstat_mutex; 126 127 128 /* 129 * max number of lgroups supported by the platform 130 */ 131 int nlgrpsmax = 0; 132 133 /* 134 * The root lgroup. Represents the set of resources at the system wide 135 * level of locality. 136 */ 137 lgrp_t *lgrp_root = NULL; 138 139 /* 140 * During system bootstrap cp_default does not contain the list of lgrp load 141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 142 * on-line when cp_default is initialized by cpupart_initialize_default(). 143 * Configuring CPU0 may create a two-level topology with root and one leaf node 144 * containing CPU0. This topology is initially constructed in a special 145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 147 * for all lpl operations until cp_default is fully constructed. 148 * 149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 151 * the first element of lpl_bootstrap_list. 152 * 153 * CPUs that are added to the system, but have not yet been assigned to an 154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 155 * on some architectures (x86) it's possible for the slave CPU startup thread 156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 157 */ 158 #define LPL_BOOTSTRAP_SIZE 2 159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 160 lpl_t *lpl_bootstrap; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 /* 172 * Size, in bytes, beyond which random memory allocation policy is applied 173 * to non-shared memory. Default is the maximum size, so random memory 174 * allocation won't be used for non-shared memory by default. 175 */ 176 size_t lgrp_privm_random_thresh = (size_t)(-1); 177 178 /* the maximum effect that a single thread can have on it's lgroup's load */ 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 180 ((lgrp_loadavg_max_effect) / (ncpu)) 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 182 183 184 /* 185 * Size, in bytes, beyond which random memory allocation policy is applied to 186 * shared memory. Default is 8MB (2 ISM pages). 187 */ 188 size_t lgrp_shm_random_thresh = 8*1024*1024; 189 190 /* 191 * Whether to do processor set aware memory allocation by default 192 */ 193 int lgrp_mem_pset_aware = 0; 194 195 /* 196 * Set the default memory allocation policy for root lgroup 197 */ 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 199 200 /* 201 * Set the default memory allocation policy. For most platforms, 202 * next touch is sufficient, but some platforms may wish to override 203 * this. 204 */ 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 206 207 208 /* 209 * lgroup CPU event handlers 210 */ 211 static void lgrp_cpu_init(struct cpu *); 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 214 215 /* 216 * lgroup memory event handlers 217 */ 218 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 219 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 220 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 221 222 /* 223 * lgroup CPU partition event handlers 224 */ 225 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 226 static void lgrp_part_del_cpu(struct cpu *); 227 228 static void lgrp_root_init(void); 229 230 /* 231 * lpl topology 232 */ 233 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 234 static void lpl_clear(lpl_t *); 235 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 236 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 237 static void lpl_rset_add(lpl_t *, lpl_t *); 238 static void lpl_rset_del(lpl_t *, lpl_t *); 239 static int lpl_rset_contains(lpl_t *, lpl_t *); 240 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 241 static void lpl_child_update(lpl_t *, struct cpupart *); 242 static int lpl_pick(lpl_t *, lpl_t *); 243 static void lpl_verify_wrapper(struct cpupart *); 244 245 /* 246 * defines for lpl topology verifier return codes 247 */ 248 249 #define LPL_TOPO_CORRECT 0 250 #define LPL_TOPO_PART_HAS_NO_LPL -1 251 #define LPL_TOPO_CPUS_NOT_EMPTY -2 252 #define LPL_TOPO_LGRP_MISMATCH -3 253 #define LPL_TOPO_MISSING_PARENT -4 254 #define LPL_TOPO_PARENT_MISMATCH -5 255 #define LPL_TOPO_BAD_CPUCNT -6 256 #define LPL_TOPO_RSET_MISMATCH -7 257 #define LPL_TOPO_LPL_ORPHANED -8 258 #define LPL_TOPO_LPL_BAD_NCPU -9 259 #define LPL_TOPO_RSET_MSSNG_LF -10 260 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 261 #define LPL_TOPO_BOGUS_HINT -12 262 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 263 #define LPL_TOPO_LGRP_NOT_LEAF -14 264 #define LPL_TOPO_BAD_RSETCNT -15 265 266 /* 267 * Return whether lgroup optimizations should be enabled on this system 268 */ 269 int 270 lgrp_optimizations(void) 271 { 272 /* 273 * System must have more than 2 lgroups to enable lgroup optimizations 274 * 275 * XXX This assumes that a 2 lgroup system has an empty root lgroup 276 * with one child lgroup containing all the resources. A 2 lgroup 277 * system with a root lgroup directly containing CPUs or memory might 278 * need lgroup optimizations with its child lgroup, but there 279 * isn't such a machine for now.... 280 */ 281 if (nlgrps > 2) 282 return (1); 283 284 return (0); 285 } 286 287 /* 288 * Build full lgroup topology 289 */ 290 static void 291 lgrp_root_init(void) 292 { 293 lgrp_handle_t hand; 294 int i; 295 lgrp_id_t id; 296 297 /* 298 * Create the "root" lgroup 299 */ 300 ASSERT(nlgrps == 0); 301 id = nlgrps++; 302 303 lgrp_root = &lroot; 304 305 lgrp_root->lgrp_cpu = NULL; 306 lgrp_root->lgrp_mnodes = 0; 307 lgrp_root->lgrp_nmnodes = 0; 308 hand = lgrp_plat_root_hand(); 309 lgrp_root->lgrp_plathand = hand; 310 311 lgrp_root->lgrp_id = id; 312 lgrp_root->lgrp_cpucnt = 0; 313 lgrp_root->lgrp_childcnt = 0; 314 klgrpset_clear(lgrp_root->lgrp_children); 315 klgrpset_clear(lgrp_root->lgrp_leaves); 316 lgrp_root->lgrp_parent = NULL; 317 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 318 319 for (i = 0; i < LGRP_RSRC_COUNT; i++) 320 klgrpset_clear(lgrp_root->lgrp_set[i]); 321 322 lgrp_root->lgrp_kstat = NULL; 323 324 lgrp_table[id] = lgrp_root; 325 326 /* 327 * Setup initial lpl list for CPU0 and initial t0 home. 328 * The only lpl space we have so far is lpl_bootstrap. It is used for 329 * all topology operations until cp_default is initialized at which 330 * point t0.t_lpl will be updated. 331 */ 332 lpl_bootstrap = lpl_bootstrap_list; 333 t0.t_lpl = lpl_bootstrap; 334 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 335 lpl_bootstrap_list[1].lpl_lgrpid = 1; 336 cp_default.cp_lgrploads = lpl_bootstrap; 337 } 338 339 /* 340 * Initialize the lgroup framework and allow the platform to do the same 341 */ 342 void 343 lgrp_init(void) 344 { 345 /* 346 * Initialize the platform 347 */ 348 lgrp_plat_init(); 349 350 /* 351 * Set max number of lgroups supported on this platform which must be 352 * less than the max number of lgroups supported by the common lgroup 353 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 354 */ 355 nlgrpsmax = lgrp_plat_max_lgrps(); 356 ASSERT(nlgrpsmax <= NLGRPS_MAX); 357 } 358 359 /* 360 * Create the root and cpu0's lgroup, and set t0's home. 361 */ 362 void 363 lgrp_setup(void) 364 { 365 /* 366 * Setup the root lgroup 367 */ 368 lgrp_root_init(); 369 370 /* 371 * Add cpu0 to an lgroup 372 */ 373 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 374 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 375 } 376 377 /* 378 * Lgroup initialization is split in two parts. The first part 379 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 380 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 381 * when all CPUs are brought online and all distance information is available. 382 * 383 * When lgrp_main_init() is complete it sets lgrp_initialized. The 384 * lgrp_main_mp_init() sets lgrp_topo_initialized. 385 */ 386 387 /* 388 * true when lgrp initialization has been completed. 389 */ 390 int lgrp_initialized = 0; 391 392 /* 393 * True when lgrp topology is constructed. 394 */ 395 int lgrp_topo_initialized = 0; 396 397 /* 398 * Init routine called after startup(), /etc/system has been processed, 399 * and cpu0 has been added to an lgroup. 400 */ 401 void 402 lgrp_main_init(void) 403 { 404 cpu_t *cp = CPU; 405 lgrp_id_t lgrpid; 406 int i; 407 extern void pg_cpu0_reinit(); 408 409 /* 410 * Enforce a valid lgrp_mem_default_policy 411 */ 412 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 413 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) || 414 (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG)) 415 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 416 417 /* 418 * See if mpo should be disabled. 419 * This may happen in the case of null proc LPA on Starcat. 420 * The platform won't be able to detect null proc LPA until after 421 * cpu0 and memory have already been added to lgroups. 422 * When and if it is detected, the Starcat platform will return 423 * a different platform handle for cpu0 which is what we check for 424 * here. If mpo should be disabled move cpu0 to it's rightful place 425 * (the root), and destroy the remaining lgroups. This effectively 426 * provides an UMA lgroup topology. 427 */ 428 lgrpid = cp->cpu_lpl->lpl_lgrpid; 429 if (lgrp_table[lgrpid]->lgrp_plathand != 430 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 431 lgrp_part_del_cpu(cp); 432 lgrp_cpu_fini(cp, lgrpid); 433 434 lgrp_cpu_init(cp); 435 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 436 437 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 438 439 /* 440 * Notify the PG subsystem that the CPU's lgrp 441 * association has changed 442 */ 443 pg_cpu0_reinit(); 444 445 /* 446 * Destroy all lgroups except for root 447 */ 448 for (i = 0; i <= lgrp_alloc_max; i++) { 449 if (LGRP_EXISTS(lgrp_table[i]) && 450 lgrp_table[i] != lgrp_root) 451 lgrp_destroy(lgrp_table[i]); 452 } 453 454 /* 455 * Fix up root to point at itself for leaves and resources 456 * and not have any children 457 */ 458 lgrp_root->lgrp_childcnt = 0; 459 klgrpset_clear(lgrp_root->lgrp_children); 460 klgrpset_clear(lgrp_root->lgrp_leaves); 461 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 462 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 463 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 464 } 465 466 /* 467 * Initialize kstats framework. 468 */ 469 lgrp_kstat_init(); 470 /* 471 * cpu0 is finally where it should be, so create it's lgroup's kstats 472 */ 473 mutex_enter(&cpu_lock); 474 lgrp_kstat_create(cp); 475 mutex_exit(&cpu_lock); 476 477 lgrp_plat_main_init(); 478 lgrp_initialized = 1; 479 } 480 481 /* 482 * Finish lgrp initialization after all CPUS are brought on-line. 483 * This routine is called after start_other_cpus(). 484 */ 485 void 486 lgrp_main_mp_init(void) 487 { 488 klgrpset_t changed; 489 490 /* 491 * Update lgroup topology (if necessary) 492 */ 493 klgrpset_clear(changed); 494 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 495 lgrp_topo_initialized = 1; 496 } 497 498 /* 499 * Change latency of lgroup with specified lgroup platform handle (if one is 500 * given) or change all lgroups with old latency to new latency 501 */ 502 void 503 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 504 u_longlong_t newtime) 505 { 506 lgrp_t *lgrp; 507 int i; 508 509 for (i = 0; i <= lgrp_alloc_max; i++) { 510 lgrp = lgrp_table[i]; 511 512 if (!LGRP_EXISTS(lgrp)) 513 continue; 514 515 if ((hand == LGRP_NULL_HANDLE && 516 lgrp->lgrp_latency == oldtime) || 517 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 518 lgrp->lgrp_latency = (int)newtime; 519 } 520 } 521 522 /* 523 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 524 */ 525 void 526 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 527 { 528 klgrpset_t changed; 529 cpu_t *cp; 530 lgrp_id_t id; 531 int rc; 532 533 switch (event) { 534 /* 535 * The following (re)configuration events are common code 536 * initiated. lgrp_plat_config() is called here to inform the 537 * platform of the reconfiguration event. 538 */ 539 case LGRP_CONFIG_CPU_ADD: 540 cp = (cpu_t *)resource; 541 542 /* 543 * Initialize the new CPU's lgrp related next/prev 544 * links, and give it a bootstrap lpl so that it can 545 * survive should it need to enter the dispatcher. 546 */ 547 cp->cpu_next_lpl = cp; 548 cp->cpu_prev_lpl = cp; 549 cp->cpu_next_lgrp = cp; 550 cp->cpu_prev_lgrp = cp; 551 cp->cpu_lpl = lpl_bootstrap; 552 553 lgrp_plat_config(event, resource); 554 atomic_add_32(&lgrp_gen, 1); 555 556 break; 557 case LGRP_CONFIG_CPU_DEL: 558 lgrp_plat_config(event, resource); 559 atomic_add_32(&lgrp_gen, 1); 560 561 break; 562 case LGRP_CONFIG_CPU_ONLINE: 563 cp = (cpu_t *)resource; 564 lgrp_cpu_init(cp); 565 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 566 rc = lpl_topo_verify(cp->cpu_part); 567 if (rc != LPL_TOPO_CORRECT) { 568 panic("lpl_topo_verify failed: %d", rc); 569 } 570 lgrp_plat_config(event, resource); 571 atomic_add_32(&lgrp_gen, 1); 572 573 break; 574 case LGRP_CONFIG_CPU_OFFLINE: 575 cp = (cpu_t *)resource; 576 id = cp->cpu_lpl->lpl_lgrpid; 577 lgrp_part_del_cpu(cp); 578 lgrp_cpu_fini(cp, id); 579 rc = lpl_topo_verify(cp->cpu_part); 580 if (rc != LPL_TOPO_CORRECT) { 581 panic("lpl_topo_verify failed: %d", rc); 582 } 583 lgrp_plat_config(event, resource); 584 atomic_add_32(&lgrp_gen, 1); 585 586 break; 587 case LGRP_CONFIG_CPUPART_ADD: 588 cp = (cpu_t *)resource; 589 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 590 rc = lpl_topo_verify(cp->cpu_part); 591 if (rc != LPL_TOPO_CORRECT) { 592 panic("lpl_topo_verify failed: %d", rc); 593 } 594 lgrp_plat_config(event, resource); 595 596 break; 597 case LGRP_CONFIG_CPUPART_DEL: 598 cp = (cpu_t *)resource; 599 lgrp_part_del_cpu((cpu_t *)resource); 600 rc = lpl_topo_verify(cp->cpu_part); 601 if (rc != LPL_TOPO_CORRECT) { 602 panic("lpl_topo_verify failed: %d", rc); 603 } 604 lgrp_plat_config(event, resource); 605 606 break; 607 /* 608 * The following events are initiated by the memnode 609 * subsystem. 610 */ 611 case LGRP_CONFIG_MEM_ADD: 612 lgrp_mem_init((int)resource, where, B_FALSE); 613 atomic_add_32(&lgrp_gen, 1); 614 615 break; 616 case LGRP_CONFIG_MEM_DEL: 617 lgrp_mem_fini((int)resource, where, B_FALSE); 618 atomic_add_32(&lgrp_gen, 1); 619 620 break; 621 case LGRP_CONFIG_MEM_RENAME: { 622 lgrp_config_mem_rename_t *ren_arg = 623 (lgrp_config_mem_rename_t *)where; 624 625 lgrp_mem_rename((int)resource, 626 ren_arg->lmem_rename_from, 627 ren_arg->lmem_rename_to); 628 atomic_add_32(&lgrp_gen, 1); 629 630 break; 631 } 632 case LGRP_CONFIG_GEN_UPDATE: 633 atomic_add_32(&lgrp_gen, 1); 634 635 break; 636 case LGRP_CONFIG_FLATTEN: 637 if (where == 0) 638 lgrp_topo_levels = (int)resource; 639 else 640 (void) lgrp_topo_flatten(resource, 641 lgrp_table, lgrp_alloc_max, &changed); 642 643 break; 644 /* 645 * Update any lgroups with old latency to new latency 646 */ 647 case LGRP_CONFIG_LAT_CHANGE_ALL: 648 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 649 (u_longlong_t)where); 650 651 break; 652 /* 653 * Update lgroup with specified lgroup platform handle to have 654 * new latency 655 */ 656 case LGRP_CONFIG_LAT_CHANGE: 657 lgrp_latency_change((lgrp_handle_t)resource, 0, 658 (u_longlong_t)where); 659 660 break; 661 case LGRP_CONFIG_NOP: 662 663 break; 664 default: 665 break; 666 } 667 668 } 669 670 /* 671 * Called to add lgrp info into cpu structure from cpu_add_unit; 672 * do not assume cpu is in cpu[] yet! 673 * 674 * CPUs are brought online with all other CPUs paused so we can't 675 * allocate memory or we could deadlock the system, so we rely on 676 * the platform to statically allocate as much space as we need 677 * for the lgrp structs and stats. 678 */ 679 static void 680 lgrp_cpu_init(struct cpu *cp) 681 { 682 klgrpset_t changed; 683 int count; 684 lgrp_handle_t hand; 685 int first_cpu; 686 lgrp_t *my_lgrp; 687 lgrp_id_t lgrpid; 688 struct cpu *cptr; 689 690 /* 691 * This is the first time through if the resource set 692 * for the root lgroup is empty. After cpu0 has been 693 * initially added to an lgroup, the root's CPU resource 694 * set can never be empty, since the system's last CPU 695 * cannot be offlined. 696 */ 697 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 698 /* 699 * First time through. 700 */ 701 first_cpu = 1; 702 } else { 703 /* 704 * If cpu0 needs to move lgroups, we may come 705 * through here again, at which time cpu_lock won't 706 * be held, and lgrp_initialized will be false. 707 */ 708 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 709 ASSERT(cp->cpu_part != NULL); 710 first_cpu = 0; 711 } 712 713 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 714 my_lgrp = lgrp_hand_to_lgrp(hand); 715 716 if (my_lgrp == NULL) { 717 /* 718 * Create new lgrp and add it to lgroup topology 719 */ 720 my_lgrp = lgrp_create(); 721 my_lgrp->lgrp_plathand = hand; 722 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 723 lgrpid = my_lgrp->lgrp_id; 724 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 725 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 726 727 count = 0; 728 klgrpset_clear(changed); 729 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 730 &changed); 731 /* 732 * May have added new intermediate lgroups, so need to add 733 * resources other than CPUs which are added below 734 */ 735 (void) lgrp_mnode_update(changed, NULL); 736 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 737 > 0) { 738 /* 739 * Leaf lgroup was created, but latency wasn't available 740 * then. So, set latency for it and fill in rest of lgroup 741 * topology now that we know how far it is from other leaf 742 * lgroups. 743 */ 744 lgrpid = my_lgrp->lgrp_id; 745 klgrpset_clear(changed); 746 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 747 lgrpid)) 748 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 749 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 750 &changed); 751 752 /* 753 * May have added new intermediate lgroups, so need to add 754 * resources other than CPUs which are added below 755 */ 756 (void) lgrp_mnode_update(changed, NULL); 757 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 758 my_lgrp->lgrp_id)) { 759 int i; 760 761 /* 762 * Update existing lgroup and lgroups containing it with CPU 763 * resource 764 */ 765 lgrpid = my_lgrp->lgrp_id; 766 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 767 for (i = 0; i <= lgrp_alloc_max; i++) { 768 lgrp_t *lgrp; 769 770 lgrp = lgrp_table[i]; 771 if (!LGRP_EXISTS(lgrp) || 772 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 773 continue; 774 775 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 776 } 777 } 778 779 lgrpid = my_lgrp->lgrp_id; 780 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 781 782 /* 783 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 784 * end up in lpl for lgroup 0 whether it is supposed to be in there or 785 * not since none of lgroup IDs in the lpl's have been set yet. 786 */ 787 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 788 cp->cpu_lpl->lpl_lgrpid = lgrpid; 789 790 /* 791 * link the CPU into the lgrp's CPU list 792 */ 793 if (my_lgrp->lgrp_cpucnt == 0) { 794 my_lgrp->lgrp_cpu = cp; 795 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 796 } else { 797 cptr = my_lgrp->lgrp_cpu; 798 cp->cpu_next_lgrp = cptr; 799 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 800 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 801 cptr->cpu_prev_lgrp = cp; 802 } 803 my_lgrp->lgrp_cpucnt++; 804 } 805 806 lgrp_t * 807 lgrp_create(void) 808 { 809 lgrp_t *my_lgrp; 810 lgrp_id_t lgrpid; 811 int i; 812 813 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 814 815 /* 816 * Find an open slot in the lgroup table and recycle unused lgroup 817 * left there if any 818 */ 819 my_lgrp = NULL; 820 if (lgrp_alloc_hint == -1) 821 /* 822 * Allocate from end when hint not set yet because no lgroups 823 * have been deleted yet 824 */ 825 lgrpid = nlgrps++; 826 else { 827 /* 828 * Start looking for next open slot from hint and leave hint 829 * at slot allocated 830 */ 831 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 832 my_lgrp = lgrp_table[i]; 833 if (!LGRP_EXISTS(my_lgrp)) { 834 lgrpid = i; 835 nlgrps++; 836 break; 837 } 838 } 839 lgrp_alloc_hint = lgrpid; 840 } 841 842 /* 843 * Keep track of max lgroup ID allocated so far to cut down on searches 844 */ 845 if (lgrpid > lgrp_alloc_max) 846 lgrp_alloc_max = lgrpid; 847 848 /* 849 * Need to allocate new lgroup if next open slot didn't have one 850 * for recycling 851 */ 852 if (my_lgrp == NULL) 853 my_lgrp = lgrp_plat_alloc(lgrpid); 854 855 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 856 panic("Too many lgrps for platform (%d)", nlgrps); 857 858 my_lgrp->lgrp_id = lgrpid; 859 my_lgrp->lgrp_latency = 0; 860 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 861 my_lgrp->lgrp_parent = NULL; 862 my_lgrp->lgrp_childcnt = 0; 863 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 864 my_lgrp->lgrp_nmnodes = 0; 865 klgrpset_clear(my_lgrp->lgrp_children); 866 klgrpset_clear(my_lgrp->lgrp_leaves); 867 for (i = 0; i < LGRP_RSRC_COUNT; i++) 868 klgrpset_clear(my_lgrp->lgrp_set[i]); 869 870 my_lgrp->lgrp_cpu = NULL; 871 my_lgrp->lgrp_cpucnt = 0; 872 873 if (my_lgrp->lgrp_kstat != NULL) 874 lgrp_kstat_reset(lgrpid); 875 876 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 877 878 return (my_lgrp); 879 } 880 881 void 882 lgrp_destroy(lgrp_t *lgrp) 883 { 884 int i; 885 886 /* 887 * Unless this lgroup is being destroyed on behalf of 888 * the boot CPU, cpu_lock must be held 889 */ 890 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 891 892 if (nlgrps == 1) 893 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 894 895 if (!LGRP_EXISTS(lgrp)) 896 return; 897 898 /* 899 * Set hint to lgroup being deleted and try to keep lower numbered 900 * hints to facilitate finding empty slots 901 */ 902 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 903 lgrp_alloc_hint = lgrp->lgrp_id; 904 905 /* 906 * Mark this lgroup to be recycled by setting its lgroup ID to 907 * LGRP_NONE and clear relevant fields 908 */ 909 lgrp->lgrp_id = LGRP_NONE; 910 lgrp->lgrp_latency = 0; 911 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 912 lgrp->lgrp_parent = NULL; 913 lgrp->lgrp_childcnt = 0; 914 915 klgrpset_clear(lgrp->lgrp_children); 916 klgrpset_clear(lgrp->lgrp_leaves); 917 for (i = 0; i < LGRP_RSRC_COUNT; i++) 918 klgrpset_clear(lgrp->lgrp_set[i]); 919 920 lgrp->lgrp_mnodes = (mnodeset_t)0; 921 lgrp->lgrp_nmnodes = 0; 922 923 lgrp->lgrp_cpu = NULL; 924 lgrp->lgrp_cpucnt = 0; 925 926 nlgrps--; 927 } 928 929 /* 930 * Initialize kstat data. Called from lgrp intialization code. 931 */ 932 static void 933 lgrp_kstat_init(void) 934 { 935 lgrp_stat_t stat; 936 937 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 938 939 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 940 kstat_named_init(&lgrp_kstat_data[stat], 941 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 942 } 943 944 /* 945 * initialize an lgrp's kstats if needed 946 * called with cpu_lock held but not with cpus paused. 947 * we don't tear these down now because we don't know about 948 * memory leaving the lgrp yet... 949 */ 950 951 void 952 lgrp_kstat_create(cpu_t *cp) 953 { 954 kstat_t *lgrp_kstat; 955 lgrp_id_t lgrpid; 956 lgrp_t *my_lgrp; 957 958 ASSERT(MUTEX_HELD(&cpu_lock)); 959 960 lgrpid = cp->cpu_lpl->lpl_lgrpid; 961 my_lgrp = lgrp_table[lgrpid]; 962 963 if (my_lgrp->lgrp_kstat != NULL) 964 return; /* already initialized */ 965 966 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 967 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 968 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 969 970 if (lgrp_kstat != NULL) { 971 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 972 lgrp_kstat->ks_private = my_lgrp; 973 lgrp_kstat->ks_data = &lgrp_kstat_data; 974 lgrp_kstat->ks_update = lgrp_kstat_extract; 975 my_lgrp->lgrp_kstat = lgrp_kstat; 976 kstat_install(lgrp_kstat); 977 } 978 } 979 980 /* 981 * this will do something when we manage to remove now unused lgrps 982 */ 983 984 /* ARGSUSED */ 985 void 986 lgrp_kstat_destroy(cpu_t *cp) 987 { 988 ASSERT(MUTEX_HELD(&cpu_lock)); 989 } 990 991 /* 992 * Called when a CPU is off-lined. 993 */ 994 static void 995 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 996 { 997 lgrp_t *my_lgrp; 998 struct cpu *prev; 999 struct cpu *next; 1000 1001 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 1002 1003 prev = cp->cpu_prev_lgrp; 1004 next = cp->cpu_next_lgrp; 1005 1006 prev->cpu_next_lgrp = next; 1007 next->cpu_prev_lgrp = prev; 1008 1009 /* 1010 * just because I'm paranoid doesn't mean... 1011 */ 1012 1013 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1014 1015 my_lgrp = lgrp_table[lgrpid]; 1016 my_lgrp->lgrp_cpucnt--; 1017 1018 /* 1019 * Removing last CPU in lgroup, so update lgroup topology 1020 */ 1021 if (my_lgrp->lgrp_cpucnt == 0) { 1022 klgrpset_t changed; 1023 int count; 1024 int i; 1025 1026 my_lgrp->lgrp_cpu = NULL; 1027 1028 /* 1029 * Remove this lgroup from its lgroup CPU resources and remove 1030 * lgroup from lgroup topology if it doesn't have any more 1031 * resources in it now 1032 */ 1033 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1034 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1035 count = 0; 1036 klgrpset_clear(changed); 1037 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1038 lgrp_alloc_max + 1, &changed); 1039 return; 1040 } 1041 1042 /* 1043 * This lgroup isn't empty, so just remove it from CPU 1044 * resources of any lgroups that contain it as such 1045 */ 1046 for (i = 0; i <= lgrp_alloc_max; i++) { 1047 lgrp_t *lgrp; 1048 1049 lgrp = lgrp_table[i]; 1050 if (!LGRP_EXISTS(lgrp) || 1051 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1052 lgrpid)) 1053 continue; 1054 1055 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1056 } 1057 return; 1058 } 1059 1060 if (my_lgrp->lgrp_cpu == cp) 1061 my_lgrp->lgrp_cpu = next; 1062 1063 } 1064 1065 /* 1066 * Update memory nodes in target lgroups and return ones that get changed 1067 */ 1068 int 1069 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1070 { 1071 int count; 1072 int i; 1073 int j; 1074 lgrp_t *lgrp; 1075 lgrp_t *lgrp_rsrc; 1076 1077 count = 0; 1078 if (changed) 1079 klgrpset_clear(*changed); 1080 1081 if (klgrpset_isempty(target)) 1082 return (0); 1083 1084 /* 1085 * Find each lgroup in target lgroups 1086 */ 1087 for (i = 0; i <= lgrp_alloc_max; i++) { 1088 /* 1089 * Skip any lgroups that don't exist or aren't in target group 1090 */ 1091 lgrp = lgrp_table[i]; 1092 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1093 continue; 1094 } 1095 1096 /* 1097 * Initialize memnodes for intermediate lgroups to 0 1098 * and update them from scratch since they may have completely 1099 * changed 1100 */ 1101 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1102 lgrp->lgrp_mnodes = (mnodeset_t)0; 1103 lgrp->lgrp_nmnodes = 0; 1104 } 1105 1106 /* 1107 * Update memory nodes of of target lgroup with memory nodes 1108 * from each lgroup in its lgroup memory resource set 1109 */ 1110 for (j = 0; j <= lgrp_alloc_max; j++) { 1111 int k; 1112 1113 /* 1114 * Skip any lgroups that don't exist or aren't in 1115 * memory resources of target lgroup 1116 */ 1117 lgrp_rsrc = lgrp_table[j]; 1118 if (!LGRP_EXISTS(lgrp_rsrc) || 1119 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1120 j)) 1121 continue; 1122 1123 /* 1124 * Update target lgroup's memnodes to include memnodes 1125 * of this lgroup 1126 */ 1127 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1128 mnodeset_t mnode_mask; 1129 1130 mnode_mask = (mnodeset_t)1 << k; 1131 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1132 !(lgrp->lgrp_mnodes & mnode_mask)) { 1133 lgrp->lgrp_mnodes |= mnode_mask; 1134 lgrp->lgrp_nmnodes++; 1135 } 1136 } 1137 count++; 1138 if (changed) 1139 klgrpset_add(*changed, lgrp->lgrp_id); 1140 } 1141 } 1142 1143 return (count); 1144 } 1145 1146 /* 1147 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1148 * is moved from one board to another. The "from" and "to" arguments specify the 1149 * source and the destination of the move. 1150 * 1151 * See plat_lgrp_config() for a detailed description of the copy-rename 1152 * semantics. 1153 * 1154 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1155 * the lgroup topology which is changing as memory moves from one lgroup to 1156 * another. It removes the mnode from the source lgroup and re-inserts it in the 1157 * target lgroup. 1158 * 1159 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1160 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1161 * copy-rename operation. 1162 * 1163 * There is one case which requires special handling. If the system contains 1164 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1165 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1166 * lgrp_mem_init), but there is a window when the system has no memory in the 1167 * lgroup hierarchy. If another thread tries to allocate memory during this 1168 * window, the allocation will fail, although the system has physical memory. 1169 * This may cause a system panic or a deadlock (some sleeping memory allocations 1170 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1171 * the mnode back). 1172 * 1173 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1174 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1175 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1176 * but it updates the rest of the lgroup topology as if the mnode was actually 1177 * removed. The lgrp_mem_init() function recognizes that the mnode being 1178 * inserted represents such a special case and updates the topology 1179 * appropriately. 1180 */ 1181 void 1182 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1183 { 1184 /* 1185 * Remove the memory from the source node and add it to the destination 1186 * node. 1187 */ 1188 lgrp_mem_fini(mnode, from, B_TRUE); 1189 lgrp_mem_init(mnode, to, B_TRUE); 1190 } 1191 1192 /* 1193 * Called to indicate that the lgrp with platform handle "hand" now 1194 * contains the memory identified by "mnode". 1195 * 1196 * LOCKING for this routine is a bit tricky. Usually it is called without 1197 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1198 * callers. During DR of the board containing the caged memory it may be called 1199 * with cpu_lock already held and CPUs paused. 1200 * 1201 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1202 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1203 * dealing with the special case of DR copy-rename described in 1204 * lgrp_mem_rename(). 1205 */ 1206 void 1207 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1208 { 1209 klgrpset_t changed; 1210 int count; 1211 int i; 1212 lgrp_t *my_lgrp; 1213 lgrp_id_t lgrpid; 1214 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1215 boolean_t drop_lock = B_FALSE; 1216 boolean_t need_synch = B_FALSE; 1217 1218 /* 1219 * Grab CPU lock (if we haven't already) 1220 */ 1221 if (!MUTEX_HELD(&cpu_lock)) { 1222 mutex_enter(&cpu_lock); 1223 drop_lock = B_TRUE; 1224 } 1225 1226 /* 1227 * This routine may be called from a context where we already 1228 * hold cpu_lock, and have already paused cpus. 1229 */ 1230 if (!cpus_paused()) 1231 need_synch = B_TRUE; 1232 1233 /* 1234 * Check if this mnode is already configured and return immediately if 1235 * it is. 1236 * 1237 * NOTE: in special case of copy-rename of the only remaining mnode, 1238 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1239 * recognize this case and continue as usual, but skip the update to 1240 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1241 * in topology, temporarily introduced by lgrp_mem_fini(). 1242 */ 1243 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1244 lgrp_root->lgrp_mnodes & mnodes_mask) { 1245 if (drop_lock) 1246 mutex_exit(&cpu_lock); 1247 return; 1248 } 1249 1250 /* 1251 * Update lgroup topology with new memory resources, keeping track of 1252 * which lgroups change 1253 */ 1254 count = 0; 1255 klgrpset_clear(changed); 1256 my_lgrp = lgrp_hand_to_lgrp(hand); 1257 if (my_lgrp == NULL) { 1258 /* new lgrp */ 1259 my_lgrp = lgrp_create(); 1260 lgrpid = my_lgrp->lgrp_id; 1261 my_lgrp->lgrp_plathand = hand; 1262 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1263 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1264 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1265 1266 if (need_synch) 1267 pause_cpus(NULL); 1268 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1269 &changed); 1270 if (need_synch) 1271 start_cpus(); 1272 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1273 > 0) { 1274 /* 1275 * Leaf lgroup was created, but latency wasn't available 1276 * then. So, set latency for it and fill in rest of lgroup 1277 * topology now that we know how far it is from other leaf 1278 * lgroups. 1279 */ 1280 klgrpset_clear(changed); 1281 lgrpid = my_lgrp->lgrp_id; 1282 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1283 lgrpid)) 1284 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1285 if (need_synch) 1286 pause_cpus(NULL); 1287 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1288 &changed); 1289 if (need_synch) 1290 start_cpus(); 1291 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1292 my_lgrp->lgrp_id)) { 1293 /* 1294 * Add new lgroup memory resource to existing lgroup 1295 */ 1296 lgrpid = my_lgrp->lgrp_id; 1297 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1298 klgrpset_add(changed, lgrpid); 1299 count++; 1300 for (i = 0; i <= lgrp_alloc_max; i++) { 1301 lgrp_t *lgrp; 1302 1303 lgrp = lgrp_table[i]; 1304 if (!LGRP_EXISTS(lgrp) || 1305 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1306 continue; 1307 1308 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1309 klgrpset_add(changed, lgrp->lgrp_id); 1310 count++; 1311 } 1312 } 1313 1314 /* 1315 * Add memory node to lgroup and remove lgroup from ones that need 1316 * to be updated 1317 */ 1318 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1319 my_lgrp->lgrp_mnodes |= mnodes_mask; 1320 my_lgrp->lgrp_nmnodes++; 1321 } 1322 klgrpset_del(changed, lgrpid); 1323 1324 /* 1325 * Update memory node information for all lgroups that changed and 1326 * contain new memory node as a resource 1327 */ 1328 if (count) 1329 (void) lgrp_mnode_update(changed, NULL); 1330 1331 if (drop_lock) 1332 mutex_exit(&cpu_lock); 1333 } 1334 1335 /* 1336 * Called to indicate that the lgroup associated with the platform 1337 * handle "hand" no longer contains given memory node 1338 * 1339 * LOCKING for this routine is a bit tricky. Usually it is called without 1340 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1341 * callers. During DR of the board containing the caged memory it may be called 1342 * with cpu_lock already held and CPUs paused. 1343 * 1344 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1345 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1346 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1347 * the same mnode back into the topology. See lgrp_mem_rename() and 1348 * lgrp_mem_init() for additional details. 1349 */ 1350 void 1351 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1352 { 1353 klgrpset_t changed; 1354 int count; 1355 int i; 1356 lgrp_t *my_lgrp; 1357 lgrp_id_t lgrpid; 1358 mnodeset_t mnodes_mask; 1359 boolean_t drop_lock = B_FALSE; 1360 boolean_t need_synch = B_FALSE; 1361 1362 /* 1363 * Grab CPU lock (if we haven't already) 1364 */ 1365 if (!MUTEX_HELD(&cpu_lock)) { 1366 mutex_enter(&cpu_lock); 1367 drop_lock = B_TRUE; 1368 } 1369 1370 /* 1371 * This routine may be called from a context where we already 1372 * hold cpu_lock and have already paused cpus. 1373 */ 1374 if (!cpus_paused()) 1375 need_synch = B_TRUE; 1376 1377 my_lgrp = lgrp_hand_to_lgrp(hand); 1378 1379 /* 1380 * The lgrp *must* be pre-existing 1381 */ 1382 ASSERT(my_lgrp != NULL); 1383 1384 /* 1385 * Delete memory node from lgroups which contain it 1386 */ 1387 mnodes_mask = ((mnodeset_t)1 << mnode); 1388 for (i = 0; i <= lgrp_alloc_max; i++) { 1389 lgrp_t *lgrp = lgrp_table[i]; 1390 /* 1391 * Skip any non-existent lgroups and any lgroups that don't 1392 * contain leaf lgroup of memory as a memory resource 1393 */ 1394 if (!LGRP_EXISTS(lgrp) || 1395 !(lgrp->lgrp_mnodes & mnodes_mask)) 1396 continue; 1397 1398 /* 1399 * Avoid removing the last mnode from the root in the DR 1400 * copy-rename case. See lgrp_mem_rename() for details. 1401 */ 1402 if (is_copy_rename && 1403 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1404 continue; 1405 1406 /* 1407 * Remove memory node from lgroup. 1408 */ 1409 lgrp->lgrp_mnodes &= ~mnodes_mask; 1410 lgrp->lgrp_nmnodes--; 1411 ASSERT(lgrp->lgrp_nmnodes >= 0); 1412 } 1413 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1414 1415 /* 1416 * Don't need to update lgroup topology if this lgroup still has memory. 1417 * 1418 * In the special case of DR copy-rename with the only mnode being 1419 * removed, the lgrp_mnodes for the root is always non-zero, but we 1420 * still need to update the lgroup topology. 1421 */ 1422 if ((my_lgrp->lgrp_nmnodes > 0) && 1423 !(is_copy_rename && 1424 (my_lgrp == lgrp_root) && 1425 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1426 if (drop_lock) 1427 mutex_exit(&cpu_lock); 1428 return; 1429 } 1430 1431 /* 1432 * This lgroup does not contain any memory now 1433 */ 1434 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1435 1436 /* 1437 * Remove this lgroup from lgroup topology if it does not contain any 1438 * resources now 1439 */ 1440 lgrpid = my_lgrp->lgrp_id; 1441 count = 0; 1442 klgrpset_clear(changed); 1443 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1444 /* 1445 * Delete lgroup when no more resources 1446 */ 1447 if (need_synch) 1448 pause_cpus(NULL); 1449 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1450 lgrp_alloc_max + 1, &changed); 1451 ASSERT(count > 0); 1452 if (need_synch) 1453 start_cpus(); 1454 } else { 1455 /* 1456 * Remove lgroup from memory resources of any lgroups that 1457 * contain it as such 1458 */ 1459 for (i = 0; i <= lgrp_alloc_max; i++) { 1460 lgrp_t *lgrp; 1461 1462 lgrp = lgrp_table[i]; 1463 if (!LGRP_EXISTS(lgrp) || 1464 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1465 lgrpid)) 1466 continue; 1467 1468 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1469 } 1470 } 1471 if (drop_lock) 1472 mutex_exit(&cpu_lock); 1473 } 1474 1475 /* 1476 * Return lgroup with given platform handle 1477 */ 1478 lgrp_t * 1479 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1480 { 1481 int i; 1482 lgrp_t *lgrp; 1483 1484 if (hand == LGRP_NULL_HANDLE) 1485 return (NULL); 1486 1487 for (i = 0; i <= lgrp_alloc_max; i++) { 1488 lgrp = lgrp_table[i]; 1489 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1490 return (lgrp); 1491 } 1492 return (NULL); 1493 } 1494 1495 /* 1496 * Return the home lgroup of the current thread. 1497 * We must do this with kernel preemption disabled, since we don't want our 1498 * thread to be re-homed while we're poking around with its lpl, and the lpl 1499 * should never be NULL. 1500 * 1501 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1502 * is enabled because of DR. Callers can use disable kernel preemption 1503 * around this call to guarantee that the lgroup will be valid beyond this 1504 * routine, since kernel preemption can be recursive. 1505 */ 1506 lgrp_t * 1507 lgrp_home_lgrp(void) 1508 { 1509 lgrp_t *lgrp; 1510 lpl_t *lpl; 1511 1512 kpreempt_disable(); 1513 1514 lpl = curthread->t_lpl; 1515 ASSERT(lpl != NULL); 1516 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1517 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1518 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1519 1520 kpreempt_enable(); 1521 1522 return (lgrp); 1523 } 1524 1525 /* 1526 * Return ID of home lgroup for given thread 1527 * (See comments for lgrp_home_lgrp() for special care and handling 1528 * instructions) 1529 */ 1530 lgrp_id_t 1531 lgrp_home_id(kthread_t *t) 1532 { 1533 lgrp_id_t lgrp; 1534 lpl_t *lpl; 1535 1536 ASSERT(t != NULL); 1537 /* 1538 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1539 * cannot since the HAT layer can call into this routine to 1540 * determine the locality for its data structures in the context 1541 * of a page fault. 1542 */ 1543 1544 kpreempt_disable(); 1545 1546 lpl = t->t_lpl; 1547 ASSERT(lpl != NULL); 1548 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1549 lgrp = lpl->lpl_lgrpid; 1550 1551 kpreempt_enable(); 1552 1553 return (lgrp); 1554 } 1555 1556 /* 1557 * Return lgroup containing the physical memory for the given page frame number 1558 */ 1559 lgrp_t * 1560 lgrp_pfn_to_lgrp(pfn_t pfn) 1561 { 1562 lgrp_handle_t hand; 1563 int i; 1564 lgrp_t *lgrp; 1565 1566 hand = lgrp_plat_pfn_to_hand(pfn); 1567 if (hand != LGRP_NULL_HANDLE) 1568 for (i = 0; i <= lgrp_alloc_max; i++) { 1569 lgrp = lgrp_table[i]; 1570 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1571 return (lgrp); 1572 } 1573 return (NULL); 1574 } 1575 1576 /* 1577 * Return lgroup containing the physical memory for the given page frame number 1578 */ 1579 lgrp_t * 1580 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1581 { 1582 lgrp_handle_t hand; 1583 int i; 1584 lgrp_t *lgrp; 1585 pfn_t pfn; 1586 1587 pfn = btop(physaddr); 1588 hand = lgrp_plat_pfn_to_hand(pfn); 1589 if (hand != LGRP_NULL_HANDLE) 1590 for (i = 0; i <= lgrp_alloc_max; i++) { 1591 lgrp = lgrp_table[i]; 1592 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1593 return (lgrp); 1594 } 1595 return (NULL); 1596 } 1597 1598 /* 1599 * Return the leaf lgroup containing the given CPU 1600 * 1601 * The caller needs to take precautions necessary to prevent 1602 * "cpu", and it's lpl from going away across a call to this function. 1603 * hint: kpreempt_disable()/kpreempt_enable() 1604 */ 1605 static lgrp_t * 1606 lgrp_cpu_to_lgrp(cpu_t *cpu) 1607 { 1608 return (cpu->cpu_lpl->lpl_lgrp); 1609 } 1610 1611 /* 1612 * Return the sum of the partition loads in an lgrp divided by 1613 * the number of CPUs in the lgrp. This is our best approximation 1614 * of an 'lgroup load average' for a useful per-lgroup kstat. 1615 */ 1616 static uint64_t 1617 lgrp_sum_loadavgs(lgrp_t *lgrp) 1618 { 1619 cpu_t *cpu; 1620 int ncpu; 1621 uint64_t loads = 0; 1622 1623 mutex_enter(&cpu_lock); 1624 1625 cpu = lgrp->lgrp_cpu; 1626 ncpu = lgrp->lgrp_cpucnt; 1627 1628 if (cpu == NULL || ncpu == 0) { 1629 mutex_exit(&cpu_lock); 1630 return (0ull); 1631 } 1632 1633 do { 1634 loads += cpu->cpu_lpl->lpl_loadavg; 1635 cpu = cpu->cpu_next_lgrp; 1636 } while (cpu != lgrp->lgrp_cpu); 1637 1638 mutex_exit(&cpu_lock); 1639 1640 return (loads / ncpu); 1641 } 1642 1643 void 1644 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1645 { 1646 struct lgrp_stats *pstats; 1647 1648 /* 1649 * Verify that the caller isn't trying to add to 1650 * a statistic for an lgroup that has gone away 1651 */ 1652 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1653 return; 1654 1655 pstats = &lgrp_stats[lgrpid]; 1656 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1657 } 1658 1659 int64_t 1660 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1661 { 1662 uint64_t val; 1663 struct lgrp_stats *pstats; 1664 1665 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1666 return ((int64_t)0); 1667 1668 pstats = &lgrp_stats[lgrpid]; 1669 LGRP_STAT_READ(pstats, stat, val); 1670 return (val); 1671 } 1672 1673 /* 1674 * Reset all kstats for lgrp specified by its lgrpid. 1675 */ 1676 static void 1677 lgrp_kstat_reset(lgrp_id_t lgrpid) 1678 { 1679 lgrp_stat_t stat; 1680 1681 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1682 return; 1683 1684 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1685 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1686 } 1687 } 1688 1689 /* 1690 * Collect all per-lgrp statistics for the lgrp associated with this 1691 * kstat, and store them in the ks_data array. 1692 * 1693 * The superuser can reset all the running counter statistics for an 1694 * lgrp by writing to any of the lgrp's stats. 1695 */ 1696 static int 1697 lgrp_kstat_extract(kstat_t *ksp, int rw) 1698 { 1699 lgrp_stat_t stat; 1700 struct kstat_named *ksd; 1701 lgrp_t *lgrp; 1702 lgrp_id_t lgrpid; 1703 1704 lgrp = (lgrp_t *)ksp->ks_private; 1705 1706 ksd = (struct kstat_named *)ksp->ks_data; 1707 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1708 1709 lgrpid = lgrp->lgrp_id; 1710 1711 if (lgrpid == LGRP_NONE) { 1712 /* 1713 * Return all zeroes as stats for freed lgrp. 1714 */ 1715 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1716 ksd[stat].value.i64 = 0; 1717 } 1718 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1719 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1720 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1721 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1722 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1723 } else if (rw != KSTAT_WRITE) { 1724 /* 1725 * Handle counter stats 1726 */ 1727 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1728 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1729 } 1730 1731 /* 1732 * Handle kernel data snapshot stats 1733 */ 1734 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1735 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1736 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1737 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1738 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1739 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1740 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1741 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1742 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1743 lgrp_loadavg_max_effect; 1744 } else { 1745 lgrp_kstat_reset(lgrpid); 1746 } 1747 1748 return (0); 1749 } 1750 1751 int 1752 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1753 { 1754 cpu_t *cp; 1755 1756 mutex_enter(&cpu_lock); 1757 1758 if ((cp = cpu_get(id)) == NULL) { 1759 mutex_exit(&cpu_lock); 1760 return (EINVAL); 1761 } 1762 1763 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1764 mutex_exit(&cpu_lock); 1765 return (EINVAL); 1766 } 1767 1768 ASSERT(cp->cpu_lpl != NULL); 1769 1770 *lp = cp->cpu_lpl->lpl_lgrpid; 1771 1772 mutex_exit(&cpu_lock); 1773 1774 return (0); 1775 } 1776 1777 int 1778 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1779 { 1780 cpu_t *cp; 1781 1782 mutex_enter(&cpu_lock); 1783 1784 if ((cp = cpu_get(id)) == NULL) { 1785 mutex_exit(&cpu_lock); 1786 return (EINVAL); 1787 } 1788 1789 ASSERT(cp->cpu_lpl != NULL); 1790 1791 *lp = cp->cpu_lpl->lpl_loadavg; 1792 1793 mutex_exit(&cpu_lock); 1794 1795 return (0); 1796 } 1797 1798 /* 1799 * Add a resource named by lpl_leaf to rset of lpl_target 1800 * 1801 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1802 * resource. It is adjusted here, as this is presently the only place that we 1803 * can be certain a resource addition has succeeded. 1804 * 1805 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1806 * list in order until it reaches a NULL. (This list is required to be NULL 1807 * terminated, too). This is done so that we can mark start pos + 1, so that 1808 * each lpl is traversed sequentially, but in a different order. We hope this 1809 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1810 */ 1811 1812 void 1813 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1814 { 1815 int i; 1816 int entry_slot = 0; 1817 1818 /* return if leaf is already present */ 1819 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1820 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1821 return; 1822 } 1823 1824 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1825 lpl_leaf->lpl_lgrpid) { 1826 break; 1827 } 1828 } 1829 1830 /* insert leaf, update counts */ 1831 entry_slot = i; 1832 i = lpl_target->lpl_nrset++; 1833 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1834 panic("More leaf lgrps in system than are supported!\n"); 1835 } 1836 1837 /* 1838 * Start at the end of the rset array and work backwards towards the 1839 * slot into which the new lpl will be inserted. This effectively 1840 * preserves the current ordering by scooting everybody over one entry, 1841 * and placing the new entry into the space created. 1842 */ 1843 1844 while (i-- > entry_slot) { 1845 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1846 } 1847 1848 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1849 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1850 } 1851 1852 /* 1853 * Update each of lpl_parent's children with a proper hint and 1854 * a reference to their parent. 1855 * The lgrp topology is used as the reference since it is fully 1856 * consistent and correct at this point. 1857 * 1858 * Each child's hint will reference an element in lpl_parent's 1859 * rset that designates where the child should start searching 1860 * for CPU resources. The hint selected is the highest order leaf present 1861 * in the child's lineage. 1862 * 1863 * This should be called after any potential change in lpl_parent's 1864 * rset. 1865 */ 1866 static void 1867 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1868 { 1869 klgrpset_t children, leaves; 1870 lpl_t *lpl; 1871 int hint; 1872 int i, j; 1873 1874 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1875 if (klgrpset_isempty(children)) 1876 return; /* nothing to do */ 1877 1878 for (i = 0; i <= lgrp_alloc_max; i++) { 1879 if (klgrpset_ismember(children, i)) { 1880 1881 /* 1882 * Given the set of leaves in this child's lineage, 1883 * find the highest order leaf present in the parent's 1884 * rset. Select this as the hint for the child. 1885 */ 1886 leaves = lgrp_table[i]->lgrp_leaves; 1887 hint = 0; 1888 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1889 lpl = lpl_parent->lpl_rset[j]; 1890 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1891 hint = j; 1892 } 1893 cp->cp_lgrploads[i].lpl_hint = hint; 1894 1895 /* 1896 * (Re)set the parent. It may be incorrect if 1897 * lpl_parent is new in the topology. 1898 */ 1899 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1900 } 1901 } 1902 } 1903 1904 /* 1905 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1906 * 1907 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1908 * resource. The values are adjusted here, as this is the only place that we can 1909 * be certain a resource was successfully deleted. 1910 */ 1911 void 1912 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1913 { 1914 int i; 1915 1916 /* find leaf in intermediate node */ 1917 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1918 if (lpl_target->lpl_rset[i] == lpl_leaf) 1919 break; 1920 } 1921 1922 /* return if leaf not found */ 1923 if (lpl_target->lpl_rset[i] != lpl_leaf) 1924 return; 1925 1926 /* prune leaf, compress array */ 1927 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1928 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1929 lpl_target->lpl_ncpu--; 1930 do { 1931 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1932 } while (i++ < lpl_target->lpl_nrset); 1933 } 1934 1935 /* 1936 * Check to see if the resource set of the target lpl contains the 1937 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1938 */ 1939 1940 int 1941 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1942 { 1943 int i; 1944 1945 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1946 if (lpl_target->lpl_rset[i] == lpl_leaf) 1947 return (1); 1948 } 1949 1950 return (0); 1951 } 1952 1953 /* 1954 * Called when we change cpu lpl membership. This increments or decrements the 1955 * per-cpu counter in every lpl in which our leaf appears. 1956 */ 1957 void 1958 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1959 { 1960 cpupart_t *cpupart; 1961 lgrp_t *lgrp_leaf; 1962 lgrp_t *lgrp_cur; 1963 lpl_t *lpl_leaf; 1964 lpl_t *lpl_cur; 1965 int i; 1966 1967 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1968 1969 cpupart = cp->cpu_part; 1970 lpl_leaf = cp->cpu_lpl; 1971 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1972 1973 for (i = 0; i <= lgrp_alloc_max; i++) { 1974 lgrp_cur = lgrp_table[i]; 1975 1976 /* 1977 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 1978 * for the cpu in question, or if the current lgrp and leaf 1979 * don't share the same resources. 1980 */ 1981 1982 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 1983 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 1984 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 1985 continue; 1986 1987 1988 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 1989 1990 if (lpl_cur->lpl_nrset > 0) { 1991 if (act == LPL_INCREMENT) { 1992 lpl_cur->lpl_ncpu++; 1993 } else if (act == LPL_DECREMENT) { 1994 lpl_cur->lpl_ncpu--; 1995 } 1996 } 1997 } 1998 } 1999 2000 /* 2001 * Initialize lpl with given resources and specified lgrp 2002 */ 2003 2004 void 2005 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2006 { 2007 lpl->lpl_lgrpid = lgrp->lgrp_id; 2008 lpl->lpl_loadavg = 0; 2009 if (lpl == lpl_leaf) 2010 lpl->lpl_ncpu = 1; 2011 else 2012 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2013 lpl->lpl_nrset = 1; 2014 lpl->lpl_rset[0] = lpl_leaf; 2015 lpl->lpl_lgrp = lgrp; 2016 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2017 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2018 } 2019 2020 /* 2021 * Clear an unused lpl 2022 */ 2023 2024 void 2025 lpl_clear(lpl_t *lpl) 2026 { 2027 lgrp_id_t lid; 2028 2029 /* save lid for debugging purposes */ 2030 lid = lpl->lpl_lgrpid; 2031 bzero(lpl, sizeof (lpl_t)); 2032 lpl->lpl_lgrpid = lid; 2033 } 2034 2035 /* 2036 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2037 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2038 * make full use of all of the lgroup topology, but this checks to make sure 2039 * that for the parts that it does use, it has correctly understood the 2040 * relationships that exist. This function returns 2041 * 0 if the topology is correct, and a non-zero error code, for non-debug 2042 * kernels if incorrect. Asserts are spread throughout the code to aid in 2043 * debugging on a DEBUG kernel. 2044 */ 2045 int 2046 lpl_topo_verify(cpupart_t *cpupart) 2047 { 2048 lgrp_t *lgrp; 2049 lpl_t *lpl; 2050 klgrpset_t rset; 2051 klgrpset_t cset; 2052 cpu_t *cpu; 2053 cpu_t *cp_start; 2054 int i; 2055 int j; 2056 int sum; 2057 2058 /* topology can't be incorrect if it doesn't exist */ 2059 if (!lgrp_topo_initialized || !lgrp_initialized) 2060 return (LPL_TOPO_CORRECT); 2061 2062 ASSERT(cpupart != NULL); 2063 2064 for (i = 0; i <= lgrp_alloc_max; i++) { 2065 lgrp = lgrp_table[i]; 2066 lpl = NULL; 2067 /* make sure lpls are allocated */ 2068 ASSERT(cpupart->cp_lgrploads); 2069 if (!cpupart->cp_lgrploads) 2070 return (LPL_TOPO_PART_HAS_NO_LPL); 2071 2072 lpl = &cpupart->cp_lgrploads[i]; 2073 /* make sure our index is good */ 2074 ASSERT(i < cpupart->cp_nlgrploads); 2075 2076 /* if lgroup doesn't exist, make sure lpl is empty */ 2077 if (!LGRP_EXISTS(lgrp)) { 2078 ASSERT(lpl->lpl_ncpu == 0); 2079 if (lpl->lpl_ncpu > 0) { 2080 return (LPL_TOPO_CPUS_NOT_EMPTY); 2081 } else { 2082 continue; 2083 } 2084 } 2085 2086 /* verify that lgroup and lpl are identically numbered */ 2087 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2088 2089 /* if lgroup isn't in our partition, make sure lpl is empty */ 2090 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2091 cpupart->cp_lgrpset)) { 2092 ASSERT(lpl->lpl_ncpu == 0); 2093 if (lpl->lpl_ncpu > 0) { 2094 return (LPL_TOPO_CPUS_NOT_EMPTY); 2095 } 2096 /* 2097 * lpl is empty, and lgroup isn't in partition. verify 2098 * that lpl doesn't show up in anyone else's rsets (in 2099 * this partition, anyway) 2100 */ 2101 2102 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2103 lpl_t *i_lpl; /* lpl we're iterating over */ 2104 2105 i_lpl = &cpupart->cp_lgrploads[j]; 2106 2107 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2108 if (lpl_rset_contains(i_lpl, lpl)) { 2109 return (LPL_TOPO_LPL_ORPHANED); 2110 } 2111 } 2112 /* lgroup is empty, and everything is ok. continue */ 2113 continue; 2114 } 2115 2116 2117 /* lgroup is in this partition, now check it against lpl */ 2118 2119 /* do both have matching lgrps? */ 2120 ASSERT(lgrp == lpl->lpl_lgrp); 2121 if (lgrp != lpl->lpl_lgrp) { 2122 return (LPL_TOPO_LGRP_MISMATCH); 2123 } 2124 2125 /* do the parent lgroups exist and do they match? */ 2126 if (lgrp->lgrp_parent) { 2127 ASSERT(lpl->lpl_parent); 2128 ASSERT(lgrp->lgrp_parent->lgrp_id == 2129 lpl->lpl_parent->lpl_lgrpid); 2130 2131 if (!lpl->lpl_parent) { 2132 return (LPL_TOPO_MISSING_PARENT); 2133 } else if (lgrp->lgrp_parent->lgrp_id != 2134 lpl->lpl_parent->lpl_lgrpid) { 2135 return (LPL_TOPO_PARENT_MISMATCH); 2136 } 2137 } 2138 2139 /* only leaf lgroups keep a cpucnt, only check leaves */ 2140 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2141 2142 /* verify that lgrp is also a leaf */ 2143 ASSERT((lgrp->lgrp_childcnt == 0) && 2144 (klgrpset_ismember(lgrp->lgrp_leaves, 2145 lpl->lpl_lgrpid))); 2146 2147 if ((lgrp->lgrp_childcnt > 0) || 2148 (!klgrpset_ismember(lgrp->lgrp_leaves, 2149 lpl->lpl_lgrpid))) { 2150 return (LPL_TOPO_LGRP_NOT_LEAF); 2151 } 2152 2153 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2154 (lpl->lpl_ncpu > 0)); 2155 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2156 (lpl->lpl_ncpu <= 0)) { 2157 return (LPL_TOPO_BAD_CPUCNT); 2158 } 2159 2160 /* 2161 * Check that lpl_ncpu also matches the number of 2162 * cpus in the lpl's linked list. This only exists in 2163 * leaves, but they should always match. 2164 */ 2165 j = 0; 2166 cpu = cp_start = lpl->lpl_cpus; 2167 while (cpu != NULL) { 2168 j++; 2169 2170 /* check to make sure cpu's lpl is leaf lpl */ 2171 ASSERT(cpu->cpu_lpl == lpl); 2172 if (cpu->cpu_lpl != lpl) { 2173 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2174 } 2175 2176 /* check next cpu */ 2177 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2178 continue; 2179 } else { 2180 cpu = NULL; 2181 } 2182 } 2183 2184 ASSERT(j == lpl->lpl_ncpu); 2185 if (j != lpl->lpl_ncpu) { 2186 return (LPL_TOPO_LPL_BAD_NCPU); 2187 } 2188 2189 /* 2190 * Also, check that leaf lpl is contained in all 2191 * intermediate lpls that name the leaf as a descendant 2192 */ 2193 2194 for (j = 0; j <= lgrp_alloc_max; j++) { 2195 klgrpset_t intersect; 2196 lgrp_t *lgrp_cand; 2197 lpl_t *lpl_cand; 2198 2199 lgrp_cand = lgrp_table[j]; 2200 intersect = klgrpset_intersects( 2201 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2202 cpupart->cp_lgrpset); 2203 2204 if (!LGRP_EXISTS(lgrp_cand) || 2205 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2206 cpupart->cp_lgrpset) || 2207 (intersect == 0)) 2208 continue; 2209 2210 lpl_cand = 2211 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2212 2213 if (klgrpset_ismember(intersect, 2214 lgrp->lgrp_id)) { 2215 ASSERT(lpl_rset_contains(lpl_cand, 2216 lpl)); 2217 2218 if (!lpl_rset_contains(lpl_cand, lpl)) { 2219 return (LPL_TOPO_RSET_MSSNG_LF); 2220 } 2221 } 2222 } 2223 2224 } else { /* non-leaf specific checks */ 2225 2226 /* 2227 * Non-leaf lpls should have lpl_cpus == NULL 2228 * verify that this is so 2229 */ 2230 ASSERT(lpl->lpl_cpus == NULL); 2231 if (lpl->lpl_cpus != NULL) { 2232 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2233 } 2234 2235 /* 2236 * verify that the sum of the cpus in the leaf resources 2237 * is equal to the total ncpu in the intermediate 2238 */ 2239 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2240 sum += lpl->lpl_rset[j]->lpl_ncpu; 2241 } 2242 2243 ASSERT(sum == lpl->lpl_ncpu); 2244 if (sum != lpl->lpl_ncpu) { 2245 return (LPL_TOPO_LPL_BAD_NCPU); 2246 } 2247 } 2248 2249 /* 2250 * check on lpl_hint. Don't check root, since it has no parent. 2251 */ 2252 if (lpl->lpl_parent != NULL) { 2253 int hint; 2254 lpl_t *hint_lpl; 2255 2256 /* make sure hint is within limits of nrset */ 2257 hint = lpl->lpl_hint; 2258 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2259 if (lpl->lpl_parent->lpl_nrset < hint) { 2260 return (LPL_TOPO_BOGUS_HINT); 2261 } 2262 2263 /* make sure hint points to valid lpl */ 2264 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2265 ASSERT(hint_lpl->lpl_ncpu > 0); 2266 if (hint_lpl->lpl_ncpu <= 0) { 2267 return (LPL_TOPO_BOGUS_HINT); 2268 } 2269 } 2270 2271 /* 2272 * Check the rset of the lpl in question. Make sure that each 2273 * rset contains a subset of the resources in 2274 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2275 * sure that each rset doesn't include resources that are 2276 * outside of that set. (Which would be resources somehow not 2277 * accounted for). 2278 */ 2279 2280 klgrpset_clear(rset); 2281 for (j = 0; j < lpl->lpl_nrset; j++) { 2282 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2283 } 2284 klgrpset_copy(cset, rset); 2285 /* make sure lpl rset matches lgrp rset */ 2286 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2287 /* make sure rset is contained with in partition, too */ 2288 klgrpset_diff(cset, cpupart->cp_lgrpset); 2289 2290 ASSERT(klgrpset_isempty(rset) && 2291 klgrpset_isempty(cset)); 2292 if (!klgrpset_isempty(rset) || 2293 !klgrpset_isempty(cset)) { 2294 return (LPL_TOPO_RSET_MISMATCH); 2295 } 2296 2297 /* 2298 * check to make sure lpl_nrset matches the number of rsets 2299 * contained in the lpl 2300 */ 2301 2302 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2303 j++); 2304 2305 ASSERT(j == lpl->lpl_nrset); 2306 if (j != lpl->lpl_nrset) { 2307 return (LPL_TOPO_BAD_RSETCNT); 2308 } 2309 2310 } 2311 return (LPL_TOPO_CORRECT); 2312 } 2313 2314 /* 2315 * Flatten lpl topology to given number of levels. This is presently only 2316 * implemented for a flatten to 2 levels, which will prune out the intermediates 2317 * and home the leaf lpls to the root lpl. 2318 */ 2319 int 2320 lpl_topo_flatten(int levels) 2321 { 2322 int i; 2323 uint_t sum; 2324 lgrp_t *lgrp_cur; 2325 lpl_t *lpl_cur; 2326 lpl_t *lpl_root; 2327 cpupart_t *cp; 2328 2329 if (levels != 2) 2330 return (0); 2331 2332 /* called w/ cpus paused - grab no locks! */ 2333 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2334 !lgrp_initialized); 2335 2336 cp = cp_list_head; 2337 do { 2338 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2339 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2340 2341 for (i = 0; i <= lgrp_alloc_max; i++) { 2342 lgrp_cur = lgrp_table[i]; 2343 lpl_cur = &cp->cp_lgrploads[i]; 2344 2345 if ((lgrp_cur == lgrp_root) || 2346 (!LGRP_EXISTS(lgrp_cur) && 2347 (lpl_cur->lpl_ncpu == 0))) 2348 continue; 2349 2350 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2351 /* 2352 * this should be a deleted intermediate, so 2353 * clear it 2354 */ 2355 lpl_clear(lpl_cur); 2356 } else if ((lpl_cur->lpl_nrset == 1) && 2357 (lpl_cur->lpl_rset[0] == lpl_cur) && 2358 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2359 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2360 /* 2361 * this is a leaf whose parent was deleted, or 2362 * whose parent had their lgrp deleted. (And 2363 * whose parent will soon be deleted). Point 2364 * this guy back to the root lpl. 2365 */ 2366 lpl_cur->lpl_parent = lpl_root; 2367 lpl_rset_add(lpl_root, lpl_cur); 2368 } 2369 2370 } 2371 2372 /* 2373 * Now that we're done, make sure the count on the root lpl is 2374 * correct, and update the hints of the children for the sake of 2375 * thoroughness 2376 */ 2377 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2378 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2379 } 2380 lpl_root->lpl_ncpu = sum; 2381 lpl_child_update(lpl_root, cp); 2382 2383 cp = cp->cp_next; 2384 } while (cp != cp_list_head); 2385 2386 return (levels); 2387 } 2388 2389 /* 2390 * Insert a lpl into the resource hierarchy and create any additional lpls that 2391 * are necessary to represent the varying states of locality for the cpu 2392 * resoruces newly added to the partition. 2393 * 2394 * This routine is clever enough that it can correctly add resources from the 2395 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2396 * those for which the lpl is a leaf as opposed to simply a named equally local 2397 * resource). The one special case that needs additional processing is when a 2398 * new intermediate lpl is introduced. Since the main loop only traverses 2399 * looking to add the leaf resource where it does not yet exist, additional work 2400 * is necessary to add other leaf resources that may need to exist in the newly 2401 * created intermediate. This is performed by the second inner loop, and is 2402 * only done when the check for more than one overlapping resource succeeds. 2403 */ 2404 2405 void 2406 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2407 { 2408 int i; 2409 int j; 2410 int hint; 2411 int rset_num_intersect; 2412 lgrp_t *lgrp_cur; 2413 lpl_t *lpl_cur; 2414 lpl_t *lpl_parent; 2415 lgrp_id_t parent_id; 2416 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2417 2418 for (i = 0; i <= lgrp_alloc_max; i++) { 2419 lgrp_cur = lgrp_table[i]; 2420 2421 /* 2422 * Don't insert if the lgrp isn't there, if the leaf isn't 2423 * contained within the current lgrp, or if the current lgrp has 2424 * no leaves in this partition 2425 */ 2426 2427 if (!LGRP_EXISTS(lgrp_cur) || 2428 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2429 lpl_leaf->lpl_lgrpid) || 2430 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2431 cpupart->cp_lgrpset)) 2432 continue; 2433 2434 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2435 if (lgrp_cur->lgrp_parent != NULL) { 2436 /* if lgrp has a parent, assign it properly */ 2437 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2438 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2439 } else { 2440 /* if not, make sure parent ptr gets set to null */ 2441 lpl_parent = NULL; 2442 } 2443 2444 if (lpl_cur == lpl_leaf) { 2445 /* 2446 * Almost all leaf state was initialized elsewhere. The 2447 * only thing left to do is to set the parent. 2448 */ 2449 lpl_cur->lpl_parent = lpl_parent; 2450 continue; 2451 } 2452 2453 /* 2454 * Initialize intermediate lpl 2455 * Save this lpl's hint though. Since we're changing this 2456 * lpl's resources, we need to update the hint in this lpl's 2457 * children, but the hint in this lpl is unaffected and 2458 * should be preserved. 2459 */ 2460 hint = lpl_cur->lpl_hint; 2461 2462 lpl_clear(lpl_cur); 2463 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2464 2465 lpl_cur->lpl_hint = hint; 2466 lpl_cur->lpl_parent = lpl_parent; 2467 2468 /* does new lpl need to be populated with other resources? */ 2469 rset_intersect = 2470 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2471 cpupart->cp_lgrpset); 2472 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2473 2474 if (rset_num_intersect > 1) { 2475 /* 2476 * If so, figure out what lpls have resources that 2477 * intersect this one, and add them. 2478 */ 2479 for (j = 0; j <= lgrp_alloc_max; j++) { 2480 lgrp_t *lgrp_cand; /* candidate lgrp */ 2481 lpl_t *lpl_cand; /* candidate lpl */ 2482 2483 lgrp_cand = lgrp_table[j]; 2484 if (!LGRP_EXISTS(lgrp_cand) || 2485 !klgrpset_ismember(rset_intersect, 2486 lgrp_cand->lgrp_id)) 2487 continue; 2488 lpl_cand = 2489 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2490 lpl_rset_add(lpl_cur, lpl_cand); 2491 } 2492 } 2493 /* 2494 * This lpl's rset has changed. Update the hint in it's 2495 * children. 2496 */ 2497 lpl_child_update(lpl_cur, cpupart); 2498 } 2499 } 2500 2501 /* 2502 * remove a lpl from the hierarchy of resources, clearing its state when 2503 * finished. If the lpls at the intermediate levels of the hierarchy have no 2504 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2505 * delete them as well. 2506 */ 2507 2508 void 2509 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2510 { 2511 int i; 2512 lgrp_t *lgrp_cur; 2513 lpl_t *lpl_cur; 2514 klgrpset_t leaf_intersect; /* intersection of leaves */ 2515 2516 for (i = 0; i <= lgrp_alloc_max; i++) { 2517 lgrp_cur = lgrp_table[i]; 2518 2519 /* 2520 * Don't attempt to remove from lgrps that aren't there, that 2521 * don't contain our leaf, or from the leaf itself. (We do that 2522 * later) 2523 */ 2524 2525 if (!LGRP_EXISTS(lgrp_cur)) 2526 continue; 2527 2528 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2529 2530 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2531 lpl_leaf->lpl_lgrpid) || 2532 (lpl_cur == lpl_leaf)) { 2533 continue; 2534 } 2535 2536 /* 2537 * This is a slightly sleazy simplification in that we have 2538 * already marked the cp_lgrpset as no longer containing the 2539 * leaf we've deleted. Any lpls that pass the above checks 2540 * based upon lgrp membership but not necessarily cpu-part 2541 * membership also get cleared by the checks below. Currently 2542 * this is harmless, as the lpls should be empty anyway. 2543 * 2544 * In particular, we want to preserve lpls that have additional 2545 * leaf resources, even though we don't yet have a processor 2546 * architecture that represents resources this way. 2547 */ 2548 2549 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2550 cpupart->cp_lgrpset); 2551 2552 lpl_rset_del(lpl_cur, lpl_leaf); 2553 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2554 lpl_clear(lpl_cur); 2555 } else { 2556 /* 2557 * Update this lpl's children 2558 */ 2559 lpl_child_update(lpl_cur, cpupart); 2560 } 2561 } 2562 lpl_clear(lpl_leaf); 2563 } 2564 2565 /* 2566 * add a cpu to a partition in terms of lgrp load avg bookeeping 2567 * 2568 * The lpl (cpu partition load average information) is now arranged in a 2569 * hierarchical fashion whereby resources that are closest, ie. most local, to 2570 * the cpu in question are considered to be leaves in a tree of resources. 2571 * There are two general cases for cpu additon: 2572 * 2573 * 1. A lpl structure that contains resources already in the hierarchy tree. 2574 * In this case, all of the associated lpl relationships have been defined, and 2575 * all that is necessary is that we link the new cpu into the per-lpl list of 2576 * cpus, and increment the ncpu count of all places where this cpu resource will 2577 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2578 * pushing is accomplished by this routine. 2579 * 2580 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2581 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2582 * construct the hierarchy of state necessary to name it's more distant 2583 * resources, if they should exist. The leaf structure is initialized by this 2584 * routine, as is the cpu-partition state for the lgrp membership. This routine 2585 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2586 * and builds all of the "ancestoral" state necessary to identify resources at 2587 * differing levels of locality. 2588 */ 2589 void 2590 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2591 { 2592 cpupart_t *cpupart; 2593 lgrp_t *lgrp_leaf; 2594 lpl_t *lpl_leaf; 2595 2596 /* called sometimes w/ cpus paused - grab no locks */ 2597 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2598 2599 cpupart = cp->cpu_part; 2600 lgrp_leaf = lgrp_table[lgrpid]; 2601 2602 /* don't add non-existent lgrp */ 2603 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2604 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2605 cp->cpu_lpl = lpl_leaf; 2606 2607 /* only leaf lpls contain cpus */ 2608 2609 if (lpl_leaf->lpl_ncpu++ == 0) { 2610 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2611 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2612 lpl_leaf_insert(lpl_leaf, cpupart); 2613 } else { 2614 /* 2615 * the lpl should already exist in the parent, so just update 2616 * the count of available CPUs 2617 */ 2618 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2619 } 2620 2621 /* link cpu into list of cpus in lpl */ 2622 2623 if (lpl_leaf->lpl_cpus) { 2624 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2625 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2626 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2627 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2628 } else { 2629 /* 2630 * We increment ncpu immediately after we create a new leaf 2631 * lpl, so assert that ncpu == 1 for the case where we don't 2632 * have any cpu pointers yet. 2633 */ 2634 ASSERT(lpl_leaf->lpl_ncpu == 1); 2635 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2636 } 2637 2638 } 2639 2640 2641 /* 2642 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2643 * 2644 * The lpl (cpu partition load average information) is now arranged in a 2645 * hierarchical fashion whereby resources that are closest, ie. most local, to 2646 * the cpu in question are considered to be leaves in a tree of resources. 2647 * There are two removal cases in question: 2648 * 2649 * 1. Removal of the resource in the leaf leaves other resources remaining in 2650 * that leaf. (Another cpu still exists at this level of locality). In this 2651 * case, the count of available cpus is decremented in all assocated lpls by 2652 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2653 * from the per-cpu lpl list. 2654 * 2655 * 2. Removal of the resource results in the lpl containing no resources. (It's 2656 * empty) In this case, all of what has occurred for the first step must take 2657 * place; however, additionally we must remove the lpl structure itself, prune 2658 * out any stranded lpls that do not directly name a leaf resource, and mark the 2659 * cpu partition in question as no longer containing resources from the lgrp of 2660 * the lpl that has been delted. Cpu-partition changes are handled by this 2661 * method, but the lpl_leaf_remove function deals with the details of pruning 2662 * out the empty lpl and any of its orphaned direct ancestors. 2663 */ 2664 void 2665 lgrp_part_del_cpu(cpu_t *cp) 2666 { 2667 lpl_t *lpl; 2668 lpl_t *leaf_lpl; 2669 lgrp_t *lgrp_leaf; 2670 2671 /* called sometimes w/ cpus paused - grab no locks */ 2672 2673 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2674 2675 lpl = leaf_lpl = cp->cpu_lpl; 2676 lgrp_leaf = leaf_lpl->lpl_lgrp; 2677 2678 /* don't delete a leaf that isn't there */ 2679 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2680 2681 /* no double-deletes */ 2682 ASSERT(lpl->lpl_ncpu); 2683 if (--lpl->lpl_ncpu == 0) { 2684 /* 2685 * This was the last cpu in this lgroup for this partition, 2686 * clear its bit in the partition's lgroup bitmask 2687 */ 2688 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2689 2690 /* eliminate remaning lpl link pointers in cpu, lpl */ 2691 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2692 2693 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2694 } else { 2695 2696 /* unlink cpu from lists of cpus in lpl */ 2697 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2698 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2699 if (lpl->lpl_cpus == cp) { 2700 lpl->lpl_cpus = cp->cpu_next_lpl; 2701 } 2702 2703 /* 2704 * Update the cpu count in the lpls associated with parent 2705 * lgroups. 2706 */ 2707 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2708 2709 } 2710 /* clear cpu's lpl ptr when we're all done */ 2711 cp->cpu_lpl = NULL; 2712 } 2713 2714 /* 2715 * Recompute load average for the specified partition/lgrp fragment. 2716 * 2717 * We rely on the fact that this routine is called from the clock thread 2718 * at a point before the clock thread can block (i.e. before its first 2719 * lock request). Since the clock thread can not be preempted (since it 2720 * runs at highest priority), we know that cpu partitions can not change 2721 * (since doing so would require either the repartition requester or the 2722 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2723 * without grabbing cpu_lock. 2724 */ 2725 void 2726 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2727 { 2728 uint_t ncpu; 2729 int64_t old, new, f; 2730 2731 /* 2732 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2733 */ 2734 static short expval[] = { 2735 0, 3196, 1618, 1083, 2736 814, 652, 543, 466, 2737 408, 363, 326, 297, 2738 272, 251, 233, 218, 2739 204, 192, 181, 172, 2740 163, 155, 148, 142, 2741 136, 130, 125, 121, 2742 116, 112, 109, 105 2743 }; 2744 2745 /* ASSERT (called from clock level) */ 2746 2747 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2748 ((ncpu = lpl->lpl_ncpu) == 0)) { 2749 return; 2750 } 2751 2752 for (;;) { 2753 2754 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2755 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2756 else 2757 f = expval[ncpu]; 2758 2759 /* 2760 * Modify the load average atomically to avoid losing 2761 * anticipatory load updates (see lgrp_move_thread()). 2762 */ 2763 if (ageflag) { 2764 /* 2765 * We're supposed to both update and age the load. 2766 * This happens 10 times/sec. per cpu. We do a 2767 * little hoop-jumping to avoid integer overflow. 2768 */ 2769 int64_t q, r; 2770 2771 do { 2772 old = new = lpl->lpl_loadavg; 2773 q = (old >> 16) << 7; 2774 r = (old & 0xffff) << 7; 2775 new += ((long long)(nrcpus - q) * f - 2776 ((r * f) >> 16)) >> 7; 2777 2778 /* 2779 * Check for overflow 2780 */ 2781 if (new > LGRP_LOADAVG_MAX) 2782 new = LGRP_LOADAVG_MAX; 2783 else if (new < 0) 2784 new = 0; 2785 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2786 new) != old); 2787 } else { 2788 /* 2789 * We're supposed to update the load, but not age it. 2790 * This option is used to update the load (which either 2791 * has already been aged in this 1/10 sec. interval or 2792 * soon will be) to account for a remotely executing 2793 * thread. 2794 */ 2795 do { 2796 old = new = lpl->lpl_loadavg; 2797 new += f; 2798 /* 2799 * Check for overflow 2800 * Underflow not possible here 2801 */ 2802 if (new < old) 2803 new = LGRP_LOADAVG_MAX; 2804 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2805 new) != old); 2806 } 2807 2808 /* 2809 * Do the same for this lpl's parent 2810 */ 2811 if ((lpl = lpl->lpl_parent) == NULL) 2812 break; 2813 ncpu = lpl->lpl_ncpu; 2814 } 2815 } 2816 2817 /* 2818 * Initialize lpl topology in the target based on topology currently present in 2819 * lpl_bootstrap. 2820 * 2821 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2822 * initialize cp_default list of lpls. Up to this point all topology operations 2823 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2824 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2825 * `target' points to the list of lpls in cp_default and `size' is the size of 2826 * this list. 2827 * 2828 * This function walks the lpl topology in lpl_bootstrap and does for things: 2829 * 2830 * 1) Copies all fields from lpl_bootstrap to the target. 2831 * 2832 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2833 * 2834 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2835 * instead of lpl_bootstrap. 2836 * 2837 * 4) Updates pointers in the resource list of the target to point to the lpls 2838 * in the target list instead of lpl_bootstrap. 2839 * 2840 * After lpl_topo_bootstrap() completes, target contains the same information 2841 * that would be present there if it were used during boot instead of 2842 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2843 * and it is bzeroed. 2844 */ 2845 void 2846 lpl_topo_bootstrap(lpl_t *target, int size) 2847 { 2848 lpl_t *lpl = lpl_bootstrap; 2849 lpl_t *target_lpl = target; 2850 int howmany; 2851 int id; 2852 int i; 2853 2854 /* 2855 * The only target that should be passed here is cp_default lpl list. 2856 */ 2857 ASSERT(target == cp_default.cp_lgrploads); 2858 ASSERT(size == cp_default.cp_nlgrploads); 2859 ASSERT(!lgrp_topo_initialized); 2860 ASSERT(ncpus == 1); 2861 2862 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2863 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2864 /* 2865 * Copy all fields from lpl. 2866 */ 2867 2868 *target_lpl = *lpl; 2869 2870 /* 2871 * Substitute CPU0 lpl pointer with one relative to target. 2872 */ 2873 if (lpl->lpl_cpus == CPU) { 2874 ASSERT(CPU->cpu_lpl == lpl); 2875 CPU->cpu_lpl = target_lpl; 2876 } 2877 2878 /* 2879 * Substitute parent information with parent relative to target. 2880 */ 2881 if (lpl->lpl_parent != NULL) 2882 target_lpl->lpl_parent = (lpl_t *) 2883 (((uintptr_t)lpl->lpl_parent - 2884 (uintptr_t)lpl_bootstrap) + 2885 (uintptr_t)target); 2886 2887 /* 2888 * Walk over resource set substituting pointers relative to 2889 * lpl_bootstrap to pointers relative to target. 2890 */ 2891 ASSERT(lpl->lpl_nrset <= 1); 2892 2893 for (id = 0; id < lpl->lpl_nrset; id++) { 2894 if (lpl->lpl_rset[id] != NULL) { 2895 target_lpl->lpl_rset[id] = 2896 (lpl_t *) 2897 (((uintptr_t)lpl->lpl_rset[id] - 2898 (uintptr_t)lpl_bootstrap) + 2899 (uintptr_t)target); 2900 } 2901 } 2902 } 2903 2904 /* 2905 * Topology information in lpl_bootstrap is no longer needed. 2906 */ 2907 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2908 } 2909 2910 /* 2911 * If the lowest load among the lgroups a process' threads are currently 2912 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2913 * expanding the process to a new lgroup. 2914 */ 2915 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2916 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2917 2918 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2919 ((lgrp_expand_proc_thresh) / (ncpu)) 2920 2921 /* 2922 * A process will be expanded to a new lgroup only if the difference between 2923 * the lowest load on the lgroups the process' thread's are currently spread 2924 * across and the lowest load on the other lgroups in the process' partition 2925 * is greater than lgrp_expand_proc_diff. 2926 */ 2927 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2928 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2929 2930 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2931 ((lgrp_expand_proc_diff) / (ncpu)) 2932 2933 /* 2934 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2935 * be present due to impreciseness of the load average decay algorithm. 2936 * 2937 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2938 * tolerance is scaled by the number of cpus in the lgroup just like 2939 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2940 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2941 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2942 */ 2943 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2944 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2945 ((lgrp_loadavg_tolerance) / ncpu) 2946 2947 /* 2948 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2949 * average is above this threshold 2950 */ 2951 uint32_t lgrp_load_thresh = UINT32_MAX; 2952 2953 /* 2954 * lgrp_choose() will try to skip any lgroups with less memory 2955 * than this free when choosing a home lgroup 2956 */ 2957 pgcnt_t lgrp_mem_free_thresh = 0; 2958 2959 /* 2960 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2961 * one based on one of the following policies: 2962 * - Random selection 2963 * - Pseudo round robin placement 2964 * - Longest time since a thread was last placed 2965 */ 2966 #define LGRP_CHOOSE_RANDOM 1 2967 #define LGRP_CHOOSE_RR 2 2968 #define LGRP_CHOOSE_TIME 3 2969 2970 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2971 2972 /* 2973 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 2974 * be bound to a CPU or processor set. 2975 * 2976 * Arguments: 2977 * t The thread 2978 * cpupart The partition the thread belongs to. 2979 * 2980 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 2981 * disabled, or thread_lock held (at splhigh) to protect against the CPU 2982 * partitions changing out from under us and assumes that given thread is 2983 * protected. Also, called sometimes w/ cpus paused or kernel preemption 2984 * disabled, so don't grab any locks because we should never block under 2985 * those conditions. 2986 */ 2987 lpl_t * 2988 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 2989 { 2990 lgrp_load_t bestload, bestrload; 2991 int lgrpid_offset, lgrp_count; 2992 lgrp_id_t lgrpid, lgrpid_start; 2993 lpl_t *lpl, *bestlpl, *bestrlpl; 2994 klgrpset_t lgrpset; 2995 proc_t *p; 2996 2997 ASSERT(t != NULL); 2998 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2999 THREAD_LOCK_HELD(t)); 3000 ASSERT(cpupart != NULL); 3001 3002 p = t->t_procp; 3003 3004 /* A process should always be in an active partition */ 3005 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3006 3007 bestlpl = bestrlpl = NULL; 3008 bestload = bestrload = LGRP_LOADAVG_MAX; 3009 lgrpset = cpupart->cp_lgrpset; 3010 3011 switch (lgrp_choose_policy) { 3012 case LGRP_CHOOSE_RR: 3013 lgrpid = cpupart->cp_lgrp_hint; 3014 do { 3015 if (++lgrpid > lgrp_alloc_max) 3016 lgrpid = 0; 3017 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3018 3019 break; 3020 default: 3021 case LGRP_CHOOSE_TIME: 3022 case LGRP_CHOOSE_RANDOM: 3023 klgrpset_nlgrps(lgrpset, lgrp_count); 3024 lgrpid_offset = 3025 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3026 for (lgrpid = 0; ; lgrpid++) { 3027 if (klgrpset_ismember(lgrpset, lgrpid)) { 3028 if (--lgrpid_offset == 0) 3029 break; 3030 } 3031 } 3032 break; 3033 } 3034 3035 lgrpid_start = lgrpid; 3036 3037 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3038 lgrp_id_t, cpupart->cp_lgrp_hint); 3039 3040 /* 3041 * Use lgroup affinities (if any) to choose best lgroup 3042 * 3043 * NOTE: Assumes that thread is protected from going away and its 3044 * lgroup affinities won't change (ie. p_lock, or 3045 * thread_lock() being held and/or CPUs paused) 3046 */ 3047 if (t->t_lgrp_affinity) { 3048 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 3049 if (lpl != NULL) 3050 return (lpl); 3051 } 3052 3053 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3054 3055 do { 3056 pgcnt_t npgs; 3057 3058 /* 3059 * Skip any lgroups outside of thread's pset 3060 */ 3061 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3062 if (++lgrpid > lgrp_alloc_max) 3063 lgrpid = 0; /* wrap the search */ 3064 continue; 3065 } 3066 3067 /* 3068 * Skip any non-leaf lgroups 3069 */ 3070 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3071 continue; 3072 3073 /* 3074 * Skip any lgroups without enough free memory 3075 * (when threshold set to nonzero positive value) 3076 */ 3077 if (lgrp_mem_free_thresh > 0) { 3078 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3079 if (npgs < lgrp_mem_free_thresh) { 3080 if (++lgrpid > lgrp_alloc_max) 3081 lgrpid = 0; /* wrap the search */ 3082 continue; 3083 } 3084 } 3085 3086 lpl = &cpupart->cp_lgrploads[lgrpid]; 3087 if (klgrpset_isempty(p->p_lgrpset) || 3088 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3089 /* 3090 * Either this is a new process or the process already 3091 * has threads on this lgrp, so this is a preferred 3092 * lgroup for the thread. 3093 */ 3094 if (bestlpl == NULL || 3095 lpl_pick(lpl, bestlpl)) { 3096 bestload = lpl->lpl_loadavg; 3097 bestlpl = lpl; 3098 } 3099 } else { 3100 /* 3101 * The process doesn't have any threads on this lgrp, 3102 * but we're willing to consider this lgrp if the load 3103 * difference is big enough to justify splitting up 3104 * the process' threads. 3105 */ 3106 if (bestrlpl == NULL || 3107 lpl_pick(lpl, bestrlpl)) { 3108 bestrload = lpl->lpl_loadavg; 3109 bestrlpl = lpl; 3110 } 3111 } 3112 if (++lgrpid > lgrp_alloc_max) 3113 lgrpid = 0; /* wrap the search */ 3114 } while (lgrpid != lgrpid_start); 3115 3116 /* 3117 * Return root lgroup if threshold isn't set to maximum value and 3118 * lowest lgroup load average more than a certain threshold 3119 */ 3120 if (lgrp_load_thresh != UINT32_MAX && 3121 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3122 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3123 3124 /* 3125 * If all the lgroups over which the thread's process is spread are 3126 * heavily loaded, or otherwise undesirable, we'll consider placing 3127 * the thread on one of the other leaf lgroups in the thread's 3128 * partition. 3129 */ 3130 if ((bestlpl == NULL) || 3131 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3132 (bestrload < bestload) && /* paranoid about wraparound */ 3133 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3134 bestload))) { 3135 bestlpl = bestrlpl; 3136 } 3137 3138 if (bestlpl == NULL) { 3139 /* 3140 * No lgroup looked particularly good, but we still 3141 * have to pick something. Go with the randomly selected 3142 * legal lgroup we started with above. 3143 */ 3144 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3145 } 3146 3147 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3148 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3149 3150 ASSERT(bestlpl->lpl_ncpu > 0); 3151 return (bestlpl); 3152 } 3153 3154 /* 3155 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3156 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3157 */ 3158 static int 3159 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3160 { 3161 lgrp_load_t l1, l2; 3162 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3163 3164 l1 = lpl1->lpl_loadavg; 3165 l2 = lpl2->lpl_loadavg; 3166 3167 if ((l1 + tolerance < l2) && (l1 < l2)) { 3168 /* lpl1 is significantly less loaded than lpl2 */ 3169 return (1); 3170 } 3171 3172 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3173 l1 + tolerance >= l2 && l1 < l2 && 3174 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3175 /* 3176 * lpl1's load is within the tolerance of lpl2. We're 3177 * willing to consider it be to better however if 3178 * it has been longer since we last homed a thread there 3179 */ 3180 return (1); 3181 } 3182 3183 return (0); 3184 } 3185 3186 /* 3187 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a 3188 * process that uses text replication changed home lgrp. This info is used by 3189 * segvn asyncronous thread to detect if it needs to recheck what lgrps 3190 * should be used for text replication. 3191 */ 3192 static uint64_t lgrp_trthr_moves = 0; 3193 3194 uint64_t 3195 lgrp_get_trthr_migrations(void) 3196 { 3197 return (lgrp_trthr_moves); 3198 } 3199 3200 void 3201 lgrp_update_trthr_migrations(uint64_t incr) 3202 { 3203 atomic_add_64(&lgrp_trthr_moves, incr); 3204 } 3205 3206 /* 3207 * An LWP is expected to be assigned to an lgroup for at least this long 3208 * for its anticipatory load to be justified. NOTE that this value should 3209 * not be set extremely huge (say, larger than 100 years), to avoid problems 3210 * with overflow in the calculation that uses it. 3211 */ 3212 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3213 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3214 3215 /* 3216 * Routine to change a thread's lgroup affiliation. This routine updates 3217 * the thread's kthread_t struct and its process' proc_t struct to note the 3218 * thread's new lgroup affiliation, and its lgroup affinities. 3219 * 3220 * Note that this is the only routine that modifies a thread's t_lpl field, 3221 * and that adds in or removes anticipatory load. 3222 * 3223 * If the thread is exiting, newlpl is NULL. 3224 * 3225 * Locking: 3226 * The following lock must be held on entry: 3227 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3228 * doesn't get removed from t's partition 3229 * 3230 * This routine is not allowed to grab any locks, since it may be called 3231 * with cpus paused (such as from cpu_offline). 3232 */ 3233 void 3234 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3235 { 3236 proc_t *p; 3237 lpl_t *lpl, *oldlpl; 3238 lgrp_id_t oldid; 3239 kthread_t *tp; 3240 uint_t ncpu; 3241 lgrp_load_t old, new; 3242 3243 ASSERT(t); 3244 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3245 THREAD_LOCK_HELD(t)); 3246 3247 /* 3248 * If not changing lpls, just return 3249 */ 3250 if ((oldlpl = t->t_lpl) == newlpl) 3251 return; 3252 3253 /* 3254 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3255 * associated with process 0 rather than with its original process). 3256 */ 3257 if (t->t_proc_flag & TP_LWPEXIT) { 3258 if (newlpl != NULL) { 3259 t->t_lpl = newlpl; 3260 } 3261 return; 3262 } 3263 3264 p = ttoproc(t); 3265 3266 /* 3267 * If the thread had a previous lgroup, update its process' p_lgrpset 3268 * to account for it being moved from its old lgroup. 3269 */ 3270 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3271 (p->p_tlist != NULL)) { 3272 oldid = oldlpl->lpl_lgrpid; 3273 3274 if (newlpl != NULL) 3275 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3276 3277 if ((do_lgrpset_delete) && 3278 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3279 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3280 /* 3281 * Check if a thread other than the thread 3282 * that's moving is assigned to the same 3283 * lgroup as the thread that's moving. Note 3284 * that we have to compare lgroup IDs, rather 3285 * than simply comparing t_lpl's, since the 3286 * threads may belong to different partitions 3287 * but be assigned to the same lgroup. 3288 */ 3289 ASSERT(tp->t_lpl != NULL); 3290 3291 if ((tp != t) && 3292 (tp->t_lpl->lpl_lgrpid == oldid)) { 3293 /* 3294 * Another thread is assigned to the 3295 * same lgroup as the thread that's 3296 * moving, p_lgrpset doesn't change. 3297 */ 3298 break; 3299 } else if (tp == p->p_tlist) { 3300 /* 3301 * No other thread is assigned to the 3302 * same lgroup as the exiting thread, 3303 * clear the lgroup's bit in p_lgrpset. 3304 */ 3305 klgrpset_del(p->p_lgrpset, oldid); 3306 break; 3307 } 3308 } 3309 } 3310 3311 /* 3312 * If this thread was assigned to its old lgroup for such a 3313 * short amount of time that the anticipatory load that was 3314 * added on its behalf has aged very little, remove that 3315 * anticipatory load. 3316 */ 3317 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3318 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3319 lpl = oldlpl; 3320 for (;;) { 3321 do { 3322 old = new = lpl->lpl_loadavg; 3323 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3324 if (new > old) { 3325 /* 3326 * this can happen if the load 3327 * average was aged since we 3328 * added in the anticipatory 3329 * load 3330 */ 3331 new = 0; 3332 } 3333 } while (cas32( 3334 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3335 new) != old); 3336 3337 lpl = lpl->lpl_parent; 3338 if (lpl == NULL) 3339 break; 3340 3341 ncpu = lpl->lpl_ncpu; 3342 ASSERT(ncpu > 0); 3343 } 3344 } 3345 } 3346 /* 3347 * If the thread has a new lgroup (i.e. it's not exiting), update its 3348 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3349 * to its new lgroup to account for its move to its new lgroup. 3350 */ 3351 if (newlpl != NULL) { 3352 /* 3353 * This thread is moving to a new lgroup 3354 */ 3355 t->t_lpl = newlpl; 3356 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) { 3357 p->p_t1_lgrpid = newlpl->lpl_lgrpid; 3358 membar_producer(); 3359 if (p->p_tr_lgrpid != LGRP_NONE && 3360 p->p_tr_lgrpid != p->p_t1_lgrpid) { 3361 lgrp_update_trthr_migrations(1); 3362 } 3363 } 3364 3365 /* 3366 * Reflect move in load average of new lgroup 3367 * unless it is root lgroup 3368 */ 3369 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3370 return; 3371 3372 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3373 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3374 } 3375 3376 /* 3377 * It'll take some time for the load on the new lgroup 3378 * to reflect this thread's placement on it. We'd 3379 * like not, however, to have all threads between now 3380 * and then also piling on to this lgroup. To avoid 3381 * this pileup, we anticipate the load this thread 3382 * will generate on its new lgroup. The goal is to 3383 * make the lgroup's load appear as though the thread 3384 * had been there all along. We're very conservative 3385 * in calculating this anticipatory load, we assume 3386 * the worst case case (100% CPU-bound thread). This 3387 * may be modified in the future to be more accurate. 3388 */ 3389 lpl = newlpl; 3390 for (;;) { 3391 ncpu = lpl->lpl_ncpu; 3392 ASSERT(ncpu > 0); 3393 do { 3394 old = new = lpl->lpl_loadavg; 3395 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3396 /* 3397 * Check for overflow 3398 * Underflow not possible here 3399 */ 3400 if (new < old) 3401 new = UINT32_MAX; 3402 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3403 new) != old); 3404 3405 lpl = lpl->lpl_parent; 3406 if (lpl == NULL) 3407 break; 3408 } 3409 t->t_anttime = gethrtime(); 3410 } 3411 } 3412 3413 /* 3414 * Return lgroup memory allocation policy given advice from madvise(3C) 3415 */ 3416 lgrp_mem_policy_t 3417 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3418 { 3419 switch (advice) { 3420 case MADV_ACCESS_LWP: 3421 return (LGRP_MEM_POLICY_NEXT); 3422 case MADV_ACCESS_MANY: 3423 return (LGRP_MEM_POLICY_RANDOM); 3424 default: 3425 return (lgrp_mem_policy_default(size, type)); 3426 } 3427 } 3428 3429 /* 3430 * Figure out default policy 3431 */ 3432 lgrp_mem_policy_t 3433 lgrp_mem_policy_default(size_t size, int type) 3434 { 3435 cpupart_t *cp; 3436 lgrp_mem_policy_t policy; 3437 size_t pset_mem_size; 3438 3439 /* 3440 * Randomly allocate memory across lgroups for shared memory 3441 * beyond a certain threshold 3442 */ 3443 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3444 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3445 /* 3446 * Get total memory size of current thread's pset 3447 */ 3448 kpreempt_disable(); 3449 cp = curthread->t_cpupart; 3450 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3451 kpreempt_enable(); 3452 3453 /* 3454 * Choose policy to randomly allocate memory across 3455 * lgroups in pset if it will fit and is not default 3456 * partition. Otherwise, allocate memory randomly 3457 * across machine. 3458 */ 3459 if (lgrp_mem_pset_aware && size < pset_mem_size) 3460 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3461 else 3462 policy = LGRP_MEM_POLICY_RANDOM; 3463 } else 3464 /* 3465 * Apply default policy for private memory and 3466 * shared memory under the respective random 3467 * threshold. 3468 */ 3469 policy = lgrp_mem_default_policy; 3470 3471 return (policy); 3472 } 3473 3474 /* 3475 * Get memory allocation policy for this segment 3476 */ 3477 lgrp_mem_policy_info_t * 3478 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3479 { 3480 lgrp_mem_policy_info_t *policy_info; 3481 extern struct seg_ops segspt_ops; 3482 extern struct seg_ops segspt_shmops; 3483 3484 /* 3485 * This is for binary compatibility to protect against third party 3486 * segment drivers which haven't recompiled to allow for 3487 * SEGOP_GETPOLICY() 3488 */ 3489 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3490 seg->s_ops != &segspt_shmops) 3491 return (NULL); 3492 3493 policy_info = NULL; 3494 if (seg->s_ops->getpolicy != NULL) 3495 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3496 3497 return (policy_info); 3498 } 3499 3500 /* 3501 * Set policy for allocating private memory given desired policy, policy info, 3502 * size in bytes of memory that policy is being applied. 3503 * Return 0 if policy wasn't set already and 1 if policy was set already 3504 */ 3505 int 3506 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3507 lgrp_mem_policy_info_t *policy_info, size_t size) 3508 { 3509 3510 ASSERT(policy_info != NULL); 3511 3512 if (policy == LGRP_MEM_POLICY_DEFAULT) 3513 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3514 3515 /* 3516 * Policy set already? 3517 */ 3518 if (policy == policy_info->mem_policy) 3519 return (1); 3520 3521 /* 3522 * Set policy 3523 */ 3524 policy_info->mem_policy = policy; 3525 policy_info->mem_lgrpid = LGRP_NONE; 3526 3527 return (0); 3528 } 3529 3530 3531 /* 3532 * Get shared memory allocation policy with given tree and offset 3533 */ 3534 lgrp_mem_policy_info_t * 3535 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3536 u_offset_t vn_off) 3537 { 3538 u_offset_t off; 3539 lgrp_mem_policy_info_t *policy_info; 3540 lgrp_shm_policy_seg_t *policy_seg; 3541 lgrp_shm_locality_t *shm_locality; 3542 avl_tree_t *tree; 3543 avl_index_t where; 3544 3545 /* 3546 * Get policy segment tree from anon_map or vnode and use specified 3547 * anon index or vnode offset as offset 3548 * 3549 * Assume that no lock needs to be held on anon_map or vnode, since 3550 * they should be protected by their reference count which must be 3551 * nonzero for an existing segment 3552 */ 3553 if (amp) { 3554 ASSERT(amp->refcnt != 0); 3555 shm_locality = amp->locality; 3556 if (shm_locality == NULL) 3557 return (NULL); 3558 tree = shm_locality->loc_tree; 3559 off = ptob(anon_index); 3560 } else if (vp) { 3561 shm_locality = vp->v_locality; 3562 if (shm_locality == NULL) 3563 return (NULL); 3564 ASSERT(shm_locality->loc_count != 0); 3565 tree = shm_locality->loc_tree; 3566 off = vn_off; 3567 } 3568 3569 if (tree == NULL) 3570 return (NULL); 3571 3572 /* 3573 * Lookup policy segment for offset into shared object and return 3574 * policy info 3575 */ 3576 rw_enter(&shm_locality->loc_lock, RW_READER); 3577 policy_info = NULL; 3578 policy_seg = avl_find(tree, &off, &where); 3579 if (policy_seg) 3580 policy_info = &policy_seg->shm_policy; 3581 rw_exit(&shm_locality->loc_lock); 3582 3583 return (policy_info); 3584 } 3585 3586 /* 3587 * Default memory allocation policy for kernel segmap pages 3588 */ 3589 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3590 3591 /* 3592 * Return lgroup to use for allocating memory 3593 * given the segment and address 3594 * 3595 * There isn't any mutual exclusion that exists between calls 3596 * to this routine and DR, so this routine and whomever calls it 3597 * should be mindful of the possibility that the lgrp returned 3598 * may be deleted. If this happens, dereferences of the lgrp 3599 * pointer will still be safe, but the resources in the lgrp will 3600 * be gone, and LGRP_EXISTS() will no longer be true. 3601 */ 3602 lgrp_t * 3603 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3604 { 3605 int i; 3606 lgrp_t *lgrp; 3607 klgrpset_t lgrpset; 3608 int lgrps_spanned; 3609 unsigned long off; 3610 lgrp_mem_policy_t policy; 3611 lgrp_mem_policy_info_t *policy_info; 3612 ushort_t random; 3613 int stat = 0; 3614 extern struct seg *segkmap; 3615 3616 /* 3617 * Just return null if the lgrp framework hasn't finished 3618 * initializing or if this is a UMA machine. 3619 */ 3620 if (nlgrps == 1 || !lgrp_initialized) 3621 return (lgrp_root); 3622 3623 /* 3624 * Get memory allocation policy for this segment 3625 */ 3626 policy = lgrp_mem_default_policy; 3627 if (seg != NULL) { 3628 if (seg->s_as == &kas) { 3629 if (seg == segkmap) 3630 policy = lgrp_segmap_default_policy; 3631 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3632 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3633 policy = LGRP_MEM_POLICY_RANDOM; 3634 } else { 3635 policy_info = lgrp_mem_policy_get(seg, vaddr); 3636 if (policy_info != NULL) { 3637 policy = policy_info->mem_policy; 3638 if (policy == LGRP_MEM_POLICY_NEXT_SEG) { 3639 lgrp_id_t id = policy_info->mem_lgrpid; 3640 ASSERT(id != LGRP_NONE); 3641 ASSERT(id < NLGRPS_MAX); 3642 lgrp = lgrp_table[id]; 3643 if (!LGRP_EXISTS(lgrp)) { 3644 policy = LGRP_MEM_POLICY_NEXT; 3645 } else { 3646 lgrp_stat_add(id, 3647 LGRP_NUM_NEXT_SEG, 1); 3648 return (lgrp); 3649 } 3650 } 3651 } 3652 } 3653 } 3654 lgrpset = 0; 3655 3656 /* 3657 * Initialize lgroup to home by default 3658 */ 3659 lgrp = lgrp_home_lgrp(); 3660 3661 /* 3662 * When homing threads on root lgrp, override default memory 3663 * allocation policies with root lgroup memory allocation policy 3664 */ 3665 if (lgrp == lgrp_root) 3666 policy = lgrp_mem_policy_root; 3667 3668 /* 3669 * Implement policy 3670 */ 3671 switch (policy) { 3672 case LGRP_MEM_POLICY_NEXT_CPU: 3673 3674 /* 3675 * Return lgroup of current CPU which faulted on memory 3676 * If the CPU isn't currently in an lgrp, then opt to 3677 * allocate from the root. 3678 * 3679 * Kernel preemption needs to be disabled here to prevent 3680 * the current CPU from going away before lgrp is found. 3681 */ 3682 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3683 lgrp = lgrp_root; 3684 } else { 3685 kpreempt_disable(); 3686 lgrp = lgrp_cpu_to_lgrp(CPU); 3687 kpreempt_enable(); 3688 } 3689 break; 3690 3691 case LGRP_MEM_POLICY_NEXT: 3692 case LGRP_MEM_POLICY_DEFAULT: 3693 default: 3694 3695 /* 3696 * Just return current thread's home lgroup 3697 * for default policy (next touch) 3698 * If the thread is homed to the root, 3699 * then the default policy is random across lgroups. 3700 * Fallthrough to the random case. 3701 */ 3702 if (lgrp != lgrp_root) { 3703 if (policy == LGRP_MEM_POLICY_NEXT) 3704 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3705 else 3706 lgrp_stat_add(lgrp->lgrp_id, 3707 LGRP_NUM_DEFAULT, 1); 3708 break; 3709 } 3710 /* LINTED fallthrough on case statement */ 3711 case LGRP_MEM_POLICY_RANDOM: 3712 3713 /* 3714 * Return a random leaf lgroup with memory 3715 */ 3716 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3717 /* 3718 * Count how many lgroups are spanned 3719 */ 3720 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3721 3722 /* 3723 * There may be no memnodes in the root lgroup during DR copy 3724 * rename on a system with only two boards (memnodes) 3725 * configured. In this case just return the root lgrp. 3726 */ 3727 if (lgrps_spanned == 0) { 3728 lgrp = lgrp_root; 3729 break; 3730 } 3731 3732 /* 3733 * Pick a random offset within lgroups spanned 3734 * and return lgroup at that offset 3735 */ 3736 random = (ushort_t)gethrtime() >> 4; 3737 off = random % lgrps_spanned; 3738 ASSERT(off <= lgrp_alloc_max); 3739 3740 for (i = 0; i <= lgrp_alloc_max; i++) { 3741 if (!klgrpset_ismember(lgrpset, i)) 3742 continue; 3743 if (off) 3744 off--; 3745 else { 3746 lgrp = lgrp_table[i]; 3747 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3748 1); 3749 break; 3750 } 3751 } 3752 break; 3753 3754 case LGRP_MEM_POLICY_RANDOM_PROC: 3755 3756 /* 3757 * Grab copy of bitmask of lgroups spanned by 3758 * this process 3759 */ 3760 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3761 stat = LGRP_NUM_RANDOM_PROC; 3762 3763 /* LINTED fallthrough on case statement */ 3764 case LGRP_MEM_POLICY_RANDOM_PSET: 3765 3766 if (!stat) 3767 stat = LGRP_NUM_RANDOM_PSET; 3768 3769 if (klgrpset_isempty(lgrpset)) { 3770 /* 3771 * Grab copy of bitmask of lgroups spanned by 3772 * this processor set 3773 */ 3774 kpreempt_disable(); 3775 klgrpset_copy(lgrpset, 3776 curthread->t_cpupart->cp_lgrpset); 3777 kpreempt_enable(); 3778 } 3779 3780 /* 3781 * Count how many lgroups are spanned 3782 */ 3783 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3784 ASSERT(lgrps_spanned <= nlgrps); 3785 3786 /* 3787 * Probably lgrps_spanned should be always non-zero, but to be 3788 * on the safe side we return lgrp_root if it is empty. 3789 */ 3790 if (lgrps_spanned == 0) { 3791 lgrp = lgrp_root; 3792 break; 3793 } 3794 3795 /* 3796 * Pick a random offset within lgroups spanned 3797 * and return lgroup at that offset 3798 */ 3799 random = (ushort_t)gethrtime() >> 4; 3800 off = random % lgrps_spanned; 3801 ASSERT(off <= lgrp_alloc_max); 3802 3803 for (i = 0; i <= lgrp_alloc_max; i++) { 3804 if (!klgrpset_ismember(lgrpset, i)) 3805 continue; 3806 if (off) 3807 off--; 3808 else { 3809 lgrp = lgrp_table[i]; 3810 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3811 1); 3812 break; 3813 } 3814 } 3815 break; 3816 3817 case LGRP_MEM_POLICY_ROUNDROBIN: 3818 3819 /* 3820 * Use offset within segment to determine 3821 * offset from home lgroup to choose for 3822 * next lgroup to allocate memory from 3823 */ 3824 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3825 (lgrp_alloc_max + 1); 3826 3827 kpreempt_disable(); 3828 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3829 i = lgrp->lgrp_id; 3830 kpreempt_enable(); 3831 3832 while (off > 0) { 3833 i = (i + 1) % (lgrp_alloc_max + 1); 3834 lgrp = lgrp_table[i]; 3835 if (klgrpset_ismember(lgrpset, i)) 3836 off--; 3837 } 3838 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3839 3840 break; 3841 } 3842 3843 ASSERT(lgrp != NULL); 3844 return (lgrp); 3845 } 3846 3847 /* 3848 * Return the number of pages in an lgroup 3849 * 3850 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3851 * could cause tests that rely on the numat driver to fail.... 3852 */ 3853 pgcnt_t 3854 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3855 { 3856 lgrp_t *lgrp; 3857 3858 lgrp = lgrp_table[lgrpid]; 3859 if (!LGRP_EXISTS(lgrp) || 3860 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3861 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3862 return (0); 3863 3864 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3865 } 3866 3867 /* 3868 * Initialize lgroup shared memory allocation policy support 3869 */ 3870 void 3871 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3872 { 3873 lgrp_shm_locality_t *shm_locality; 3874 3875 /* 3876 * Initialize locality field in anon_map 3877 * Don't need any locks because this is called when anon_map is 3878 * allocated, but not used anywhere yet. 3879 */ 3880 if (amp) { 3881 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3882 if (amp->locality == NULL) { 3883 /* 3884 * Allocate and initialize shared memory locality info 3885 * and set anon_map locality pointer to it 3886 * Drop lock across kmem_alloc(KM_SLEEP) 3887 */ 3888 ANON_LOCK_EXIT(&->a_rwlock); 3889 shm_locality = kmem_alloc(sizeof (*shm_locality), 3890 KM_SLEEP); 3891 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3892 NULL); 3893 shm_locality->loc_count = 1; /* not used for amp */ 3894 shm_locality->loc_tree = NULL; 3895 3896 /* 3897 * Reacquire lock and check to see whether anyone beat 3898 * us to initializing the locality info 3899 */ 3900 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3901 if (amp->locality != NULL) { 3902 rw_destroy(&shm_locality->loc_lock); 3903 kmem_free(shm_locality, 3904 sizeof (*shm_locality)); 3905 } else 3906 amp->locality = shm_locality; 3907 } 3908 ANON_LOCK_EXIT(&->a_rwlock); 3909 return; 3910 } 3911 3912 /* 3913 * Allocate shared vnode policy info if vnode is not locality aware yet 3914 */ 3915 mutex_enter(&vp->v_lock); 3916 if ((vp->v_flag & V_LOCALITY) == 0) { 3917 /* 3918 * Allocate and initialize shared memory locality info 3919 */ 3920 mutex_exit(&vp->v_lock); 3921 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3922 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3923 shm_locality->loc_count = 1; 3924 shm_locality->loc_tree = NULL; 3925 3926 /* 3927 * Point vnode locality field at shared vnode policy info 3928 * and set locality aware flag in vnode 3929 */ 3930 mutex_enter(&vp->v_lock); 3931 if ((vp->v_flag & V_LOCALITY) == 0) { 3932 vp->v_locality = shm_locality; 3933 vp->v_flag |= V_LOCALITY; 3934 } else { 3935 /* 3936 * Lost race so free locality info and increment count. 3937 */ 3938 rw_destroy(&shm_locality->loc_lock); 3939 kmem_free(shm_locality, sizeof (*shm_locality)); 3940 shm_locality = vp->v_locality; 3941 shm_locality->loc_count++; 3942 } 3943 mutex_exit(&vp->v_lock); 3944 3945 return; 3946 } 3947 3948 /* 3949 * Increment reference count of number of segments mapping this vnode 3950 * shared 3951 */ 3952 shm_locality = vp->v_locality; 3953 shm_locality->loc_count++; 3954 mutex_exit(&vp->v_lock); 3955 } 3956 3957 /* 3958 * Destroy the given shared memory policy segment tree 3959 */ 3960 void 3961 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3962 { 3963 lgrp_shm_policy_seg_t *cur; 3964 lgrp_shm_policy_seg_t *next; 3965 3966 if (tree == NULL) 3967 return; 3968 3969 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3970 while (cur != NULL) { 3971 next = AVL_NEXT(tree, cur); 3972 avl_remove(tree, cur); 3973 kmem_free(cur, sizeof (*cur)); 3974 cur = next; 3975 } 3976 kmem_free(tree, sizeof (avl_tree_t)); 3977 } 3978 3979 /* 3980 * Uninitialize lgroup shared memory allocation policy support 3981 */ 3982 void 3983 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3984 { 3985 lgrp_shm_locality_t *shm_locality; 3986 3987 /* 3988 * For anon_map, deallocate shared memory policy tree and 3989 * zero locality field 3990 * Don't need any locks because anon_map is being freed 3991 */ 3992 if (amp) { 3993 if (amp->locality == NULL) 3994 return; 3995 shm_locality = amp->locality; 3996 shm_locality->loc_count = 0; /* not really used for amp */ 3997 rw_destroy(&shm_locality->loc_lock); 3998 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3999 kmem_free(shm_locality, sizeof (*shm_locality)); 4000 amp->locality = 0; 4001 return; 4002 } 4003 4004 /* 4005 * For vnode, decrement reference count of segments mapping this vnode 4006 * shared and delete locality info if reference count drops to 0 4007 */ 4008 mutex_enter(&vp->v_lock); 4009 shm_locality = vp->v_locality; 4010 shm_locality->loc_count--; 4011 4012 if (shm_locality->loc_count == 0) { 4013 rw_destroy(&shm_locality->loc_lock); 4014 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4015 kmem_free(shm_locality, sizeof (*shm_locality)); 4016 vp->v_locality = 0; 4017 vp->v_flag &= ~V_LOCALITY; 4018 } 4019 mutex_exit(&vp->v_lock); 4020 } 4021 4022 /* 4023 * Compare two shared memory policy segments 4024 * Used by AVL tree code for searching 4025 */ 4026 int 4027 lgrp_shm_policy_compar(const void *x, const void *y) 4028 { 4029 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4030 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4031 4032 if (a->shm_off < b->shm_off) 4033 return (-1); 4034 if (a->shm_off >= b->shm_off + b->shm_size) 4035 return (1); 4036 return (0); 4037 } 4038 4039 /* 4040 * Concatenate seg1 with seg2 and remove seg2 4041 */ 4042 static int 4043 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4044 lgrp_shm_policy_seg_t *seg2) 4045 { 4046 if (!seg1 || !seg2 || 4047 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4048 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4049 return (-1); 4050 4051 seg1->shm_size += seg2->shm_size; 4052 avl_remove(tree, seg2); 4053 kmem_free(seg2, sizeof (*seg2)); 4054 return (0); 4055 } 4056 4057 /* 4058 * Split segment at given offset and return rightmost (uppermost) segment 4059 * Assumes that there are no overlapping segments 4060 */ 4061 static lgrp_shm_policy_seg_t * 4062 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4063 u_offset_t off) 4064 { 4065 lgrp_shm_policy_seg_t *newseg; 4066 avl_index_t where; 4067 4068 ASSERT(seg != NULL); 4069 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4070 4071 if (!seg || off < seg->shm_off || off > seg->shm_off + 4072 seg->shm_size) 4073 return (NULL); 4074 4075 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4076 return (seg); 4077 4078 /* 4079 * Adjust size of left segment and allocate new (right) segment 4080 */ 4081 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4082 newseg->shm_policy = seg->shm_policy; 4083 newseg->shm_off = off; 4084 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4085 seg->shm_size = off - seg->shm_off; 4086 4087 /* 4088 * Find where to insert new segment in AVL tree and insert it 4089 */ 4090 (void) avl_find(tree, &off, &where); 4091 avl_insert(tree, newseg, where); 4092 4093 return (newseg); 4094 } 4095 4096 /* 4097 * Set shared memory allocation policy on specified shared object at given 4098 * offset and length 4099 * 4100 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4101 * -1 if can't set policy. 4102 */ 4103 int 4104 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4105 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4106 { 4107 u_offset_t eoff; 4108 lgrp_shm_policy_seg_t *next; 4109 lgrp_shm_policy_seg_t *newseg; 4110 u_offset_t off; 4111 u_offset_t oldeoff; 4112 lgrp_shm_policy_seg_t *prev; 4113 int retval; 4114 lgrp_shm_policy_seg_t *seg; 4115 lgrp_shm_locality_t *shm_locality; 4116 avl_tree_t *tree; 4117 avl_index_t where; 4118 4119 ASSERT(amp || vp); 4120 ASSERT((len & PAGEOFFSET) == 0); 4121 4122 if (len == 0) 4123 return (-1); 4124 4125 retval = 0; 4126 4127 /* 4128 * Get locality info and starting offset into shared object 4129 * Try anon map first and then vnode 4130 * Assume that no locks need to be held on anon_map or vnode, since 4131 * it should be protected by its reference count which must be nonzero 4132 * for an existing segment. 4133 */ 4134 if (amp) { 4135 /* 4136 * Get policy info from anon_map 4137 * 4138 */ 4139 ASSERT(amp->refcnt != 0); 4140 if (amp->locality == NULL) 4141 lgrp_shm_policy_init(amp, NULL); 4142 shm_locality = amp->locality; 4143 off = ptob(anon_index); 4144 } else if (vp) { 4145 /* 4146 * Get policy info from vnode 4147 */ 4148 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4149 lgrp_shm_policy_init(NULL, vp); 4150 shm_locality = vp->v_locality; 4151 ASSERT(shm_locality->loc_count != 0); 4152 off = vn_off; 4153 } else 4154 return (-1); 4155 4156 ASSERT((off & PAGEOFFSET) == 0); 4157 4158 /* 4159 * Figure out default policy 4160 */ 4161 if (policy == LGRP_MEM_POLICY_DEFAULT) 4162 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4163 4164 /* 4165 * Create AVL tree if there isn't one yet 4166 * and set locality field to point at it 4167 */ 4168 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4169 tree = shm_locality->loc_tree; 4170 if (!tree) { 4171 rw_exit(&shm_locality->loc_lock); 4172 4173 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4174 4175 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4176 if (shm_locality->loc_tree == NULL) { 4177 avl_create(tree, lgrp_shm_policy_compar, 4178 sizeof (lgrp_shm_policy_seg_t), 4179 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4180 shm_locality->loc_tree = tree; 4181 } else { 4182 /* 4183 * Another thread managed to set up the tree 4184 * before we could. Free the tree we allocated 4185 * and use the one that's already there. 4186 */ 4187 kmem_free(tree, sizeof (*tree)); 4188 tree = shm_locality->loc_tree; 4189 } 4190 } 4191 4192 /* 4193 * Set policy 4194 * 4195 * Need to maintain hold on writer's lock to keep tree from 4196 * changing out from under us 4197 */ 4198 while (len != 0) { 4199 /* 4200 * Find policy segment for specified offset into shared object 4201 */ 4202 seg = avl_find(tree, &off, &where); 4203 4204 /* 4205 * Didn't find any existing segment that contains specified 4206 * offset, so allocate new segment, insert it, and concatenate 4207 * with adjacent segments if possible 4208 */ 4209 if (seg == NULL) { 4210 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4211 KM_SLEEP); 4212 newseg->shm_policy.mem_policy = policy; 4213 newseg->shm_policy.mem_lgrpid = LGRP_NONE; 4214 newseg->shm_off = off; 4215 avl_insert(tree, newseg, where); 4216 4217 /* 4218 * Check to see whether new segment overlaps with next 4219 * one, set length of new segment accordingly, and 4220 * calculate remaining length and next offset 4221 */ 4222 seg = AVL_NEXT(tree, newseg); 4223 if (seg == NULL || off + len <= seg->shm_off) { 4224 newseg->shm_size = len; 4225 len = 0; 4226 } else { 4227 newseg->shm_size = seg->shm_off - off; 4228 off = seg->shm_off; 4229 len -= newseg->shm_size; 4230 } 4231 4232 /* 4233 * Try to concatenate new segment with next and 4234 * previous ones, since they might have the same policy 4235 * now. Grab previous and next segments first because 4236 * they will change on concatenation. 4237 */ 4238 prev = AVL_PREV(tree, newseg); 4239 next = AVL_NEXT(tree, newseg); 4240 (void) lgrp_shm_policy_concat(tree, newseg, next); 4241 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4242 4243 continue; 4244 } 4245 4246 eoff = off + len; 4247 oldeoff = seg->shm_off + seg->shm_size; 4248 4249 /* 4250 * Policy set already? 4251 */ 4252 if (policy == seg->shm_policy.mem_policy) { 4253 /* 4254 * Nothing left to do if offset and length 4255 * fall within this segment 4256 */ 4257 if (eoff <= oldeoff) { 4258 retval = 1; 4259 break; 4260 } else { 4261 len = eoff - oldeoff; 4262 off = oldeoff; 4263 continue; 4264 } 4265 } 4266 4267 /* 4268 * Specified offset and length match existing segment exactly 4269 */ 4270 if (off == seg->shm_off && len == seg->shm_size) { 4271 /* 4272 * Set policy and update current length 4273 */ 4274 seg->shm_policy.mem_policy = policy; 4275 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4276 len = 0; 4277 4278 /* 4279 * Try concatenating new segment with previous and next 4280 * segments, since they might have the same policy now. 4281 * Grab previous and next segments first because they 4282 * will change on concatenation. 4283 */ 4284 prev = AVL_PREV(tree, seg); 4285 next = AVL_NEXT(tree, seg); 4286 (void) lgrp_shm_policy_concat(tree, seg, next); 4287 (void) lgrp_shm_policy_concat(tree, prev, seg); 4288 } else { 4289 /* 4290 * Specified offset and length only apply to part of 4291 * existing segment 4292 */ 4293 4294 /* 4295 * New segment starts in middle of old one, so split 4296 * new one off near beginning of old one 4297 */ 4298 newseg = NULL; 4299 if (off > seg->shm_off) { 4300 newseg = lgrp_shm_policy_split(tree, seg, off); 4301 4302 /* 4303 * New segment ends where old one did, so try 4304 * to concatenate with next segment 4305 */ 4306 if (eoff == oldeoff) { 4307 newseg->shm_policy.mem_policy = policy; 4308 newseg->shm_policy.mem_lgrpid = 4309 LGRP_NONE; 4310 (void) lgrp_shm_policy_concat(tree, 4311 newseg, AVL_NEXT(tree, newseg)); 4312 break; 4313 } 4314 } 4315 4316 /* 4317 * New segment ends before old one, so split off end of 4318 * old one 4319 */ 4320 if (eoff < oldeoff) { 4321 if (newseg) { 4322 (void) lgrp_shm_policy_split(tree, 4323 newseg, eoff); 4324 newseg->shm_policy.mem_policy = policy; 4325 newseg->shm_policy.mem_lgrpid = 4326 LGRP_NONE; 4327 } else { 4328 (void) lgrp_shm_policy_split(tree, seg, 4329 eoff); 4330 seg->shm_policy.mem_policy = policy; 4331 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4332 } 4333 4334 if (off == seg->shm_off) 4335 (void) lgrp_shm_policy_concat(tree, 4336 AVL_PREV(tree, seg), seg); 4337 break; 4338 } 4339 4340 /* 4341 * Calculate remaining length and next offset 4342 */ 4343 len = eoff - oldeoff; 4344 off = oldeoff; 4345 } 4346 } 4347 4348 rw_exit(&shm_locality->loc_lock); 4349 return (retval); 4350 } 4351 4352 /* 4353 * Return the best memnode from which to allocate memory given 4354 * an lgroup. 4355 * 4356 * "c" is for cookie, which is good enough for me. 4357 * It references a cookie struct that should be zero'ed to initialize. 4358 * The cookie should live on the caller's stack. 4359 * 4360 * The routine returns -1 when: 4361 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4362 * - traverse is 1, and all the memnodes in the system have been 4363 * returned. 4364 */ 4365 int 4366 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4367 { 4368 lgrp_t *lp = c->lmc_lgrp; 4369 mnodeset_t nodes = c->lmc_nodes; 4370 int cnt = c->lmc_cnt; 4371 int offset, mnode; 4372 4373 extern int max_mem_nodes; 4374 4375 /* 4376 * If the set is empty, and the caller is willing, traverse 4377 * up the hierarchy until we find a non-empty set. 4378 */ 4379 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4380 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4381 ((lp = lp->lgrp_parent) == NULL)) 4382 return (-1); 4383 4384 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4385 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4386 } 4387 4388 /* 4389 * Select a memnode by picking one at a "random" offset. 4390 * Because of DR, memnodes can come and go at any time. 4391 * This code must be able to cope with the possibility 4392 * that the nodes count "cnt" is inconsistent with respect 4393 * to the number of elements actually in "nodes", and 4394 * therefore that the offset chosen could be greater than 4395 * the number of elements in the set (some memnodes may 4396 * have dissapeared just before cnt was read). 4397 * If this happens, the search simply wraps back to the 4398 * beginning of the set. 4399 */ 4400 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4401 offset = c->lmc_rand % cnt; 4402 do { 4403 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4404 if (nodes & ((mnodeset_t)1 << mnode)) 4405 if (!offset--) 4406 break; 4407 } while (mnode >= max_mem_nodes); 4408 4409 /* Found a node. Store state before returning. */ 4410 c->lmc_lgrp = lp; 4411 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4412 c->lmc_cnt = cnt - 1; 4413 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4414 c->lmc_ntried++; 4415 4416 return (mnode); 4417 } 4418