1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/machsystm.h> 32 #include <sys/machparam.h> 33 #include <sys/cmn_err.h> 34 #include <sys/stat.h> 35 #include <sys/mach_descrip.h> 36 #include <sys/memnode.h> 37 #include <sys/mdesc.h> 38 #include <sys/mpo.h> 39 #include <vm/vm_dep.h> 40 #include <vm/hat_sfmmu.h> 41 #include <sys/promif.h> 42 43 /* 44 * MPO and the sun4v memory representation 45 * --------------------------------------- 46 * 47 * Latency groups are defined in the sun4v achitecture by memory-latency-group 48 * nodes in the Machine Description, as specified in FWARC/2007/260. These 49 * tie together cpu nodes and mblock nodes, and contain mask and match 50 * properties that identify the portion of an mblock that belongs to the 51 * lgroup. Mask and match are defined in the Physical Address (PA) space, 52 * but an mblock defines Real Addresses (RA). To translate, the mblock 53 * includes the property address-congruence-offset, hereafter referred to as 54 * ra_to_pa. A real address ra is a member of an lgroup if 55 * 56 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 57 * 58 * The MD is traversed, and information on all mblocks is kept in the array 59 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 60 * to, is kept in the array mpo_cpu[]. 61 * 62 * This implementation makes (and verifies) the simplifying assumption that 63 * the mask bits are the same for all defined lgroups, and that all 1 bits in 64 * the mask are contiguous. Thus the number of lgroups is bounded by the 65 * number of possible mask values, and the lgrp_handle_t is defined as the 66 * mask value, shifted right to eliminate the 0 bit positions in mask. The 67 * masks and values are also referred to as "home bits" in the code. 68 * 69 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 70 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 71 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 72 * home bits. This yields the mem_node. 73 * 74 * Interfaces 75 * ---------- 76 * 77 * This file exports the following entry points: 78 * 79 * plat_lgrp_init() 80 * plat_build_mem_nodes() 81 * plat_lgrp_cpu_to_hand() 82 * plat_lgrp_latency() 83 * plat_pfn_to_mem_node() 84 * These implement the usual platform lgroup interfaces. 85 * 86 * plat_rapfn_to_papfn() 87 * Recover the PA page coloring bits from an RA. 88 * 89 * plat_mem_node_iterator_init() 90 * Initialize an iterator to efficiently step through pages in a mem_node. 91 * 92 * plat_mem_node_intersect_range() 93 * Find the intersection with a mem_node. 94 */ 95 96 int sun4v_mpo_enable = 1; 97 int sun4v_mpo_debug = 0; 98 char sun4v_mpo_status[256] = ""; 99 100 /* Save CPU info from the MD and associate CPUs with lgroups */ 101 static struct cpu_md mpo_cpu[NCPU]; 102 103 /* Save lgroup info from the MD */ 104 #define MAX_MD_LGROUPS 32 105 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 106 static int n_lgrpnodes = 0; 107 static int n_locality_groups = 0; 108 static int max_locality_groups = 0; 109 110 /* Save mblocks from the MD */ 111 #define SMALL_MBLOCKS_COUNT 8 112 static struct mblock_md *mpo_mblock; 113 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 114 static int n_mblocks = 0; 115 116 /* Save mem_node stripes calculate from mblocks and lgroups. */ 117 static mem_stripe_t *mem_stripes; 118 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 119 static int mstripesz = 0; 120 static int n_mem_stripes = 0; 121 static pfn_t mnode_stride; /* distance between stripes, start to start */ 122 static int stripe_shift; /* stride/stripes expressed as a shift */ 123 static pfn_t mnode_pages; /* mem_node stripe width */ 124 125 /* Save home mask and shift used to calculate lgrp_handle_t values */ 126 static uint64_t home_mask = 0; 127 static pfn_t home_mask_pfn = 0; 128 static int home_mask_shift = 0; 129 static uint_t home_mask_pfn_shift = 0; 130 131 /* Save lowest and highest latencies found across all lgroups */ 132 static int lower_latency = 0; 133 static int higher_latency = 0; 134 135 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 136 137 static int valid_pages(md_t *md, mde_cookie_t cpu0); 138 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 139 static int fix_interleave(void); 140 141 /* Debug support */ 142 #if defined(DEBUG) && !defined(lint) 143 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 144 #else 145 #define MPO_DEBUG(...) 146 #endif /* DEBUG */ 147 148 /* Record status message, viewable from mdb */ 149 #define MPO_STATUS(args...) { \ 150 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 151 MPO_DEBUG(sun4v_mpo_status); \ 152 } 153 154 /* 155 * Routine to read a uint64_t from a given md 156 */ 157 static int64_t 158 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 159 { 160 int err = md_get_prop_val(md, node, propname, val); 161 return (err); 162 } 163 164 static int 165 mblock_cmp(const void *a, const void *b) 166 { 167 struct mblock_md *m1 = (struct mblock_md *)a; 168 struct mblock_md *m2 = (struct mblock_md *)b; 169 170 if (m1->base < m2->base) 171 return (-1); 172 else if (m1->base == m2->base) 173 return (0); 174 else 175 return (1); 176 } 177 178 static void 179 mblock_sort(struct mblock_md *mblocks, int n) 180 { 181 extern void qsort(void *, size_t, size_t, 182 int (*)(const void *, const void *)); 183 184 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 185 } 186 187 /* 188 * 189 * Traverse the MD to determine: 190 * 191 * Number of CPU nodes, lgrp_nodes, and mblocks 192 * Then for each lgrp_node, obtain the appropriate data. 193 * For each CPU, determine its home locality and store it. 194 * For each mblock, retrieve its data and store it. 195 */ 196 static int 197 lgrp_traverse(md_t *md) 198 { 199 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 200 uint64_t i, j, k, o, n_nodes; 201 uint64_t n_lgroups = 0; 202 uint64_t mem_lg_homeset = 0; 203 int ret_val = 0; 204 int result = 0; 205 int n_cpunodes = 0; 206 int sub_page_fix; 207 int mblocksz = 0; 208 size_t allocsz; 209 210 n_nodes = md_node_count(md); 211 212 if (n_nodes <= 0) { 213 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 214 ret_val = -1; 215 goto fail; 216 } 217 218 root = md_root_node(md); 219 220 if (root == MDE_INVAL_ELEM_COOKIE) { 221 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 222 ret_val = -1; 223 goto fail; 224 } 225 226 /* 227 * Build the Memory Nodes. Do this before any possibility of 228 * bailing from this routine so we obtain ra_to_pa (needed for page 229 * coloring) even when there are no lgroups defined. 230 */ 231 232 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 233 "fwd", &mblocknodes); 234 235 if (n_mblocks <= 0) { 236 MPO_STATUS("lgrp_traverse: No mblock " 237 "nodes detected in Machine Descriptor\n"); 238 n_mblocks = 0; 239 ret_val = -1; 240 goto fail; 241 } 242 /* 243 * If we have a small number of mblocks we will use the space 244 * that we preallocated. Otherwise, we will dynamically 245 * allocate the space 246 */ 247 mblocksz = n_mblocks * sizeof (struct mblock_md); 248 mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t); 249 250 if (n_mblocks <= SMALL_MBLOCKS_COUNT) { 251 mpo_mblock = &small_mpo_mblocks[0]; 252 mem_stripes = &small_mem_stripes[0]; 253 } else { 254 allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 255 /* Ensure that we dont request more space than reserved */ 256 if (allocsz > MPOBUF_SIZE) { 257 MPO_STATUS("lgrp_traverse: Insufficient space " 258 "for mblock structures \n"); 259 ret_val = -1; 260 n_mblocks = 0; 261 goto fail; 262 } 263 mpo_mblock = (struct mblock_md *) 264 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 265 if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) { 266 MPO_STATUS("lgrp_traverse: Cannot allocate space " 267 "for mblocks \n"); 268 ret_val = -1; 269 n_mblocks = 0; 270 goto fail; 271 } 272 mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 273 mpo_heap32_bufsz = MPOBUF_SIZE; 274 275 mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks); 276 } 277 278 for (i = 0; i < n_mblocks; i++) { 279 mpo_mblock[i].node = mblocknodes[i]; 280 mpo_mblock[i].mnode_mask = (mnodeset_t)0; 281 282 /* Without a base or size value we will fail */ 283 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 284 &mpo_mblock[i].base); 285 if (result < 0) { 286 MPO_STATUS("lgrp_traverse: " 287 "PROP_LG_BASE is missing\n"); 288 n_mblocks = 0; 289 ret_val = -1; 290 goto fail; 291 } 292 293 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 294 &mpo_mblock[i].size); 295 if (result < 0) { 296 MPO_STATUS("lgrp_traverse: " 297 "PROP_LG_SIZE is missing\n"); 298 n_mblocks = 0; 299 ret_val = -1; 300 goto fail; 301 } 302 303 result = get_int(md, mblocknodes[i], 304 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 305 306 /* If we don't have an ra_pa_offset, just set it to 0 */ 307 if (result < 0) 308 mpo_mblock[i].ra_to_pa = 0; 309 310 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 311 "ra_to_pa = %lx\n", i, 312 mpo_mblock[i].base, 313 mpo_mblock[i].size, 314 mpo_mblock[i].ra_to_pa); 315 } 316 317 /* Must sort mblocks by address for mem_node_iterator_init() */ 318 mblock_sort(mpo_mblock, n_mblocks); 319 320 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 321 322 /* Page coloring hook is required so we can iterate through mnodes */ 323 if (&page_next_pfn_for_color_cpu == NULL) { 324 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 325 ret_val = -1; 326 goto fail; 327 } 328 329 /* Global enable for mpo */ 330 if (sun4v_mpo_enable == 0) { 331 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 332 ret_val = -1; 333 goto fail; 334 } 335 336 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 337 "fwd", &lgrpnodes); 338 339 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 340 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 341 ret_val = -1; 342 goto fail; 343 } 344 345 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 346 347 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 348 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 349 "in MD\n"); 350 ret_val = -1; 351 goto fail; 352 } 353 354 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 355 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 356 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 357 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 358 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 359 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 360 361 for (i = 0; i < n_lgrpnodes; i++) { 362 mpo_lgroup[i].node = lgrpnodes[i]; 363 mpo_lgroup[i].id = i; 364 mpo_lgroup[i].ncpu = 0; 365 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 366 &mpo_lgroup[i].addr_mask); 367 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 368 &mpo_lgroup[i].addr_match); 369 370 /* 371 * If either the mask or match properties are missing, set to 0 372 */ 373 if (result < 0) { 374 mpo_lgroup[i].addr_mask = 0; 375 mpo_lgroup[i].addr_match = 0; 376 } 377 378 /* Set latency to 0 if property not present */ 379 380 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 381 &mpo_lgroup[i].latency); 382 if (result < 0) 383 mpo_lgroup[i].latency = 0; 384 } 385 386 /* 387 * Sub-page level interleave is not yet supported. Check for it, 388 * and remove sub-page interleaved lgroups from mpo_lgroup and 389 * n_lgrpnodes. If no lgroups are left, return. 390 */ 391 392 sub_page_fix = fix_interleave(); 393 if (n_lgrpnodes == 0) { 394 ret_val = -1; 395 goto fail; 396 } 397 398 /* Ensure that all of the addr_mask values are the same */ 399 400 for (i = 0; i < n_lgrpnodes; i++) { 401 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 402 MPO_STATUS("lgrp_traverse: " 403 "addr_mask values are not the same\n"); 404 ret_val = -1; 405 goto fail; 406 } 407 } 408 409 /* 410 * Ensure that all lgrp nodes see all the mblocks. However, if 411 * sub-page interleave is being fixed, they do not, so skip 412 * the check. 413 */ 414 415 if (sub_page_fix == 0) { 416 for (i = 0; i < n_lgrpnodes; i++) { 417 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 418 PROP_LG_MBLOCK, "fwd", &nodes); 419 md_free_scan_dag(md, &nodes); 420 if (j != n_mblocks) { 421 MPO_STATUS("lgrp_traverse: " 422 "sub-page interleave is being fixed\n"); 423 ret_val = -1; 424 goto fail; 425 } 426 } 427 } 428 429 /* 430 * Use the address mask from the first lgroup node 431 * to establish our home_mask. 432 */ 433 home_mask = mpo_lgroup[0].addr_mask; 434 home_mask_pfn = btop(home_mask); 435 home_mask_shift = lowbit(home_mask) - 1; 436 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 437 mnode_pages = btop(1ULL << home_mask_shift); 438 439 /* 440 * How many values are possible in home mask? Assume the mask 441 * bits are contiguous. 442 */ 443 max_locality_groups = 444 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 445 446 /* Now verify the home mask bits are contiguous */ 447 448 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 449 MPO_STATUS("lgrp_traverse: " 450 "home mask bits are not contiguous\n"); 451 ret_val = -1; 452 goto fail; 453 } 454 455 /* Record all of the home bits */ 456 457 for (i = 0; i < n_lgrpnodes; i++) { 458 HOMESET_ADD(mem_lg_homeset, 459 mpo_lgroup[i].addr_match >> home_mask_shift); 460 } 461 462 /* Count the number different "home" mem_lg's we've discovered */ 463 464 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 465 466 /* If we have only 1 locality group then we can exit */ 467 if (n_locality_groups == 1) { 468 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 469 ret_val = -1; 470 goto fail; 471 } 472 473 /* 474 * Set the latencies. A CPU's lgroup is defined by the lowest 475 * latency found. All other memory is considered remote, and the 476 * remote latency is represented by the highest latency found. 477 * Thus hierarchical lgroups, if any, are approximated by a 478 * two level scheme. 479 * 480 * The Solaris MPO framework by convention wants to see latencies 481 * in units of nano-sec/10. In the MD, the units are defined to be 482 * pico-seconds. 483 */ 484 485 lower_latency = mpo_lgroup[0].latency; 486 higher_latency = mpo_lgroup[0].latency; 487 488 for (i = 1; i < n_lgrpnodes; i++) { 489 if (mpo_lgroup[i].latency < lower_latency) { 490 lower_latency = mpo_lgroup[i].latency; 491 } 492 if (mpo_lgroup[i].latency > higher_latency) { 493 higher_latency = mpo_lgroup[i].latency; 494 } 495 } 496 lower_latency /= 10000; 497 higher_latency /= 10000; 498 499 /* Clear our CPU data */ 500 501 for (i = 0; i < NCPU; i++) { 502 mpo_cpu[i].home = 0; 503 mpo_cpu[i].latency = (uint_t)(-1); 504 } 505 506 /* Build the CPU nodes */ 507 for (i = 0; i < n_cpunodes; i++) { 508 509 /* Read in the lgroup nodes */ 510 511 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 512 if (result < 0) { 513 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 514 ret_val = -1; 515 goto fail; 516 } 517 518 n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, 519 "fwd", &nodes); 520 if (n_lgroups <= 0) { 521 MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); 522 ret_val = -1; 523 goto fail; 524 } 525 526 /* 527 * Find the lgroup this cpu belongs to with the lowest latency. 528 * Check all the lgrp nodes connected to this CPU to determine 529 * which has the smallest latency. 530 */ 531 532 for (j = 0; j < n_lgroups; j++) { 533 for (o = 0; o < n_lgrpnodes; o++) { 534 if (nodes[j] == mpo_lgroup[o].node) { 535 if (mpo_lgroup[o].latency < 536 mpo_cpu[k].latency) { 537 mpo_cpu[k].home = 538 mpo_lgroup[o].addr_match 539 >> home_mask_shift; 540 mpo_cpu[k].latency = 541 mpo_lgroup[o].latency; 542 mpo_lgroup[o].ncpu++; 543 } 544 } 545 } 546 } 547 md_free_scan_dag(md, &nodes); 548 } 549 550 /* Validate that no large pages cross mnode boundaries. */ 551 if (valid_pages(md, cpunodes[0]) == 0) { 552 ret_val = -1; 553 goto fail; 554 } 555 556 fail: 557 /* MD cookies are no longer valid; ensure they are not used again. */ 558 for (i = 0; i < n_mblocks; i++) 559 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 560 for (i = 0; i < n_lgrpnodes; i++) 561 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 562 563 if (n_cpunodes > 0) 564 md_free_scan_dag(md, &cpunodes); 565 if (n_lgrpnodes > 0) 566 md_free_scan_dag(md, &lgrpnodes); 567 if (n_mblocks > 0) 568 md_free_scan_dag(md, &mblocknodes); 569 else 570 panic("lgrp_traverse: No memory blocks found"); 571 572 if (ret_val == 0) 573 MPO_STATUS("MPO feature is enabled.\n"); 574 575 return (ret_val); 576 } 577 578 /* 579 * Determine the number of unique mem_lg's present in our system 580 */ 581 static int 582 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 583 { 584 int homeid; 585 int count = 0; 586 587 /* 588 * Scan the "home" bits of the mem_lgs, count 589 * the number that are unique. 590 */ 591 592 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 593 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 594 count++; 595 } 596 } 597 598 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 599 mem_lg_homeset); 600 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 601 602 /* Default must be at least one */ 603 if (count == 0) 604 count = 1; 605 606 return (count); 607 } 608 609 /* 610 * Platform specific lgroup initialization 611 */ 612 void 613 plat_lgrp_init(void) 614 { 615 md_t *md; 616 int i, rc, ncpu_min; 617 618 /* Get the Machine Descriptor handle */ 619 620 md = md_get_handle(); 621 622 /* If not, we cannot continue */ 623 624 if (md == NULL) { 625 panic("cannot access machine descriptor\n"); 626 } else { 627 rc = lgrp_traverse(md); 628 (void) md_fini_handle(md); 629 } 630 631 /* 632 * If we can't process the MD for lgroups then at least let the 633 * system try to boot. Assume we have one lgroup so that 634 * when plat_build_mem_nodes is called, it will attempt to init 635 * an mnode based on the supplied memory segment. 636 */ 637 638 if (rc == -1) { 639 home_mask_pfn = 0; 640 max_locality_groups = 1; 641 n_locality_groups = 1; 642 return; 643 } 644 645 mem_node_pfn_shift = 0; 646 mem_node_physalign = 0; 647 648 /* Use lgroup-aware TSB allocations */ 649 tsb_lgrp_affinity = 1; 650 651 /* 652 * lgrp_expand_proc_thresh is the minimum load on the lgroups 653 * this process is currently running on before considering 654 * expanding threads to another lgroup. 655 * 656 * lgrp_expand_proc_diff determines how much less the remote lgroup 657 * must be loaded before expanding to it. 658 * 659 * On sun4v CMT processors, threads share a core pipeline, and 660 * at less than 100% utilization, best throughput is obtained by 661 * spreading threads across more cores, even if some are in a 662 * different lgroup. Spread threads to a new lgroup if the 663 * current group is more than 50% loaded. Because of virtualization, 664 * lgroups may have different numbers of CPUs, but the tunables 665 * apply to all lgroups, so find the smallest lgroup and compute 666 * 50% loading. 667 */ 668 669 ncpu_min = NCPU; 670 for (i = 0; i < n_lgrpnodes; i++) { 671 int ncpu = mpo_lgroup[i].ncpu; 672 if (ncpu != 0 && ncpu < ncpu_min) 673 ncpu_min = ncpu; 674 } 675 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 676 677 /* new home may only be half as loaded as the existing home to use it */ 678 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 679 680 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 681 682 /* Require that a home lgroup have some memory to be chosen */ 683 lgrp_mem_free_thresh = 1; 684 685 /* Standard home-on-next-touch policy */ 686 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 687 688 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 689 lgrp_load_thresh = UINT32_MAX; 690 } 691 692 /* 693 * Helper routine for debugging calls to mem_node_add_slice() 694 */ 695 static void 696 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 697 { 698 #if defined(DEBUG) && !defined(lint) 699 static int slice_count = 0; 700 701 slice_count++; 702 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 703 slice_count, basepfn, endpfn); 704 #endif 705 mem_node_add_slice(basepfn, endpfn); 706 } 707 708 /* 709 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 710 */ 711 static void 712 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 713 { 714 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 715 "mnode index: %d\n", plathand, mnode); 716 plat_assign_lgrphand_to_mem_node(plathand, mnode); 717 } 718 719 /* 720 * plat_build_mem_nodes() 721 * 722 * Define the mem_nodes based on the modified boot memory list, 723 * or based on info read from the MD in plat_lgrp_init(). 724 * 725 * When the home mask lies in the middle of the address bits (as it does on 726 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 727 * it is striped across an mblock in a repeating pattern of contiguous memory 728 * followed by a gap. The stripe width is the size of the contiguous piece. 729 * The stride is the distance from the start of one contiguous piece to the 730 * start of the next. The gap is thus stride - stripe_width. 731 * 732 * The stripe of an mnode that falls within an mblock is described by the type 733 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 734 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 735 * this array is predetermined. The mem_stripe_t that describes mnode m 736 * within mpo_mblock[i] is stored at 737 * mem_stripes[ m + i * max_locality_groups ] 738 * 739 * max_locality_groups is the total number of possible locality groups, 740 * as defined by the size of the home mask, even if the memory assigned 741 * to the domain is small and does not cover all the lgroups. Thus some 742 * mem_stripe_t's may be empty. 743 * 744 * The members of mem_stripe_t are: 745 * physbase: First valid page in mem_node in the corresponding mblock 746 * physmax: Last valid page in mem_node in mblock 747 * offset: The full stripe width starts at physbase - offset. 748 * Thus if offset is non-zero, this mem_node starts in the middle 749 * of a stripe width, and the second full stripe starts at 750 * physbase - offset + stride. (even though physmax may fall in the 751 * middle of a stripe width, we do not save the ending fragment size 752 * in this data structure.) 753 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 754 * 755 * The stripe width is kept in the global mnode_pages. 756 * The stride is kept in the global mnode_stride. 757 * All the above use pfn's as the unit. 758 * 759 * As an example, the memory layout for a domain with 2 mblocks and 4 760 * mem_nodes 0,1,2,3 could look like this: 761 * 762 * 123012301230 ... 012301230123 ... 763 * mblock 0 mblock 1 764 */ 765 766 void 767 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 768 { 769 lgrp_handle_t lgrphand, lgrp_start; 770 int i, mnode, elem; 771 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 772 uint64_t stripe, frag, remove; 773 mem_stripe_t *ms; 774 775 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 776 max_mem_nodes = max_locality_groups; 777 778 /* Check for non-MPO sun4v platforms */ 779 if (n_locality_groups <= 1) { 780 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 781 for (elem = 0; elem < nelems; elem += 2) { 782 base = list[elem]; 783 len = list[elem+1]; 784 785 mpo_mem_node_add_slice(btop(base), 786 btop(base + len - 1)); 787 } 788 mem_node_pfn_shift = 0; 789 mem_node_physalign = 0; 790 n_mem_stripes = 0; 791 if (n_mblocks == 1) 792 return; 793 } 794 795 bzero(mem_stripes, mstripesz); 796 stripe = ptob(mnode_pages); 797 stride = max_locality_groups * stripe; 798 799 /* Save commonly used values in globals */ 800 mnode_stride = btop(stride); 801 n_mem_stripes = max_locality_groups * n_mblocks; 802 stripe_shift = highbit(max_locality_groups) - 1; 803 804 for (i = 0; i < n_mblocks; i++) { 805 806 base = mpo_mblock[i].base; 807 end = mpo_mblock[i].base + mpo_mblock[i].size; 808 ra_to_pa = mpo_mblock[i].ra_to_pa; 809 mpo_mblock[i].base_pfn = btop(base); 810 mpo_mblock[i].end_pfn = btop(end - 1); 811 812 /* Find the offset from the prev stripe boundary in PA space. */ 813 offset = (base + ra_to_pa) & (stripe - 1); 814 815 /* Set the next stripe boundary. */ 816 stripe_end = base - offset + stripe; 817 818 lgrp_start = (((base + ra_to_pa) & home_mask) >> 819 home_mask_shift); 820 lgrphand = lgrp_start; 821 822 /* 823 * Loop over all lgroups covered by the mblock, creating a 824 * stripe for each. Stop when lgrp_start is visited again. 825 */ 826 do { 827 /* mblock may not span all lgroups */ 828 if (base >= end) 829 break; 830 831 mnode = lgrphand; 832 ASSERT(mnode < max_mem_nodes); 833 mpo_mblock[i].mnode_mask |= (mnodeset_t)1 << mnode; 834 835 /* 836 * Calculate the size of the fragment that does not 837 * belong to the mnode in the last partial stride. 838 */ 839 frag = (end - (base - offset)) & (stride - 1); 840 if (frag == 0) { 841 /* remove the gap */ 842 remove = stride - stripe; 843 } else if (frag < stripe) { 844 /* fragment fits in stripe; keep it all */ 845 remove = 0; 846 } else { 847 /* fragment is large; trim after whole stripe */ 848 remove = frag - stripe; 849 } 850 851 ms = &mem_stripes[i * max_locality_groups + mnode]; 852 ms->physbase = btop(base); 853 ms->physmax = btop(end - 1 - remove); 854 ms->offset = btop(offset); 855 ms->exists = 1; 856 857 /* 858 * If we have only 1 lgroup and multiple mblocks, 859 * then we have already established our lgrp handle 860 * to mem_node and mem_node_config values above. 861 */ 862 if (n_locality_groups > 1) { 863 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, 864 mnode); 865 mpo_mem_node_add_slice(ms->physbase, 866 ms->physmax); 867 } 868 base = stripe_end; 869 stripe_end += stripe; 870 offset = 0; 871 lgrphand = (((base + ra_to_pa) & home_mask) >> 872 home_mask_shift); 873 } while (lgrphand != lgrp_start); 874 } 875 876 /* 877 * Indicate to vm_pagelist that the hpm_counters array 878 * should be shared because the ranges overlap. 879 */ 880 if (max_mem_nodes > 1) { 881 interleaved_mnodes = 1; 882 } 883 } 884 885 /* 886 * Return the locality group value for the supplied processor 887 */ 888 lgrp_handle_t 889 plat_lgrp_cpu_to_hand(processorid_t id) 890 { 891 if (n_locality_groups > 1) { 892 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 893 } else { 894 return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */ 895 } 896 } 897 898 int 899 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 900 { 901 /* 902 * Return min remote latency when there are more than two lgroups 903 * (root and child) and getting latency between two different lgroups 904 * or root is involved. 905 */ 906 if (lgrp_optimizations() && (from != to || 907 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 908 return ((int)higher_latency); 909 } else { 910 return ((int)lower_latency); 911 } 912 } 913 914 int 915 plat_pfn_to_mem_node(pfn_t pfn) 916 { 917 int i, mnode; 918 pfn_t ra_to_pa_pfn; 919 struct mblock_md *mb; 920 921 if (n_locality_groups <= 1) 922 return (0); 923 924 /* 925 * The mnode is defined to be 1:1 with the lgroup handle, which 926 * is taken from from the home bits. Find the mblock in which 927 * the pfn falls to get the ra_to_pa adjustment, and extract 928 * the home bits. 929 */ 930 mb = &mpo_mblock[0]; 931 for (i = 0; i < n_mblocks; i++) { 932 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 933 ra_to_pa_pfn = btop(mb->ra_to_pa); 934 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 935 home_mask_pfn_shift); 936 ASSERT(mnode < max_mem_nodes); 937 return (mnode); 938 } 939 mb++; 940 } 941 942 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 943 return (pfn); 944 } 945 946 /* 947 * plat_rapfn_to_papfn 948 * 949 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 950 * and home mask bits are correct. The upper bits do not necessarily 951 * match the actual PA, however. 952 */ 953 pfn_t 954 plat_rapfn_to_papfn(pfn_t pfn) 955 { 956 int i; 957 pfn_t ra_to_pa_pfn; 958 struct mblock_md *mb; 959 960 ASSERT(n_mblocks > 0); 961 if (n_mblocks == 1) 962 return (pfn + base_ra_to_pa_pfn); 963 964 /* 965 * Find the mblock in which the pfn falls 966 * in order to get the ra_to_pa adjustment. 967 */ 968 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 969 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 970 ra_to_pa_pfn = btop(mb->ra_to_pa); 971 return (pfn + ra_to_pa_pfn); 972 } 973 } 974 975 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 976 return (pfn); 977 } 978 979 /* 980 * plat_mem_node_iterator_init() 981 * Initialize cookie to iterate over pfn's in an mnode. There is 982 * no additional iterator function. The caller uses the info from 983 * the iterator structure directly. 984 * 985 * pfn: starting pfn. 986 * mnode: desired mnode. 987 * init: set to 1 for full init, 0 for continuation 988 * 989 * Returns the appropriate starting pfn for the iteration 990 * the same as the input pfn if it falls in an mblock. 991 * Returns the (pfn_t)-1 value if the input pfn lies past 992 * the last valid mnode pfn. 993 */ 994 pfn_t 995 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 996 mem_node_iterator_t *it, int init) 997 { 998 int i; 999 struct mblock_md *mblock; 1000 pfn_t base, end; 1001 1002 ASSERT(it != NULL); 1003 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1004 ASSERT(n_mblocks > 0); 1005 1006 if (init) { 1007 it->mi_last_mblock = 0; 1008 it->mi_init = 1; 1009 } 1010 1011 /* Check if mpo is not enabled and we only have one mblock */ 1012 if (n_locality_groups == 1 && n_mblocks == 1) { 1013 it->mi_mnode = mnode; 1014 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1015 it->mi_mnode_pfn_mask = 0; 1016 it->mi_mnode_pfn_shift = 0; 1017 it->mi_mnode_mask = 0; 1018 it->mi_mblock_base = mem_node_config[mnode].physbase; 1019 it->mi_mblock_end = mem_node_config[mnode].physmax; 1020 if (pfn < it->mi_mblock_base) 1021 pfn = it->mi_mblock_base; 1022 else if (pfn > it->mi_mblock_end) 1023 pfn = (pfn_t)-1; 1024 return (pfn); 1025 } 1026 1027 /* 1028 * Find mblock that contains pfn, or first mblock after pfn, 1029 * else pfn is out of bounds, so use the last mblock. 1030 * mblocks are sorted in ascending address order. 1031 */ 1032 ASSERT(it->mi_last_mblock < n_mblocks); 1033 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 1034 i = init ? 0 : it->mi_last_mblock + 1; 1035 if (i == n_mblocks) 1036 return ((pfn_t)-1); 1037 1038 for (; i < n_mblocks; i++) { 1039 if ((mpo_mblock[i].mnode_mask & ((mnodeset_t)1 << mnode)) && 1040 (pfn <= mpo_mblock[i].end_pfn)) 1041 break; 1042 } 1043 if (i == n_mblocks) { 1044 it->mi_last_mblock = i - 1; 1045 return ((pfn_t)-1); 1046 } 1047 it->mi_last_mblock = i; 1048 1049 /* 1050 * Memory stripes are defined if there is more than one locality 1051 * group, so use the stripe bounds. Otherwise use mblock bounds. 1052 */ 1053 mblock = &mpo_mblock[i]; 1054 if (n_mem_stripes > 0) { 1055 mem_stripe_t *ms = 1056 &mem_stripes[i * max_locality_groups + mnode]; 1057 base = ms->physbase; 1058 end = ms->physmax; 1059 } else { 1060 ASSERT(mnode == 0); 1061 base = mblock->base_pfn; 1062 end = mblock->end_pfn; 1063 } 1064 1065 it->mi_mnode = mnode; 1066 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1067 it->mi_mblock_base = base; 1068 it->mi_mblock_end = end; 1069 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1070 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1071 it->mi_mnode_mask = max_locality_groups - 1; 1072 if (pfn < base) 1073 pfn = base; 1074 else if (pfn > end) 1075 pfn = (pfn_t)-1; 1076 return (pfn); 1077 } 1078 1079 /* 1080 * plat_mem_node_intersect_range() 1081 * 1082 * Find the intersection between a memnode and a range of pfn's. 1083 */ 1084 void 1085 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1086 int mnode, pgcnt_t *npages_out) 1087 { 1088 pfn_t offset, len, hole, base, end, test_end, frag; 1089 pfn_t nearest; 1090 mem_stripe_t *ms; 1091 int i, npages; 1092 1093 *npages_out = 0; 1094 1095 if (!mem_node_config[mnode].exists || test_len == 0) 1096 return; 1097 1098 base = mem_node_config[mnode].physbase; 1099 end = mem_node_config[mnode].physmax; 1100 1101 test_end = test_base + test_len - 1; 1102 if (end < test_base || base > test_end) 1103 return; 1104 1105 if (n_locality_groups == 1) { 1106 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1107 return; 1108 } 1109 1110 hole = mnode_stride - mnode_pages; 1111 npages = 0; 1112 1113 /* 1114 * Iterate over all the stripes for this mnode (one per mblock), 1115 * find the intersection with each, and accumulate the intersections. 1116 * 1117 * Determing the intersection with a stripe is tricky. If base or end 1118 * fall outside the mem_node bounds, round them to physbase/physmax of 1119 * mem_node. If base or end fall in a gap, round them to start of 1120 * nearest stripe. If they fall within a stripe, keep base or end, 1121 * but calculate the fragment size that should be excluded from the 1122 * stripe. Calculate how many strides fall in the adjusted range, 1123 * multiply by stripe width, and add the start and end fragments. 1124 */ 1125 1126 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1127 ms = &mem_stripes[i]; 1128 if (ms->exists && 1129 test_base <= (end = ms->physmax) && 1130 test_end >= (base = ms->physbase)) { 1131 1132 offset = ms->offset; 1133 1134 if (test_base > base) { 1135 /* Round test_base to next multiple of stride */ 1136 len = P2ROUNDUP(test_base - (base - offset), 1137 mnode_stride); 1138 nearest = base - offset + len; 1139 /* 1140 * Compute distance from test_base to the 1141 * stride boundary to see if test_base falls 1142 * in the stripe or in the hole. 1143 */ 1144 if (nearest - test_base > hole) { 1145 /* 1146 * test_base lies in stripe, 1147 * and offset should be excluded. 1148 */ 1149 offset = test_base - 1150 (nearest - mnode_stride); 1151 base = test_base; 1152 } else { 1153 /* round up to next stripe start */ 1154 offset = 0; 1155 base = nearest; 1156 if (base > end) 1157 continue; 1158 } 1159 1160 } 1161 1162 if (test_end < end) 1163 end = test_end; 1164 end++; /* adjust to an exclusive bound */ 1165 1166 /* Round end to next multiple of stride */ 1167 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1168 nearest = (base - offset) + len; 1169 if (nearest - end <= hole) { 1170 /* end falls in hole, use entire last stripe */ 1171 frag = 0; 1172 } else { 1173 /* end falls in stripe, compute fragment */ 1174 frag = nearest - hole - end; 1175 } 1176 1177 len = (len >> stripe_shift) - offset - frag; 1178 npages += len; 1179 } 1180 } 1181 1182 *npages_out = npages; 1183 } 1184 1185 /* 1186 * valid_pages() 1187 * 1188 * Return 1 if pages are valid and do not cross mnode boundaries 1189 * (which would break page free list assumptions), and 0 otherwise. 1190 */ 1191 1192 #define MNODE(pa) \ 1193 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1194 1195 static int 1196 valid_pages(md_t *md, mde_cookie_t cpu0) 1197 { 1198 int i, max_szc; 1199 uint64_t last_page_base, szc_mask; 1200 uint64_t max_page_len, max_coalesce_len; 1201 struct mblock_md *mb = mpo_mblock; 1202 1203 /* 1204 * Find the smaller of the largest page possible and supported. 1205 * mmu_exported_pagesize_mask is not yet initialized, so read 1206 * it from the MD. Apply minimal fixups in case of broken MDs 1207 * to get a sane mask. 1208 */ 1209 1210 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1211 szc_mask = 0; 1212 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1213 max_szc = highbit(szc_mask) - 1; 1214 if (max_szc > TTE256M) 1215 max_szc = TTE256M; 1216 max_page_len = TTEBYTES(max_szc); 1217 1218 /* 1219 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1220 * if mmu-page-size-list does not contain it, so 256M pages must fall 1221 * within one mnode to use MPO. 1222 */ 1223 max_coalesce_len = TTEBYTES(TTE256M); 1224 ASSERT(max_coalesce_len >= max_page_len); 1225 1226 if (ptob(mnode_pages) < max_coalesce_len) { 1227 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1228 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1229 return (0); 1230 } 1231 1232 for (i = 0; i < n_mblocks; i++) { 1233 uint64_t base = mb->base; 1234 uint64_t end = mb->base + mb->size - 1; 1235 uint64_t ra_to_pa = mb->ra_to_pa; 1236 1237 /* 1238 * If mblock is smaller than the max page size, then 1239 * RA = PA mod MAXPAGE is not guaranteed, but it must 1240 * not span mnodes. 1241 */ 1242 if (mb->size < max_page_len) { 1243 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1244 MPO_STATUS("Small mblock spans mnodes; " 1245 "MPO disabled: base = %lx, end = %lx, " 1246 "ra2pa = %lx\n", base, end, ra_to_pa); 1247 return (0); 1248 } 1249 } else { 1250 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1251 uint64_t pa_base = base + ra_to_pa; 1252 if ((base & (max_coalesce_len - 1)) != 1253 (pa_base & (max_coalesce_len - 1))) { 1254 MPO_STATUS("bad page alignment; MPO disabled: " 1255 "ra = %lx, pa = %lx, pagelen = %lx\n", 1256 base, pa_base, max_coalesce_len); 1257 return (0); 1258 } 1259 } 1260 1261 /* 1262 * Find start of last large page in mblock in RA space. 1263 * If page extends into the next mblock, verify the 1264 * mnode does not change. 1265 */ 1266 last_page_base = P2ALIGN(end, max_coalesce_len); 1267 if (i + 1 < n_mblocks && 1268 last_page_base + max_coalesce_len > mb[1].base && 1269 MNODE(last_page_base + ra_to_pa) != 1270 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1271 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1272 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1273 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1274 mb[1].ra_to_pa, max_coalesce_len); 1275 return (0); 1276 } 1277 1278 mb++; 1279 } 1280 return (1); 1281 } 1282 1283 1284 /* 1285 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1286 * if any, and remove them. This yields a config where the "coarse 1287 * grained" lgroups cover all of memory, even though part of that memory 1288 * is fine grain interleaved and does not deliver a purely local memory 1289 * latency. 1290 * 1291 * This function reads and modifies the globals: 1292 * mpo_lgroup[], n_lgrpnodes 1293 * 1294 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1295 */ 1296 1297 static int 1298 fix_interleave(void) 1299 { 1300 int i, j; 1301 uint64_t mask = 0; 1302 1303 j = 0; 1304 for (i = 0; i < n_lgrpnodes; i++) { 1305 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1306 /* remove this lgroup */ 1307 mask = mpo_lgroup[i].addr_mask; 1308 } else { 1309 mpo_lgroup[j++] = mpo_lgroup[i]; 1310 } 1311 } 1312 n_lgrpnodes = j; 1313 1314 if (mask != 0) 1315 MPO_STATUS("sub-page interleave %lx found; " 1316 "removing lgroup.\n", mask); 1317 1318 return (mask != 0); 1319 } 1320