1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/machsystm.h> 32 #include <sys/machparam.h> 33 #include <sys/cmn_err.h> 34 #include <sys/stat.h> 35 #include <sys/mach_descrip.h> 36 #include <sys/memnode.h> 37 #include <sys/mdesc.h> 38 #include <sys/mpo.h> 39 #include <vm/vm_dep.h> 40 #include <vm/hat_sfmmu.h> 41 #include <sys/promif.h> 42 43 /* 44 * MPO and the sun4v memory representation 45 * --------------------------------------- 46 * 47 * Latency groups are defined in the sun4v achitecture by memory-latency-group 48 * nodes in the Machine Description, as specified in FWARC/2007/260. These 49 * tie together cpu nodes and mblock nodes, and contain mask and match 50 * properties that identify the portion of an mblock that belongs to the 51 * lgroup. Mask and match are defined in the Physical Address (PA) space, 52 * but an mblock defines Real Addresses (RA). To translate, the mblock 53 * includes the property address-congruence-offset, hereafter referred to as 54 * ra_to_pa. A real address ra is a member of an lgroup if 55 * 56 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 57 * 58 * The MD is traversed, and information on all mblocks is kept in the array 59 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 60 * to, is kept in the array mpo_cpu[]. 61 * 62 * This implementation makes (and verifies) the simplifying assumption that 63 * the mask bits are the same for all defined lgroups, and that all 1 bits in 64 * the mask are contiguous. Thus the number of lgroups is bounded by the 65 * number of possible mask values, and the lgrp_handle_t is defined as the 66 * mask value, shifted right to eliminate the 0 bit positions in mask. The 67 * masks and values are also referred to as "home bits" in the code. 68 * 69 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 70 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 71 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 72 * home bits. This yields the mem_node. 73 * 74 * Interfaces 75 * ---------- 76 * 77 * This file exports the following entry points: 78 * 79 * plat_lgrp_init() 80 * plat_build_mem_nodes() 81 * plat_lgrp_cpu_to_hand() 82 * plat_lgrp_latency() 83 * plat_pfn_to_mem_node() 84 * These implement the usual platform lgroup interfaces. 85 * 86 * plat_rapfn_to_papfn() 87 * Recover the PA page coloring bits from an RA. 88 * 89 * plat_mem_node_iterator_init() 90 * Initialize an iterator to efficiently step through pages in a mem_node. 91 * 92 * plat_mem_node_intersect_range() 93 * Find the intersection with a mem_node. 94 */ 95 96 int sun4v_mpo_enable = 1; 97 int sun4v_mpo_debug = 0; 98 char sun4v_mpo_status[256] = ""; 99 100 /* Save CPU info from the MD and associate CPUs with lgroups */ 101 static struct cpu_md mpo_cpu[NCPU]; 102 103 /* Save lgroup info from the MD */ 104 #define MAX_MD_LGROUPS 32 105 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 106 static int n_lgrpnodes = 0; 107 static int n_locality_groups = 0; 108 static int max_locality_groups = 0; 109 110 /* Save mblocks from the MD */ 111 #define SMALL_MBLOCKS_COUNT 8 112 static struct mblock_md *mpo_mblock; 113 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 114 static int n_mblocks = 0; 115 116 /* Save mem_node stripes calculate from mblocks and lgroups. */ 117 static mem_stripe_t *mem_stripes; 118 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 119 static int mstripesz = 0; 120 static int n_mem_stripes = 0; 121 static pfn_t mnode_stride; /* distance between stripes, start to start */ 122 static int stripe_shift; /* stride/stripes expressed as a shift */ 123 static pfn_t mnode_pages; /* mem_node stripe width */ 124 125 /* Save home mask and shift used to calculate lgrp_handle_t values */ 126 static uint64_t home_mask = 0; 127 static pfn_t home_mask_pfn = 0; 128 static int home_mask_shift = 0; 129 static uint_t home_mask_pfn_shift = 0; 130 131 /* Save lowest and highest latencies found across all lgroups */ 132 static int lower_latency = 0; 133 static int higher_latency = 0; 134 135 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 136 137 static int valid_pages(md_t *md, mde_cookie_t cpu0); 138 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 139 static int fix_interleave(void); 140 141 /* Debug support */ 142 #if defined(DEBUG) && !defined(lint) 143 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 144 #else 145 #define MPO_DEBUG(...) 146 #endif /* DEBUG */ 147 148 /* Record status message, viewable from mdb */ 149 #define MPO_STATUS(args...) { \ 150 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 151 MPO_DEBUG(sun4v_mpo_status); \ 152 } 153 154 /* 155 * Routine to read a uint64_t from a given md 156 */ 157 static int64_t 158 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 159 { 160 int err = md_get_prop_val(md, node, propname, val); 161 return (err); 162 } 163 164 static int 165 mblock_cmp(const void *a, const void *b) 166 { 167 struct mblock_md *m1 = (struct mblock_md *)a; 168 struct mblock_md *m2 = (struct mblock_md *)b; 169 170 if (m1->base < m2->base) 171 return (-1); 172 else if (m1->base == m2->base) 173 return (0); 174 else 175 return (1); 176 } 177 178 static void 179 mblock_sort(struct mblock_md *mblocks, int n) 180 { 181 extern void qsort(void *, size_t, size_t, 182 int (*)(const void *, const void *)); 183 184 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 185 } 186 187 static void 188 mpo_update_tunables(void) 189 { 190 int i, ncpu_min; 191 192 /* 193 * lgrp_expand_proc_thresh is the minimum load on the lgroups 194 * this process is currently running on before considering 195 * expanding threads to another lgroup. 196 * 197 * lgrp_expand_proc_diff determines how much less the remote lgroup 198 * must be loaded before expanding to it. 199 * 200 * On sun4v CMT processors, threads share a core pipeline, and 201 * at less than 100% utilization, best throughput is obtained by 202 * spreading threads across more cores, even if some are in a 203 * different lgroup. Spread threads to a new lgroup if the 204 * current group is more than 50% loaded. Because of virtualization, 205 * lgroups may have different numbers of CPUs, but the tunables 206 * apply to all lgroups, so find the smallest lgroup and compute 207 * 50% loading. 208 */ 209 210 ncpu_min = NCPU; 211 for (i = 0; i < n_lgrpnodes; i++) { 212 int ncpu = mpo_lgroup[i].ncpu; 213 if (ncpu != 0 && ncpu < ncpu_min) 214 ncpu_min = ncpu; 215 } 216 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 217 218 /* new home may only be half as loaded as the existing home to use it */ 219 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 220 221 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 222 } 223 224 static mde_cookie_t 225 cpuid_to_cpunode(md_t *md, int cpuid) 226 { 227 mde_cookie_t rootnode, foundnode, *cpunodes; 228 uint64_t cpuid_prop; 229 int n_cpunodes, i; 230 231 if (md == NULL) 232 return (MDE_INVAL_ELEM_COOKIE); 233 234 rootnode = md_root_node(md); 235 if (rootnode == MDE_INVAL_ELEM_COOKIE) 236 return (MDE_INVAL_ELEM_COOKIE); 237 238 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU, 239 "fwd", &cpunodes); 240 if (n_cpunodes <= 0 || n_cpunodes > NCPU) 241 goto cpuid_fail; 242 243 for (i = 0; i < n_cpunodes; i++) { 244 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID, 245 &cpuid_prop)) 246 break; 247 if (cpuid_prop == (uint64_t)cpuid) { 248 foundnode = cpunodes[i]; 249 md_free_scan_dag(md, &cpunodes); 250 return (foundnode); 251 } 252 } 253 cpuid_fail: 254 if (n_cpunodes > 0) 255 md_free_scan_dag(md, &cpunodes); 256 return (MDE_INVAL_ELEM_COOKIE); 257 } 258 259 static int 260 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode) 261 { 262 mde_cookie_t *nodes; 263 uint64_t latency, lowest_latency; 264 uint64_t address_match, lowest_address_match; 265 int n_lgroups, j, result = 0; 266 267 /* Find lgroup nodes reachable from this cpu */ 268 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG, 269 "fwd", &nodes); 270 271 lowest_latency = ~(0UL); 272 273 /* Find the lgroup node with the smallest latency */ 274 for (j = 0; j < n_lgroups; j++) { 275 result = get_int(md, nodes[j], PROP_LG_LATENCY, 276 &latency); 277 result |= get_int(md, nodes[j], PROP_LG_MATCH, 278 &address_match); 279 if (result != 0) { 280 j = -1; 281 goto to_lgrp_done; 282 } 283 if (latency < lowest_latency) { 284 lowest_latency = latency; 285 lowest_address_match = address_match; 286 } 287 } 288 for (j = 0; j < n_lgrpnodes; j++) { 289 if ((mpo_lgroup[j].latency == lowest_latency) && 290 (mpo_lgroup[j].addr_match == lowest_address_match)) 291 break; 292 } 293 if (j == n_lgrpnodes) 294 j = -1; 295 296 to_lgrp_done: 297 if (n_lgroups > 0) 298 md_free_scan_dag(md, &nodes); 299 return (j); 300 } 301 302 /* Called when DR'ing in a CPU */ 303 void 304 mpo_cpu_add(int cpuid) 305 { 306 md_t *md; 307 mde_cookie_t cpunode; 308 309 int i; 310 311 if (n_lgrpnodes <= 0) 312 return; 313 314 md = md_get_handle(); 315 316 if (md == NULL) 317 goto add_fail; 318 319 cpunode = cpuid_to_cpunode(md, cpuid); 320 if (cpunode == MDE_INVAL_ELEM_COOKIE) 321 goto add_fail; 322 323 i = mpo_cpu_to_lgroup(md, cpunode); 324 if (i == -1) 325 goto add_fail; 326 327 mpo_cpu[cpuid].lgrp_index = i; 328 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift; 329 mpo_lgroup[i].ncpu++; 330 mpo_update_tunables(); 331 (void) md_fini_handle(md); 332 return; 333 add_fail: 334 panic("mpo_cpu_add: Cannot read MD"); 335 } 336 337 /* Called when DR'ing out a CPU */ 338 void 339 mpo_cpu_remove(int cpuid) 340 { 341 int i; 342 343 if (n_lgrpnodes <= 0) 344 return; 345 346 i = mpo_cpu[cpuid].lgrp_index; 347 mpo_lgroup[i].ncpu--; 348 mpo_cpu[cpuid].home = 0; 349 mpo_cpu[cpuid].lgrp_index = -1; 350 mpo_update_tunables(); 351 } 352 353 /* 354 * 355 * Traverse the MD to determine: 356 * 357 * Number of CPU nodes, lgrp_nodes, and mblocks 358 * Then for each lgrp_node, obtain the appropriate data. 359 * For each CPU, determine its home locality and store it. 360 * For each mblock, retrieve its data and store it. 361 */ 362 static int 363 lgrp_traverse(md_t *md) 364 { 365 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 366 uint64_t i, j, k, o, n_nodes; 367 uint64_t mem_lg_homeset = 0; 368 int ret_val = 0; 369 int result = 0; 370 int n_cpunodes = 0; 371 int sub_page_fix; 372 int mblocksz = 0; 373 size_t allocsz; 374 375 n_nodes = md_node_count(md); 376 377 if (n_nodes <= 0) { 378 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 379 ret_val = -1; 380 goto fail; 381 } 382 383 root = md_root_node(md); 384 385 if (root == MDE_INVAL_ELEM_COOKIE) { 386 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 387 ret_val = -1; 388 goto fail; 389 } 390 391 /* 392 * Build the Memory Nodes. Do this before any possibility of 393 * bailing from this routine so we obtain ra_to_pa (needed for page 394 * coloring) even when there are no lgroups defined. 395 */ 396 397 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 398 "fwd", &mblocknodes); 399 400 if (n_mblocks <= 0) { 401 MPO_STATUS("lgrp_traverse: No mblock " 402 "nodes detected in Machine Descriptor\n"); 403 n_mblocks = 0; 404 ret_val = -1; 405 goto fail; 406 } 407 /* 408 * If we have a small number of mblocks we will use the space 409 * that we preallocated. Otherwise, we will dynamically 410 * allocate the space 411 */ 412 mblocksz = n_mblocks * sizeof (struct mblock_md); 413 mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t); 414 415 if (n_mblocks <= SMALL_MBLOCKS_COUNT) { 416 mpo_mblock = &small_mpo_mblocks[0]; 417 mem_stripes = &small_mem_stripes[0]; 418 } else { 419 allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 420 /* Ensure that we dont request more space than reserved */ 421 if (allocsz > MPOBUF_SIZE) { 422 MPO_STATUS("lgrp_traverse: Insufficient space " 423 "for mblock structures \n"); 424 ret_val = -1; 425 n_mblocks = 0; 426 goto fail; 427 } 428 mpo_mblock = (struct mblock_md *) 429 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 430 if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) { 431 MPO_STATUS("lgrp_traverse: Cannot allocate space " 432 "for mblocks \n"); 433 ret_val = -1; 434 n_mblocks = 0; 435 goto fail; 436 } 437 mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 438 mpo_heap32_bufsz = MPOBUF_SIZE; 439 440 mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks); 441 } 442 for (i = 0, j = 0; j < n_mblocks; j++) { 443 mpo_mblock[i].node = mblocknodes[j]; 444 445 /* Without a base or size value we will fail */ 446 result = get_int(md, mblocknodes[j], PROP_LG_BASE, 447 &mpo_mblock[i].base); 448 if (result < 0) { 449 MPO_STATUS("lgrp_traverse: " 450 "PROP_LG_BASE is missing\n"); 451 n_mblocks = 0; 452 ret_val = -1; 453 goto fail; 454 } 455 456 result = get_int(md, mblocknodes[j], PROP_LG_SIZE, 457 &mpo_mblock[i].size); 458 if (result < 0) { 459 MPO_STATUS("lgrp_traverse: " 460 "PROP_LG_SIZE is missing\n"); 461 n_mblocks = 0; 462 ret_val = -1; 463 goto fail; 464 } 465 466 result = get_int(md, mblocknodes[j], 467 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 468 469 /* If we don't have an ra_pa_offset, just set it to 0 */ 470 if (result < 0) 471 mpo_mblock[i].ra_to_pa = 0; 472 473 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 474 "ra_to_pa = %lx\n", i, 475 mpo_mblock[i].base, 476 mpo_mblock[i].size, 477 mpo_mblock[i].ra_to_pa); 478 479 /* check for unsupportable values of base and size */ 480 if (mpo_mblock[i].base > 481 mpo_mblock[i].base + mpo_mblock[i].size) { 482 MPO_STATUS("lgrp_traverse: " 483 "PROP_LG_BASE+PROP_LG_SIZE is invalid: " 484 "base = %lx, size = %lx", 485 mpo_mblock[i].base, mpo_mblock[i].size); 486 n_mblocks = 0; 487 ret_val = -1; 488 goto fail; 489 } 490 491 /* eliminate size==0 blocks */ 492 if (mpo_mblock[i].size != 0) { 493 i++; 494 } 495 } 496 497 if (i == 0) { 498 MPO_STATUS("lgrp_traverse: " 499 "No non-empty mblock nodes were found " 500 "in the Machine Descriptor\n"); 501 n_mblocks = 0; 502 ret_val = -1; 503 goto fail; 504 } 505 ASSERT(i <= n_mblocks); 506 n_mblocks = i; 507 508 /* Must sort mblocks by address for mem_node_iterator_init() */ 509 mblock_sort(mpo_mblock, n_mblocks); 510 511 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 512 513 /* Page coloring hook is required so we can iterate through mnodes */ 514 if (&page_next_pfn_for_color_cpu == NULL) { 515 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 516 ret_val = -1; 517 goto fail; 518 } 519 520 /* Global enable for mpo */ 521 if (sun4v_mpo_enable == 0) { 522 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 523 ret_val = -1; 524 goto fail; 525 } 526 527 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 528 "fwd", &lgrpnodes); 529 530 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 531 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 532 ret_val = -1; 533 goto fail; 534 } 535 536 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 537 538 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 539 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 540 "in MD\n"); 541 ret_val = -1; 542 goto fail; 543 } 544 545 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 546 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 547 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 548 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 549 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 550 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 551 552 for (i = 0; i < n_lgrpnodes; i++) { 553 mpo_lgroup[i].node = lgrpnodes[i]; 554 mpo_lgroup[i].id = i; 555 mpo_lgroup[i].ncpu = 0; 556 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 557 &mpo_lgroup[i].addr_mask); 558 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 559 &mpo_lgroup[i].addr_match); 560 561 /* 562 * If either the mask or match properties are missing, set to 0 563 */ 564 if (result < 0) { 565 mpo_lgroup[i].addr_mask = 0; 566 mpo_lgroup[i].addr_match = 0; 567 } 568 569 /* Set latency to 0 if property not present */ 570 571 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 572 &mpo_lgroup[i].latency); 573 if (result < 0) 574 mpo_lgroup[i].latency = 0; 575 } 576 577 /* 578 * Sub-page level interleave is not yet supported. Check for it, 579 * and remove sub-page interleaved lgroups from mpo_lgroup and 580 * n_lgrpnodes. If no lgroups are left, return. 581 */ 582 583 sub_page_fix = fix_interleave(); 584 if (n_lgrpnodes == 0) { 585 ret_val = -1; 586 goto fail; 587 } 588 589 /* Ensure that all of the addr_mask values are the same */ 590 591 for (i = 0; i < n_lgrpnodes; i++) { 592 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 593 MPO_STATUS("lgrp_traverse: " 594 "addr_mask values are not the same\n"); 595 ret_val = -1; 596 goto fail; 597 } 598 } 599 600 /* 601 * Ensure that all lgrp nodes see all the mblocks. However, if 602 * sub-page interleave is being fixed, they do not, so skip 603 * the check. 604 */ 605 606 if (sub_page_fix == 0) { 607 for (i = 0; i < n_lgrpnodes; i++) { 608 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 609 PROP_LG_MBLOCK, "fwd", &nodes); 610 md_free_scan_dag(md, &nodes); 611 if (j != n_mblocks) { 612 MPO_STATUS("lgrp_traverse: " 613 "sub-page interleave is being fixed\n"); 614 ret_val = -1; 615 goto fail; 616 } 617 } 618 } 619 620 /* 621 * Use the address mask from the first lgroup node 622 * to establish our home_mask. 623 */ 624 home_mask = mpo_lgroup[0].addr_mask; 625 home_mask_pfn = btop(home_mask); 626 home_mask_shift = lowbit(home_mask) - 1; 627 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 628 mnode_pages = btop(1ULL << home_mask_shift); 629 630 /* 631 * How many values are possible in home mask? Assume the mask 632 * bits are contiguous. 633 */ 634 max_locality_groups = 635 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 636 637 /* Now verify the home mask bits are contiguous */ 638 639 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 640 MPO_STATUS("lgrp_traverse: " 641 "home mask bits are not contiguous\n"); 642 ret_val = -1; 643 goto fail; 644 } 645 646 /* Record all of the home bits */ 647 648 for (i = 0; i < n_lgrpnodes; i++) { 649 HOMESET_ADD(mem_lg_homeset, 650 mpo_lgroup[i].addr_match >> home_mask_shift); 651 } 652 653 /* Count the number different "home" mem_lg's we've discovered */ 654 655 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 656 657 /* If we have only 1 locality group then we can exit */ 658 if (n_locality_groups == 1) { 659 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 660 ret_val = -1; 661 goto fail; 662 } 663 664 /* 665 * Set the latencies. A CPU's lgroup is defined by the lowest 666 * latency found. All other memory is considered remote, and the 667 * remote latency is represented by the highest latency found. 668 * Thus hierarchical lgroups, if any, are approximated by a 669 * two level scheme. 670 * 671 * The Solaris MPO framework by convention wants to see latencies 672 * in units of nano-sec/10. In the MD, the units are defined to be 673 * pico-seconds. 674 */ 675 676 lower_latency = mpo_lgroup[0].latency; 677 higher_latency = mpo_lgroup[0].latency; 678 679 for (i = 1; i < n_lgrpnodes; i++) { 680 if (mpo_lgroup[i].latency < lower_latency) { 681 lower_latency = mpo_lgroup[i].latency; 682 } 683 if (mpo_lgroup[i].latency > higher_latency) { 684 higher_latency = mpo_lgroup[i].latency; 685 } 686 } 687 lower_latency /= 10000; 688 higher_latency /= 10000; 689 690 /* Clear our CPU data */ 691 692 for (i = 0; i < NCPU; i++) { 693 mpo_cpu[i].home = 0; 694 mpo_cpu[i].lgrp_index = -1; 695 } 696 697 /* Build the CPU nodes */ 698 for (i = 0; i < n_cpunodes; i++) { 699 700 /* Read in the lgroup nodes */ 701 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 702 if (result < 0) { 703 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 704 ret_val = -1; 705 goto fail; 706 } 707 708 o = mpo_cpu_to_lgroup(md, cpunodes[i]); 709 if (o == -1) { 710 ret_val = -1; 711 goto fail; 712 } 713 mpo_cpu[k].lgrp_index = o; 714 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift; 715 mpo_lgroup[o].ncpu++; 716 } 717 /* Validate that no large pages cross mnode boundaries. */ 718 if (valid_pages(md, cpunodes[0]) == 0) { 719 ret_val = -1; 720 goto fail; 721 } 722 723 fail: 724 /* MD cookies are no longer valid; ensure they are not used again. */ 725 for (i = 0; i < n_mblocks; i++) 726 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 727 for (i = 0; i < n_lgrpnodes; i++) 728 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 729 730 if (n_cpunodes > 0) 731 md_free_scan_dag(md, &cpunodes); 732 if (n_lgrpnodes > 0) 733 md_free_scan_dag(md, &lgrpnodes); 734 if (n_mblocks > 0) 735 md_free_scan_dag(md, &mblocknodes); 736 else 737 panic("lgrp_traverse: No memory blocks found"); 738 739 if (ret_val == 0) 740 MPO_STATUS("MPO feature is enabled.\n"); 741 742 return (ret_val); 743 } 744 745 /* 746 * Determine the number of unique mem_lg's present in our system 747 */ 748 static int 749 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 750 { 751 int homeid; 752 int count = 0; 753 754 /* 755 * Scan the "home" bits of the mem_lgs, count 756 * the number that are unique. 757 */ 758 759 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 760 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 761 count++; 762 } 763 } 764 765 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 766 mem_lg_homeset); 767 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 768 769 /* Default must be at least one */ 770 if (count == 0) 771 count = 1; 772 773 return (count); 774 } 775 776 /* 777 * Platform specific lgroup initialization 778 */ 779 void 780 plat_lgrp_init(void) 781 { 782 md_t *md; 783 int rc; 784 785 /* Get the Machine Descriptor handle */ 786 787 md = md_get_handle(); 788 789 /* If not, we cannot continue */ 790 791 if (md == NULL) { 792 panic("cannot access machine descriptor\n"); 793 } else { 794 rc = lgrp_traverse(md); 795 (void) md_fini_handle(md); 796 } 797 798 /* 799 * If we can't process the MD for lgroups then at least let the 800 * system try to boot. Assume we have one lgroup so that 801 * when plat_build_mem_nodes is called, it will attempt to init 802 * an mnode based on the supplied memory segment. 803 */ 804 805 if (rc == -1) { 806 home_mask_pfn = 0; 807 max_locality_groups = 1; 808 n_locality_groups = 1; 809 return; 810 } 811 812 mem_node_pfn_shift = 0; 813 mem_node_physalign = 0; 814 815 /* Use lgroup-aware TSB allocations */ 816 tsb_lgrp_affinity = 1; 817 818 /* Require that a home lgroup have some memory to be chosen */ 819 lgrp_mem_free_thresh = 1; 820 821 /* Standard home-on-next-touch policy */ 822 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 823 824 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 825 lgrp_load_thresh = UINT32_MAX; 826 827 mpo_update_tunables(); 828 } 829 830 /* 831 * Helper routine for debugging calls to mem_node_add_slice() 832 */ 833 static void 834 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 835 { 836 #if defined(DEBUG) && !defined(lint) 837 static int slice_count = 0; 838 839 slice_count++; 840 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 841 slice_count, basepfn, endpfn); 842 #endif 843 mem_node_add_slice(basepfn, endpfn); 844 } 845 846 /* 847 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 848 */ 849 static void 850 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 851 { 852 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 853 "mnode index: %d\n", plathand, mnode); 854 plat_assign_lgrphand_to_mem_node(plathand, mnode); 855 } 856 857 /* 858 * plat_build_mem_nodes() 859 * 860 * Define the mem_nodes based on the modified boot memory list, 861 * or based on info read from the MD in plat_lgrp_init(). 862 * 863 * When the home mask lies in the middle of the address bits (as it does on 864 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 865 * it is striped across an mblock in a repeating pattern of contiguous memory 866 * followed by a gap. The stripe width is the size of the contiguous piece. 867 * The stride is the distance from the start of one contiguous piece to the 868 * start of the next. The gap is thus stride - stripe_width. 869 * 870 * The stripe of an mnode that falls within an mblock is described by the type 871 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 872 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 873 * this array is predetermined. The mem_stripe_t that describes mnode m 874 * within mpo_mblock[i] is stored at 875 * mem_stripes[ m + i * max_locality_groups ] 876 * 877 * max_locality_groups is the total number of possible locality groups, 878 * as defined by the size of the home mask, even if the memory assigned 879 * to the domain is small and does not cover all the lgroups. Thus some 880 * mem_stripe_t's may be empty. 881 * 882 * The members of mem_stripe_t are: 883 * physbase: First valid page in mem_node in the corresponding mblock 884 * physmax: Last valid page in mem_node in mblock 885 * offset: The full stripe width starts at physbase - offset. 886 * Thus if offset is non-zero, this mem_node starts in the middle 887 * of a stripe width, and the second full stripe starts at 888 * physbase - offset + stride. (even though physmax may fall in the 889 * middle of a stripe width, we do not save the ending fragment size 890 * in this data structure.) 891 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 892 * 893 * The stripe width is kept in the global mnode_pages. 894 * The stride is kept in the global mnode_stride. 895 * All the above use pfn's as the unit. 896 * 897 * As an example, the memory layout for a domain with 2 mblocks and 4 898 * mem_nodes 0,1,2,3 could look like this: 899 * 900 * 123012301230 ... 012301230123 ... 901 * mblock 0 mblock 1 902 */ 903 904 void 905 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems) 906 { 907 lgrp_handle_t lgrphand, lgrp_start; 908 int i, mnode, elem; 909 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 910 uint64_t stripe, frag, remove; 911 mem_stripe_t *ms; 912 913 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 914 max_mem_nodes = max_locality_groups; 915 916 /* Check for non-MPO sun4v platforms */ 917 if (n_locality_groups <= 1) { 918 ASSERT(n_locality_groups == 1); 919 ASSERT(max_locality_groups == 1 && max_mem_nodes == 1); 920 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 921 for (elem = 0; elem < nelems; list++, elem++) { 922 base = list->addr; 923 len = list->size; 924 925 mpo_mem_node_add_slice(btop(base), 926 btop(base + len - 1)); 927 } 928 mem_node_pfn_shift = 0; 929 mem_node_physalign = 0; 930 931 if (n_mblocks == 1) { 932 n_mem_stripes = 0; 933 } else { 934 n_mem_stripes = n_mblocks; 935 bzero(mem_stripes, mstripesz); 936 for (i = 0; i < n_mblocks; i++) { 937 base = mpo_mblock[i].base; 938 end = base + mpo_mblock[i].size; 939 ASSERT(end > base); 940 mem_stripes[i].exists = 1; 941 mpo_mblock[i].base_pfn = btop(base); 942 mpo_mblock[i].end_pfn = btop(end - 1); 943 mem_stripes[i].physbase = 944 mpo_mblock[i].base_pfn; 945 mem_stripes[i].physmax = mpo_mblock[i].end_pfn; 946 } 947 } 948 return; 949 } 950 951 bzero(mem_stripes, mstripesz); 952 stripe = ptob(mnode_pages); 953 stride = max_locality_groups * stripe; 954 955 /* Save commonly used values in globals */ 956 mnode_stride = btop(stride); 957 n_mem_stripes = max_locality_groups * n_mblocks; 958 stripe_shift = highbit(max_locality_groups) - 1; 959 960 for (i = 0; i < n_mblocks; i++) { 961 base = mpo_mblock[i].base; 962 end = mpo_mblock[i].base + mpo_mblock[i].size; 963 ra_to_pa = mpo_mblock[i].ra_to_pa; 964 mpo_mblock[i].base_pfn = btop(base); 965 mpo_mblock[i].end_pfn = btop(end - 1); 966 967 /* Find the offset from the prev stripe boundary in PA space. */ 968 offset = (base + ra_to_pa) & (stripe - 1); 969 970 /* Set the next stripe boundary. */ 971 stripe_end = base - offset + stripe; 972 973 lgrp_start = (((base + ra_to_pa) & home_mask) >> 974 home_mask_shift); 975 lgrphand = lgrp_start; 976 977 /* 978 * Loop over all lgroups covered by the mblock, creating a 979 * stripe for each. Stop when lgrp_start is visited again. 980 */ 981 do { 982 /* mblock may not span all lgroups */ 983 if (base >= end) 984 break; 985 986 mnode = lgrphand; 987 ASSERT(mnode < max_mem_nodes); 988 989 /* 990 * Calculate the size of the fragment that does not 991 * belong to the mnode in the last partial stride. 992 */ 993 frag = (end - (base - offset)) & (stride - 1); 994 if (frag == 0) { 995 /* remove the gap */ 996 remove = stride - stripe; 997 } else if (frag < stripe) { 998 /* fragment fits in stripe; keep it all */ 999 remove = 0; 1000 } else { 1001 /* fragment is large; trim after whole stripe */ 1002 remove = frag - stripe; 1003 } 1004 1005 ms = &mem_stripes[i * max_locality_groups + mnode]; 1006 ms->physbase = btop(base); 1007 ms->physmax = btop(end - 1 - remove); 1008 ms->offset = btop(offset); 1009 ms->exists = 1; 1010 1011 /* 1012 * If we have only 1 lgroup and multiple mblocks, 1013 * then we have already established our lgrp handle 1014 * to mem_node and mem_node_config values above. 1015 */ 1016 if (n_locality_groups > 1) { 1017 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, 1018 mnode); 1019 mpo_mem_node_add_slice(ms->physbase, 1020 ms->physmax); 1021 } 1022 base = stripe_end; 1023 stripe_end += stripe; 1024 offset = 0; 1025 lgrphand = (((base + ra_to_pa) & home_mask) >> 1026 home_mask_shift); 1027 } while (lgrphand != lgrp_start); 1028 } 1029 1030 /* 1031 * Indicate to vm_pagelist that the hpm_counters array 1032 * should be shared because the ranges overlap. 1033 */ 1034 if (max_mem_nodes > 1) { 1035 interleaved_mnodes = 1; 1036 } 1037 } 1038 1039 /* 1040 * Return the locality group value for the supplied processor 1041 */ 1042 lgrp_handle_t 1043 plat_lgrp_cpu_to_hand(processorid_t id) 1044 { 1045 if (n_locality_groups > 1) { 1046 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 1047 } else { 1048 return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */ 1049 } 1050 } 1051 1052 int 1053 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 1054 { 1055 /* 1056 * Return min remote latency when there are more than two lgroups 1057 * (root and child) and getting latency between two different lgroups 1058 * or root is involved. 1059 */ 1060 if (lgrp_optimizations() && (from != to || 1061 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 1062 return ((int)higher_latency); 1063 } else { 1064 return ((int)lower_latency); 1065 } 1066 } 1067 1068 int 1069 plat_pfn_to_mem_node(pfn_t pfn) 1070 { 1071 int i, mnode; 1072 pfn_t ra_to_pa_pfn; 1073 struct mblock_md *mb; 1074 1075 if (n_locality_groups <= 1) 1076 return (0); 1077 1078 /* 1079 * The mnode is defined to be 1:1 with the lgroup handle, which 1080 * is taken from from the home bits. Find the mblock in which 1081 * the pfn falls to get the ra_to_pa adjustment, and extract 1082 * the home bits. 1083 */ 1084 mb = &mpo_mblock[0]; 1085 for (i = 0; i < n_mblocks; i++) { 1086 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 1087 ra_to_pa_pfn = btop(mb->ra_to_pa); 1088 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 1089 home_mask_pfn_shift); 1090 ASSERT(mnode < max_mem_nodes); 1091 return (mnode); 1092 } 1093 mb++; 1094 } 1095 1096 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 1097 return (pfn); 1098 } 1099 1100 /* 1101 * plat_rapfn_to_papfn 1102 * 1103 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 1104 * and home mask bits are correct. The upper bits do not necessarily 1105 * match the actual PA, however. 1106 */ 1107 pfn_t 1108 plat_rapfn_to_papfn(pfn_t pfn) 1109 { 1110 int i; 1111 pfn_t ra_to_pa_pfn; 1112 struct mblock_md *mb; 1113 1114 ASSERT(n_mblocks > 0); 1115 if (n_mblocks == 1) 1116 return (pfn + base_ra_to_pa_pfn); 1117 1118 /* 1119 * Find the mblock in which the pfn falls 1120 * in order to get the ra_to_pa adjustment. 1121 */ 1122 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 1123 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 1124 ra_to_pa_pfn = btop(mb->ra_to_pa); 1125 return (pfn + ra_to_pa_pfn); 1126 } 1127 } 1128 1129 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 1130 return (pfn); 1131 } 1132 1133 /* 1134 * plat_mem_node_iterator_init() 1135 * Initialize cookie "it" to iterate over pfn's in an mnode. There is 1136 * no additional iterator function. The caller uses the info from 1137 * the iterator structure directly. 1138 * 1139 * pfn: starting pfn. 1140 * mnode: desired mnode. 1141 * szc: desired page size. 1142 * init: 1143 * if 1, start a new traversal, initialize "it", find first 1144 * mblock containing pfn, and return its starting pfn 1145 * within the mnode. 1146 * if 0, continue the previous traversal using passed-in data 1147 * from "it", advance to the next mblock, and return its 1148 * starting pfn within the mnode. 1149 * it: returns readonly data to the caller; see below. 1150 * 1151 * The input pfn must be aligned for the page size szc. 1152 * 1153 * Returns: starting pfn for the iteration for the mnode/mblock, 1154 * which is aligned according to the page size, 1155 * or returns (pfn_t)(-1) if the input pfn lies past the last 1156 * valid pfn of the mnode. 1157 * Returns misc values in the "it" struct that allows the caller 1158 * to advance the pfn within an mblock using address arithmetic; 1159 * see definition of mem_node_iterator_t in vm_dep.h. 1160 * When the caller calculates a pfn that is greater than the 1161 * returned value it->mi_mblock_end, the caller should again 1162 * call plat_mem_node_iterator_init, passing init=0. 1163 */ 1164 pfn_t 1165 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc, 1166 mem_node_iterator_t *it, int init) 1167 { 1168 int i; 1169 pgcnt_t szcpgcnt = PNUM_SIZE(szc); 1170 struct mblock_md *mblock; 1171 pfn_t base, end; 1172 mem_stripe_t *ms; 1173 uint64_t szcpagesize; 1174 1175 ASSERT(it != NULL); 1176 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1177 ASSERT(n_mblocks > 0); 1178 ASSERT(P2PHASE(pfn, szcpgcnt) == 0); 1179 1180 if (init) { 1181 it->mi_last_mblock = 0; 1182 it->mi_init = 1; 1183 } 1184 1185 /* Check if mpo is not enabled and we only have one mblock */ 1186 if (n_locality_groups == 1 && n_mblocks == 1) { 1187 if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) 1188 return ((pfn_t)-1); 1189 it->mi_mnode = mnode; 1190 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1191 it->mi_mnode_pfn_mask = 0; 1192 it->mi_mnode_pfn_shift = 0; 1193 it->mi_mnode_mask = 0; 1194 it->mi_mblock_base = mem_node_config[mnode].physbase; 1195 it->mi_mblock_end = mem_node_config[mnode].physmax; 1196 if (pfn < it->mi_mblock_base) 1197 pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt); 1198 if ((pfn + szcpgcnt - 1) > it->mi_mblock_end) 1199 pfn = (pfn_t)-1; 1200 return (pfn); 1201 } 1202 1203 /* init=1 means begin iterator, init=0 means continue */ 1204 if (init == 1) { 1205 i = 0; 1206 } else { 1207 ASSERT(it->mi_last_mblock < n_mblocks); 1208 i = it->mi_last_mblock; 1209 ASSERT(pfn > 1210 mem_stripes[i * max_locality_groups + mnode].physmax); 1211 if (++i == n_mblocks) 1212 return ((pfn_t)-1); 1213 } 1214 1215 /* 1216 * Find mblock that contains pfn for mnode's stripe, or first such an 1217 * mblock after pfn, else pfn is out of bound and we'll return -1. 1218 * mblocks and stripes are sorted in ascending address order. 1219 */ 1220 szcpagesize = szcpgcnt << PAGESHIFT; 1221 for (; i < n_mblocks; i++) { 1222 if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize)) 1223 continue; 1224 ms = &mem_stripes[i * max_locality_groups + mnode]; 1225 if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax && 1226 (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <= 1227 ms->physmax) 1228 break; 1229 } 1230 if (i == n_mblocks) { 1231 it->mi_last_mblock = i - 1; 1232 return ((pfn_t)-1); 1233 } 1234 1235 it->mi_last_mblock = i; 1236 1237 mblock = &mpo_mblock[i]; 1238 base = ms->physbase; 1239 end = ms->physmax; 1240 1241 it->mi_mnode = mnode; 1242 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1243 it->mi_mblock_base = base; 1244 it->mi_mblock_end = end; 1245 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1246 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1247 it->mi_mnode_mask = max_locality_groups - 1; 1248 if (pfn < base) { 1249 pfn = P2ROUNDUP(base, szcpgcnt); 1250 ASSERT(pfn + szcpgcnt - 1 <= end); 1251 } 1252 ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn); 1253 return (pfn); 1254 } 1255 1256 /* 1257 * plat_mem_node_intersect_range() 1258 * 1259 * Find the intersection between a memnode and a range of pfn's. 1260 */ 1261 void 1262 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1263 int mnode, pgcnt_t *npages_out) 1264 { 1265 pfn_t offset, len, hole, base, end, test_end, frag; 1266 pfn_t nearest; 1267 mem_stripe_t *ms; 1268 int i, npages; 1269 1270 *npages_out = 0; 1271 1272 if (!mem_node_config[mnode].exists || test_len == 0) 1273 return; 1274 1275 base = mem_node_config[mnode].physbase; 1276 end = mem_node_config[mnode].physmax; 1277 1278 test_end = test_base + test_len - 1; 1279 if (end < test_base || base > test_end) 1280 return; 1281 1282 if (n_locality_groups == 1) { 1283 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1284 return; 1285 } 1286 1287 hole = mnode_stride - mnode_pages; 1288 npages = 0; 1289 1290 /* 1291 * Iterate over all the stripes for this mnode (one per mblock), 1292 * find the intersection with each, and accumulate the intersections. 1293 * 1294 * Determing the intersection with a stripe is tricky. If base or end 1295 * fall outside the mem_node bounds, round them to physbase/physmax of 1296 * mem_node. If base or end fall in a gap, round them to start of 1297 * nearest stripe. If they fall within a stripe, keep base or end, 1298 * but calculate the fragment size that should be excluded from the 1299 * stripe. Calculate how many strides fall in the adjusted range, 1300 * multiply by stripe width, and add the start and end fragments. 1301 */ 1302 1303 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1304 ms = &mem_stripes[i]; 1305 if (ms->exists && 1306 test_base <= (end = ms->physmax) && 1307 test_end >= (base = ms->physbase)) { 1308 1309 offset = ms->offset; 1310 1311 if (test_base > base) { 1312 /* Round test_base to next multiple of stride */ 1313 len = P2ROUNDUP(test_base - (base - offset), 1314 mnode_stride); 1315 nearest = base - offset + len; 1316 /* 1317 * Compute distance from test_base to the 1318 * stride boundary to see if test_base falls 1319 * in the stripe or in the hole. 1320 */ 1321 if (nearest - test_base > hole) { 1322 /* 1323 * test_base lies in stripe, 1324 * and offset should be excluded. 1325 */ 1326 offset = test_base - 1327 (nearest - mnode_stride); 1328 base = test_base; 1329 } else { 1330 /* round up to next stripe start */ 1331 offset = 0; 1332 base = nearest; 1333 if (base > end) 1334 continue; 1335 } 1336 1337 } 1338 1339 if (test_end < end) 1340 end = test_end; 1341 end++; /* adjust to an exclusive bound */ 1342 1343 /* Round end to next multiple of stride */ 1344 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1345 nearest = (base - offset) + len; 1346 if (nearest - end <= hole) { 1347 /* end falls in hole, use entire last stripe */ 1348 frag = 0; 1349 } else { 1350 /* end falls in stripe, compute fragment */ 1351 frag = nearest - hole - end; 1352 } 1353 1354 len = (len >> stripe_shift) - offset - frag; 1355 npages += len; 1356 } 1357 } 1358 1359 *npages_out = npages; 1360 } 1361 1362 /* 1363 * valid_pages() 1364 * 1365 * Return 1 if pages are valid and do not cross mnode boundaries 1366 * (which would break page free list assumptions), and 0 otherwise. 1367 */ 1368 1369 #define MNODE(pa) \ 1370 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1371 1372 static int 1373 valid_pages(md_t *md, mde_cookie_t cpu0) 1374 { 1375 int i, max_szc; 1376 uint64_t last_page_base, szc_mask; 1377 uint64_t max_page_len, max_coalesce_len; 1378 struct mblock_md *mb = mpo_mblock; 1379 1380 /* 1381 * Find the smaller of the largest page possible and supported. 1382 * mmu_exported_pagesize_mask is not yet initialized, so read 1383 * it from the MD. Apply minimal fixups in case of broken MDs 1384 * to get a sane mask. 1385 */ 1386 1387 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1388 szc_mask = 0; 1389 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1390 max_szc = highbit(szc_mask) - 1; 1391 if (max_szc > TTE256M) 1392 max_szc = TTE256M; 1393 max_page_len = TTEBYTES(max_szc); 1394 1395 /* 1396 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1397 * if mmu-page-size-list does not contain it, so 256M pages must fall 1398 * within one mnode to use MPO. 1399 */ 1400 max_coalesce_len = TTEBYTES(TTE256M); 1401 ASSERT(max_coalesce_len >= max_page_len); 1402 1403 if (ptob(mnode_pages) < max_coalesce_len) { 1404 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1405 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1406 return (0); 1407 } 1408 1409 for (i = 0; i < n_mblocks; i++) { 1410 uint64_t base = mb->base; 1411 uint64_t end = mb->base + mb->size - 1; 1412 uint64_t ra_to_pa = mb->ra_to_pa; 1413 1414 /* 1415 * If mblock is smaller than the max page size, then 1416 * RA = PA mod MAXPAGE is not guaranteed, but it must 1417 * not span mnodes. 1418 */ 1419 if (mb->size < max_page_len) { 1420 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1421 MPO_STATUS("Small mblock spans mnodes; " 1422 "MPO disabled: base = %lx, end = %lx, " 1423 "ra2pa = %lx\n", base, end, ra_to_pa); 1424 return (0); 1425 } 1426 } else { 1427 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1428 uint64_t pa_base = base + ra_to_pa; 1429 if ((base & (max_coalesce_len - 1)) != 1430 (pa_base & (max_coalesce_len - 1))) { 1431 MPO_STATUS("bad page alignment; MPO disabled: " 1432 "ra = %lx, pa = %lx, pagelen = %lx\n", 1433 base, pa_base, max_coalesce_len); 1434 return (0); 1435 } 1436 } 1437 1438 /* 1439 * Find start of last large page in mblock in RA space. 1440 * If page extends into the next mblock, verify the 1441 * mnode does not change. 1442 */ 1443 last_page_base = P2ALIGN(end, max_coalesce_len); 1444 if (i + 1 < n_mblocks && 1445 last_page_base + max_coalesce_len > mb[1].base && 1446 MNODE(last_page_base + ra_to_pa) != 1447 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1448 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1449 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1450 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1451 mb[1].ra_to_pa, max_coalesce_len); 1452 return (0); 1453 } 1454 1455 mb++; 1456 } 1457 return (1); 1458 } 1459 1460 1461 /* 1462 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1463 * if any, and remove them. This yields a config where the "coarse 1464 * grained" lgroups cover all of memory, even though part of that memory 1465 * is fine grain interleaved and does not deliver a purely local memory 1466 * latency. 1467 * 1468 * This function reads and modifies the globals: 1469 * mpo_lgroup[], n_lgrpnodes 1470 * 1471 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1472 */ 1473 1474 static int 1475 fix_interleave(void) 1476 { 1477 int i, j; 1478 uint64_t mask = 0; 1479 1480 j = 0; 1481 for (i = 0; i < n_lgrpnodes; i++) { 1482 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1483 /* remove this lgroup */ 1484 mask = mpo_lgroup[i].addr_mask; 1485 } else { 1486 mpo_lgroup[j++] = mpo_lgroup[i]; 1487 } 1488 } 1489 n_lgrpnodes = j; 1490 1491 if (mask != 0) 1492 MPO_STATUS("sub-page interleave %lx found; " 1493 "removing lgroup.\n", mask); 1494 1495 return (mask != 0); 1496 } 1497