1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/machsystm.h> 32 #include <sys/machparam.h> 33 #include <sys/cmn_err.h> 34 #include <sys/stat.h> 35 #include <sys/mach_descrip.h> 36 #include <sys/memnode.h> 37 #include <sys/mdesc.h> 38 #include <sys/mpo.h> 39 #include <vm/vm_dep.h> 40 #include <vm/hat_sfmmu.h> 41 #include <sys/promif.h> 42 43 /* 44 * MPO and the sun4v memory representation 45 * --------------------------------------- 46 * 47 * Latency groups are defined in the sun4v achitecture by memory-latency-group 48 * nodes in the Machine Description, as specified in FWARC/2007/260. These 49 * tie together cpu nodes and mblock nodes, and contain mask and match 50 * properties that identify the portion of an mblock that belongs to the 51 * lgroup. Mask and match are defined in the Physical Address (PA) space, 52 * but an mblock defines Real Addresses (RA). To translate, the mblock 53 * includes the property address-congruence-offset, hereafter referred to as 54 * ra_to_pa. A real address ra is a member of an lgroup if 55 * 56 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 57 * 58 * The MD is traversed, and information on all mblocks is kept in the array 59 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 60 * to, is kept in the array mpo_cpu[]. 61 * 62 * This implementation makes (and verifies) the simplifying assumption that 63 * the mask bits are the same for all defined lgroups, and that all 1 bits in 64 * the mask are contiguous. Thus the number of lgroups is bounded by the 65 * number of possible mask values, and the lgrp_handle_t is defined as the 66 * mask value, shifted right to eliminate the 0 bit positions in mask. The 67 * masks and values are also referred to as "home bits" in the code. 68 * 69 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 70 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 71 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 72 * home bits. This yields the mem_node. 73 * 74 * Interfaces 75 * ---------- 76 * 77 * This file exports the following entry points: 78 * 79 * plat_lgrp_init() 80 * plat_build_mem_nodes() 81 * plat_lgrp_cpu_to_hand() 82 * plat_lgrp_latency() 83 * plat_pfn_to_mem_node() 84 * These implement the usual platform lgroup interfaces. 85 * 86 * plat_rapfn_to_papfn() 87 * Recover the PA page coloring bits from an RA. 88 * 89 * plat_mem_node_iterator_init() 90 * Initialize an iterator to efficiently step through pages in a mem_node. 91 * 92 * plat_mem_node_intersect_range() 93 * Find the intersection with a mem_node. 94 */ 95 96 int sun4v_mpo_enable = 1; 97 int sun4v_mpo_debug = 0; 98 char sun4v_mpo_status[256] = ""; 99 100 /* Save CPU info from the MD and associate CPUs with lgroups */ 101 static struct cpu_md mpo_cpu[NCPU]; 102 103 /* Save lgroup info from the MD */ 104 #define MAX_MD_LGROUPS 32 105 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 106 static int n_lgrpnodes = 0; 107 static int n_locality_groups = 0; 108 static int max_locality_groups = 0; 109 110 /* Save mblocks from the MD */ 111 #define SMALL_MBLOCKS_COUNT 8 112 static struct mblock_md *mpo_mblock; 113 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 114 static int n_mblocks = 0; 115 116 /* Save mem_node stripes calculate from mblocks and lgroups. */ 117 static mem_stripe_t *mem_stripes; 118 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 119 static int mstripesz = 0; 120 static int n_mem_stripes = 0; 121 static pfn_t mnode_stride; /* distance between stripes, start to start */ 122 static int stripe_shift; /* stride/stripes expressed as a shift */ 123 static pfn_t mnode_pages; /* mem_node stripe width */ 124 125 /* Save home mask and shift used to calculate lgrp_handle_t values */ 126 static uint64_t home_mask = 0; 127 static pfn_t home_mask_pfn = 0; 128 static int home_mask_shift = 0; 129 static uint_t home_mask_pfn_shift = 0; 130 131 /* Save lowest and highest latencies found across all lgroups */ 132 static int lower_latency = 0; 133 static int higher_latency = 0; 134 135 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 136 137 static int valid_pages(md_t *md, mde_cookie_t cpu0); 138 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 139 static int fix_interleave(void); 140 141 /* Debug support */ 142 #if defined(DEBUG) && !defined(lint) 143 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 144 #else 145 #define MPO_DEBUG(...) 146 #endif /* DEBUG */ 147 148 /* Record status message, viewable from mdb */ 149 #define MPO_STATUS(args...) { \ 150 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 151 MPO_DEBUG(sun4v_mpo_status); \ 152 } 153 154 /* 155 * Routine to read a uint64_t from a given md 156 */ 157 static int64_t 158 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 159 { 160 int err = md_get_prop_val(md, node, propname, val); 161 return (err); 162 } 163 164 static int 165 mblock_cmp(const void *a, const void *b) 166 { 167 struct mblock_md *m1 = (struct mblock_md *)a; 168 struct mblock_md *m2 = (struct mblock_md *)b; 169 170 if (m1->base < m2->base) 171 return (-1); 172 else if (m1->base == m2->base) 173 return (0); 174 else 175 return (1); 176 } 177 178 static void 179 mblock_sort(struct mblock_md *mblocks, int n) 180 { 181 extern void qsort(void *, size_t, size_t, 182 int (*)(const void *, const void *)); 183 184 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 185 } 186 187 static void 188 mpo_update_tunables(void) 189 { 190 int i, ncpu_min; 191 192 /* 193 * lgrp_expand_proc_thresh is the minimum load on the lgroups 194 * this process is currently running on before considering 195 * expanding threads to another lgroup. 196 * 197 * lgrp_expand_proc_diff determines how much less the remote lgroup 198 * must be loaded before expanding to it. 199 * 200 * On sun4v CMT processors, threads share a core pipeline, and 201 * at less than 100% utilization, best throughput is obtained by 202 * spreading threads across more cores, even if some are in a 203 * different lgroup. Spread threads to a new lgroup if the 204 * current group is more than 50% loaded. Because of virtualization, 205 * lgroups may have different numbers of CPUs, but the tunables 206 * apply to all lgroups, so find the smallest lgroup and compute 207 * 50% loading. 208 */ 209 210 ncpu_min = NCPU; 211 for (i = 0; i < n_lgrpnodes; i++) { 212 int ncpu = mpo_lgroup[i].ncpu; 213 if (ncpu != 0 && ncpu < ncpu_min) 214 ncpu_min = ncpu; 215 } 216 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 217 218 /* new home may only be half as loaded as the existing home to use it */ 219 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 220 221 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 222 } 223 224 static mde_cookie_t 225 cpuid_to_cpunode(md_t *md, int cpuid) 226 { 227 mde_cookie_t rootnode, foundnode, *cpunodes; 228 uint64_t cpuid_prop; 229 int n_cpunodes, i; 230 231 if (md == NULL) 232 return (MDE_INVAL_ELEM_COOKIE); 233 234 rootnode = md_root_node(md); 235 if (rootnode == MDE_INVAL_ELEM_COOKIE) 236 return (MDE_INVAL_ELEM_COOKIE); 237 238 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU, 239 "fwd", &cpunodes); 240 if (n_cpunodes <= 0 || n_cpunodes > NCPU) 241 goto cpuid_fail; 242 243 for (i = 0; i < n_cpunodes; i++) { 244 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID, 245 &cpuid_prop)) 246 break; 247 if (cpuid_prop == (uint64_t)cpuid) { 248 foundnode = cpunodes[i]; 249 md_free_scan_dag(md, &cpunodes); 250 return (foundnode); 251 } 252 } 253 cpuid_fail: 254 if (n_cpunodes > 0) 255 md_free_scan_dag(md, &cpunodes); 256 return (MDE_INVAL_ELEM_COOKIE); 257 } 258 259 static int 260 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode) 261 { 262 mde_cookie_t *nodes; 263 uint64_t latency, lowest_latency; 264 uint64_t address_match, lowest_address_match; 265 int n_lgroups, j, result = 0; 266 267 /* Find lgroup nodes reachable from this cpu */ 268 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG, 269 "fwd", &nodes); 270 271 lowest_latency = ~(0UL); 272 273 /* Find the lgroup node with the smallest latency */ 274 for (j = 0; j < n_lgroups; j++) { 275 result = get_int(md, nodes[j], PROP_LG_LATENCY, 276 &latency); 277 result |= get_int(md, nodes[j], PROP_LG_MATCH, 278 &address_match); 279 if (result != 0) { 280 j = -1; 281 goto to_lgrp_done; 282 } 283 if (latency < lowest_latency) { 284 lowest_latency = latency; 285 lowest_address_match = address_match; 286 } 287 } 288 for (j = 0; j < n_lgrpnodes; j++) { 289 if ((mpo_lgroup[j].latency == lowest_latency) && 290 (mpo_lgroup[j].addr_match == lowest_address_match)) 291 break; 292 } 293 if (j == n_lgrpnodes) 294 j = -1; 295 296 to_lgrp_done: 297 if (n_lgroups > 0) 298 md_free_scan_dag(md, &nodes); 299 return (j); 300 } 301 302 /* Called when DR'ing in a CPU */ 303 void 304 mpo_cpu_add(int cpuid) 305 { 306 md_t *md; 307 mde_cookie_t cpunode; 308 309 int i; 310 311 if (n_lgrpnodes <= 0) 312 return; 313 314 md = md_get_handle(); 315 316 if (md == NULL) 317 goto add_fail; 318 319 cpunode = cpuid_to_cpunode(md, cpuid); 320 if (cpunode == MDE_INVAL_ELEM_COOKIE) 321 goto add_fail; 322 323 i = mpo_cpu_to_lgroup(md, cpunode); 324 if (i == -1) 325 goto add_fail; 326 327 mpo_cpu[cpuid].lgrp_index = i; 328 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift; 329 mpo_lgroup[i].ncpu++; 330 mpo_update_tunables(); 331 (void) md_fini_handle(md); 332 return; 333 add_fail: 334 panic("mpo_cpu_add: Cannot read MD"); 335 } 336 337 /* Called when DR'ing out a CPU */ 338 void 339 mpo_cpu_remove(int cpuid) 340 { 341 int i; 342 343 if (n_lgrpnodes <= 0) 344 return; 345 346 i = mpo_cpu[cpuid].lgrp_index; 347 mpo_lgroup[i].ncpu--; 348 mpo_cpu[cpuid].home = 0; 349 mpo_cpu[cpuid].lgrp_index = -1; 350 mpo_update_tunables(); 351 } 352 353 /* 354 * 355 * Traverse the MD to determine: 356 * 357 * Number of CPU nodes, lgrp_nodes, and mblocks 358 * Then for each lgrp_node, obtain the appropriate data. 359 * For each CPU, determine its home locality and store it. 360 * For each mblock, retrieve its data and store it. 361 */ 362 static int 363 lgrp_traverse(md_t *md) 364 { 365 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 366 uint64_t i, j, k, o, n_nodes; 367 uint64_t mem_lg_homeset = 0; 368 int ret_val = 0; 369 int result = 0; 370 int n_cpunodes = 0; 371 int sub_page_fix; 372 int mblocksz = 0; 373 size_t allocsz; 374 375 n_nodes = md_node_count(md); 376 377 if (n_nodes <= 0) { 378 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 379 ret_val = -1; 380 goto fail; 381 } 382 383 root = md_root_node(md); 384 385 if (root == MDE_INVAL_ELEM_COOKIE) { 386 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 387 ret_val = -1; 388 goto fail; 389 } 390 391 /* 392 * Build the Memory Nodes. Do this before any possibility of 393 * bailing from this routine so we obtain ra_to_pa (needed for page 394 * coloring) even when there are no lgroups defined. 395 */ 396 397 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 398 "fwd", &mblocknodes); 399 400 if (n_mblocks <= 0) { 401 MPO_STATUS("lgrp_traverse: No mblock " 402 "nodes detected in Machine Descriptor\n"); 403 n_mblocks = 0; 404 ret_val = -1; 405 goto fail; 406 } 407 /* 408 * If we have a small number of mblocks we will use the space 409 * that we preallocated. Otherwise, we will dynamically 410 * allocate the space 411 */ 412 mblocksz = n_mblocks * sizeof (struct mblock_md); 413 mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t); 414 415 if (n_mblocks <= SMALL_MBLOCKS_COUNT) { 416 mpo_mblock = &small_mpo_mblocks[0]; 417 mem_stripes = &small_mem_stripes[0]; 418 } else { 419 allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 420 /* Ensure that we dont request more space than reserved */ 421 if (allocsz > MPOBUF_SIZE) { 422 MPO_STATUS("lgrp_traverse: Insufficient space " 423 "for mblock structures \n"); 424 ret_val = -1; 425 n_mblocks = 0; 426 goto fail; 427 } 428 mpo_mblock = (struct mblock_md *) 429 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 430 if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) { 431 MPO_STATUS("lgrp_traverse: Cannot allocate space " 432 "for mblocks \n"); 433 ret_val = -1; 434 n_mblocks = 0; 435 goto fail; 436 } 437 mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 438 mpo_heap32_bufsz = MPOBUF_SIZE; 439 440 mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks); 441 } 442 for (i = 0; i < n_mblocks; i++) { 443 mpo_mblock[i].node = mblocknodes[i]; 444 445 /* Without a base or size value we will fail */ 446 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 447 &mpo_mblock[i].base); 448 if (result < 0) { 449 MPO_STATUS("lgrp_traverse: " 450 "PROP_LG_BASE is missing\n"); 451 n_mblocks = 0; 452 ret_val = -1; 453 goto fail; 454 } 455 456 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 457 &mpo_mblock[i].size); 458 if (result < 0) { 459 MPO_STATUS("lgrp_traverse: " 460 "PROP_LG_SIZE is missing\n"); 461 n_mblocks = 0; 462 ret_val = -1; 463 goto fail; 464 } 465 466 result = get_int(md, mblocknodes[i], 467 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 468 469 /* If we don't have an ra_pa_offset, just set it to 0 */ 470 if (result < 0) 471 mpo_mblock[i].ra_to_pa = 0; 472 473 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 474 "ra_to_pa = %lx\n", i, 475 mpo_mblock[i].base, 476 mpo_mblock[i].size, 477 mpo_mblock[i].ra_to_pa); 478 } 479 480 /* Must sort mblocks by address for mem_node_iterator_init() */ 481 mblock_sort(mpo_mblock, n_mblocks); 482 483 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 484 485 /* Page coloring hook is required so we can iterate through mnodes */ 486 if (&page_next_pfn_for_color_cpu == NULL) { 487 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 488 ret_val = -1; 489 goto fail; 490 } 491 492 /* Global enable for mpo */ 493 if (sun4v_mpo_enable == 0) { 494 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 495 ret_val = -1; 496 goto fail; 497 } 498 499 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 500 "fwd", &lgrpnodes); 501 502 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 503 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 504 ret_val = -1; 505 goto fail; 506 } 507 508 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 509 510 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 511 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 512 "in MD\n"); 513 ret_val = -1; 514 goto fail; 515 } 516 517 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 518 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 519 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 520 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 521 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 522 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 523 524 for (i = 0; i < n_lgrpnodes; i++) { 525 mpo_lgroup[i].node = lgrpnodes[i]; 526 mpo_lgroup[i].id = i; 527 mpo_lgroup[i].ncpu = 0; 528 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 529 &mpo_lgroup[i].addr_mask); 530 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 531 &mpo_lgroup[i].addr_match); 532 533 /* 534 * If either the mask or match properties are missing, set to 0 535 */ 536 if (result < 0) { 537 mpo_lgroup[i].addr_mask = 0; 538 mpo_lgroup[i].addr_match = 0; 539 } 540 541 /* Set latency to 0 if property not present */ 542 543 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 544 &mpo_lgroup[i].latency); 545 if (result < 0) 546 mpo_lgroup[i].latency = 0; 547 } 548 549 /* 550 * Sub-page level interleave is not yet supported. Check for it, 551 * and remove sub-page interleaved lgroups from mpo_lgroup and 552 * n_lgrpnodes. If no lgroups are left, return. 553 */ 554 555 sub_page_fix = fix_interleave(); 556 if (n_lgrpnodes == 0) { 557 ret_val = -1; 558 goto fail; 559 } 560 561 /* Ensure that all of the addr_mask values are the same */ 562 563 for (i = 0; i < n_lgrpnodes; i++) { 564 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 565 MPO_STATUS("lgrp_traverse: " 566 "addr_mask values are not the same\n"); 567 ret_val = -1; 568 goto fail; 569 } 570 } 571 572 /* 573 * Ensure that all lgrp nodes see all the mblocks. However, if 574 * sub-page interleave is being fixed, they do not, so skip 575 * the check. 576 */ 577 578 if (sub_page_fix == 0) { 579 for (i = 0; i < n_lgrpnodes; i++) { 580 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 581 PROP_LG_MBLOCK, "fwd", &nodes); 582 md_free_scan_dag(md, &nodes); 583 if (j != n_mblocks) { 584 MPO_STATUS("lgrp_traverse: " 585 "sub-page interleave is being fixed\n"); 586 ret_val = -1; 587 goto fail; 588 } 589 } 590 } 591 592 /* 593 * Use the address mask from the first lgroup node 594 * to establish our home_mask. 595 */ 596 home_mask = mpo_lgroup[0].addr_mask; 597 home_mask_pfn = btop(home_mask); 598 home_mask_shift = lowbit(home_mask) - 1; 599 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 600 mnode_pages = btop(1ULL << home_mask_shift); 601 602 /* 603 * How many values are possible in home mask? Assume the mask 604 * bits are contiguous. 605 */ 606 max_locality_groups = 607 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 608 609 /* Now verify the home mask bits are contiguous */ 610 611 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 612 MPO_STATUS("lgrp_traverse: " 613 "home mask bits are not contiguous\n"); 614 ret_val = -1; 615 goto fail; 616 } 617 618 /* Record all of the home bits */ 619 620 for (i = 0; i < n_lgrpnodes; i++) { 621 HOMESET_ADD(mem_lg_homeset, 622 mpo_lgroup[i].addr_match >> home_mask_shift); 623 } 624 625 /* Count the number different "home" mem_lg's we've discovered */ 626 627 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 628 629 /* If we have only 1 locality group then we can exit */ 630 if (n_locality_groups == 1) { 631 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 632 ret_val = -1; 633 goto fail; 634 } 635 636 /* 637 * Set the latencies. A CPU's lgroup is defined by the lowest 638 * latency found. All other memory is considered remote, and the 639 * remote latency is represented by the highest latency found. 640 * Thus hierarchical lgroups, if any, are approximated by a 641 * two level scheme. 642 * 643 * The Solaris MPO framework by convention wants to see latencies 644 * in units of nano-sec/10. In the MD, the units are defined to be 645 * pico-seconds. 646 */ 647 648 lower_latency = mpo_lgroup[0].latency; 649 higher_latency = mpo_lgroup[0].latency; 650 651 for (i = 1; i < n_lgrpnodes; i++) { 652 if (mpo_lgroup[i].latency < lower_latency) { 653 lower_latency = mpo_lgroup[i].latency; 654 } 655 if (mpo_lgroup[i].latency > higher_latency) { 656 higher_latency = mpo_lgroup[i].latency; 657 } 658 } 659 lower_latency /= 10000; 660 higher_latency /= 10000; 661 662 /* Clear our CPU data */ 663 664 for (i = 0; i < NCPU; i++) { 665 mpo_cpu[i].home = 0; 666 mpo_cpu[i].lgrp_index = -1; 667 } 668 669 /* Build the CPU nodes */ 670 for (i = 0; i < n_cpunodes; i++) { 671 672 /* Read in the lgroup nodes */ 673 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 674 if (result < 0) { 675 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 676 ret_val = -1; 677 goto fail; 678 } 679 680 o = mpo_cpu_to_lgroup(md, cpunodes[i]); 681 if (o == -1) { 682 ret_val = -1; 683 goto fail; 684 } 685 mpo_cpu[k].lgrp_index = o; 686 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift; 687 mpo_lgroup[o].ncpu++; 688 } 689 /* Validate that no large pages cross mnode boundaries. */ 690 if (valid_pages(md, cpunodes[0]) == 0) { 691 ret_val = -1; 692 goto fail; 693 } 694 695 fail: 696 /* MD cookies are no longer valid; ensure they are not used again. */ 697 for (i = 0; i < n_mblocks; i++) 698 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 699 for (i = 0; i < n_lgrpnodes; i++) 700 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 701 702 if (n_cpunodes > 0) 703 md_free_scan_dag(md, &cpunodes); 704 if (n_lgrpnodes > 0) 705 md_free_scan_dag(md, &lgrpnodes); 706 if (n_mblocks > 0) 707 md_free_scan_dag(md, &mblocknodes); 708 else 709 panic("lgrp_traverse: No memory blocks found"); 710 711 if (ret_val == 0) 712 MPO_STATUS("MPO feature is enabled.\n"); 713 714 return (ret_val); 715 } 716 717 /* 718 * Determine the number of unique mem_lg's present in our system 719 */ 720 static int 721 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 722 { 723 int homeid; 724 int count = 0; 725 726 /* 727 * Scan the "home" bits of the mem_lgs, count 728 * the number that are unique. 729 */ 730 731 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 732 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 733 count++; 734 } 735 } 736 737 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 738 mem_lg_homeset); 739 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 740 741 /* Default must be at least one */ 742 if (count == 0) 743 count = 1; 744 745 return (count); 746 } 747 748 /* 749 * Platform specific lgroup initialization 750 */ 751 void 752 plat_lgrp_init(void) 753 { 754 md_t *md; 755 int rc; 756 757 /* Get the Machine Descriptor handle */ 758 759 md = md_get_handle(); 760 761 /* If not, we cannot continue */ 762 763 if (md == NULL) { 764 panic("cannot access machine descriptor\n"); 765 } else { 766 rc = lgrp_traverse(md); 767 (void) md_fini_handle(md); 768 } 769 770 /* 771 * If we can't process the MD for lgroups then at least let the 772 * system try to boot. Assume we have one lgroup so that 773 * when plat_build_mem_nodes is called, it will attempt to init 774 * an mnode based on the supplied memory segment. 775 */ 776 777 if (rc == -1) { 778 home_mask_pfn = 0; 779 max_locality_groups = 1; 780 n_locality_groups = 1; 781 return; 782 } 783 784 mem_node_pfn_shift = 0; 785 mem_node_physalign = 0; 786 787 /* Use lgroup-aware TSB allocations */ 788 tsb_lgrp_affinity = 1; 789 790 /* Require that a home lgroup have some memory to be chosen */ 791 lgrp_mem_free_thresh = 1; 792 793 /* Standard home-on-next-touch policy */ 794 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 795 796 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 797 lgrp_load_thresh = UINT32_MAX; 798 799 mpo_update_tunables(); 800 } 801 802 /* 803 * Helper routine for debugging calls to mem_node_add_slice() 804 */ 805 static void 806 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 807 { 808 #if defined(DEBUG) && !defined(lint) 809 static int slice_count = 0; 810 811 slice_count++; 812 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 813 slice_count, basepfn, endpfn); 814 #endif 815 mem_node_add_slice(basepfn, endpfn); 816 } 817 818 /* 819 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 820 */ 821 static void 822 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 823 { 824 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 825 "mnode index: %d\n", plathand, mnode); 826 plat_assign_lgrphand_to_mem_node(plathand, mnode); 827 } 828 829 /* 830 * plat_build_mem_nodes() 831 * 832 * Define the mem_nodes based on the modified boot memory list, 833 * or based on info read from the MD in plat_lgrp_init(). 834 * 835 * When the home mask lies in the middle of the address bits (as it does on 836 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 837 * it is striped across an mblock in a repeating pattern of contiguous memory 838 * followed by a gap. The stripe width is the size of the contiguous piece. 839 * The stride is the distance from the start of one contiguous piece to the 840 * start of the next. The gap is thus stride - stripe_width. 841 * 842 * The stripe of an mnode that falls within an mblock is described by the type 843 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 844 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 845 * this array is predetermined. The mem_stripe_t that describes mnode m 846 * within mpo_mblock[i] is stored at 847 * mem_stripes[ m + i * max_locality_groups ] 848 * 849 * max_locality_groups is the total number of possible locality groups, 850 * as defined by the size of the home mask, even if the memory assigned 851 * to the domain is small and does not cover all the lgroups. Thus some 852 * mem_stripe_t's may be empty. 853 * 854 * The members of mem_stripe_t are: 855 * physbase: First valid page in mem_node in the corresponding mblock 856 * physmax: Last valid page in mem_node in mblock 857 * offset: The full stripe width starts at physbase - offset. 858 * Thus if offset is non-zero, this mem_node starts in the middle 859 * of a stripe width, and the second full stripe starts at 860 * physbase - offset + stride. (even though physmax may fall in the 861 * middle of a stripe width, we do not save the ending fragment size 862 * in this data structure.) 863 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 864 * 865 * The stripe width is kept in the global mnode_pages. 866 * The stride is kept in the global mnode_stride. 867 * All the above use pfn's as the unit. 868 * 869 * As an example, the memory layout for a domain with 2 mblocks and 4 870 * mem_nodes 0,1,2,3 could look like this: 871 * 872 * 123012301230 ... 012301230123 ... 873 * mblock 0 mblock 1 874 */ 875 876 void 877 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 878 { 879 lgrp_handle_t lgrphand, lgrp_start; 880 int i, mnode, elem; 881 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 882 uint64_t stripe, frag, remove; 883 mem_stripe_t *ms; 884 885 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 886 max_mem_nodes = max_locality_groups; 887 888 /* Check for non-MPO sun4v platforms */ 889 if (n_locality_groups <= 1) { 890 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 891 for (elem = 0; elem < nelems; elem += 2) { 892 base = list[elem]; 893 len = list[elem+1]; 894 895 mpo_mem_node_add_slice(btop(base), 896 btop(base + len - 1)); 897 } 898 mem_node_pfn_shift = 0; 899 mem_node_physalign = 0; 900 n_mem_stripes = 0; 901 if (n_mblocks == 1) 902 return; 903 } 904 905 bzero(mem_stripes, mstripesz); 906 stripe = ptob(mnode_pages); 907 stride = max_locality_groups * stripe; 908 909 /* Save commonly used values in globals */ 910 mnode_stride = btop(stride); 911 n_mem_stripes = max_locality_groups * n_mblocks; 912 stripe_shift = highbit(max_locality_groups) - 1; 913 914 for (i = 0; i < n_mblocks; i++) { 915 mpo_mblock[i].mnode_mask = (mnodeset_t)0; 916 base = mpo_mblock[i].base; 917 end = mpo_mblock[i].base + mpo_mblock[i].size; 918 ra_to_pa = mpo_mblock[i].ra_to_pa; 919 mpo_mblock[i].base_pfn = btop(base); 920 mpo_mblock[i].end_pfn = btop(end - 1); 921 922 /* Find the offset from the prev stripe boundary in PA space. */ 923 offset = (base + ra_to_pa) & (stripe - 1); 924 925 /* Set the next stripe boundary. */ 926 stripe_end = base - offset + stripe; 927 928 lgrp_start = (((base + ra_to_pa) & home_mask) >> 929 home_mask_shift); 930 lgrphand = lgrp_start; 931 932 /* 933 * Loop over all lgroups covered by the mblock, creating a 934 * stripe for each. Stop when lgrp_start is visited again. 935 */ 936 do { 937 /* mblock may not span all lgroups */ 938 if (base >= end) 939 break; 940 941 mnode = lgrphand; 942 ASSERT(mnode < max_mem_nodes); 943 mpo_mblock[i].mnode_mask |= (mnodeset_t)1 << mnode; 944 945 /* 946 * Calculate the size of the fragment that does not 947 * belong to the mnode in the last partial stride. 948 */ 949 frag = (end - (base - offset)) & (stride - 1); 950 if (frag == 0) { 951 /* remove the gap */ 952 remove = stride - stripe; 953 } else if (frag < stripe) { 954 /* fragment fits in stripe; keep it all */ 955 remove = 0; 956 } else { 957 /* fragment is large; trim after whole stripe */ 958 remove = frag - stripe; 959 } 960 961 ms = &mem_stripes[i * max_locality_groups + mnode]; 962 ms->physbase = btop(base); 963 ms->physmax = btop(end - 1 - remove); 964 ms->offset = btop(offset); 965 ms->exists = 1; 966 967 /* 968 * If we have only 1 lgroup and multiple mblocks, 969 * then we have already established our lgrp handle 970 * to mem_node and mem_node_config values above. 971 */ 972 if (n_locality_groups > 1) { 973 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, 974 mnode); 975 mpo_mem_node_add_slice(ms->physbase, 976 ms->physmax); 977 } 978 base = stripe_end; 979 stripe_end += stripe; 980 offset = 0; 981 lgrphand = (((base + ra_to_pa) & home_mask) >> 982 home_mask_shift); 983 } while (lgrphand != lgrp_start); 984 } 985 986 /* 987 * Indicate to vm_pagelist that the hpm_counters array 988 * should be shared because the ranges overlap. 989 */ 990 if (max_mem_nodes > 1) { 991 interleaved_mnodes = 1; 992 } 993 } 994 995 /* 996 * Return the locality group value for the supplied processor 997 */ 998 lgrp_handle_t 999 plat_lgrp_cpu_to_hand(processorid_t id) 1000 { 1001 if (n_locality_groups > 1) { 1002 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 1003 } else { 1004 return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */ 1005 } 1006 } 1007 1008 int 1009 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 1010 { 1011 /* 1012 * Return min remote latency when there are more than two lgroups 1013 * (root and child) and getting latency between two different lgroups 1014 * or root is involved. 1015 */ 1016 if (lgrp_optimizations() && (from != to || 1017 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 1018 return ((int)higher_latency); 1019 } else { 1020 return ((int)lower_latency); 1021 } 1022 } 1023 1024 int 1025 plat_pfn_to_mem_node(pfn_t pfn) 1026 { 1027 int i, mnode; 1028 pfn_t ra_to_pa_pfn; 1029 struct mblock_md *mb; 1030 1031 if (n_locality_groups <= 1) 1032 return (0); 1033 1034 /* 1035 * The mnode is defined to be 1:1 with the lgroup handle, which 1036 * is taken from from the home bits. Find the mblock in which 1037 * the pfn falls to get the ra_to_pa adjustment, and extract 1038 * the home bits. 1039 */ 1040 mb = &mpo_mblock[0]; 1041 for (i = 0; i < n_mblocks; i++) { 1042 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 1043 ra_to_pa_pfn = btop(mb->ra_to_pa); 1044 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 1045 home_mask_pfn_shift); 1046 ASSERT(mnode < max_mem_nodes); 1047 return (mnode); 1048 } 1049 mb++; 1050 } 1051 1052 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 1053 return (pfn); 1054 } 1055 1056 /* 1057 * plat_rapfn_to_papfn 1058 * 1059 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 1060 * and home mask bits are correct. The upper bits do not necessarily 1061 * match the actual PA, however. 1062 */ 1063 pfn_t 1064 plat_rapfn_to_papfn(pfn_t pfn) 1065 { 1066 int i; 1067 pfn_t ra_to_pa_pfn; 1068 struct mblock_md *mb; 1069 1070 ASSERT(n_mblocks > 0); 1071 if (n_mblocks == 1) 1072 return (pfn + base_ra_to_pa_pfn); 1073 1074 /* 1075 * Find the mblock in which the pfn falls 1076 * in order to get the ra_to_pa adjustment. 1077 */ 1078 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 1079 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 1080 ra_to_pa_pfn = btop(mb->ra_to_pa); 1081 return (pfn + ra_to_pa_pfn); 1082 } 1083 } 1084 1085 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 1086 return (pfn); 1087 } 1088 1089 /* 1090 * plat_mem_node_iterator_init() 1091 * Initialize cookie to iterate over pfn's in an mnode. There is 1092 * no additional iterator function. The caller uses the info from 1093 * the iterator structure directly. 1094 * 1095 * pfn: starting pfn. 1096 * mnode: desired mnode. 1097 * init: set to 1 for full init, 0 for continuation 1098 * 1099 * Returns the appropriate starting pfn for the iteration 1100 * the same as the input pfn if it falls in an mblock. 1101 * Returns the (pfn_t)-1 value if the input pfn lies past 1102 * the last valid mnode pfn. 1103 */ 1104 pfn_t 1105 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 1106 mem_node_iterator_t *it, int init) 1107 { 1108 int i; 1109 struct mblock_md *mblock; 1110 pfn_t base, end; 1111 1112 ASSERT(it != NULL); 1113 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1114 ASSERT(n_mblocks > 0); 1115 1116 if (init) { 1117 it->mi_last_mblock = 0; 1118 it->mi_init = 1; 1119 } 1120 1121 /* Check if mpo is not enabled and we only have one mblock */ 1122 if (n_locality_groups == 1 && n_mblocks == 1) { 1123 it->mi_mnode = mnode; 1124 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1125 it->mi_mnode_pfn_mask = 0; 1126 it->mi_mnode_pfn_shift = 0; 1127 it->mi_mnode_mask = 0; 1128 it->mi_mblock_base = mem_node_config[mnode].physbase; 1129 it->mi_mblock_end = mem_node_config[mnode].physmax; 1130 if (pfn < it->mi_mblock_base) 1131 pfn = it->mi_mblock_base; 1132 else if (pfn > it->mi_mblock_end) 1133 pfn = (pfn_t)-1; 1134 return (pfn); 1135 } 1136 1137 /* 1138 * Find mblock that contains pfn, or first mblock after pfn, 1139 * else pfn is out of bounds, so use the last mblock. 1140 * mblocks are sorted in ascending address order. 1141 */ 1142 ASSERT(it->mi_last_mblock < n_mblocks); 1143 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 1144 i = init ? 0 : it->mi_last_mblock + 1; 1145 if (i == n_mblocks) 1146 return ((pfn_t)-1); 1147 1148 for (; i < n_mblocks; i++) { 1149 if ((mpo_mblock[i].mnode_mask & ((mnodeset_t)1 << mnode)) && 1150 (pfn <= mpo_mblock[i].end_pfn)) 1151 break; 1152 } 1153 if (i == n_mblocks) { 1154 it->mi_last_mblock = i - 1; 1155 return ((pfn_t)-1); 1156 } 1157 it->mi_last_mblock = i; 1158 1159 /* 1160 * Memory stripes are defined if there is more than one locality 1161 * group, so use the stripe bounds. Otherwise use mblock bounds. 1162 */ 1163 mblock = &mpo_mblock[i]; 1164 if (n_mem_stripes > 0) { 1165 mem_stripe_t *ms = 1166 &mem_stripes[i * max_locality_groups + mnode]; 1167 base = ms->physbase; 1168 end = ms->physmax; 1169 } else { 1170 ASSERT(mnode == 0); 1171 base = mblock->base_pfn; 1172 end = mblock->end_pfn; 1173 } 1174 1175 it->mi_mnode = mnode; 1176 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1177 it->mi_mblock_base = base; 1178 it->mi_mblock_end = end; 1179 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1180 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1181 it->mi_mnode_mask = max_locality_groups - 1; 1182 if (pfn < base) 1183 pfn = base; 1184 else if (pfn > end) 1185 pfn = (pfn_t)-1; 1186 return (pfn); 1187 } 1188 1189 /* 1190 * plat_mem_node_intersect_range() 1191 * 1192 * Find the intersection between a memnode and a range of pfn's. 1193 */ 1194 void 1195 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1196 int mnode, pgcnt_t *npages_out) 1197 { 1198 pfn_t offset, len, hole, base, end, test_end, frag; 1199 pfn_t nearest; 1200 mem_stripe_t *ms; 1201 int i, npages; 1202 1203 *npages_out = 0; 1204 1205 if (!mem_node_config[mnode].exists || test_len == 0) 1206 return; 1207 1208 base = mem_node_config[mnode].physbase; 1209 end = mem_node_config[mnode].physmax; 1210 1211 test_end = test_base + test_len - 1; 1212 if (end < test_base || base > test_end) 1213 return; 1214 1215 if (n_locality_groups == 1) { 1216 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1217 return; 1218 } 1219 1220 hole = mnode_stride - mnode_pages; 1221 npages = 0; 1222 1223 /* 1224 * Iterate over all the stripes for this mnode (one per mblock), 1225 * find the intersection with each, and accumulate the intersections. 1226 * 1227 * Determing the intersection with a stripe is tricky. If base or end 1228 * fall outside the mem_node bounds, round them to physbase/physmax of 1229 * mem_node. If base or end fall in a gap, round them to start of 1230 * nearest stripe. If they fall within a stripe, keep base or end, 1231 * but calculate the fragment size that should be excluded from the 1232 * stripe. Calculate how many strides fall in the adjusted range, 1233 * multiply by stripe width, and add the start and end fragments. 1234 */ 1235 1236 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1237 ms = &mem_stripes[i]; 1238 if (ms->exists && 1239 test_base <= (end = ms->physmax) && 1240 test_end >= (base = ms->physbase)) { 1241 1242 offset = ms->offset; 1243 1244 if (test_base > base) { 1245 /* Round test_base to next multiple of stride */ 1246 len = P2ROUNDUP(test_base - (base - offset), 1247 mnode_stride); 1248 nearest = base - offset + len; 1249 /* 1250 * Compute distance from test_base to the 1251 * stride boundary to see if test_base falls 1252 * in the stripe or in the hole. 1253 */ 1254 if (nearest - test_base > hole) { 1255 /* 1256 * test_base lies in stripe, 1257 * and offset should be excluded. 1258 */ 1259 offset = test_base - 1260 (nearest - mnode_stride); 1261 base = test_base; 1262 } else { 1263 /* round up to next stripe start */ 1264 offset = 0; 1265 base = nearest; 1266 if (base > end) 1267 continue; 1268 } 1269 1270 } 1271 1272 if (test_end < end) 1273 end = test_end; 1274 end++; /* adjust to an exclusive bound */ 1275 1276 /* Round end to next multiple of stride */ 1277 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1278 nearest = (base - offset) + len; 1279 if (nearest - end <= hole) { 1280 /* end falls in hole, use entire last stripe */ 1281 frag = 0; 1282 } else { 1283 /* end falls in stripe, compute fragment */ 1284 frag = nearest - hole - end; 1285 } 1286 1287 len = (len >> stripe_shift) - offset - frag; 1288 npages += len; 1289 } 1290 } 1291 1292 *npages_out = npages; 1293 } 1294 1295 /* 1296 * valid_pages() 1297 * 1298 * Return 1 if pages are valid and do not cross mnode boundaries 1299 * (which would break page free list assumptions), and 0 otherwise. 1300 */ 1301 1302 #define MNODE(pa) \ 1303 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1304 1305 static int 1306 valid_pages(md_t *md, mde_cookie_t cpu0) 1307 { 1308 int i, max_szc; 1309 uint64_t last_page_base, szc_mask; 1310 uint64_t max_page_len, max_coalesce_len; 1311 struct mblock_md *mb = mpo_mblock; 1312 1313 /* 1314 * Find the smaller of the largest page possible and supported. 1315 * mmu_exported_pagesize_mask is not yet initialized, so read 1316 * it from the MD. Apply minimal fixups in case of broken MDs 1317 * to get a sane mask. 1318 */ 1319 1320 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1321 szc_mask = 0; 1322 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1323 max_szc = highbit(szc_mask) - 1; 1324 if (max_szc > TTE256M) 1325 max_szc = TTE256M; 1326 max_page_len = TTEBYTES(max_szc); 1327 1328 /* 1329 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1330 * if mmu-page-size-list does not contain it, so 256M pages must fall 1331 * within one mnode to use MPO. 1332 */ 1333 max_coalesce_len = TTEBYTES(TTE256M); 1334 ASSERT(max_coalesce_len >= max_page_len); 1335 1336 if (ptob(mnode_pages) < max_coalesce_len) { 1337 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1338 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1339 return (0); 1340 } 1341 1342 for (i = 0; i < n_mblocks; i++) { 1343 uint64_t base = mb->base; 1344 uint64_t end = mb->base + mb->size - 1; 1345 uint64_t ra_to_pa = mb->ra_to_pa; 1346 1347 /* 1348 * If mblock is smaller than the max page size, then 1349 * RA = PA mod MAXPAGE is not guaranteed, but it must 1350 * not span mnodes. 1351 */ 1352 if (mb->size < max_page_len) { 1353 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1354 MPO_STATUS("Small mblock spans mnodes; " 1355 "MPO disabled: base = %lx, end = %lx, " 1356 "ra2pa = %lx\n", base, end, ra_to_pa); 1357 return (0); 1358 } 1359 } else { 1360 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1361 uint64_t pa_base = base + ra_to_pa; 1362 if ((base & (max_coalesce_len - 1)) != 1363 (pa_base & (max_coalesce_len - 1))) { 1364 MPO_STATUS("bad page alignment; MPO disabled: " 1365 "ra = %lx, pa = %lx, pagelen = %lx\n", 1366 base, pa_base, max_coalesce_len); 1367 return (0); 1368 } 1369 } 1370 1371 /* 1372 * Find start of last large page in mblock in RA space. 1373 * If page extends into the next mblock, verify the 1374 * mnode does not change. 1375 */ 1376 last_page_base = P2ALIGN(end, max_coalesce_len); 1377 if (i + 1 < n_mblocks && 1378 last_page_base + max_coalesce_len > mb[1].base && 1379 MNODE(last_page_base + ra_to_pa) != 1380 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1381 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1382 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1383 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1384 mb[1].ra_to_pa, max_coalesce_len); 1385 return (0); 1386 } 1387 1388 mb++; 1389 } 1390 return (1); 1391 } 1392 1393 1394 /* 1395 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1396 * if any, and remove them. This yields a config where the "coarse 1397 * grained" lgroups cover all of memory, even though part of that memory 1398 * is fine grain interleaved and does not deliver a purely local memory 1399 * latency. 1400 * 1401 * This function reads and modifies the globals: 1402 * mpo_lgroup[], n_lgrpnodes 1403 * 1404 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1405 */ 1406 1407 static int 1408 fix_interleave(void) 1409 { 1410 int i, j; 1411 uint64_t mask = 0; 1412 1413 j = 0; 1414 for (i = 0; i < n_lgrpnodes; i++) { 1415 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1416 /* remove this lgroup */ 1417 mask = mpo_lgroup[i].addr_mask; 1418 } else { 1419 mpo_lgroup[j++] = mpo_lgroup[i]; 1420 } 1421 } 1422 n_lgrpnodes = j; 1423 1424 if (mask != 0) 1425 MPO_STATUS("sub-page interleave %lx found; " 1426 "removing lgroup.\n", mask); 1427 1428 return (mask != 0); 1429 } 1430