1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/machsystm.h> 32 #include <sys/machparam.h> 33 #include <sys/cmn_err.h> 34 #include <sys/stat.h> 35 #include <sys/mach_descrip.h> 36 #include <sys/memnode.h> 37 #include <sys/mdesc.h> 38 #include <sys/mpo.h> 39 #include <vm/vm_dep.h> 40 #include <vm/hat_sfmmu.h> 41 42 /* 43 * MPO and the sun4v memory representation 44 * --------------------------------------- 45 * 46 * Latency groups are defined in the sun4v achitecture by memory-latency-group 47 * nodes in the Machine Description, as specified in FWARC/2007/260. These 48 * tie together cpu nodes and mblock nodes, and contain mask and match 49 * properties that identify the portion of an mblock that belongs to the 50 * lgroup. Mask and match are defined in the Physical Address (PA) space, 51 * but an mblock defines Real Addresses (RA). To translate, the mblock 52 * includes the property address-congruence-offset, hereafter referred to as 53 * ra_to_pa. A real address ra is a member of an lgroup if 54 * 55 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 56 * 57 * The MD is traversed, and information on all mblocks is kept in the array 58 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 59 * to, is kept in the array mpo_cpu[]. 60 * 61 * This implementation makes (and verifies) the simplifying assumption that 62 * the mask bits are the same for all defined lgroups, and that all 1 bits in 63 * the mask are contiguous. Thus the number of lgroups is bounded by the 64 * number of possible mask values, and the lgrp_handle_t is defined as the 65 * mask value, shifted right to eliminate the 0 bit positions in mask. The 66 * masks and values are also referred to as "home bits" in the code. 67 * 68 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 69 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 70 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 71 * home bits. This yields the mem_node. 72 * 73 * Interfaces 74 * ---------- 75 * 76 * This file exports the following entry points: 77 * 78 * plat_lgrp_init() 79 * plat_build_mem_nodes() 80 * plat_lgrp_cpu_to_hand() 81 * plat_lgrp_latency() 82 * plat_pfn_to_mem_node() 83 * These implement the usual platform lgroup interfaces. 84 * 85 * plat_rapfn_to_papfn() 86 * Recover the PA page coloring bits from an RA. 87 * 88 * plat_mem_node_iterator_init() 89 * Initialize an iterator to efficiently step through pages in a mem_node. 90 * 91 * plat_mem_node_intersect_range() 92 * Find the intersection with a mem_node. 93 */ 94 95 int sun4v_mpo_enable = 1; 96 int sun4v_mpo_debug = 0; 97 char sun4v_mpo_status[256] = ""; 98 99 /* Save CPU info from the MD and associate CPUs with lgroups */ 100 static struct cpu_md mpo_cpu[NCPU]; 101 102 /* Save lgroup info from the MD */ 103 #define MAX_MD_LGROUPS 32 104 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 105 static int n_lgrpnodes = 0; 106 static int n_locality_groups = 0; 107 static int max_locality_groups = 0; 108 109 /* Save mblocks from the MD */ 110 static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS]; 111 static int n_mblocks = 0; 112 113 /* Save mem_node stripes calculate from mblocks and lgroups. */ 114 static mem_stripe_t mem_stripes[MAX_MEM_STRIPES]; 115 static int n_mem_stripes = 0; 116 static pfn_t mnode_stride; /* distance between stripes, start to start */ 117 static int stripe_shift; /* stride/stripes expressed as a shift */ 118 static pfn_t mnode_pages; /* mem_node stripe width */ 119 120 /* Save home mask and shift used to calculate lgrp_handle_t values */ 121 static uint64_t home_mask = 0; 122 static pfn_t home_mask_pfn = 0; 123 static int home_mask_shift = 0; 124 static uint_t home_mask_pfn_shift = 0; 125 126 /* Save lowest and highest latencies found across all lgroups */ 127 static int lower_latency = 0; 128 static int higher_latency = 0; 129 130 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 131 132 static int valid_pages(md_t *md, mde_cookie_t cpu0); 133 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 134 static int fix_interleave(void); 135 136 /* Debug support */ 137 #if defined(DEBUG) && !defined(lint) 138 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 139 #else 140 #define MPO_DEBUG(...) 141 #endif /* DEBUG */ 142 143 /* Record status message, viewable from mdb */ 144 #define MPO_STATUS(args...) { \ 145 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 146 MPO_DEBUG(sun4v_mpo_status); \ 147 } 148 149 /* 150 * Routine to read a uint64_t from a given md 151 */ 152 static int64_t 153 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 154 { 155 int err = md_get_prop_val(md, node, propname, val); 156 return (err); 157 } 158 159 static int 160 mblock_cmp(const void *a, const void *b) 161 { 162 struct mblock_md *m1 = (struct mblock_md *)a; 163 struct mblock_md *m2 = (struct mblock_md *)b; 164 165 if (m1->base < m2->base) 166 return (-1); 167 else if (m1->base == m2->base) 168 return (0); 169 else 170 return (1); 171 } 172 173 static void 174 mblock_sort(struct mblock_md *mblocks, int n) 175 { 176 extern void qsort(void *, size_t, size_t, 177 int (*)(const void *, const void *)); 178 179 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 180 } 181 182 /* 183 * 184 * Traverse the MD to determine: 185 * 186 * Number of CPU nodes, lgrp_nodes, and mblocks 187 * Then for each lgrp_node, obtain the appropriate data. 188 * For each CPU, determine its home locality and store it. 189 * For each mblock, retrieve its data and store it. 190 */ 191 static int 192 lgrp_traverse(md_t *md) 193 { 194 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 195 uint64_t i, j, k, o, n_nodes; 196 uint64_t n_lgroups = 0; 197 uint64_t mem_lg_homeset = 0; 198 int ret_val = 0; 199 int result = 0; 200 int n_cpunodes = 0; 201 int sub_page_fix; 202 203 n_nodes = md_node_count(md); 204 205 if (n_nodes <= 0) { 206 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 207 ret_val = -1; 208 goto fail; 209 } 210 211 root = md_root_node(md); 212 213 if (root == MDE_INVAL_ELEM_COOKIE) { 214 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 215 ret_val = -1; 216 goto fail; 217 } 218 219 /* 220 * Build the Memory Nodes. Do this before any possibility of 221 * bailing from this routine so we obtain ra_to_pa (needed for page 222 * coloring) even when there are no lgroups defined. 223 */ 224 225 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 226 "fwd", &mblocknodes); 227 228 if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) { 229 MPO_STATUS("lgrp_traverse: No mblock " 230 "nodes detected in Machine Descriptor\n"); 231 n_mblocks = 0; 232 ret_val = -1; 233 goto fail; 234 } 235 236 for (i = 0; i < n_mblocks; i++) { 237 mpo_mblock[i].node = mblocknodes[i]; 238 239 /* Without a base or size value we will fail */ 240 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 241 &mpo_mblock[i].base); 242 if (result < 0) { 243 MPO_STATUS("lgrp_traverse: " 244 "PROP_LG_BASE is missing\n"); 245 n_mblocks = 0; 246 ret_val = -1; 247 goto fail; 248 } 249 250 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 251 &mpo_mblock[i].size); 252 if (result < 0) { 253 MPO_STATUS("lgrp_traverse: " 254 "PROP_LG_SIZE is missing\n"); 255 n_mblocks = 0; 256 ret_val = -1; 257 goto fail; 258 } 259 260 result = get_int(md, mblocknodes[i], 261 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 262 263 /* If we don't have an ra_pa_offset, just set it to 0 */ 264 if (result < 0) 265 mpo_mblock[i].ra_to_pa = 0; 266 267 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 268 "ra_to_pa = %lx\n", i, 269 mpo_mblock[i].base, 270 mpo_mblock[i].size, 271 mpo_mblock[i].ra_to_pa); 272 } 273 274 /* Must sort mblocks by address for mem_node_iterator_init() */ 275 mblock_sort(mpo_mblock, n_mblocks); 276 277 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 278 279 /* Page coloring hook is required so we can iterate through mnodes */ 280 if (&page_next_pfn_for_color_cpu == NULL) { 281 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 282 ret_val = -1; 283 goto fail; 284 } 285 286 /* Global enable for mpo */ 287 if (sun4v_mpo_enable == 0) { 288 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 289 ret_val = -1; 290 goto fail; 291 } 292 293 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 294 "fwd", &lgrpnodes); 295 296 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 297 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 298 ret_val = -1; 299 goto fail; 300 } 301 302 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 303 304 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 305 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 306 "in MD\n"); 307 ret_val = -1; 308 goto fail; 309 } 310 311 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 312 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 313 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 314 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 315 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 316 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 317 318 for (i = 0; i < n_lgrpnodes; i++) { 319 mpo_lgroup[i].node = lgrpnodes[i]; 320 mpo_lgroup[i].id = i; 321 mpo_lgroup[i].ncpu = 0; 322 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 323 &mpo_lgroup[i].addr_mask); 324 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 325 &mpo_lgroup[i].addr_match); 326 327 /* 328 * If either the mask or match properties are missing, set to 0 329 */ 330 if (result < 0) { 331 mpo_lgroup[i].addr_mask = 0; 332 mpo_lgroup[i].addr_match = 0; 333 } 334 335 /* Set latency to 0 if property not present */ 336 337 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 338 &mpo_lgroup[i].latency); 339 if (result < 0) 340 mpo_lgroup[i].latency = 0; 341 } 342 343 /* 344 * Sub-page level interleave is not yet supported. Check for it, 345 * and remove sub-page interleaved lgroups from mpo_lgroup and 346 * n_lgrpnodes. If no lgroups are left, return. 347 */ 348 349 sub_page_fix = fix_interleave(); 350 if (n_lgrpnodes == 0) { 351 ret_val = -1; 352 goto fail; 353 } 354 355 /* Ensure that all of the addr_mask values are the same */ 356 357 for (i = 0; i < n_lgrpnodes; i++) { 358 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 359 MPO_STATUS("lgrp_traverse: " 360 "addr_mask values are not the same\n"); 361 ret_val = -1; 362 goto fail; 363 } 364 } 365 366 /* 367 * Ensure that all lgrp nodes see all the mblocks. However, if 368 * sub-page interleave is being fixed, they do not, so skip 369 * the check. 370 */ 371 372 if (sub_page_fix == 0) { 373 for (i = 0; i < n_lgrpnodes; i++) { 374 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 375 PROP_LG_MBLOCK, "fwd", &nodes); 376 md_free_scan_dag(md, &nodes); 377 if (j != n_mblocks) { 378 MPO_STATUS("lgrp_traverse: " 379 "sub-page interleave is being fixed\n"); 380 ret_val = -1; 381 goto fail; 382 } 383 } 384 } 385 386 /* 387 * Use the address mask from the first lgroup node 388 * to establish our home_mask. 389 */ 390 home_mask = mpo_lgroup[0].addr_mask; 391 home_mask_pfn = btop(home_mask); 392 home_mask_shift = lowbit(home_mask) - 1; 393 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 394 mnode_pages = btop(1ULL << home_mask_shift); 395 396 /* 397 * How many values are possible in home mask? Assume the mask 398 * bits are contiguous. 399 */ 400 max_locality_groups = 401 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 402 403 /* Now verify the home mask bits are contiguous */ 404 405 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 406 MPO_STATUS("lgrp_traverse: " 407 "home mask bits are not contiguous\n"); 408 ret_val = -1; 409 goto fail; 410 } 411 412 /* Record all of the home bits */ 413 414 for (i = 0; i < n_lgrpnodes; i++) { 415 HOMESET_ADD(mem_lg_homeset, 416 mpo_lgroup[i].addr_match >> home_mask_shift); 417 } 418 419 /* Count the number different "home" mem_lg's we've discovered */ 420 421 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 422 423 /* If we have only 1 locality group then we can exit */ 424 if (n_locality_groups == 1) { 425 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 426 ret_val = -1; 427 goto fail; 428 } 429 430 /* 431 * Set the latencies. A CPU's lgroup is defined by the lowest 432 * latency found. All other memory is considered remote, and the 433 * remote latency is represented by the highest latency found. 434 * Thus hierarchical lgroups, if any, are approximated by a 435 * two level scheme. 436 * 437 * The Solaris MPO framework by convention wants to see latencies 438 * in units of nano-sec/10. In the MD, the units are defined to be 439 * pico-seconds. 440 */ 441 442 lower_latency = mpo_lgroup[0].latency; 443 higher_latency = mpo_lgroup[0].latency; 444 445 for (i = 1; i < n_lgrpnodes; i++) { 446 if (mpo_lgroup[i].latency < lower_latency) { 447 lower_latency = mpo_lgroup[i].latency; 448 } 449 if (mpo_lgroup[i].latency > higher_latency) { 450 higher_latency = mpo_lgroup[i].latency; 451 } 452 } 453 lower_latency /= 10000; 454 higher_latency /= 10000; 455 456 /* Clear our CPU data */ 457 458 for (i = 0; i < NCPU; i++) { 459 mpo_cpu[i].home = 0; 460 mpo_cpu[i].latency = (uint_t)(-1); 461 } 462 463 /* Build the CPU nodes */ 464 for (i = 0; i < n_cpunodes; i++) { 465 466 /* Read in the lgroup nodes */ 467 468 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 469 if (result < 0) { 470 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 471 ret_val = -1; 472 goto fail; 473 } 474 475 n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, 476 "fwd", &nodes); 477 if (n_lgroups <= 0) { 478 MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); 479 ret_val = -1; 480 goto fail; 481 } 482 483 /* 484 * Find the lgroup this cpu belongs to with the lowest latency. 485 * Check all the lgrp nodes connected to this CPU to determine 486 * which has the smallest latency. 487 */ 488 489 for (j = 0; j < n_lgroups; j++) { 490 for (o = 0; o < n_lgrpnodes; o++) { 491 if (nodes[j] == mpo_lgroup[o].node) { 492 if (mpo_lgroup[o].latency < 493 mpo_cpu[k].latency) { 494 mpo_cpu[k].home = 495 mpo_lgroup[o].addr_match 496 >> home_mask_shift; 497 mpo_cpu[k].latency = 498 mpo_lgroup[o].latency; 499 mpo_lgroup[o].ncpu++; 500 } 501 } 502 } 503 } 504 md_free_scan_dag(md, &nodes); 505 } 506 507 /* Validate that no large pages cross mnode boundaries. */ 508 if (valid_pages(md, cpunodes[0]) == 0) { 509 ret_val = -1; 510 goto fail; 511 } 512 513 fail: 514 /* MD cookies are no longer valid; ensure they are not used again. */ 515 for (i = 0; i < n_mblocks; i++) 516 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 517 for (i = 0; i < n_lgrpnodes; i++) 518 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 519 520 if (n_cpunodes > 0) 521 md_free_scan_dag(md, &cpunodes); 522 if (n_lgrpnodes > 0) 523 md_free_scan_dag(md, &lgrpnodes); 524 if (n_mblocks > 0) 525 md_free_scan_dag(md, &mblocknodes); 526 else 527 panic("lgrp_traverse: No memory blocks found"); 528 529 if (ret_val == 0) 530 MPO_STATUS("MPO feature is enabled.\n"); 531 532 return (ret_val); 533 } 534 535 /* 536 * Determine the number of unique mem_lg's present in our system 537 */ 538 static int 539 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 540 { 541 int homeid; 542 int count = 0; 543 544 /* 545 * Scan the "home" bits of the mem_lgs, count 546 * the number that are unique. 547 */ 548 549 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 550 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 551 count++; 552 } 553 } 554 555 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 556 mem_lg_homeset); 557 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 558 559 /* Default must be at least one */ 560 if (count == 0) 561 count = 1; 562 563 return (count); 564 } 565 566 /* 567 * Platform specific lgroup initialization 568 */ 569 void 570 plat_lgrp_init(void) 571 { 572 md_t *md; 573 int i, rc, ncpu_min; 574 575 /* Get the Machine Descriptor handle */ 576 577 md = md_get_handle(); 578 579 /* If not, we cannot continue */ 580 581 if (md == NULL) { 582 panic("cannot access machine descriptor\n"); 583 } else { 584 rc = lgrp_traverse(md); 585 (void) md_fini_handle(md); 586 } 587 588 /* 589 * If we can't process the MD for lgroups then at least let the 590 * system try to boot. Assume we have one lgroup so that 591 * when plat_build_mem_nodes is called, it will attempt to init 592 * an mnode based on the supplied memory segment. 593 */ 594 595 if (rc == -1) { 596 home_mask_pfn = 0; 597 max_locality_groups = 1; 598 n_locality_groups = 1; 599 return; 600 } 601 602 mem_node_pfn_shift = 0; 603 mem_node_physalign = 0; 604 605 /* Use lgroup-aware TSB allocations */ 606 tsb_lgrp_affinity = 1; 607 608 /* 609 * lgrp_expand_proc_thresh is the minimum load on the lgroups 610 * this process is currently running on before considering 611 * expanding threads to another lgroup. 612 * 613 * lgrp_expand_proc_diff determines how much less the remote lgroup 614 * must be loaded before expanding to it. 615 * 616 * On sun4v CMT processors, threads share a core pipeline, and 617 * at less than 100% utilization, best throughput is obtained by 618 * spreading threads across more cores, even if some are in a 619 * different lgroup. Spread threads to a new lgroup if the 620 * current group is more than 50% loaded. Because of virtualization, 621 * lgroups may have different numbers of CPUs, but the tunables 622 * apply to all lgroups, so find the smallest lgroup and compute 623 * 50% loading. 624 */ 625 626 ncpu_min = NCPU; 627 for (i = 0; i < n_lgrpnodes; i++) { 628 int ncpu = mpo_lgroup[i].ncpu; 629 if (ncpu != 0 && ncpu < ncpu_min) 630 ncpu_min = ncpu; 631 } 632 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 633 634 /* new home may only be half as loaded as the existing home to use it */ 635 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 636 637 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 638 639 /* Require that a home lgroup have some memory to be chosen */ 640 lgrp_mem_free_thresh = 1; 641 642 /* Standard home-on-next-touch policy */ 643 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 644 645 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 646 lgrp_load_thresh = UINT32_MAX; 647 } 648 649 /* 650 * Helper routine for debugging calls to mem_node_add_slice() 651 */ 652 static void 653 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 654 { 655 #if defined(DEBUG) && !defined(lint) 656 static int slice_count = 0; 657 658 slice_count++; 659 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 660 slice_count, basepfn, endpfn); 661 #endif 662 mem_node_add_slice(basepfn, endpfn); 663 } 664 665 /* 666 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 667 */ 668 static void 669 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 670 { 671 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 672 "mnode index: %d\n", plathand, mnode); 673 plat_assign_lgrphand_to_mem_node(plathand, mnode); 674 } 675 676 /* 677 * plat_build_mem_nodes() 678 * 679 * Define the mem_nodes based on the modified boot memory list, 680 * or based on info read from the MD in plat_lgrp_init(). 681 * 682 * When the home mask lies in the middle of the address bits (as it does on 683 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 684 * it is striped across an mblock in a repeating pattern of contiguous memory 685 * followed by a gap. The stripe width is the size of the contiguous piece. 686 * The stride is the distance from the start of one contiguous piece to the 687 * start of the next. The gap is thus stride - stripe_width. 688 * 689 * The stripe of an mnode that falls within an mblock is described by the type 690 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 691 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 692 * this array is predetermined. The mem_stripe_t that describes mnode m 693 * within mpo_mblock[i] is stored at 694 * mem_stripes[ m + i * max_locality_groups ] 695 * 696 * max_locality_groups is the total number of possible locality groups, 697 * as defined by the size of the home mask, even if the memory assigned 698 * to the domain is small and does not cover all the lgroups. Thus some 699 * mem_stripe_t's may be empty. 700 * 701 * The members of mem_stripe_t are: 702 * physbase: First valid page in mem_node in the corresponding mblock 703 * physmax: Last valid page in mem_node in mblock 704 * offset: The full stripe width starts at physbase - offset. 705 * Thus if offset is non-zero, this mem_node starts in the middle 706 * of a stripe width, and the second full stripe starts at 707 * physbase - offset + stride. (even though physmax may fall in the 708 * middle of a stripe width, we do not save the ending fragment size 709 * in this data structure.) 710 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 711 * 712 * The stripe width is kept in the global mnode_pages. 713 * The stride is kept in the global mnode_stride. 714 * All the above use pfn's as the unit. 715 * 716 * As an example, the memory layout for a domain with 2 mblocks and 4 717 * mem_nodes 0,1,2,3 could look like this: 718 * 719 * 123012301230 ... 012301230123 ... 720 * mblock 0 mblock 1 721 */ 722 723 void 724 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 725 { 726 lgrp_handle_t lgrphand, lgrp_start; 727 int i, mnode, elem; 728 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 729 uint64_t stripe, frag, remove; 730 mem_stripe_t *ms; 731 732 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 733 max_mem_nodes = max_locality_groups; 734 735 /* Check for non-MPO sun4v platforms */ 736 if (n_locality_groups <= 1) { 737 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 738 for (elem = 0; elem < nelems; elem += 2) { 739 base = list[elem]; 740 len = list[elem+1]; 741 742 mpo_mem_node_add_slice(btop(base), 743 btop(base + len - 1)); 744 } 745 mem_node_pfn_shift = 0; 746 mem_node_physalign = 0; 747 n_mem_stripes = 0; 748 if (n_mblocks == 1) 749 return; 750 } 751 752 bzero(mem_stripes, sizeof (mem_stripes)); 753 stripe = ptob(mnode_pages); 754 stride = max_locality_groups * stripe; 755 756 /* Save commonly used values in globals */ 757 mnode_stride = btop(stride); 758 n_mem_stripes = max_locality_groups * n_mblocks; 759 stripe_shift = highbit(max_locality_groups) - 1; 760 761 for (i = 0; i < n_mblocks; i++) { 762 763 base = mpo_mblock[i].base; 764 end = mpo_mblock[i].base + mpo_mblock[i].size; 765 ra_to_pa = mpo_mblock[i].ra_to_pa; 766 mpo_mblock[i].base_pfn = btop(base); 767 mpo_mblock[i].end_pfn = btop(end - 1); 768 769 /* Find the offset from the prev stripe boundary in PA space. */ 770 offset = (base + ra_to_pa) & (stripe - 1); 771 772 /* Set the next stripe boundary. */ 773 stripe_end = base - offset + stripe; 774 775 lgrp_start = (((base + ra_to_pa) & home_mask) >> 776 home_mask_shift); 777 lgrphand = lgrp_start; 778 779 /* 780 * Loop over all lgroups covered by the mblock, creating a 781 * stripe for each. Stop when lgrp_start is visited again. 782 */ 783 do { 784 /* mblock may not span all lgroups */ 785 if (base >= end) 786 break; 787 788 mnode = lgrphand; 789 ASSERT(mnode < max_mem_nodes); 790 791 /* 792 * Calculate the size of the fragment that does not 793 * belong to the mnode in the last partial stride. 794 */ 795 frag = (end - (base - offset)) & (stride - 1); 796 if (frag == 0) { 797 /* remove the gap */ 798 remove = stride - stripe; 799 } else if (frag < stripe) { 800 /* fragment fits in stripe; keep it all */ 801 remove = 0; 802 } else { 803 /* fragment is large; trim after whole stripe */ 804 remove = frag - stripe; 805 } 806 807 ms = &mem_stripes[i * max_locality_groups + mnode]; 808 ms->physbase = btop(base); 809 ms->physmax = btop(end - 1 - remove); 810 ms->offset = btop(offset); 811 ms->exists = 1; 812 813 /* 814 * If we have only 1 lgroup and multiple mblocks, 815 * then we have already established our lgrp handle 816 * to mem_node and mem_node_config values above. 817 */ 818 if (n_locality_groups > 1) { 819 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, 820 mnode); 821 mpo_mem_node_add_slice(ms->physbase, 822 ms->physmax); 823 } 824 base = stripe_end; 825 stripe_end += stripe; 826 offset = 0; 827 lgrphand = (((base + ra_to_pa) & home_mask) >> 828 home_mask_shift); 829 } while (lgrphand != lgrp_start); 830 } 831 832 /* 833 * Indicate to vm_pagelist that the hpm_counters array 834 * should be shared because the ranges overlap. 835 */ 836 if (max_mem_nodes > 1) { 837 interleaved_mnodes = 1; 838 } 839 } 840 841 /* 842 * Return the locality group value for the supplied processor 843 */ 844 lgrp_handle_t 845 plat_lgrp_cpu_to_hand(processorid_t id) 846 { 847 if (n_locality_groups > 1) { 848 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 849 } else { 850 return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */ 851 } 852 } 853 854 int 855 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 856 { 857 /* 858 * Return min remote latency when there are more than two lgroups 859 * (root and child) and getting latency between two different lgroups 860 * or root is involved. 861 */ 862 if (lgrp_optimizations() && (from != to || 863 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 864 return ((int)higher_latency); 865 } else { 866 return ((int)lower_latency); 867 } 868 } 869 870 int 871 plat_pfn_to_mem_node(pfn_t pfn) 872 { 873 int i, mnode; 874 pfn_t ra_to_pa_pfn; 875 struct mblock_md *mb; 876 877 if (n_locality_groups <= 1) 878 return (0); 879 880 /* 881 * The mnode is defined to be 1:1 with the lgroup handle, which 882 * is taken from from the home bits. Find the mblock in which 883 * the pfn falls to get the ra_to_pa adjustment, and extract 884 * the home bits. 885 */ 886 mb = &mpo_mblock[0]; 887 for (i = 0; i < n_mblocks; i++) { 888 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 889 ra_to_pa_pfn = btop(mb->ra_to_pa); 890 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 891 home_mask_pfn_shift); 892 ASSERT(mnode < max_mem_nodes); 893 return (mnode); 894 } 895 mb++; 896 } 897 898 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 899 return (pfn); 900 } 901 902 /* 903 * plat_rapfn_to_papfn 904 * 905 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 906 * and home mask bits are correct. The upper bits do not necessarily 907 * match the actual PA, however. 908 */ 909 pfn_t 910 plat_rapfn_to_papfn(pfn_t pfn) 911 { 912 int i; 913 pfn_t ra_to_pa_pfn; 914 struct mblock_md *mb; 915 916 ASSERT(n_mblocks > 0); 917 if (n_mblocks == 1) 918 return (pfn + base_ra_to_pa_pfn); 919 920 /* 921 * Find the mblock in which the pfn falls 922 * in order to get the ra_to_pa adjustment. 923 */ 924 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 925 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 926 ra_to_pa_pfn = btop(mb->ra_to_pa); 927 return (pfn + ra_to_pa_pfn); 928 } 929 } 930 931 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 932 return (pfn); 933 } 934 935 /* 936 * plat_mem_node_iterator_init() 937 * Initialize cookie to iterate over pfn's in an mnode. There is 938 * no additional iterator function. The caller uses the info from 939 * the iterator structure directly. 940 * 941 * pfn: starting pfn. 942 * mnode: desired mnode. 943 * init: set to 1 for full init, 0 for continuation 944 * 945 * Returns the appropriate starting pfn for the iteration 946 * the same as the input pfn if it falls in an mblock. 947 * Returns the (pfn_t)-1 value if the input pfn lies past 948 * the last valid mnode pfn. 949 */ 950 pfn_t 951 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 952 mem_node_iterator_t *it, int init) 953 { 954 int i; 955 struct mblock_md *mblock; 956 pfn_t base, end; 957 958 ASSERT(it != NULL); 959 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 960 ASSERT(n_mblocks > 0); 961 962 if (init) { 963 it->mi_last_mblock = 0; 964 it->mi_init = 1; 965 } 966 967 /* Check if mpo is not enabled and we only have one mblock */ 968 if (n_locality_groups == 1 && n_mblocks == 1) { 969 it->mi_mnode = mnode; 970 it->mi_ra_to_pa = base_ra_to_pa_pfn; 971 it->mi_mnode_pfn_mask = 0; 972 it->mi_mnode_pfn_shift = 0; 973 it->mi_mnode_mask = 0; 974 it->mi_mblock_base = mem_node_config[mnode].physbase; 975 it->mi_mblock_end = mem_node_config[mnode].physmax; 976 if (pfn < it->mi_mblock_base) 977 pfn = it->mi_mblock_base; 978 else if (pfn > it->mi_mblock_end) 979 pfn = (pfn_t)-1; 980 return (pfn); 981 } 982 983 /* 984 * Find mblock that contains pfn, or first mblock after pfn, 985 * else pfn is out of bounds, so use the last mblock. 986 * mblocks are sorted in ascending address order. 987 */ 988 ASSERT(it->mi_last_mblock < n_mblocks); 989 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 990 i = init ? 0 : it->mi_last_mblock + 1; 991 if (i == n_mblocks) 992 return ((pfn_t)-1); 993 994 for (; i < n_mblocks; i++) { 995 if (pfn <= mpo_mblock[i].end_pfn) 996 break; 997 } 998 if (i == n_mblocks) { 999 it->mi_last_mblock = i - 1; 1000 return ((pfn_t)-1); 1001 } 1002 it->mi_last_mblock = i; 1003 1004 /* 1005 * Memory stripes are defined if there is more than one locality 1006 * group, so use the stripe bounds. Otherwise use mblock bounds. 1007 */ 1008 mblock = &mpo_mblock[i]; 1009 if (n_mem_stripes > 0) { 1010 mem_stripe_t *ms = 1011 &mem_stripes[i * max_locality_groups + mnode]; 1012 base = ms->physbase; 1013 end = ms->physmax; 1014 } else { 1015 ASSERT(mnode == 0); 1016 base = mblock->base_pfn; 1017 end = mblock->end_pfn; 1018 } 1019 1020 it->mi_mnode = mnode; 1021 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1022 it->mi_mblock_base = base; 1023 it->mi_mblock_end = end; 1024 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1025 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1026 it->mi_mnode_mask = max_locality_groups - 1; 1027 if (pfn < base) 1028 pfn = base; 1029 else if (pfn > end) 1030 pfn = (pfn_t)-1; 1031 return (pfn); 1032 } 1033 1034 /* 1035 * plat_mem_node_intersect_range() 1036 * 1037 * Find the intersection between a memnode and a range of pfn's. 1038 */ 1039 void 1040 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1041 int mnode, pgcnt_t *npages_out) 1042 { 1043 pfn_t offset, len, hole, base, end, test_end, frag; 1044 pfn_t nearest; 1045 mem_stripe_t *ms; 1046 int i, npages; 1047 1048 *npages_out = 0; 1049 1050 if (!mem_node_config[mnode].exists || test_len == 0) 1051 return; 1052 1053 base = mem_node_config[mnode].physbase; 1054 end = mem_node_config[mnode].physmax; 1055 1056 test_end = test_base + test_len - 1; 1057 if (end < test_base || base > test_end) 1058 return; 1059 1060 if (n_locality_groups == 1) { 1061 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1062 return; 1063 } 1064 1065 hole = mnode_stride - mnode_pages; 1066 npages = 0; 1067 1068 /* 1069 * Iterate over all the stripes for this mnode (one per mblock), 1070 * find the intersection with each, and accumulate the intersections. 1071 * 1072 * Determing the intersection with a stripe is tricky. If base or end 1073 * fall outside the mem_node bounds, round them to physbase/physmax of 1074 * mem_node. If base or end fall in a gap, round them to start of 1075 * nearest stripe. If they fall within a stripe, keep base or end, 1076 * but calculate the fragment size that should be excluded from the 1077 * stripe. Calculate how many strides fall in the adjusted range, 1078 * multiply by stripe width, and add the start and end fragments. 1079 */ 1080 1081 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1082 ms = &mem_stripes[i]; 1083 if (ms->exists && 1084 test_base <= (end = ms->physmax) && 1085 test_end >= (base = ms->physbase)) { 1086 1087 offset = ms->offset; 1088 1089 if (test_base > base) { 1090 /* Round test_base to next multiple of stride */ 1091 len = P2ROUNDUP(test_base - (base - offset), 1092 mnode_stride); 1093 nearest = base - offset + len; 1094 /* 1095 * Compute distance from test_base to the 1096 * stride boundary to see if test_base falls 1097 * in the stripe or in the hole. 1098 */ 1099 if (nearest - test_base > hole) { 1100 /* 1101 * test_base lies in stripe, 1102 * and offset should be excluded. 1103 */ 1104 offset = test_base - 1105 (nearest - mnode_stride); 1106 base = test_base; 1107 } else { 1108 /* round up to next stripe start */ 1109 offset = 0; 1110 base = nearest; 1111 if (base > end) 1112 continue; 1113 } 1114 1115 } 1116 1117 if (test_end < end) 1118 end = test_end; 1119 end++; /* adjust to an exclusive bound */ 1120 1121 /* Round end to next multiple of stride */ 1122 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1123 nearest = (base - offset) + len; 1124 if (nearest - end <= hole) { 1125 /* end falls in hole, use entire last stripe */ 1126 frag = 0; 1127 } else { 1128 /* end falls in stripe, compute fragment */ 1129 frag = nearest - hole - end; 1130 } 1131 1132 len = (len >> stripe_shift) - offset - frag; 1133 npages += len; 1134 } 1135 } 1136 1137 *npages_out = npages; 1138 } 1139 1140 /* 1141 * valid_pages() 1142 * 1143 * Return 1 if pages are valid and do not cross mnode boundaries 1144 * (which would break page free list assumptions), and 0 otherwise. 1145 */ 1146 1147 #define MNODE(pa) \ 1148 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1149 1150 static int 1151 valid_pages(md_t *md, mde_cookie_t cpu0) 1152 { 1153 int i, max_szc; 1154 uint64_t last_page_base, szc_mask; 1155 uint64_t max_page_len, max_coalesce_len; 1156 struct mblock_md *mb = mpo_mblock; 1157 1158 /* 1159 * Find the smaller of the largest page possible and supported. 1160 * mmu_exported_pagesize_mask is not yet initialized, so read 1161 * it from the MD. Apply minimal fixups in case of broken MDs 1162 * to get a sane mask. 1163 */ 1164 1165 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1166 szc_mask = 0; 1167 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1168 max_szc = highbit(szc_mask) - 1; 1169 if (max_szc > TTE256M) 1170 max_szc = TTE256M; 1171 max_page_len = TTEBYTES(max_szc); 1172 1173 /* 1174 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1175 * if mmu-page-size-list does not contain it, so 256M pages must fall 1176 * within one mnode to use MPO. 1177 */ 1178 max_coalesce_len = TTEBYTES(TTE256M); 1179 ASSERT(max_coalesce_len >= max_page_len); 1180 1181 if (ptob(mnode_pages) < max_coalesce_len) { 1182 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1183 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1184 return (0); 1185 } 1186 1187 for (i = 0; i < n_mblocks; i++) { 1188 uint64_t base = mb->base; 1189 uint64_t end = mb->base + mb->size - 1; 1190 uint64_t ra_to_pa = mb->ra_to_pa; 1191 1192 /* 1193 * If mblock is smaller than the max page size, then 1194 * RA = PA mod MAXPAGE is not guaranteed, but it must 1195 * not span mnodes. 1196 */ 1197 if (mb->size < max_page_len) { 1198 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1199 MPO_STATUS("Small mblock spans mnodes; " 1200 "MPO disabled: base = %lx, end = %lx, " 1201 "ra2pa = %lx\n", base, end, ra_to_pa); 1202 return (0); 1203 } 1204 } else { 1205 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1206 uint64_t pa_base = base + ra_to_pa; 1207 if ((base & (max_coalesce_len - 1)) != 1208 (pa_base & (max_coalesce_len - 1))) { 1209 MPO_STATUS("bad page alignment; MPO disabled: " 1210 "ra = %lx, pa = %lx, pagelen = %lx\n", 1211 base, pa_base, max_coalesce_len); 1212 return (0); 1213 } 1214 } 1215 1216 /* 1217 * Find start of last large page in mblock in RA space. 1218 * If page extends into the next mblock, verify the 1219 * mnode does not change. 1220 */ 1221 last_page_base = P2ALIGN(end, max_coalesce_len); 1222 if (i + 1 < n_mblocks && 1223 last_page_base + max_coalesce_len > mb[1].base && 1224 MNODE(last_page_base + ra_to_pa) != 1225 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1226 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1227 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1228 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1229 mb[1].ra_to_pa, max_coalesce_len); 1230 return (0); 1231 } 1232 1233 mb++; 1234 } 1235 return (1); 1236 } 1237 1238 1239 /* 1240 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1241 * if any, and remove them. This yields a config where the "coarse 1242 * grained" lgroups cover all of memory, even though part of that memory 1243 * is fine grain interleaved and does not deliver a purely local memory 1244 * latency. 1245 * 1246 * This function reads and modifies the globals: 1247 * mpo_lgroup[], n_lgrpnodes 1248 * 1249 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1250 */ 1251 1252 static int 1253 fix_interleave(void) 1254 { 1255 int i, j; 1256 uint64_t mask = 0; 1257 1258 j = 0; 1259 for (i = 0; i < n_lgrpnodes; i++) { 1260 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1261 /* remove this lgroup */ 1262 mask = mpo_lgroup[i].addr_mask; 1263 } else { 1264 mpo_lgroup[j++] = mpo_lgroup[i]; 1265 } 1266 } 1267 n_lgrpnodes = j; 1268 1269 if (mask != 0) 1270 MPO_STATUS("sub-page interleave %lx found; " 1271 "removing lgroup.\n", mask); 1272 1273 return (mask != 0); 1274 } 1275