1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/machsystm.h> 32 #include <sys/machparam.h> 33 #include <sys/cmn_err.h> 34 #include <sys/stat.h> 35 #include <sys/mach_descrip.h> 36 #include <sys/memnode.h> 37 #include <sys/mdesc.h> 38 #include <sys/mpo.h> 39 #include <vm/vm_dep.h> 40 41 /* 42 * MPO and the sun4v memory representation 43 * --------------------------------------- 44 * 45 * Latency groups are defined in the sun4v achitecture by memory-latency-group 46 * nodes in the Machine Description, as specified in FWARC/2007/260. These 47 * tie together cpu nodes and mblock nodes, and contain mask and match 48 * properties that identify the portion of an mblock that belongs to the 49 * lgroup. Mask and match are defined in the Physical Address (PA) space, 50 * but an mblock defines Real Addresses (RA). To translate, the mblock 51 * includes the property address-congruence-offset, hereafter referred to as 52 * ra_to_pa. A real address ra is a member of an lgroup if 53 * 54 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 55 * 56 * The MD is traversed, and information on all mblocks is kept in the array 57 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 58 * to, is kept in the array mpo_cpu[]. 59 * 60 * This implementation makes (and verifies) the simplifying assumption that 61 * the mask bits are the same for all defined lgroups, and that all 1 bits in 62 * the mask are contiguous. Thus the number of lgroups is bounded by the 63 * number of possible mask values, and the lgrp_handle_t is defined as the 64 * mask value, shifted right to eliminate the 0 bit positions in mask. The 65 * masks and values are also referred to as "home bits" in the code. 66 * 67 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 68 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 69 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 70 * home bits. This yields the mem_node. 71 * 72 * Interfaces 73 * ---------- 74 * 75 * This file exports the following entry points: 76 * 77 * plat_lgrp_init() 78 * plat_build_mem_nodes() 79 * plat_lgrp_cpu_to_hand() 80 * plat_lgrp_latency() 81 * plat_pfn_to_mem_node() 82 * These implement the usual platform lgroup interfaces. 83 * 84 * plat_rapfn_to_papfn() 85 * Recover the PA page coloring bits from an RA. 86 * 87 * plat_mem_node_iterator_init() 88 * Initialize an iterator to efficiently step through pages in a mem_node. 89 * 90 * plat_mem_node_intersect_range() 91 * Find the intersection with a mem_node. 92 */ 93 94 int sun4v_mpo_enable = 1; 95 int sun4v_mpo_debug = 0; 96 char sun4v_mpo_status[256] = ""; 97 98 /* Save CPU info from the MD and associate CPUs with lgroups */ 99 static struct cpu_md mpo_cpu[NCPU]; 100 101 /* Save lgroup info from the MD */ 102 #define MAX_MD_LGROUPS 32 103 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 104 static int n_lgrpnodes = 0; 105 static int n_locality_groups = 0; 106 static int max_locality_groups = 0; 107 108 /* Save mblocks from the MD */ 109 static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS]; 110 static int n_mblocks = 0; 111 112 /* Save mem_node stripes calculate from mblocks and lgroups. */ 113 static mem_stripe_t mem_stripes[MAX_MEM_STRIPES]; 114 static int n_mem_stripes = 0; 115 static pfn_t mnode_stride; /* distance between stripes, start to start */ 116 static int stripe_shift; /* stride/stripes expressed as a shift */ 117 static pfn_t mnode_pages; /* mem_node stripe width */ 118 119 /* Save home mask and shift used to calculate lgrp_handle_t values */ 120 static uint64_t home_mask = 0; 121 static pfn_t home_mask_pfn = 0; 122 static int home_mask_shift = 0; 123 static uint_t home_mask_pfn_shift = 0; 124 125 /* Save lowest and highest latencies found across all lgroups */ 126 static int lower_latency = 0; 127 static int higher_latency = 0; 128 129 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 130 131 static int valid_pages(md_t *md, mde_cookie_t cpu0); 132 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 133 static int fix_interleave(void); 134 135 /* Debug support */ 136 #if defined(DEBUG) && !defined(lint) 137 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 138 #else 139 #define MPO_DEBUG(...) 140 #endif /* DEBUG */ 141 142 /* Record status message, viewable from mdb */ 143 #define MPO_STATUS(args...) { \ 144 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 145 MPO_DEBUG(sun4v_mpo_status); \ 146 } 147 148 /* 149 * Routine to read a uint64_t from a given md 150 */ 151 static int64_t 152 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 153 { 154 int err = md_get_prop_val(md, node, propname, val); 155 return (err); 156 } 157 158 static int 159 mblock_cmp(const void *a, const void *b) 160 { 161 struct mblock_md *m1 = (struct mblock_md *)a; 162 struct mblock_md *m2 = (struct mblock_md *)b; 163 164 if (m1->base < m2->base) 165 return (-1); 166 else if (m1->base == m2->base) 167 return (0); 168 else 169 return (1); 170 } 171 172 static void 173 mblock_sort(struct mblock_md *mblocks, int n) 174 { 175 extern void qsort(void *, size_t, size_t, 176 int (*)(const void *, const void *)); 177 178 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 179 } 180 181 /* 182 * 183 * Traverse the MD to determine: 184 * 185 * Number of CPU nodes, lgrp_nodes, and mblocks 186 * Then for each lgrp_node, obtain the appropriate data. 187 * For each CPU, determine its home locality and store it. 188 * For each mblock, retrieve its data and store it. 189 */ 190 static int 191 lgrp_traverse(md_t *md) 192 { 193 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 194 uint64_t i, j, k, o, n_nodes; 195 uint64_t n_lgroups = 0; 196 uint64_t mem_lg_homeset = 0; 197 int ret_val = 0; 198 int result = 0; 199 int n_cpunodes = 0; 200 int sub_page_fix; 201 202 n_nodes = md_node_count(md); 203 204 if (n_nodes <= 0) { 205 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 206 ret_val = -1; 207 goto fail; 208 } 209 210 root = md_root_node(md); 211 212 if (root == MDE_INVAL_ELEM_COOKIE) { 213 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 214 ret_val = -1; 215 goto fail; 216 } 217 218 /* 219 * Build the Memory Nodes. Do this before any possibility of 220 * bailing from this routine so we obtain ra_to_pa (needed for page 221 * coloring) even when there are no lgroups defined. 222 */ 223 224 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 225 "fwd", &mblocknodes); 226 227 if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) { 228 MPO_STATUS("lgrp_traverse: No mblock " 229 "nodes detected in Machine Descriptor\n"); 230 n_mblocks = 0; 231 ret_val = -1; 232 goto fail; 233 } 234 235 for (i = 0; i < n_mblocks; i++) { 236 mpo_mblock[i].node = mblocknodes[i]; 237 238 /* Without a base or size value we will fail */ 239 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 240 &mpo_mblock[i].base); 241 if (result < 0) { 242 MPO_STATUS("lgrp_traverse: " 243 "PROP_LG_BASE is missing\n"); 244 n_mblocks = 0; 245 ret_val = -1; 246 goto fail; 247 } 248 249 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 250 &mpo_mblock[i].size); 251 if (result < 0) { 252 MPO_STATUS("lgrp_traverse: " 253 "PROP_LG_SIZE is missing\n"); 254 n_mblocks = 0; 255 ret_val = -1; 256 goto fail; 257 } 258 259 result = get_int(md, mblocknodes[i], 260 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 261 262 /* If we don't have an ra_pa_offset, just set it to 0 */ 263 if (result < 0) 264 mpo_mblock[i].ra_to_pa = 0; 265 266 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 267 "ra_to_pa = %lx\n", i, 268 mpo_mblock[i].base, 269 mpo_mblock[i].size, 270 mpo_mblock[i].ra_to_pa); 271 } 272 273 /* Must sort mblocks by address for mem_node_iterator_init() */ 274 mblock_sort(mpo_mblock, n_mblocks); 275 276 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 277 278 /* Page coloring hook is required so we can iterate through mnodes */ 279 if (&page_next_pfn_for_color_cpu == NULL) { 280 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 281 ret_val = -1; 282 goto fail; 283 } 284 285 /* Global enable for mpo */ 286 if (sun4v_mpo_enable == 0) { 287 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 288 ret_val = -1; 289 goto fail; 290 } 291 292 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 293 "fwd", &lgrpnodes); 294 295 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 296 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 297 ret_val = -1; 298 goto fail; 299 } 300 301 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 302 303 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 304 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 305 "in MD\n"); 306 ret_val = -1; 307 goto fail; 308 } 309 310 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 311 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 312 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 313 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 314 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 315 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 316 317 for (i = 0; i < n_lgrpnodes; i++) { 318 mpo_lgroup[i].node = lgrpnodes[i]; 319 mpo_lgroup[i].id = i; 320 mpo_lgroup[i].ncpu = 0; 321 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 322 &mpo_lgroup[i].addr_mask); 323 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 324 &mpo_lgroup[i].addr_match); 325 326 /* 327 * If either the mask or match properties are missing, set to 0 328 */ 329 if (result < 0) { 330 mpo_lgroup[i].addr_mask = 0; 331 mpo_lgroup[i].addr_match = 0; 332 } 333 334 /* Set latency to 0 if property not present */ 335 336 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 337 &mpo_lgroup[i].latency); 338 if (result < 0) 339 mpo_lgroup[i].latency = 0; 340 } 341 342 /* 343 * Sub-page level interleave is not yet supported. Check for it, 344 * and remove sub-page interleaved lgroups from mpo_lgroup and 345 * n_lgrpnodes. If no lgroups are left, return. 346 */ 347 348 sub_page_fix = fix_interleave(); 349 if (n_lgrpnodes == 0) { 350 ret_val = -1; 351 goto fail; 352 } 353 354 /* Ensure that all of the addr_mask values are the same */ 355 356 for (i = 0; i < n_lgrpnodes; i++) { 357 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 358 MPO_STATUS("lgrp_traverse: " 359 "addr_mask values are not the same\n"); 360 ret_val = -1; 361 goto fail; 362 } 363 } 364 365 /* 366 * Ensure that all lgrp nodes see all the mblocks. However, if 367 * sub-page interleave is being fixed, they do not, so skip 368 * the check. 369 */ 370 371 if (sub_page_fix == 0) { 372 for (i = 0; i < n_lgrpnodes; i++) { 373 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 374 PROP_LG_MBLOCK, "fwd", &nodes); 375 md_free_scan_dag(md, &nodes); 376 if (j != n_mblocks) { 377 MPO_STATUS("lgrp_traverse: " 378 "sub-page interleave is being fixed\n"); 379 ret_val = -1; 380 goto fail; 381 } 382 } 383 } 384 385 /* 386 * Use the address mask from the first lgroup node 387 * to establish our home_mask. 388 */ 389 home_mask = mpo_lgroup[0].addr_mask; 390 home_mask_pfn = btop(home_mask); 391 home_mask_shift = lowbit(home_mask) - 1; 392 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 393 mnode_pages = btop(1ULL << home_mask_shift); 394 395 /* 396 * How many values are possible in home mask? Assume the mask 397 * bits are contiguous. 398 */ 399 max_locality_groups = 400 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 401 402 /* Now verify the home mask bits are contiguous */ 403 404 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 405 MPO_STATUS("lgrp_traverse: " 406 "home mask bits are not contiguous\n"); 407 ret_val = -1; 408 goto fail; 409 } 410 411 /* Record all of the home bits */ 412 413 for (i = 0; i < n_lgrpnodes; i++) { 414 HOMESET_ADD(mem_lg_homeset, 415 mpo_lgroup[i].addr_match >> home_mask_shift); 416 } 417 418 /* Count the number different "home" mem_lg's we've discovered */ 419 420 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 421 422 /* If we have only 1 locality group then we can exit */ 423 if (n_locality_groups == 1) { 424 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 425 ret_val = -1; 426 goto fail; 427 } 428 429 /* 430 * Set the latencies. A CPU's lgroup is defined by the lowest 431 * latency found. All other memory is considered remote, and the 432 * remote latency is represented by the highest latency found. 433 * Thus hierarchical lgroups, if any, are approximated by a 434 * two level scheme. 435 * 436 * The Solaris MPO framework by convention wants to see latencies 437 * in units of nano-sec/10. In the MD, the units are defined to be 438 * pico-seconds. 439 */ 440 441 lower_latency = mpo_lgroup[0].latency; 442 higher_latency = mpo_lgroup[0].latency; 443 444 for (i = 1; i < n_lgrpnodes; i++) { 445 if (mpo_lgroup[i].latency < lower_latency) { 446 lower_latency = mpo_lgroup[i].latency; 447 } 448 if (mpo_lgroup[i].latency > higher_latency) { 449 higher_latency = mpo_lgroup[i].latency; 450 } 451 } 452 lower_latency /= 10000; 453 higher_latency /= 10000; 454 455 /* Clear our CPU data */ 456 457 for (i = 0; i < NCPU; i++) { 458 mpo_cpu[i].home = 0; 459 mpo_cpu[i].latency = (uint_t)(-1); 460 } 461 462 /* Build the CPU nodes */ 463 for (i = 0; i < n_cpunodes; i++) { 464 465 /* Read in the lgroup nodes */ 466 467 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 468 if (result < 0) { 469 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 470 ret_val = -1; 471 goto fail; 472 } 473 474 n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, 475 "fwd", &nodes); 476 if (n_lgroups <= 0) { 477 MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); 478 ret_val = -1; 479 goto fail; 480 } 481 482 /* 483 * Find the lgroup this cpu belongs to with the lowest latency. 484 * Check all the lgrp nodes connected to this CPU to determine 485 * which has the smallest latency. 486 */ 487 488 for (j = 0; j < n_lgroups; j++) { 489 for (o = 0; o < n_lgrpnodes; o++) { 490 if (nodes[j] == mpo_lgroup[o].node) { 491 if (mpo_lgroup[o].latency < 492 mpo_cpu[k].latency) { 493 mpo_cpu[k].home = 494 mpo_lgroup[o].addr_match 495 >> home_mask_shift; 496 mpo_cpu[k].latency = 497 mpo_lgroup[o].latency; 498 mpo_lgroup[o].ncpu++; 499 } 500 } 501 } 502 } 503 md_free_scan_dag(md, &nodes); 504 } 505 506 /* Validate that no large pages cross mnode boundaries. */ 507 if (valid_pages(md, cpunodes[0]) == 0) { 508 ret_val = -1; 509 goto fail; 510 } 511 512 fail: 513 /* MD cookies are no longer valid; ensure they are not used again. */ 514 for (i = 0; i < n_mblocks; i++) 515 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 516 for (i = 0; i < n_lgrpnodes; i++) 517 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 518 519 if (n_cpunodes > 0) 520 md_free_scan_dag(md, &cpunodes); 521 if (n_lgrpnodes > 0) 522 md_free_scan_dag(md, &lgrpnodes); 523 if (n_mblocks > 0) 524 md_free_scan_dag(md, &mblocknodes); 525 else 526 panic("lgrp_traverse: No memory blocks found"); 527 528 if (ret_val == 0) 529 MPO_STATUS("MPO feature is enabled.\n"); 530 531 return (ret_val); 532 } 533 534 /* 535 * Determine the number of unique mem_lg's present in our system 536 */ 537 static int 538 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 539 { 540 int homeid; 541 int count = 0; 542 543 /* 544 * Scan the "home" bits of the mem_lgs, count 545 * the number that are unique. 546 */ 547 548 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 549 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 550 count++; 551 } 552 } 553 554 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 555 mem_lg_homeset); 556 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 557 558 /* Default must be at least one */ 559 if (count == 0) 560 count = 1; 561 562 return (count); 563 } 564 565 /* 566 * Platform specific lgroup initialization 567 */ 568 void 569 plat_lgrp_init(void) 570 { 571 md_t *md; 572 int i, rc, ncpu_min; 573 574 /* Get the Machine Descriptor handle */ 575 576 md = md_get_handle(); 577 578 /* If not, we cannot continue */ 579 580 if (md == NULL) { 581 panic("cannot access machine descriptor\n"); 582 } else { 583 rc = lgrp_traverse(md); 584 (void) md_fini_handle(md); 585 } 586 587 /* 588 * If we can't process the MD for lgroups then at least let the 589 * system try to boot. Assume we have one lgroup so that 590 * when plat_build_mem_nodes is called, it will attempt to init 591 * an mnode based on the supplied memory segment. 592 */ 593 594 if (rc == -1) { 595 home_mask_pfn = 0; 596 max_locality_groups = 1; 597 n_locality_groups = 1; 598 return; 599 } 600 601 mem_node_pfn_shift = 0; 602 mem_node_physalign = 0; 603 604 /* Use lgroup-aware TSB allocations */ 605 tsb_lgrp_affinity = 1; 606 607 /* 608 * lgrp_expand_proc_thresh is the minimum load on the lgroups 609 * this process is currently running on before considering 610 * expanding threads to another lgroup. 611 * 612 * lgrp_expand_proc_diff determines how much less the remote lgroup 613 * must be loaded before expanding to it. 614 * 615 * On sun4v CMT processors, threads share a core pipeline, and 616 * at less than 100% utilization, best throughput is obtained by 617 * spreading threads across more cores, even if some are in a 618 * different lgroup. Spread threads to a new lgroup if the 619 * current group is more than 50% loaded. Because of virtualization, 620 * lgroups may have different numbers of CPUs, but the tunables 621 * apply to all lgroups, so find the smallest lgroup and compute 622 * 50% loading. 623 */ 624 625 ncpu_min = NCPU; 626 for (i = 0; i < n_lgrpnodes; i++) { 627 int ncpu = mpo_lgroup[i].ncpu; 628 if (ncpu != 0 && ncpu < ncpu_min) 629 ncpu_min = ncpu; 630 } 631 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 632 633 /* new home may only be half as loaded as the existing home to use it */ 634 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 635 636 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 637 638 /* Require that a home lgroup have some memory to be chosen */ 639 lgrp_mem_free_thresh = 1; 640 641 /* Standard home-on-next-touch policy */ 642 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 643 644 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 645 lgrp_load_thresh = UINT32_MAX; 646 } 647 648 /* 649 * Helper routine for debugging calls to mem_node_add_slice() 650 */ 651 static void 652 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 653 { 654 #if defined(DEBUG) && !defined(lint) 655 static int slice_count = 0; 656 657 slice_count++; 658 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 659 slice_count, basepfn, endpfn); 660 #endif 661 mem_node_add_slice(basepfn, endpfn); 662 } 663 664 /* 665 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 666 */ 667 static void 668 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 669 { 670 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 671 "mnode index: %d\n", plathand, mnode); 672 plat_assign_lgrphand_to_mem_node(plathand, mnode); 673 } 674 675 /* 676 * plat_build_mem_nodes() 677 * 678 * Define the mem_nodes based on the modified boot memory list, 679 * or based on info read from the MD in plat_lgrp_init(). 680 * 681 * When the home mask lies in the middle of the address bits (as it does on 682 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 683 * it is striped across an mblock in a repeating pattern of contiguous memory 684 * followed by a gap. The stripe width is the size of the contiguous piece. 685 * The stride is the distance from the start of one contiguous piece to the 686 * start of the next. The gap is thus stride - stripe_width. 687 * 688 * The stripe of an mnode that falls within an mblock is described by the type 689 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 690 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 691 * this array is predetermined. The mem_stripe_t that describes mnode m 692 * within mpo_mblock[i] is stored at 693 * mem_stripes[ m + i * max_locality_groups ] 694 * 695 * max_locality_groups is the total number of possible locality groups, 696 * as defined by the size of the home mask, even if the memory assigned 697 * to the domain is small and does not cover all the lgroups. Thus some 698 * mem_stripe_t's may be empty. 699 * 700 * The members of mem_stripe_t are: 701 * physbase: First valid page in mem_node in the corresponding mblock 702 * physmax: Last valid page in mem_node in mblock 703 * offset: The full stripe width starts at physbase - offset. 704 * Thus if offset is non-zero, this mem_node starts in the middle 705 * of a stripe width, and the second full stripe starts at 706 * physbase - offset + stride. (even though physmax may fall in the 707 * middle of a stripe width, we do not save the ending fragment size 708 * in this data structure.) 709 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 710 * 711 * The stripe width is kept in the global mnode_pages. 712 * The stride is kept in the global mnode_stride. 713 * All the above use pfn's as the unit. 714 * 715 * As an example, the memory layout for a domain with 2 mblocks and 4 716 * mem_nodes 0,1,2,3 could look like this: 717 * 718 * 123012301230 ... 012301230123 ... 719 * mblock 0 mblock 1 720 */ 721 722 void 723 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 724 { 725 lgrp_handle_t lgrphand, lgrp_start; 726 int i, mnode, elem; 727 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 728 uint64_t stripe, frag, remove; 729 mem_stripe_t *ms; 730 731 /* Check for non-MPO sun4v platforms */ 732 733 if (n_locality_groups <= 1) { 734 mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0); 735 for (elem = 0; elem < nelems; elem += 2) { 736 base = list[elem]; 737 len = list[elem+1]; 738 739 mpo_mem_node_add_slice(btop(base), 740 btop(base + len - 1)); 741 } 742 mem_node_pfn_shift = 0; 743 mem_node_physalign = 0; 744 n_mem_stripes = 0; 745 return; 746 } 747 748 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 749 max_mem_nodes = max_locality_groups; 750 bzero(mem_stripes, sizeof (mem_stripes)); 751 stripe = ptob(mnode_pages); 752 stride = max_locality_groups * stripe; 753 754 /* Save commonly used values in globals */ 755 mnode_stride = btop(stride); 756 n_mem_stripes = max_locality_groups * n_mblocks; 757 stripe_shift = highbit(max_locality_groups) - 1; 758 759 for (i = 0; i < n_mblocks; i++) { 760 761 base = mpo_mblock[i].base; 762 end = mpo_mblock[i].base + mpo_mblock[i].size; 763 ra_to_pa = mpo_mblock[i].ra_to_pa; 764 mpo_mblock[i].base_pfn = btop(base); 765 mpo_mblock[i].end_pfn = btop(end - 1); 766 767 /* Find the offset from the prev stripe boundary in PA space. */ 768 offset = (base + ra_to_pa) & (stripe - 1); 769 770 /* Set the next stripe boundary. */ 771 stripe_end = base - offset + stripe; 772 773 lgrp_start = (((base + ra_to_pa) & home_mask) >> 774 home_mask_shift); 775 lgrphand = lgrp_start; 776 777 /* 778 * Loop over all lgroups covered by the mblock, creating a 779 * stripe for each. Stop when lgrp_start is visited again. 780 */ 781 do { 782 /* mblock may not span all lgroups */ 783 if (base >= end) 784 break; 785 786 mnode = lgrphand; 787 ASSERT(mnode < max_mem_nodes); 788 789 /* 790 * Calculate the size of the fragment that does not 791 * belong to the mnode in the last partial stride. 792 */ 793 frag = (end - (base - offset)) & (stride - 1); 794 if (frag == 0) { 795 /* remove the gap */ 796 remove = stride - stripe; 797 } else if (frag < stripe) { 798 /* fragment fits in stripe; keep it all */ 799 remove = 0; 800 } else { 801 /* fragment is large; trim after whole stripe */ 802 remove = frag - stripe; 803 } 804 805 ms = &mem_stripes[i * max_locality_groups + mnode]; 806 ms->physbase = btop(base); 807 ms->physmax = btop(end - 1 - remove); 808 ms->offset = btop(offset); 809 ms->exists = 1; 810 811 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode); 812 mpo_mem_node_add_slice(ms->physbase, ms->physmax); 813 814 base = stripe_end; 815 stripe_end += stripe; 816 offset = 0; 817 lgrphand = (((base + ra_to_pa) & home_mask) >> 818 home_mask_shift); 819 } while (lgrphand != lgrp_start); 820 } 821 822 /* 823 * Indicate to vm_pagelist that the hpm_counters array 824 * should be shared because the ranges overlap. 825 */ 826 if (max_mem_nodes > 1) { 827 interleaved_mnodes = 1; 828 } 829 } 830 831 /* 832 * Return the locality group value for the supplied processor 833 */ 834 lgrp_handle_t 835 plat_lgrp_cpu_to_hand(processorid_t id) 836 { 837 if (n_locality_groups > 1) { 838 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 839 } else { 840 return ((lgrp_handle_t)0); /* Default */ 841 } 842 } 843 844 int 845 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 846 { 847 /* 848 * Return min remote latency when there are more than two lgroups 849 * (root and child) and getting latency between two different lgroups 850 * or root is involved. 851 */ 852 if (lgrp_optimizations() && (from != to || 853 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 854 return ((int)higher_latency); 855 } else { 856 return ((int)lower_latency); 857 } 858 } 859 860 int 861 plat_pfn_to_mem_node(pfn_t pfn) 862 { 863 int i, mnode; 864 pfn_t ra_to_pa_pfn; 865 struct mblock_md *mb; 866 867 if (n_locality_groups <= 1) 868 return (0); 869 870 /* 871 * The mnode is defined to be 1:1 with the lgroup handle, which 872 * is taken from from the home bits. Find the mblock in which 873 * the pfn falls to get the ra_to_pa adjustment, and extract 874 * the home bits. 875 */ 876 mb = &mpo_mblock[0]; 877 for (i = 0; i < n_mblocks; i++) { 878 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 879 ra_to_pa_pfn = btop(mb->ra_to_pa); 880 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 881 home_mask_pfn_shift); 882 ASSERT(mnode < max_mem_nodes); 883 return (mnode); 884 } 885 mb++; 886 } 887 888 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 889 return (pfn); 890 } 891 892 /* 893 * plat_rapfn_to_papfn 894 * 895 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 896 * and home mask bits are correct. The upper bits do not necessarily 897 * match the actual PA, however. 898 */ 899 pfn_t 900 plat_rapfn_to_papfn(pfn_t pfn) 901 { 902 int i; 903 pfn_t ra_to_pa_pfn; 904 struct mblock_md *mb; 905 906 ASSERT(n_mblocks > 0); 907 if (n_mblocks == 1) 908 return (pfn + base_ra_to_pa_pfn); 909 910 /* 911 * Find the mblock in which the pfn falls 912 * in order to get the ra_to_pa adjustment. 913 */ 914 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 915 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 916 ra_to_pa_pfn = btop(mb->ra_to_pa); 917 return (pfn + ra_to_pa_pfn); 918 } 919 } 920 921 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 922 return (pfn); 923 } 924 925 /* 926 * plat_mem_node_iterator_init() 927 * Initialize cookie to iterate over pfn's in an mnode. There is 928 * no additional iterator function. The caller uses the info from 929 * the iterator structure directly. 930 * 931 * pfn: starting pfn. 932 * mnode: desired mnode. 933 * init: set to 1 for full init, 0 for continuation 934 * 935 * Returns the appropriate starting pfn for the iteration 936 * the same as the input pfn if it falls in an mblock. 937 * Returns the (pfn_t)-1 value if the input pfn lies past 938 * the last valid mnode pfn. 939 */ 940 pfn_t 941 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 942 mem_node_iterator_t *it, int init) 943 { 944 int i; 945 struct mblock_md *mblock; 946 pfn_t base, end; 947 948 ASSERT(it != NULL); 949 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 950 ASSERT(n_mblocks > 0); 951 952 if (init) { 953 it->mi_last_mblock = 0; 954 it->mi_init = 1; 955 } 956 957 /* Check if mpo is not enabled and we only have one mblock */ 958 if (n_locality_groups == 1 && n_mblocks == 1) { 959 it->mi_mnode = mnode; 960 it->mi_ra_to_pa = base_ra_to_pa_pfn; 961 it->mi_mnode_pfn_mask = 0; 962 it->mi_mnode_pfn_shift = 0; 963 it->mi_mnode_mask = 0; 964 it->mi_mblock_base = mem_node_config[mnode].physbase; 965 it->mi_mblock_end = mem_node_config[mnode].physmax; 966 if (pfn < it->mi_mblock_base) 967 pfn = it->mi_mblock_base; 968 else if (pfn > it->mi_mblock_end) 969 pfn = (pfn_t)-1; 970 return (pfn); 971 } 972 973 /* 974 * Find mblock that contains pfn, or first mblock after pfn, 975 * else pfn is out of bounds, so use the last mblock. 976 * mblocks are sorted in ascending address order. 977 */ 978 ASSERT(it->mi_last_mblock < n_mblocks); 979 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 980 i = init ? 0 : it->mi_last_mblock + 1; 981 if (i == n_mblocks) 982 return ((pfn_t)-1); 983 984 for (; i < n_mblocks; i++) { 985 if (pfn <= mpo_mblock[i].end_pfn) 986 break; 987 } 988 if (i == n_mblocks) { 989 it->mi_last_mblock = i - 1; 990 return ((pfn_t)-1); 991 } 992 it->mi_last_mblock = i; 993 994 /* 995 * Memory stripes are defined if there is more than one locality 996 * group, so use the stripe bounds. Otherwise use mblock bounds. 997 */ 998 mblock = &mpo_mblock[i]; 999 if (n_mem_stripes > 0) { 1000 mem_stripe_t *ms = 1001 &mem_stripes[i * max_locality_groups + mnode]; 1002 base = ms->physbase; 1003 end = ms->physmax; 1004 } else { 1005 ASSERT(mnode == 0); 1006 base = mblock->base_pfn; 1007 end = mblock->end_pfn; 1008 } 1009 1010 it->mi_mnode = mnode; 1011 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1012 it->mi_mblock_base = base; 1013 it->mi_mblock_end = end; 1014 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1015 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1016 it->mi_mnode_mask = max_locality_groups - 1; 1017 if (pfn < base) 1018 pfn = base; 1019 else if (pfn > end) 1020 pfn = (pfn_t)-1; 1021 return (pfn); 1022 } 1023 1024 /* 1025 * plat_mem_node_intersect_range() 1026 * 1027 * Find the intersection between a memnode and a range of pfn's. 1028 */ 1029 void 1030 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1031 int mnode, pgcnt_t *npages_out) 1032 { 1033 pfn_t offset, len, hole, base, end, test_end, frag; 1034 pfn_t nearest; 1035 mem_stripe_t *ms; 1036 int i, npages; 1037 1038 *npages_out = 0; 1039 1040 if (!mem_node_config[mnode].exists || test_len == 0) 1041 return; 1042 1043 base = mem_node_config[mnode].physbase; 1044 end = mem_node_config[mnode].physmax; 1045 1046 test_end = test_base + test_len - 1; 1047 if (end < test_base || base > test_end) 1048 return; 1049 1050 if (n_locality_groups == 1) { 1051 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1052 return; 1053 } 1054 1055 hole = mnode_stride - mnode_pages; 1056 npages = 0; 1057 1058 /* 1059 * Iterate over all the stripes for this mnode (one per mblock), 1060 * find the intersection with each, and accumulate the intersections. 1061 * 1062 * Determing the intersection with a stripe is tricky. If base or end 1063 * fall outside the mem_node bounds, round them to physbase/physmax of 1064 * mem_node. If base or end fall in a gap, round them to start of 1065 * nearest stripe. If they fall within a stripe, keep base or end, 1066 * but calculate the fragment size that should be excluded from the 1067 * stripe. Calculate how many strides fall in the adjusted range, 1068 * multiply by stripe width, and add the start and end fragments. 1069 */ 1070 1071 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1072 ms = &mem_stripes[i]; 1073 if (ms->exists && 1074 test_base <= (end = ms->physmax) && 1075 test_end >= (base = ms->physbase)) { 1076 1077 offset = ms->offset; 1078 1079 if (test_base > base) { 1080 /* Round test_base to next multiple of stride */ 1081 len = P2ROUNDUP(test_base - (base - offset), 1082 mnode_stride); 1083 nearest = base - offset + len; 1084 /* 1085 * Compute distance from test_base to the 1086 * stride boundary to see if test_base falls 1087 * in the stripe or in the hole. 1088 */ 1089 if (nearest - test_base > hole) { 1090 /* 1091 * test_base lies in stripe, 1092 * and offset should be excluded. 1093 */ 1094 offset = test_base - 1095 (nearest - mnode_stride); 1096 base = test_base; 1097 } else { 1098 /* round up to next stripe start */ 1099 offset = 0; 1100 base = nearest; 1101 if (base > end) 1102 continue; 1103 } 1104 1105 } 1106 1107 if (test_end < end) 1108 end = test_end; 1109 end++; /* adjust to an exclusive bound */ 1110 1111 /* Round end to next multiple of stride */ 1112 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1113 nearest = (base - offset) + len; 1114 if (nearest - end <= hole) { 1115 /* end falls in hole, use entire last stripe */ 1116 frag = 0; 1117 } else { 1118 /* end falls in stripe, compute fragment */ 1119 frag = nearest - hole - end; 1120 } 1121 1122 len = (len >> stripe_shift) - offset - frag; 1123 npages += len; 1124 } 1125 } 1126 1127 *npages_out = npages; 1128 } 1129 1130 /* 1131 * valid_pages() 1132 * 1133 * Return 1 if pages are valid and do not cross mnode boundaries 1134 * (which would break page free list assumptions), and 0 otherwise. 1135 */ 1136 1137 #define MNODE(pa) \ 1138 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1139 1140 static int 1141 valid_pages(md_t *md, mde_cookie_t cpu0) 1142 { 1143 int i, max_szc; 1144 uint64_t last_page_base, szc_mask; 1145 uint64_t max_page_len, max_coalesce_len; 1146 struct mblock_md *mb = mpo_mblock; 1147 1148 /* 1149 * Find the smaller of the largest page possible and supported. 1150 * mmu_exported_pagesize_mask is not yet initialized, so read 1151 * it from the MD. Apply minimal fixups in case of broken MDs 1152 * to get a sane mask. 1153 */ 1154 1155 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1156 szc_mask = 0; 1157 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1158 max_szc = highbit(szc_mask) - 1; 1159 if (max_szc > TTE256M) 1160 max_szc = TTE256M; 1161 max_page_len = TTEBYTES(max_szc); 1162 1163 /* 1164 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1165 * if mmu-page-size-list does not contain it, so 256M pages must fall 1166 * within one mnode to use MPO. 1167 */ 1168 max_coalesce_len = TTEBYTES(TTE256M); 1169 ASSERT(max_coalesce_len >= max_page_len); 1170 1171 if (ptob(mnode_pages) < max_coalesce_len) { 1172 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1173 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1174 return (0); 1175 } 1176 1177 for (i = 0; i < n_mblocks; i++) { 1178 uint64_t base = mb->base; 1179 uint64_t end = mb->base + mb->size - 1; 1180 uint64_t ra_to_pa = mb->ra_to_pa; 1181 1182 /* 1183 * If mblock is smaller than the max page size, then 1184 * RA = PA mod MAXPAGE is not guaranteed, but it must 1185 * not span mnodes. 1186 */ 1187 if (mb->size < max_page_len) { 1188 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1189 MPO_STATUS("Small mblock spans mnodes; " 1190 "MPO disabled: base = %lx, end = %lx, " 1191 "ra2pa = %lx\n", base, end, ra_to_pa); 1192 return (0); 1193 } 1194 } else { 1195 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1196 uint64_t pa_base = base + ra_to_pa; 1197 if ((base & (max_coalesce_len - 1)) != 1198 (pa_base & (max_coalesce_len - 1))) { 1199 MPO_STATUS("bad page alignment; MPO disabled: " 1200 "ra = %lx, pa = %lx, pagelen = %lx\n", 1201 base, pa_base, max_coalesce_len); 1202 return (0); 1203 } 1204 } 1205 1206 /* 1207 * Find start of last large page in mblock in RA space. 1208 * If page extends into the next mblock, verify the 1209 * mnode does not change. 1210 */ 1211 last_page_base = P2ALIGN(end, max_coalesce_len); 1212 if (i + 1 < n_mblocks && 1213 last_page_base + max_coalesce_len > mb[1].base && 1214 MNODE(last_page_base + ra_to_pa) != 1215 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1216 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1217 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1218 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1219 mb[1].ra_to_pa, max_coalesce_len); 1220 return (0); 1221 } 1222 1223 mb++; 1224 } 1225 return (1); 1226 } 1227 1228 1229 /* 1230 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1231 * if any, and remove them. This yields a config where the "coarse 1232 * grained" lgroups cover all of memory, even though part of that memory 1233 * is fine grain interleaved and does not deliver a purely local memory 1234 * latency. 1235 * 1236 * This function reads and modifies the globals: 1237 * mpo_lgroup[], n_lgrpnodes 1238 * 1239 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1240 */ 1241 1242 static int 1243 fix_interleave(void) 1244 { 1245 int i, j; 1246 uint64_t mask = 0; 1247 1248 j = 0; 1249 for (i = 0; i < n_lgrpnodes; i++) { 1250 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1251 /* remove this lgroup */ 1252 mask = mpo_lgroup[i].addr_mask; 1253 } else { 1254 mpo_lgroup[j++] = mpo_lgroup[i]; 1255 } 1256 } 1257 n_lgrpnodes = j; 1258 1259 if (mask != 0) 1260 MPO_STATUS("sub-page interleave %lx found; " 1261 "removing lgroup.\n", mask); 1262 1263 return (mask != 0); 1264 } 1265