1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/sysmacros.h> 29 #include <sys/machsystm.h> 30 #include <sys/machparam.h> 31 #include <sys/cmn_err.h> 32 #include <sys/stat.h> 33 #include <sys/mach_descrip.h> 34 #include <sys/memnode.h> 35 #include <sys/mdesc.h> 36 #include <sys/mpo.h> 37 #include <vm/page.h> 38 #include <vm/vm_dep.h> 39 #include <vm/hat_sfmmu.h> 40 #include <sys/promif.h> 41 42 /* 43 * MPO and the sun4v memory representation 44 * --------------------------------------- 45 * 46 * Latency groups are defined in the sun4v achitecture by memory-latency-group 47 * nodes in the Machine Description, as specified in FWARC/2007/260. These 48 * tie together cpu nodes and mblock nodes, and contain mask and match 49 * properties that identify the portion of an mblock that belongs to the 50 * lgroup. Mask and match are defined in the Physical Address (PA) space, 51 * but an mblock defines Real Addresses (RA). To translate, the mblock 52 * includes the property address-congruence-offset, hereafter referred to as 53 * ra_to_pa. A real address ra is a member of an lgroup if 54 * 55 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 56 * 57 * The MD is traversed, and information on all mblocks is kept in the array 58 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 59 * to, is kept in the array mpo_cpu[]. 60 * 61 * This implementation makes (and verifies) the simplifying assumption that 62 * the mask bits are the same for all defined lgroups, and that all 1 bits in 63 * the mask are contiguous. Thus the number of lgroups is bounded by the 64 * number of possible mask values, and the lgrp_handle_t is defined as the 65 * mask value, shifted right to eliminate the 0 bit positions in mask. The 66 * masks and values are also referred to as "home bits" in the code. 67 * 68 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 69 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 70 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 71 * home bits. This yields the mem_node. 72 * 73 * Interfaces 74 * ---------- 75 * 76 * This file exports the following entry points: 77 * 78 * plat_lgrp_init() 79 * plat_build_mem_nodes() 80 * plat_lgrp_cpu_to_hand() 81 * plat_lgrp_latency() 82 * plat_pfn_to_mem_node() 83 * These implement the usual platform lgroup interfaces. 84 * 85 * plat_rapfn_to_papfn() 86 * Recover the PA page coloring bits from an RA. 87 * 88 * plat_mem_node_iterator_init() 89 * Initialize an iterator to efficiently step through pages in a mem_node. 90 * 91 * plat_mem_node_intersect_range() 92 * Find the intersection with a mem_node. 93 * 94 * plat_slice_add() 95 * plat_slice_del() 96 * Platform hooks to add/delete a pfn range. 97 * 98 * Internal Organization 99 * --------------------- 100 * 101 * A number of routines are used both boot/DR code which (re)build 102 * appropriate MPO structures. 103 * 104 * mblock_alloc() 105 * Allocate memory for mblocks and stripes as 106 * appropriate for boot or memory DR. 107 * 108 * mblock_free() 109 * Free memory allocated by mblock_alloc. 110 * 111 * mblock_update() 112 * Build mblocks based on mblock nodes read from the MD. 113 * 114 * mblock_update_add() 115 * Rebuild mblocks after a memory DR add operation. 116 * 117 * mblock_update_del() 118 * Rebuild mblocks after a memory DR delete operation. 119 * 120 * mblock_install() 121 * Install mblocks as the new configuration. 122 * 123 * mstripe_update() 124 * Build stripes based on mblocks. 125 * 126 * mnode_update() 127 * Call memnode layer to add/del a pfn range, based on stripes. 128 * 129 * The platform interfaces allocate all memory required for the 130 * particualar update first, block access to the MPO structures 131 * while they are updated, and free old structures after the update. 132 */ 133 134 int sun4v_mpo_enable = 1; 135 int sun4v_mpo_debug = 0; 136 char sun4v_mpo_status[256] = ""; 137 138 /* Save CPU info from the MD and associate CPUs with lgroups */ 139 static struct cpu_md mpo_cpu[NCPU]; 140 141 /* Save lgroup info from the MD */ 142 #define MAX_MD_LGROUPS 32 143 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 144 static int n_lgrpnodes = 0; 145 static int n_locality_groups = 0; 146 static int max_locality_groups = 0; 147 static int szc_mask0 = 0; 148 149 /* Save mblocks from the MD */ 150 #define SMALL_MBLOCKS_COUNT 8 151 static struct mblock_md *mpo_mblock; 152 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 153 static int n_mblocks = 0; 154 155 /* Save mem_node stripes calculate from mblocks and lgroups. */ 156 static mem_stripe_t *mem_stripes; 157 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 158 static int n_mem_stripes = 0; 159 static pfn_t mnode_stride; /* distance between stripes, start to start */ 160 static int stripe_shift; /* stride/stripes expressed as a shift */ 161 static pfn_t mnode_pages; /* mem_node stripe width */ 162 163 /* Save home mask and shift used to calculate lgrp_handle_t values */ 164 static uint64_t home_mask = 0; 165 static pfn_t home_mask_pfn = 0; 166 static int home_mask_shift = 0; 167 static uint_t home_mask_pfn_shift = 0; 168 169 /* Save lowest and highest latencies found across all lgroups */ 170 static int lower_latency = 0; 171 static int higher_latency = 0; 172 173 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 174 static int mpo_genid; /* config gen; updated by mem DR */ 175 static mpo_config_t mpo_config; /* current mblocks and stripes */ 176 177 typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t; 178 179 static int valid_pages(md_t *md, mde_cookie_t cpu0); 180 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 181 static int fix_interleave(void); 182 183 static int mblock_alloc(mpo_config_t *, update_t, int nmblocks); 184 static void mblock_install(mpo_config_t *); 185 static void mblock_free(mpo_config_t *); 186 static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes); 187 static void mblock_update_add(mpo_config_t *); 188 static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t); 189 static void mstripe_update(mpo_config_t *); 190 static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t); 191 192 /* Debug support */ 193 #if defined(DEBUG) && !defined(lint) 194 #define VALIDATE_SLICE(base, end) { \ 195 ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M))); \ 196 ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M))); \ 197 } 198 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 199 #else 200 #define VALIDATE_SLICE(base, end) 201 #define MPO_DEBUG(...) 202 #endif /* DEBUG */ 203 204 /* Record status message, viewable from mdb */ 205 #define MPO_STATUS(args...) { \ 206 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 207 MPO_DEBUG(sun4v_mpo_status); \ 208 } 209 210 /* 211 * The MPO locks are to protect the MPO metadata while that 212 * information is updated as a result of a memory DR operation. 213 * The read lock must be acquired to read the metadata and the 214 * write locks must be acquired to update it. 215 */ 216 #define mpo_rd_lock kpreempt_disable 217 #define mpo_rd_unlock kpreempt_enable 218 219 static void 220 mpo_wr_lock() 221 { 222 mutex_enter(&cpu_lock); 223 pause_cpus(NULL); 224 mutex_exit(&cpu_lock); 225 } 226 227 static void 228 mpo_wr_unlock() 229 { 230 mutex_enter(&cpu_lock); 231 start_cpus(); 232 mutex_exit(&cpu_lock); 233 } 234 235 /* 236 * Routine to read a uint64_t from a given md 237 */ 238 static int64_t 239 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 240 { 241 int err = md_get_prop_val(md, node, propname, val); 242 return (err); 243 } 244 245 static int 246 mblock_cmp(const void *a, const void *b) 247 { 248 struct mblock_md *m1 = (struct mblock_md *)a; 249 struct mblock_md *m2 = (struct mblock_md *)b; 250 251 if (m1->base < m2->base) 252 return (-1); 253 else if (m1->base == m2->base) 254 return (0); 255 else 256 return (1); 257 } 258 259 static void 260 mblock_sort(struct mblock_md *mblocks, int n) 261 { 262 extern void qsort(void *, size_t, size_t, 263 int (*)(const void *, const void *)); 264 265 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 266 } 267 268 static void 269 mpo_update_tunables(void) 270 { 271 int i, ncpu_min; 272 273 /* 274 * lgrp_expand_proc_thresh is the minimum load on the lgroups 275 * this process is currently running on before considering 276 * expanding threads to another lgroup. 277 * 278 * lgrp_expand_proc_diff determines how much less the remote lgroup 279 * must be loaded before expanding to it. 280 * 281 * On sun4v CMT processors, threads share a core pipeline, and 282 * at less than 100% utilization, best throughput is obtained by 283 * spreading threads across more cores, even if some are in a 284 * different lgroup. Spread threads to a new lgroup if the 285 * current group is more than 50% loaded. Because of virtualization, 286 * lgroups may have different numbers of CPUs, but the tunables 287 * apply to all lgroups, so find the smallest lgroup and compute 288 * 50% loading. 289 */ 290 291 ncpu_min = NCPU; 292 for (i = 0; i < n_lgrpnodes; i++) { 293 int ncpu = mpo_lgroup[i].ncpu; 294 if (ncpu != 0 && ncpu < ncpu_min) 295 ncpu_min = ncpu; 296 } 297 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 298 299 /* new home may only be half as loaded as the existing home to use it */ 300 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 301 302 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 303 } 304 305 static mde_cookie_t 306 cpuid_to_cpunode(md_t *md, int cpuid) 307 { 308 mde_cookie_t rootnode, foundnode, *cpunodes; 309 uint64_t cpuid_prop; 310 int n_cpunodes, i; 311 312 if (md == NULL) 313 return (MDE_INVAL_ELEM_COOKIE); 314 315 rootnode = md_root_node(md); 316 if (rootnode == MDE_INVAL_ELEM_COOKIE) 317 return (MDE_INVAL_ELEM_COOKIE); 318 319 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU, 320 "fwd", &cpunodes); 321 if (n_cpunodes <= 0 || n_cpunodes > NCPU) 322 goto cpuid_fail; 323 324 for (i = 0; i < n_cpunodes; i++) { 325 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID, 326 &cpuid_prop)) 327 break; 328 if (cpuid_prop == (uint64_t)cpuid) { 329 foundnode = cpunodes[i]; 330 md_free_scan_dag(md, &cpunodes); 331 return (foundnode); 332 } 333 } 334 cpuid_fail: 335 if (n_cpunodes > 0) 336 md_free_scan_dag(md, &cpunodes); 337 return (MDE_INVAL_ELEM_COOKIE); 338 } 339 340 static int 341 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode) 342 { 343 mde_cookie_t *nodes; 344 uint64_t latency, lowest_latency; 345 uint64_t address_match, lowest_address_match; 346 int n_lgroups, j, result = 0; 347 348 /* Find lgroup nodes reachable from this cpu */ 349 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG, 350 "fwd", &nodes); 351 352 lowest_latency = ~(0UL); 353 354 /* Find the lgroup node with the smallest latency */ 355 for (j = 0; j < n_lgroups; j++) { 356 result = get_int(md, nodes[j], PROP_LG_LATENCY, 357 &latency); 358 result |= get_int(md, nodes[j], PROP_LG_MATCH, 359 &address_match); 360 if (result != 0) { 361 j = -1; 362 goto to_lgrp_done; 363 } 364 if (latency < lowest_latency) { 365 lowest_latency = latency; 366 lowest_address_match = address_match; 367 } 368 } 369 for (j = 0; j < n_lgrpnodes; j++) { 370 if ((mpo_lgroup[j].latency == lowest_latency) && 371 (mpo_lgroup[j].addr_match == lowest_address_match)) 372 break; 373 } 374 if (j == n_lgrpnodes) 375 j = -1; 376 377 to_lgrp_done: 378 if (n_lgroups > 0) 379 md_free_scan_dag(md, &nodes); 380 return (j); 381 } 382 383 /* Called when DR'ing in a CPU */ 384 void 385 mpo_cpu_add(int cpuid) 386 { 387 md_t *md; 388 mde_cookie_t cpunode; 389 390 int i; 391 392 if (n_lgrpnodes <= 0) 393 return; 394 395 md = md_get_handle(); 396 397 if (md == NULL) 398 goto add_fail; 399 400 cpunode = cpuid_to_cpunode(md, cpuid); 401 if (cpunode == MDE_INVAL_ELEM_COOKIE) 402 goto add_fail; 403 404 i = mpo_cpu_to_lgroup(md, cpunode); 405 if (i == -1) 406 goto add_fail; 407 408 mpo_cpu[cpuid].lgrp_index = i; 409 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift; 410 mpo_lgroup[i].ncpu++; 411 mpo_update_tunables(); 412 (void) md_fini_handle(md); 413 return; 414 add_fail: 415 panic("mpo_cpu_add: Cannot read MD"); 416 } 417 418 /* Called when DR'ing out a CPU */ 419 void 420 mpo_cpu_remove(int cpuid) 421 { 422 int i; 423 424 if (n_lgrpnodes <= 0) 425 return; 426 427 i = mpo_cpu[cpuid].lgrp_index; 428 mpo_lgroup[i].ncpu--; 429 mpo_cpu[cpuid].home = 0; 430 mpo_cpu[cpuid].lgrp_index = -1; 431 mpo_update_tunables(); 432 } 433 434 static mde_cookie_t 435 md_get_root(md_t *md) 436 { 437 mde_cookie_t root = MDE_INVAL_ELEM_COOKIE; 438 int n_nodes; 439 440 n_nodes = md_node_count(md); 441 442 if (n_nodes <= 0) { 443 MPO_STATUS("md_get_root: No nodes in node count\n"); 444 return (root); 445 } 446 447 root = md_root_node(md); 448 449 if (root == MDE_INVAL_ELEM_COOKIE) { 450 MPO_STATUS("md_get_root: Root node is missing\n"); 451 return (root); 452 } 453 454 MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes); 455 MPO_DEBUG("md_get_root: md: %p\n", md); 456 MPO_DEBUG("md_get_root: root: %lx\n", root); 457 done: 458 return (root); 459 } 460 461 static int 462 lgrp_update(md_t *md, mde_cookie_t root) 463 { 464 int i, j, result; 465 int ret_val = 0; 466 int sub_page_fix; 467 mde_cookie_t *nodes, *lgrpnodes; 468 469 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 470 "fwd", &lgrpnodes); 471 472 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 473 MPO_STATUS("lgrp_update: No Lgroups\n"); 474 ret_val = -1; 475 goto fail; 476 } 477 478 MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes); 479 480 for (i = 0; i < n_lgrpnodes; i++) { 481 mpo_lgroup[i].node = lgrpnodes[i]; 482 mpo_lgroup[i].id = i; 483 mpo_lgroup[i].ncpu = 0; 484 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 485 &mpo_lgroup[i].addr_mask); 486 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 487 &mpo_lgroup[i].addr_match); 488 489 /* 490 * If either the mask or match properties are missing, set to 0 491 */ 492 if (result < 0) { 493 mpo_lgroup[i].addr_mask = 0; 494 mpo_lgroup[i].addr_match = 0; 495 } 496 497 /* Set latency to 0 if property not present */ 498 499 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 500 &mpo_lgroup[i].latency); 501 if (result < 0) 502 mpo_lgroup[i].latency = 0; 503 } 504 505 /* 506 * Sub-page level interleave is not yet supported. Check for it, 507 * and remove sub-page interleaved lgroups from mpo_lgroup and 508 * n_lgrpnodes. If no lgroups are left, return. 509 */ 510 511 sub_page_fix = fix_interleave(); 512 if (n_lgrpnodes == 0) { 513 ret_val = -1; 514 goto fail; 515 } 516 517 /* Ensure that all of the addr_mask values are the same */ 518 519 for (i = 0; i < n_lgrpnodes; i++) { 520 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 521 MPO_STATUS("lgrp_update: " 522 "addr_mask values are not the same\n"); 523 ret_val = -1; 524 goto fail; 525 } 526 } 527 528 /* 529 * Ensure that all lgrp nodes see all the mblocks. However, if 530 * sub-page interleave is being fixed, they do not, so skip 531 * the check. 532 */ 533 534 if (sub_page_fix == 0) { 535 for (i = 0; i < n_lgrpnodes; i++) { 536 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 537 PROP_LG_MBLOCK, "fwd", &nodes); 538 md_free_scan_dag(md, &nodes); 539 if (j != n_mblocks) { 540 MPO_STATUS("lgrp_update: " 541 "sub-page interleave is being fixed\n"); 542 ret_val = -1; 543 goto fail; 544 } 545 } 546 } 547 fail: 548 if (n_lgrpnodes > 0) { 549 md_free_scan_dag(md, &lgrpnodes); 550 for (i = 0; i < n_lgrpnodes; i++) 551 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 552 } 553 554 return (ret_val); 555 } 556 557 /* 558 * 559 * Traverse the MD to determine: 560 * 561 * Number of CPU nodes, lgrp_nodes, and mblocks 562 * Then for each lgrp_node, obtain the appropriate data. 563 * For each CPU, determine its home locality and store it. 564 * For each mblock, retrieve its data and store it. 565 */ 566 static int 567 lgrp_traverse(md_t *md) 568 { 569 mde_cookie_t root, *cpunodes, *mblocknodes; 570 int o; 571 uint64_t i, k, stripe, stride; 572 uint64_t mem_lg_homeset = 0; 573 int ret_val = 0; 574 int result = 0; 575 int n_cpunodes = 0; 576 mpo_config_t new_config; 577 578 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) { 579 ret_val = -1; 580 goto fail; 581 } 582 583 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd", 584 &mblocknodes); 585 if (n_mblocks <= 0) { 586 MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine " 587 "Descriptor\n"); 588 ret_val = -1; 589 goto fail; 590 } 591 592 /* 593 * Build the Memory Nodes. Do this before any possibility of 594 * bailing from this routine so we obtain ra_to_pa (needed for page 595 * coloring) even when there are no lgroups defined. 596 */ 597 if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) { 598 ret_val = -1; 599 goto fail; 600 } 601 602 mblock_update(&new_config, md, mblocknodes); 603 mblock_install(&new_config); 604 605 /* Page coloring hook is required so we can iterate through mnodes */ 606 if (&page_next_pfn_for_color_cpu == NULL) { 607 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 608 ret_val = -1; 609 goto fail; 610 } 611 612 /* Global enable for mpo */ 613 if (sun4v_mpo_enable == 0) { 614 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 615 ret_val = -1; 616 goto fail; 617 } 618 619 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 620 621 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 622 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 623 "in MD\n"); 624 ret_val = -1; 625 goto fail; 626 } 627 628 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 629 630 if ((ret_val = lgrp_update(md, root)) == -1) 631 goto fail; 632 633 /* 634 * Use the address mask from the first lgroup node 635 * to establish our home_mask. 636 */ 637 home_mask = mpo_lgroup[0].addr_mask; 638 home_mask_pfn = btop(home_mask); 639 home_mask_shift = lowbit(home_mask) - 1; 640 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 641 mnode_pages = btop(1ULL << home_mask_shift); 642 643 /* 644 * How many values are possible in home mask? Assume the mask 645 * bits are contiguous. 646 */ 647 max_locality_groups = 648 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 649 650 stripe_shift = highbit(max_locality_groups) - 1; 651 stripe = ptob(mnode_pages); 652 stride = max_locality_groups * stripe; 653 mnode_stride = btop(stride); 654 655 /* Now verify the home mask bits are contiguous */ 656 657 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 658 MPO_STATUS("lgrp_traverse: " 659 "home mask bits are not contiguous\n"); 660 ret_val = -1; 661 goto fail; 662 } 663 664 /* Record all of the home bits */ 665 666 for (i = 0; i < n_lgrpnodes; i++) { 667 HOMESET_ADD(mem_lg_homeset, 668 mpo_lgroup[i].addr_match >> home_mask_shift); 669 } 670 671 /* Count the number different "home" mem_lg's we've discovered */ 672 673 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 674 675 /* If we have only 1 locality group then we can exit */ 676 if (n_locality_groups == 1) { 677 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 678 ret_val = -1; 679 goto fail; 680 } 681 682 /* 683 * Set the latencies. A CPU's lgroup is defined by the lowest 684 * latency found. All other memory is considered remote, and the 685 * remote latency is represented by the highest latency found. 686 * Thus hierarchical lgroups, if any, are approximated by a 687 * two level scheme. 688 * 689 * The Solaris MPO framework by convention wants to see latencies 690 * in units of nano-sec/10. In the MD, the units are defined to be 691 * pico-seconds. 692 */ 693 694 lower_latency = mpo_lgroup[0].latency; 695 higher_latency = mpo_lgroup[0].latency; 696 697 for (i = 1; i < n_lgrpnodes; i++) { 698 if (mpo_lgroup[i].latency < lower_latency) { 699 lower_latency = mpo_lgroup[i].latency; 700 } 701 if (mpo_lgroup[i].latency > higher_latency) { 702 higher_latency = mpo_lgroup[i].latency; 703 } 704 } 705 lower_latency /= 10000; 706 higher_latency /= 10000; 707 708 /* Clear our CPU data */ 709 710 for (i = 0; i < NCPU; i++) { 711 mpo_cpu[i].home = 0; 712 mpo_cpu[i].lgrp_index = -1; 713 } 714 715 /* Build the CPU nodes */ 716 for (i = 0; i < n_cpunodes; i++) { 717 718 /* Read in the lgroup nodes */ 719 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 720 if (result < 0) { 721 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 722 ret_val = -1; 723 goto fail; 724 } 725 726 o = mpo_cpu_to_lgroup(md, cpunodes[i]); 727 if (o == -1) { 728 ret_val = -1; 729 goto fail; 730 } 731 mpo_cpu[k].lgrp_index = o; 732 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift; 733 mpo_lgroup[o].ncpu++; 734 } 735 /* Validate that no large pages cross mnode boundaries. */ 736 if (valid_pages(md, cpunodes[0]) == 0) { 737 ret_val = -1; 738 goto fail; 739 } 740 741 fail: 742 if (n_cpunodes > 0) 743 md_free_scan_dag(md, &cpunodes); 744 if (n_mblocks > 0) 745 md_free_scan_dag(md, &mblocknodes); 746 else 747 panic("lgrp_traverse: No memory blocks found"); 748 749 if (ret_val == 0) { 750 MPO_STATUS("MPO feature is enabled.\n"); 751 } else 752 sun4v_mpo_enable = 0; /* set this for DR */ 753 754 return (ret_val); 755 } 756 757 /* 758 * Determine the number of unique mem_lg's present in our system 759 */ 760 static int 761 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 762 { 763 int homeid; 764 int count = 0; 765 766 /* 767 * Scan the "home" bits of the mem_lgs, count 768 * the number that are unique. 769 */ 770 771 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 772 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 773 count++; 774 } 775 } 776 777 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 778 mem_lg_homeset); 779 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 780 781 /* Default must be at least one */ 782 if (count == 0) 783 count = 1; 784 785 return (count); 786 } 787 788 /* 789 * Platform specific lgroup initialization 790 */ 791 void 792 plat_lgrp_init(void) 793 { 794 md_t *md; 795 int rc; 796 797 /* Get the Machine Descriptor handle */ 798 799 md = md_get_handle(); 800 801 /* If not, we cannot continue */ 802 803 if (md == NULL) { 804 panic("cannot access machine descriptor\n"); 805 } else { 806 rc = lgrp_traverse(md); 807 (void) md_fini_handle(md); 808 } 809 810 /* 811 * If we can't process the MD for lgroups then at least let the 812 * system try to boot. Assume we have one lgroup so that 813 * when plat_build_mem_nodes is called, it will attempt to init 814 * an mnode based on the supplied memory segment. 815 */ 816 817 if (rc == -1) { 818 home_mask_pfn = 0; 819 max_locality_groups = 1; 820 n_locality_groups = 1; 821 return; 822 } 823 824 mem_node_pfn_shift = 0; 825 mem_node_physalign = 0; 826 827 /* Use lgroup-aware TSB allocations */ 828 tsb_lgrp_affinity = 1; 829 830 /* Require that a home lgroup have some memory to be chosen */ 831 lgrp_mem_free_thresh = 1; 832 833 /* Standard home-on-next-touch policy */ 834 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 835 836 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 837 lgrp_load_thresh = UINT32_MAX; 838 839 mpo_update_tunables(); 840 } 841 842 /* 843 * Helper routine for debugging calls to mem_node_add_slice() 844 */ 845 static void 846 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 847 { 848 #if defined(DEBUG) && !defined(lint) 849 static int slice_count = 0; 850 851 slice_count++; 852 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 853 slice_count, basepfn, endpfn); 854 #endif 855 mem_node_add_slice(basepfn, endpfn); 856 } 857 858 static void 859 mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn) 860 { 861 #if defined(DEBUG) && !defined(lint) 862 static int slice_count = 0; 863 864 slice_count++; 865 MPO_DEBUG("mem_del_slice(%d): basepfn: %lx endpfn: %lx\n", 866 slice_count, basepfn, endpfn); 867 #endif 868 mem_node_del_slice(basepfn, endpfn); 869 } 870 871 /* 872 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 873 */ 874 static void 875 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 876 { 877 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, " 878 "mnode index: %d\n", plathand, mnode); 879 plat_assign_lgrphand_to_mem_node(plathand, mnode); 880 } 881 882 /* 883 * plat_build_mem_nodes() 884 * 885 * Define the mem_nodes based on the modified boot memory list, 886 * or based on info read from the MD in plat_lgrp_init(). 887 * 888 * When the home mask lies in the middle of the address bits (as it does on 889 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 890 * it is striped across an mblock in a repeating pattern of contiguous memory 891 * followed by a gap. The stripe width is the size of the contiguous piece. 892 * The stride is the distance from the start of one contiguous piece to the 893 * start of the next. The gap is thus stride - stripe_width. 894 * 895 * The stripe of an mnode that falls within an mblock is described by the type 896 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 897 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 898 * this array is predetermined. The mem_stripe_t that describes mnode m 899 * within mpo_mblock[i] is stored at 900 * mem_stripes[ m + i * max_locality_groups ] 901 * 902 * max_locality_groups is the total number of possible locality groups, 903 * as defined by the size of the home mask, even if the memory assigned 904 * to the domain is small and does not cover all the lgroups. Thus some 905 * mem_stripe_t's may be empty. 906 * 907 * The members of mem_stripe_t are: 908 * physbase: First valid page in mem_node in the corresponding mblock 909 * physmax: Last valid page in mem_node in mblock 910 * offset: The full stripe width starts at physbase - offset. 911 * Thus if offset is non-zero, this mem_node starts in the middle 912 * of a stripe width, and the second full stripe starts at 913 * physbase - offset + stride. (even though physmax may fall in the 914 * middle of a stripe width, we do not save the ending fragment size 915 * in this data structure.) 916 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 917 * 918 * The stripe width is kept in the global mnode_pages. 919 * The stride is kept in the global mnode_stride. 920 * All the above use pfn's as the unit. 921 * 922 * As an example, the memory layout for a domain with 2 mblocks and 4 923 * mem_nodes 0,1,2,3 could look like this: 924 * 925 * 123012301230 ... 012301230123 ... 926 * mblock 0 mblock 1 927 */ 928 929 /*ARGSUSED*/ 930 void 931 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems) 932 { 933 int elem; 934 uint64_t base, len; 935 936 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 937 max_mem_nodes = max_locality_groups; 938 939 mstripe_update(&mpo_config); 940 941 /* Check for non-MPO sun4v platforms */ 942 if (n_locality_groups <= 1) { 943 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 944 for (elem = 0; elem < nelems; list++, elem++) { 945 base = list->addr; 946 len = list->size; 947 948 mpo_mem_node_add_slice(btop(base), 949 btop(base + len - 1)); 950 } 951 mem_node_pfn_shift = 0; 952 mem_node_physalign = 0; 953 } else 954 mnode_update(&mpo_config, 0, 0, U_ADD_ALL); 955 956 /* 957 * Indicate to vm_pagelist that the hpm_counters array 958 * should be shared because the ranges overlap. 959 */ 960 if (max_mem_nodes > 1) { 961 interleaved_mnodes = 1; 962 } 963 } 964 965 /* 966 * Return the locality group value for the supplied processor 967 */ 968 lgrp_handle_t 969 plat_lgrp_cpu_to_hand(processorid_t id) 970 { 971 lgrp_handle_t lgrphand; 972 973 mpo_rd_lock(); 974 if (n_locality_groups > 1) { 975 lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home; 976 } else { 977 lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */ 978 } 979 mpo_rd_unlock(); 980 981 return (lgrphand); 982 } 983 984 int 985 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 986 { 987 /* 988 * Return min remote latency when there are more than two lgroups 989 * (root and child) and getting latency between two different lgroups 990 * or root is involved. 991 */ 992 if (lgrp_optimizations() && (from != to || 993 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 994 return ((int)higher_latency); 995 } else { 996 return ((int)lower_latency); 997 } 998 } 999 1000 int 1001 plat_pfn_to_mem_node(pfn_t pfn) 1002 { 1003 int i, mnode; 1004 pfn_t ra_to_pa_pfn; 1005 struct mblock_md *mb; 1006 1007 if (n_locality_groups <= 1) 1008 return (0); 1009 1010 /* 1011 * The mnode is defined to be 1:1 with the lgroup handle, which 1012 * is taken from from the home bits. Find the mblock in which 1013 * the pfn falls to get the ra_to_pa adjustment, and extract 1014 * the home bits. 1015 */ 1016 mpo_rd_lock(); 1017 mb = &mpo_mblock[0]; 1018 for (i = 0; i < n_mblocks; i++) { 1019 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 1020 ra_to_pa_pfn = btop(mb->ra_to_pa); 1021 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 1022 home_mask_pfn_shift); 1023 ASSERT(mnode < max_mem_nodes); 1024 mpo_rd_unlock(); 1025 return (mnode); 1026 } 1027 mb++; 1028 } 1029 1030 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 1031 return (pfn); 1032 } 1033 1034 /* 1035 * plat_rapfn_to_papfn 1036 * 1037 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 1038 * and home mask bits are correct. The upper bits do not necessarily 1039 * match the actual PA, however. 1040 */ 1041 pfn_t 1042 plat_rapfn_to_papfn(pfn_t pfn) 1043 { 1044 int i; 1045 pfn_t ra_to_pa_pfn; 1046 struct mblock_md *mb; 1047 1048 ASSERT(n_mblocks > 0); 1049 if (n_mblocks == 1) 1050 return (pfn + base_ra_to_pa_pfn); 1051 1052 /* 1053 * Find the mblock in which the pfn falls 1054 * in order to get the ra_to_pa adjustment. 1055 */ 1056 mpo_rd_lock(); 1057 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 1058 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 1059 ra_to_pa_pfn = btop(mb->ra_to_pa); 1060 mpo_rd_unlock(); 1061 return (pfn + ra_to_pa_pfn); 1062 } 1063 } 1064 1065 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 1066 return (pfn); 1067 } 1068 1069 /* 1070 * plat_mem_node_iterator_init() 1071 * Initialize cookie "it" to iterate over pfn's in an mnode. There is 1072 * no additional iterator function. The caller uses the info from 1073 * the iterator structure directly. 1074 * 1075 * pfn: starting pfn. 1076 * mnode: desired mnode. 1077 * szc: desired page size. 1078 * init: 1079 * if 1, start a new traversal, initialize "it", find first 1080 * mblock containing pfn, and return its starting pfn 1081 * within the mnode. 1082 * if 0, continue the previous traversal using passed-in data 1083 * from "it", advance to the next mblock, and return its 1084 * starting pfn within the mnode. 1085 * it: returns readonly data to the caller; see below. 1086 * 1087 * The input pfn must be aligned for the page size szc. 1088 * 1089 * Returns: starting pfn for the iteration for the mnode/mblock, 1090 * which is aligned according to the page size, 1091 * or returns (pfn_t)(-1) if the input pfn lies past the last 1092 * valid pfn of the mnode. 1093 * Returns misc values in the "it" struct that allows the caller 1094 * to advance the pfn within an mblock using address arithmetic; 1095 * see definition of mem_node_iterator_t in vm_dep.h. 1096 * When the caller calculates a pfn that is greater than the 1097 * returned value it->mi_mblock_end, the caller should again 1098 * call plat_mem_node_iterator_init, passing init=0. 1099 * 1100 * The last mblock in continuation case may be invalid because 1101 * of memory DR. To detect this situation mi_genid is checked 1102 * against mpo_genid which is incremented after a memory DR 1103 * operation. See also plat_slice_add()/plat_slice_del(). 1104 */ 1105 pfn_t 1106 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc, 1107 mem_node_iterator_t *it, int init) 1108 { 1109 int i; 1110 pgcnt_t szcpgcnt = PNUM_SIZE(szc); 1111 struct mblock_md *mblock; 1112 pfn_t base, end; 1113 mem_stripe_t *ms; 1114 uint64_t szcpagesize; 1115 1116 ASSERT(it != NULL); 1117 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1118 ASSERT(n_mblocks > 0); 1119 ASSERT(P2PHASE(pfn, szcpgcnt) == 0); 1120 1121 mpo_rd_lock(); 1122 1123 if (init || (it->mi_genid != mpo_genid)) { 1124 it->mi_genid = mpo_genid; 1125 it->mi_last_mblock = 0; 1126 it->mi_init = 1; 1127 } 1128 1129 /* Check if mpo is not enabled and we only have one mblock */ 1130 if (n_locality_groups == 1 && n_mblocks == 1) { 1131 if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) { 1132 pfn = (pfn_t)-1; 1133 goto done; 1134 } 1135 it->mi_mnode = mnode; 1136 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1137 it->mi_mnode_pfn_mask = 0; 1138 it->mi_mnode_pfn_shift = 0; 1139 it->mi_mnode_mask = 0; 1140 it->mi_mblock_base = mem_node_config[mnode].physbase; 1141 it->mi_mblock_end = mem_node_config[mnode].physmax; 1142 if (pfn < it->mi_mblock_base) 1143 pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt); 1144 if ((pfn + szcpgcnt - 1) > it->mi_mblock_end) 1145 pfn = (pfn_t)-1; 1146 goto done; 1147 } 1148 1149 /* init=1 means begin iterator, init=0 means continue */ 1150 if (init == 1) { 1151 i = 0; 1152 } else { 1153 ASSERT(it->mi_last_mblock < n_mblocks); 1154 i = it->mi_last_mblock; 1155 ASSERT(pfn > 1156 mem_stripes[i * max_locality_groups + mnode].physmax); 1157 if (++i == n_mblocks) { 1158 pfn = (pfn_t)-1; 1159 goto done; 1160 } 1161 } 1162 1163 /* 1164 * Find mblock that contains pfn for mnode's stripe, or first such an 1165 * mblock after pfn, else pfn is out of bound and we'll return -1. 1166 * mblocks and stripes are sorted in ascending address order. 1167 */ 1168 szcpagesize = szcpgcnt << PAGESHIFT; 1169 for (; i < n_mblocks; i++) { 1170 if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize)) 1171 continue; 1172 ms = &mem_stripes[i * max_locality_groups + mnode]; 1173 if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax && 1174 (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <= 1175 ms->physmax) 1176 break; 1177 } 1178 if (i == n_mblocks) { 1179 it->mi_last_mblock = i - 1; 1180 pfn = (pfn_t)-1; 1181 goto done; 1182 } 1183 1184 it->mi_last_mblock = i; 1185 1186 mblock = &mpo_mblock[i]; 1187 base = ms->physbase; 1188 end = ms->physmax; 1189 1190 it->mi_mnode = mnode; 1191 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1192 it->mi_mblock_base = base; 1193 it->mi_mblock_end = end; 1194 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1195 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1196 it->mi_mnode_mask = max_locality_groups - 1; 1197 if (pfn < base) { 1198 pfn = P2ROUNDUP(base, szcpgcnt); 1199 ASSERT(pfn + szcpgcnt - 1 <= end); 1200 } 1201 ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn); 1202 done: 1203 mpo_rd_unlock(); 1204 return (pfn); 1205 } 1206 1207 /* 1208 * plat_mem_node_intersect_range() 1209 * 1210 * Find the intersection between a memnode and a range of pfn's. 1211 */ 1212 void 1213 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1214 int mnode, pgcnt_t *npages_out) 1215 { 1216 pfn_t offset, len, hole, base, end, test_end, frag; 1217 pfn_t nearest; 1218 mem_stripe_t *ms; 1219 int i, npages; 1220 1221 *npages_out = 0; 1222 1223 if (!mem_node_config[mnode].exists || test_len == 0) 1224 return; 1225 1226 base = mem_node_config[mnode].physbase; 1227 end = mem_node_config[mnode].physmax; 1228 1229 test_end = test_base + test_len - 1; 1230 if (end < test_base || base > test_end) 1231 return; 1232 1233 if (n_locality_groups == 1) { 1234 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1235 return; 1236 } 1237 1238 hole = mnode_stride - mnode_pages; 1239 npages = 0; 1240 1241 /* 1242 * Iterate over all the stripes for this mnode (one per mblock), 1243 * find the intersection with each, and accumulate the intersections. 1244 * 1245 * Determing the intersection with a stripe is tricky. If base or end 1246 * fall outside the mem_node bounds, round them to physbase/physmax of 1247 * mem_node. If base or end fall in a gap, round them to start of 1248 * nearest stripe. If they fall within a stripe, keep base or end, 1249 * but calculate the fragment size that should be excluded from the 1250 * stripe. Calculate how many strides fall in the adjusted range, 1251 * multiply by stripe width, and add the start and end fragments. 1252 */ 1253 1254 mpo_rd_lock(); 1255 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1256 ms = &mem_stripes[i]; 1257 if (ms->exists && 1258 test_base <= (end = ms->physmax) && 1259 test_end >= (base = ms->physbase)) { 1260 1261 offset = ms->offset; 1262 1263 if (test_base > base) { 1264 /* Round test_base to next multiple of stride */ 1265 len = P2ROUNDUP(test_base - (base - offset), 1266 mnode_stride); 1267 nearest = base - offset + len; 1268 /* 1269 * Compute distance from test_base to the 1270 * stride boundary to see if test_base falls 1271 * in the stripe or in the hole. 1272 */ 1273 if (nearest - test_base > hole) { 1274 /* 1275 * test_base lies in stripe, 1276 * and offset should be excluded. 1277 */ 1278 offset = test_base - 1279 (nearest - mnode_stride); 1280 base = test_base; 1281 } else { 1282 /* round up to next stripe start */ 1283 offset = 0; 1284 base = nearest; 1285 if (base > end) 1286 continue; 1287 } 1288 1289 } 1290 1291 if (test_end < end) 1292 end = test_end; 1293 end++; /* adjust to an exclusive bound */ 1294 1295 /* Round end to next multiple of stride */ 1296 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1297 nearest = (base - offset) + len; 1298 if (nearest - end <= hole) { 1299 /* end falls in hole, use entire last stripe */ 1300 frag = 0; 1301 } else { 1302 /* end falls in stripe, compute fragment */ 1303 frag = nearest - hole - end; 1304 } 1305 1306 len = (len >> stripe_shift) - offset - frag; 1307 npages += len; 1308 } 1309 } 1310 1311 *npages_out = npages; 1312 mpo_rd_unlock(); 1313 } 1314 1315 /* 1316 * valid_pages() 1317 * 1318 * Return 1 if pages are valid and do not cross mnode boundaries 1319 * (which would break page free list assumptions), and 0 otherwise. 1320 */ 1321 1322 #define MNODE(pa) \ 1323 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1324 1325 static int 1326 valid_pages(md_t *md, mde_cookie_t cpu0) 1327 { 1328 int i, max_szc; 1329 uint64_t last_page_base, szc_mask; 1330 uint64_t max_page_len, max_coalesce_len; 1331 struct mblock_md *mb = mpo_mblock; 1332 1333 /* 1334 * Find the smaller of the largest page possible and supported. 1335 * mmu_exported_pagesize_mask is not yet initialized, so read 1336 * it from the MD. Apply minimal fixups in case of broken MDs 1337 * to get a sane mask. 1338 */ 1339 1340 if (cpu0 == NULL) 1341 szc_mask = szc_mask0; 1342 else { 1343 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1344 szc_mask = 0; 1345 /* largest in sun4v default support */ 1346 szc_mask |= (1 << TTE4M); 1347 szc_mask0 = szc_mask; 1348 } 1349 max_szc = highbit(szc_mask) - 1; 1350 if (max_szc > TTE256M) 1351 max_szc = TTE256M; 1352 max_page_len = TTEBYTES(max_szc); 1353 1354 /* 1355 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1356 * if mmu-page-size-list does not contain it, so 256M pages must fall 1357 * within one mnode to use MPO. 1358 */ 1359 max_coalesce_len = TTEBYTES(TTE256M); 1360 ASSERT(max_coalesce_len >= max_page_len); 1361 1362 if (ptob(mnode_pages) < max_coalesce_len) { 1363 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1364 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1365 return (0); 1366 } 1367 1368 for (i = 0; i < n_mblocks; i++) { 1369 uint64_t base = mb->base; 1370 uint64_t end = mb->base + mb->size - 1; 1371 uint64_t ra_to_pa = mb->ra_to_pa; 1372 1373 /* 1374 * If mblock is smaller than the max page size, then 1375 * RA = PA mod MAXPAGE is not guaranteed, but it must 1376 * not span mnodes. 1377 */ 1378 if (mb->size < max_page_len) { 1379 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1380 MPO_STATUS("Small mblock spans mnodes; " 1381 "MPO disabled: base = %lx, end = %lx, " 1382 "ra2pa = %lx\n", base, end, ra_to_pa); 1383 return (0); 1384 } 1385 } else { 1386 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1387 uint64_t pa_base = base + ra_to_pa; 1388 if ((base & (max_coalesce_len - 1)) != 1389 (pa_base & (max_coalesce_len - 1))) { 1390 MPO_STATUS("bad page alignment; MPO disabled: " 1391 "ra = %lx, pa = %lx, pagelen = %lx\n", 1392 base, pa_base, max_coalesce_len); 1393 return (0); 1394 } 1395 } 1396 1397 /* 1398 * Find start of last large page in mblock in RA space. 1399 * If page extends into the next mblock, verify the 1400 * mnode does not change. 1401 */ 1402 last_page_base = P2ALIGN(end, max_coalesce_len); 1403 if (i + 1 < n_mblocks && 1404 last_page_base + max_coalesce_len > mb[1].base && 1405 MNODE(last_page_base + ra_to_pa) != 1406 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1407 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1408 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1409 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1410 mb[1].ra_to_pa, max_coalesce_len); 1411 return (0); 1412 } 1413 1414 mb++; 1415 } 1416 return (1); 1417 } 1418 1419 1420 /* 1421 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1422 * if any, and remove them. This yields a config where the "coarse 1423 * grained" lgroups cover all of memory, even though part of that memory 1424 * is fine grain interleaved and does not deliver a purely local memory 1425 * latency. 1426 * 1427 * This function reads and modifies the globals: 1428 * mpo_lgroup[], n_lgrpnodes 1429 * 1430 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1431 */ 1432 1433 static int 1434 fix_interleave(void) 1435 { 1436 int i, j; 1437 uint64_t mask = 0; 1438 1439 j = 0; 1440 for (i = 0; i < n_lgrpnodes; i++) { 1441 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1442 /* remove this lgroup */ 1443 mask = mpo_lgroup[i].addr_mask; 1444 } else { 1445 mpo_lgroup[j++] = mpo_lgroup[i]; 1446 } 1447 } 1448 n_lgrpnodes = j; 1449 1450 if (mask != 0) 1451 MPO_STATUS("sub-page interleave %lx found; " 1452 "removing lgroup.\n", mask); 1453 1454 return (mask != 0); 1455 } 1456 1457 /* 1458 * mblock_alloc 1459 * 1460 * Allocate memory for mblock an stripe arrays from either static or 1461 * dynamic space depending on utype, and return the result in mc. 1462 * Returns 0 on success and -1 on error. 1463 */ 1464 1465 static int 1466 mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks) 1467 { 1468 mblock_md_t *mb = NULL; 1469 mem_stripe_t *ms = NULL; 1470 int nstripes = MAX_MEM_NODES * nmblocks; 1471 size_t mblocksz = nmblocks * sizeof (struct mblock_md); 1472 size_t mstripesz = nstripes * sizeof (mem_stripe_t); 1473 size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 1474 1475 /* 1476 * Allocate space for mblocks and mstripes. 1477 * 1478 * For DR allocations, just use kmem_alloc(), and set 1479 * mc_alloc_sz to indicate it was used. 1480 * 1481 * For boot allocation: 1482 * If we have a small number of mblocks we will use the space 1483 * that we preallocated. Otherwise, we will dynamically 1484 * allocate the space from the prom and map it to the 1485 * reserved VA at MPOBUF_BASE. 1486 */ 1487 1488 if (utype == U_ADD || utype == U_DEL) { 1489 mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP); 1490 ms = (mem_stripe_t *)(mb + nmblocks); 1491 mc->mc_alloc_sz = allocsz; 1492 } else if (nmblocks <= SMALL_MBLOCKS_COUNT) { 1493 mb = &small_mpo_mblocks[0]; 1494 ms = &small_mem_stripes[0]; 1495 mc->mc_alloc_sz = 0; 1496 } else { 1497 /* Ensure that we dont request more space than reserved */ 1498 if (allocsz > MPOBUF_SIZE) { 1499 MPO_STATUS("mblock_alloc: Insufficient space " 1500 "for mblock structures \n"); 1501 return (-1); 1502 } 1503 mb = (struct mblock_md *) 1504 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 1505 if (mb != (struct mblock_md *)MPOBUF_BASE) { 1506 MPO_STATUS("mblock_alloc: Cannot allocate space " 1507 "for mblocks \n"); 1508 return (-1); 1509 } 1510 mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 1511 mpo_heap32_bufsz = MPOBUF_SIZE; 1512 ms = (mem_stripe_t *)(mb + nmblocks); 1513 mc->mc_alloc_sz = 0; 1514 } 1515 mc->mc_mblocks = mb; 1516 mc->mc_stripes = ms; 1517 mc->mc_nmblocks = nmblocks; 1518 mc->mc_nstripes = nstripes; 1519 MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks); 1520 return (0); 1521 } 1522 1523 /* 1524 * mblock_free 1525 * 1526 * Free memory in mc that was allocated by mblock_alloc. 1527 */ 1528 1529 static void 1530 mblock_free(mpo_config_t *mc) 1531 { 1532 if (mc->mc_alloc_sz > 0) { 1533 ASSERT(mc->mc_mblocks != mpo_mblock); 1534 kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz); 1535 } 1536 bzero(mc, sizeof (*mc)); 1537 } 1538 1539 /* 1540 * mblock_install 1541 * 1542 * Install mblock config passed in mc as the global configuration. 1543 * May only be called at boot or while holding mpo_wr_lock. 1544 */ 1545 1546 static void 1547 mblock_install(mpo_config_t *mc) 1548 { 1549 mpo_mblock = mc->mc_mblocks; 1550 n_mblocks = mc->mc_nmblocks; 1551 mem_stripes = mc->mc_stripes; 1552 n_mem_stripes = mc->mc_nstripes; 1553 base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa); 1554 mpo_config = *mc; 1555 } 1556 1557 /* 1558 * mblock_update 1559 * 1560 * Traverse mblocknodes, read the mblock properties from the MD, and 1561 * save the mblocks in mc. 1562 */ 1563 1564 static void 1565 mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes) 1566 { 1567 uint64_t i, j; 1568 int result = 0; 1569 mblock_md_t *mblock = mc->mc_mblocks; 1570 1571 for (i = 0, j = 0; j < mc->mc_nmblocks; j++) { 1572 1573 /* Without a base or size value we will fail */ 1574 result = get_int(md, mblocknodes[j], PROP_LG_BASE, 1575 &mblock[i].base); 1576 if (result < 0) { 1577 MPO_STATUS("mblock_update: " 1578 "PROP_LG_BASE is missing\n"); 1579 mc->mc_nmblocks = 0; 1580 return; 1581 } 1582 1583 result = get_int(md, mblocknodes[j], PROP_LG_SIZE, 1584 &mblock[i].size); 1585 if (result < 0) { 1586 MPO_STATUS("mblock_update: " 1587 "PROP_LG_SIZE is missing\n"); 1588 mc->mc_nmblocks = 0; 1589 return; 1590 } 1591 1592 result = get_int(md, mblocknodes[j], 1593 PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa); 1594 1595 /* If we don't have an ra_pa_offset, just set it to 0 */ 1596 if (result < 0) 1597 mblock[i].ra_to_pa = 0; 1598 1599 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 1600 "ra_to_pa = %lx\n", i, 1601 mblock[i].base, 1602 mblock[i].size, 1603 mblock[i].ra_to_pa); 1604 1605 /* check for unsupportable values of base and size */ 1606 if (mblock[i].base > mblock[i].base + mblock[i].size) { 1607 MPO_STATUS("mblock_update: " 1608 "PROP_LG_BASE+PROP_LG_SIZE is invalid: " 1609 "base = %lx, size = %lx\n", 1610 mblock[i].base, mblock[i].size); 1611 mc->mc_nmblocks = 0; 1612 return; 1613 } 1614 1615 /* eliminate size==0 blocks */ 1616 if (mblock[i].size != 0) { 1617 uint64_t base = mblock[i].base; 1618 uint64_t end = base + mblock[i].size; 1619 ASSERT(end > base); 1620 mblock[i].base_pfn = btop(base); 1621 mblock[i].end_pfn = btop(end - 1); 1622 i++; 1623 } 1624 } 1625 1626 if (i == 0) { 1627 MPO_STATUS("mblock_update: " 1628 "No non-empty mblock nodes were found " 1629 "in the Machine Descriptor\n"); 1630 mc->mc_nmblocks = 0; 1631 return; 1632 } 1633 ASSERT(i <= mc->mc_nmblocks); 1634 mc->mc_nmblocks = i; 1635 1636 /* Must sort mblocks by address for mem_node_iterator_init() */ 1637 mblock_sort(mblock, mc->mc_nmblocks); 1638 } 1639 1640 /* 1641 * mblock_update_add 1642 * 1643 * Update mblock config after a memory DR add. The added range is not 1644 * needed, as we read *all* mblock nodes from the MD. Save the mblocks 1645 * in mc. 1646 */ 1647 1648 static void 1649 mblock_update_add(mpo_config_t *mc) 1650 { 1651 md_t *md; 1652 mde_cookie_t root, *mblocknodes; 1653 int nmblocks = 0; 1654 1655 if ((md = md_get_handle()) == NULL) { 1656 MPO_STATUS("Cannot access Machine Descriptor\n"); 1657 goto error; 1658 } 1659 1660 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) 1661 goto error; 1662 1663 nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd", 1664 &mblocknodes); 1665 if (nmblocks <= 0) { 1666 MPO_STATUS("No mblock nodes detected in Machine Descriptor\n"); 1667 goto error; 1668 } 1669 1670 if (mblock_alloc(mc, U_ADD, nmblocks) < 0) 1671 goto error; 1672 1673 mblock_update(mc, md, mblocknodes); 1674 md_free_scan_dag(md, &mblocknodes); 1675 (void) md_fini_handle(md); 1676 return; 1677 error: 1678 panic("mblock_update_add: cannot process mblocks from MD.\n"); 1679 } 1680 1681 /* 1682 * mblock_update_del 1683 * 1684 * Update mblocks after a memory DR deletion of the range (ubase, uend). 1685 * Allocate a new mblock config, copy old config to the new, modify the new 1686 * mblocks to reflect the deletion. The new mblocks are returned in 1687 * mc_new and are not yet installed as the active config. 1688 */ 1689 1690 static void 1691 mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase, 1692 pfn_t uend) 1693 { 1694 int i, j; 1695 pfn_t base, end; 1696 mblock_md_t *mblock; 1697 int nmblocks = mc_old->mc_nmblocks; 1698 1699 MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend); 1700 1701 /* 1702 * Allocate mblocks in mc_new and copy the old to the new. 1703 * Allocate one extra in case the deletion splits an mblock. 1704 */ 1705 if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0) 1706 return; 1707 mblock = mc_new->mc_mblocks; 1708 bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t)); 1709 1710 /* 1711 * Find the mblock containing the deleted range and adjust it in 1712 * the new config. 1713 */ 1714 for (i = 0; i < nmblocks; i++) { 1715 1716 base = btop(mblock[i].base); 1717 end = base + btop(mblock[i].size) - 1; 1718 1719 /* 1720 * Adjust the mblock based on the subset that was deleted. 1721 * 1722 * If the entire mblk was deleted, compact the table. 1723 * 1724 * If the middle of the mblk was deleted, extend 1725 * the table. Space for the new slot was already 1726 * allocated. 1727 * 1728 * The memory to be deleted is a mblock or a subset of 1729 * and does not span multiple mblocks. 1730 */ 1731 if (base == ubase && end == uend) { 1732 for (j = i; j < nmblocks - 1; j++) 1733 mblock[j] = mblock[j + 1]; 1734 nmblocks--; 1735 bzero(&mblock[nmblocks], sizeof (*mblock)); 1736 break; 1737 } else if (base < ubase && end > uend) { 1738 for (j = nmblocks - 1; j >= i; j--) 1739 mblock[j + 1] = mblock[j]; 1740 mblock[i].size = ptob(ubase - base); 1741 mblock[i].end_pfn = ubase - 1; 1742 mblock[i + 1].base = ptob(uend + 1); 1743 mblock[i + 1].size = ptob(end - uend); 1744 mblock[i + 1].base_pfn = uend + 1; 1745 nmblocks++; 1746 break; 1747 } else if (base == ubase) { 1748 MPO_DEBUG("mblock_update_del: shrink>" 1749 " i=%d base=0x%lx end=0x%lx", i, base, end); 1750 mblock[i].base = ptob(uend + 1); 1751 mblock[i].size -= ptob(uend - ubase + 1); 1752 base = uend + 1; 1753 mblock[i].base_pfn = base; 1754 mblock[i].end_pfn = end; 1755 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end); 1756 break; 1757 } else if (end == uend) { 1758 MPO_DEBUG("mblock_update_del: shrink<" 1759 " i=%d base=0x%lx end=0x%lx", i, base, end); 1760 mblock[i].size -= ptob(uend - ubase + 1); 1761 end = ubase - 1; 1762 mblock[i].base_pfn = base; 1763 mblock[i].end_pfn = end; 1764 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end); 1765 break; 1766 } 1767 } 1768 mc_new->mc_nmblocks = nmblocks; 1769 ASSERT(end > base); 1770 } 1771 1772 /* 1773 * mstripe_update 1774 * 1775 * Read mblocks from mc and update mstripes in mc 1776 */ 1777 1778 static void 1779 mstripe_update(mpo_config_t *mc) 1780 { 1781 lgrp_handle_t lgrphand, lgrp_start; 1782 int i, mnode; 1783 uint64_t offset, stripe_end, base, end, ra_to_pa, stride; 1784 uint64_t stripe, frag, remove; 1785 mem_stripe_t *ms; 1786 mblock_md_t *mblock = mc->mc_mblocks; 1787 int nmblocks = mc->mc_nmblocks; 1788 int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t); 1789 1790 /* Check for non-MPO sun4v platforms or memory DR removal */ 1791 if (n_locality_groups <= 1) { 1792 ASSERT(n_locality_groups == 1); 1793 ASSERT(max_locality_groups == 1 && max_mem_nodes == 1); 1794 1795 if (nmblocks == 1) { 1796 mc->mc_nstripes = 0; 1797 } else { 1798 mc->mc_nstripes = nmblocks; 1799 bzero(mc->mc_stripes, mstripesz); 1800 for (i = 0; i < nmblocks; i++) { 1801 mc->mc_stripes[i].exists = 1; 1802 mc->mc_stripes[i].physbase = mblock[i].base_pfn; 1803 mc->mc_stripes[i].physmax = mblock[i].end_pfn; 1804 } 1805 } 1806 return; 1807 } 1808 1809 bzero(mc->mc_stripes, mstripesz); 1810 mc->mc_nstripes = max_locality_groups * nmblocks; 1811 stripe = ptob(mnode_pages); 1812 stride = max_locality_groups * stripe; 1813 1814 for (i = 0; i < nmblocks; i++) { 1815 base = mblock[i].base; 1816 end = base + mblock[i].size; 1817 ra_to_pa = mblock[i].ra_to_pa; 1818 1819 /* Find the offset from the prev stripe boundary in PA space. */ 1820 offset = (base + ra_to_pa) & (stripe - 1); 1821 1822 /* Set the next stripe boundary. */ 1823 stripe_end = base - offset + stripe; 1824 1825 lgrp_start = (((base + ra_to_pa) & home_mask) >> 1826 home_mask_shift); 1827 lgrphand = lgrp_start; 1828 1829 /* 1830 * Loop over all lgroups covered by the mblock, creating a 1831 * stripe for each. Stop when lgrp_start is visited again. 1832 */ 1833 do { 1834 /* mblock may not span all lgroups */ 1835 if (base >= end) 1836 break; 1837 1838 mnode = lgrphand; 1839 ASSERT(mnode < max_mem_nodes); 1840 1841 /* 1842 * Calculate the size of the fragment that does not 1843 * belong to the mnode in the last partial stride. 1844 */ 1845 frag = (end - (base - offset)) & (stride - 1); 1846 if (frag == 0) { 1847 /* remove the gap */ 1848 remove = stride - stripe; 1849 } else if (frag < stripe) { 1850 /* fragment fits in stripe; keep it all */ 1851 remove = 0; 1852 } else { 1853 /* fragment is large; trim after whole stripe */ 1854 remove = frag - stripe; 1855 } 1856 1857 ms = &mc->mc_stripes[i * max_locality_groups + mnode]; 1858 ms->physbase = btop(base); 1859 ms->physmax = btop(end - 1 - remove); 1860 ms->offset = btop(offset); 1861 ms->exists = 1; 1862 1863 base = stripe_end; 1864 stripe_end += stripe; 1865 offset = 0; 1866 lgrphand = (((base + ra_to_pa) & home_mask) >> 1867 home_mask_shift); 1868 } while (lgrphand != lgrp_start); 1869 } 1870 } 1871 1872 #define INTERSECT(a, b, c, d) \ 1873 if (((a) >= (c) && (a) <= (d)) || \ 1874 ((c) >= (a) && (c) <= (b))) { \ 1875 (c) = MAX((a), (c)); \ 1876 (d) = MIN((b), (d)); \ 1877 } else { \ 1878 ASSERT((a) >= (d) || (b) <= (c)); \ 1879 continue; \ 1880 } \ 1881 1882 /* 1883 * mnode_update 1884 * 1885 * Read stripes from mc and update mnode extents. The mnode extents are 1886 * part of the live configuration, so this can only be done at boot time 1887 * or while holding the mpo_wr_lock. 1888 */ 1889 1890 static void 1891 mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype) 1892 { 1893 int i, j, mnode, found; 1894 pfn_t base, end; 1895 mem_stripe_t *ms; 1896 1897 MPO_DEBUG("mnode_udpate: basepfn: %lx endpfn: %lx\n", ubase, uend); 1898 1899 if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) { 1900 if (utype == U_ADD) 1901 mpo_mem_node_add_slice(ubase, uend); 1902 else if (utype == U_DEL) 1903 mpo_mem_node_del_slice(ubase, uend); 1904 else 1905 panic("mnode update: %d: invalid\n", utype); 1906 return; 1907 } 1908 1909 found = 0; 1910 for (i = 0; i < mc->mc_nmblocks; i++) { 1911 for (mnode = 0; mnode < max_locality_groups; mnode++) { 1912 1913 j = i * max_locality_groups + mnode; 1914 ms = &mc->mc_stripes[j]; 1915 if (!ms->exists) 1916 continue; 1917 1918 base = ms->physbase; 1919 end = ms->physmax; 1920 1921 /* 1922 * Look for the mstripes intersecting this slice. 1923 * 1924 * The mstripe and slice pairs may not be equal 1925 * if a subset of a mblock is added/deleted. 1926 */ 1927 switch (utype) { 1928 case U_ADD: 1929 INTERSECT(ubase, uend, base, end); 1930 /*FALLTHROUGH*/ 1931 case U_ADD_ALL: 1932 if (n_locality_groups > 1) 1933 mpo_plat_assign_lgrphand_to_mem_node( 1934 mnode, mnode); 1935 mpo_mem_node_add_slice(base, end); 1936 break; 1937 case U_DEL: 1938 INTERSECT(ubase, uend, base, end); 1939 mpo_mem_node_del_slice(base, end); 1940 break; 1941 default: 1942 panic("mnode_update: %d: invalid\n", utype); 1943 break; 1944 } 1945 1946 found++; 1947 } 1948 } 1949 1950 if (!found) 1951 panic("mnode_update: mstripe not found"); 1952 1953 #ifdef DEBUG 1954 if (utype == U_ADD_ALL || utype == U_DEL) 1955 return; 1956 found = 0; 1957 for (i = 0; i < max_mem_nodes; i++) { 1958 if (!mem_node_config[i].exists) 1959 continue; 1960 if (ubase >= mem_node_config[i].physbase && 1961 ubase <= mem_node_config[i].physmax) 1962 found |= 1; 1963 if (uend >= mem_node_config[i].physbase && 1964 uend <= mem_node_config[i].physmax) 1965 found |= 2; 1966 } 1967 ASSERT(found == 3); 1968 { 1969 pfn_t minpfn, maxpfn; 1970 1971 mem_node_max_range(&minpfn, &maxpfn); 1972 ASSERT(minpfn <= ubase); 1973 ASSERT(maxpfn >= uend); 1974 } 1975 #endif 1976 } 1977 1978 /* 1979 * Plat_slice_add()/plat_slice_del() are the platform hooks 1980 * for adding/deleting a pfn range to/from the system. 1981 * 1982 * Platform_slice_add() is used for both boot/DR cases. 1983 * 1984 * - Zeus has already added the mblocks to the MD, so read the updated 1985 * MD and allocate all data structures required to manage the new memory 1986 * configuration. 1987 * 1988 * - Recompute the stripes which are derived from the mblocks. 1989 * 1990 * - Update (expand) the mnode extents and install the modified mblocks as 1991 * the new mpo config. This must be done while holding the mpo_wr_lock 1992 * to guarantee that no other threads access the mpo meta-data. 1993 * 1994 * - Unlock MPO data structures; the new config is live. Free the old config. 1995 * 1996 * Plat_slice_del() is used for DR only. 1997 * 1998 * - Zeus has not yet modified the MD to reflect the deletion, so copy 1999 * the old mpo mblocks and delete the range from the copy. 2000 * 2001 * - Recompute the stripes which are derived from the mblocks. 2002 * 2003 * - Update (shrink) the mnode extents and install the modified mblocks as 2004 * the new mpo config. This must be done while holding the mpo_wr_lock 2005 * to guarantee that no other threads access the mpo meta-data. 2006 * 2007 * - Unlock MPO data structures; the new config is live. Free the old config. 2008 */ 2009 2010 void 2011 plat_slice_add(pfn_t base, pfn_t end) 2012 { 2013 mpo_config_t old_config = mpo_config; 2014 mpo_config_t new_config; 2015 2016 VALIDATE_SLICE(base, end); 2017 mblock_update_add(&new_config); 2018 mstripe_update(&new_config); 2019 mpo_wr_lock(); 2020 mblock_install(&new_config); 2021 /* Use new config to add all ranges for mnode_update */ 2022 mnode_update(&new_config, base, end, U_ADD); 2023 mpo_genid++; 2024 mpo_wr_unlock(); 2025 mblock_free(&old_config); 2026 } 2027 2028 void 2029 plat_slice_del(pfn_t base, pfn_t end) 2030 { 2031 mpo_config_t old_config = mpo_config; 2032 mpo_config_t new_config; 2033 2034 VALIDATE_SLICE(base, end); 2035 mblock_update_del(&new_config, &old_config, base, end); 2036 mstripe_update(&new_config); 2037 mpo_wr_lock(); 2038 /* Use old config to find deleted range for mnode_update */ 2039 mnode_update(&old_config, base, end, U_DEL); 2040 mblock_install(&new_config); 2041 mpo_genid++; 2042 mpo_wr_unlock(); 2043 mblock_free(&old_config); 2044 } 2045