1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/sysmacros.h> 29 #include <sys/machsystm.h> 30 #include <sys/machparam.h> 31 #include <sys/cmn_err.h> 32 #include <sys/stat.h> 33 #include <sys/mach_descrip.h> 34 #include <sys/memnode.h> 35 #include <sys/mdesc.h> 36 #include <sys/mpo.h> 37 #include <vm/page.h> 38 #include <vm/vm_dep.h> 39 #include <vm/hat_sfmmu.h> 40 #include <sys/promif.h> 41 42 /* 43 * MPO and the sun4v memory representation 44 * --------------------------------------- 45 * 46 * Latency groups are defined in the sun4v achitecture by memory-latency-group 47 * nodes in the Machine Description, as specified in FWARC/2007/260. These 48 * tie together cpu nodes and mblock nodes, and contain mask and match 49 * properties that identify the portion of an mblock that belongs to the 50 * lgroup. Mask and match are defined in the Physical Address (PA) space, 51 * but an mblock defines Real Addresses (RA). To translate, the mblock 52 * includes the property address-congruence-offset, hereafter referred to as 53 * ra_to_pa. A real address ra is a member of an lgroup if 54 * 55 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 56 * 57 * The MD is traversed, and information on all mblocks is kept in the array 58 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 59 * to, is kept in the array mpo_cpu[]. 60 * 61 * This implementation makes (and verifies) the simplifying assumption that 62 * the mask bits are the same for all defined lgroups, and that all 1 bits in 63 * the mask are contiguous. Thus the number of lgroups is bounded by the 64 * number of possible mask values, and the lgrp_handle_t is defined as the 65 * mask value, shifted right to eliminate the 0 bit positions in mask. The 66 * masks and values are also referred to as "home bits" in the code. 67 * 68 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 69 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 70 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 71 * home bits. This yields the mem_node. 72 * 73 * Interfaces 74 * ---------- 75 * 76 * This file exports the following entry points: 77 * 78 * plat_lgrp_init() 79 * plat_build_mem_nodes() 80 * plat_lgrp_cpu_to_hand() 81 * plat_lgrp_latency() 82 * plat_pfn_to_mem_node() 83 * These implement the usual platform lgroup interfaces. 84 * 85 * plat_rapfn_to_papfn() 86 * Recover the PA page coloring bits from an RA. 87 * 88 * plat_mem_node_iterator_init() 89 * Initialize an iterator to efficiently step through pages in a mem_node. 90 * 91 * plat_mem_node_intersect_range() 92 * Find the intersection with a mem_node. 93 * 94 * plat_slice_add() 95 * plat_slice_del() 96 * Platform hooks to add/delete a pfn range. 97 * 98 * Internal Organization 99 * --------------------- 100 * 101 * A number of routines are used both boot/DR code which (re)build 102 * appropriate MPO structures. 103 * 104 * mblock_alloc() 105 * Allocate memory for mblocks and stripes as 106 * appropriate for boot or memory DR. 107 * 108 * mblock_free() 109 * Free memory allocated by mblock_alloc. 110 * 111 * mblock_update() 112 * Build mblocks based on mblock nodes read from the MD. 113 * 114 * mblock_update_add() 115 * Rebuild mblocks after a memory DR add operation. 116 * 117 * mblock_update_del() 118 * Rebuild mblocks after a memory DR delete operation. 119 * 120 * mblock_install() 121 * Install mblocks as the new configuration. 122 * 123 * mstripe_update() 124 * Build stripes based on mblocks. 125 * 126 * mnode_update() 127 * Call memnode layer to add/del a pfn range, based on stripes. 128 * 129 * The platform interfaces allocate all memory required for the 130 * particualar update first, block access to the MPO structures 131 * while they are updated, and free old structures after the update. 132 */ 133 134 int sun4v_mpo_enable = 1; 135 int sun4v_mpo_debug = 0; 136 char sun4v_mpo_status[256] = ""; 137 138 /* Save CPU info from the MD and associate CPUs with lgroups */ 139 static struct cpu_md mpo_cpu[NCPU]; 140 141 /* Save lgroup info from the MD */ 142 #define MAX_MD_LGROUPS 32 143 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 144 static int n_lgrpnodes = 0; 145 static int n_locality_groups = 0; 146 static int max_locality_groups = 0; 147 static int szc_mask0 = 0; 148 149 /* Save mblocks from the MD */ 150 #define SMALL_MBLOCKS_COUNT 8 151 static struct mblock_md *mpo_mblock; 152 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 153 static int n_mblocks = 0; 154 155 /* Save mem_node stripes calculate from mblocks and lgroups. */ 156 static mem_stripe_t *mem_stripes; 157 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 158 static int n_mem_stripes = 0; 159 static pfn_t mnode_stride; /* distance between stripes, start to start */ 160 static int stripe_shift; /* stride/stripes expressed as a shift */ 161 static pfn_t mnode_pages; /* mem_node stripe width */ 162 163 /* Save home mask and shift used to calculate lgrp_handle_t values */ 164 static uint64_t home_mask = 0; 165 static pfn_t home_mask_pfn = 0; 166 static int home_mask_shift = 0; 167 static uint_t home_mask_pfn_shift = 0; 168 169 /* Save lowest and highest latencies found across all lgroups */ 170 static int lower_latency = 0; 171 static int higher_latency = 0; 172 173 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 174 static int mpo_genid; /* config gen; updated by mem DR */ 175 static mpo_config_t mpo_config; /* current mblocks and stripes */ 176 177 typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t; 178 179 static int valid_pages(md_t *md, mde_cookie_t cpu0); 180 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 181 static int fix_interleave(void); 182 183 static int mblock_alloc(mpo_config_t *, update_t, int nmblocks); 184 static void mblock_install(mpo_config_t *); 185 static void mblock_free(mpo_config_t *); 186 static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes); 187 static void mblock_update_add(mpo_config_t *); 188 static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t); 189 static void mstripe_update(mpo_config_t *); 190 static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t); 191 192 /* Debug support */ 193 #if defined(DEBUG) && !defined(lint) 194 #define VALIDATE_SLICE(base, end) { \ 195 ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M))); \ 196 ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M))); \ 197 } 198 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 199 #else 200 #define VALIDATE_SLICE(base, end) 201 #define MPO_DEBUG(...) 202 #endif /* DEBUG */ 203 204 /* Record status message, viewable from mdb */ 205 #define MPO_STATUS(args...) { \ 206 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 207 MPO_DEBUG(sun4v_mpo_status); \ 208 } 209 210 /* 211 * The MPO locks are to protect the MPO metadata while that 212 * information is updated as a result of a memory DR operation. 213 * The read lock must be acquired to read the metadata and the 214 * write locks must be acquired to update it. 215 */ 216 #define mpo_rd_lock kpreempt_disable 217 #define mpo_rd_unlock kpreempt_enable 218 219 static void 220 mpo_wr_lock() 221 { 222 mutex_enter(&cpu_lock); 223 pause_cpus(NULL); 224 mutex_exit(&cpu_lock); 225 } 226 227 static void 228 mpo_wr_unlock() 229 { 230 mutex_enter(&cpu_lock); 231 start_cpus(); 232 mutex_exit(&cpu_lock); 233 } 234 235 /* 236 * Routine to read a uint64_t from a given md 237 */ 238 static int64_t 239 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 240 { 241 int err = md_get_prop_val(md, node, propname, val); 242 return (err); 243 } 244 245 static int 246 mblock_cmp(const void *a, const void *b) 247 { 248 struct mblock_md *m1 = (struct mblock_md *)a; 249 struct mblock_md *m2 = (struct mblock_md *)b; 250 251 if (m1->base < m2->base) 252 return (-1); 253 else if (m1->base == m2->base) 254 return (0); 255 else 256 return (1); 257 } 258 259 static void 260 mblock_sort(struct mblock_md *mblocks, int n) 261 { 262 extern void qsort(void *, size_t, size_t, 263 int (*)(const void *, const void *)); 264 265 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 266 } 267 268 static void 269 mpo_update_tunables(void) 270 { 271 int i, ncpu_min; 272 273 /* 274 * lgrp_expand_proc_thresh is the minimum load on the lgroups 275 * this process is currently running on before considering 276 * expanding threads to another lgroup. 277 * 278 * lgrp_expand_proc_diff determines how much less the remote lgroup 279 * must be loaded before expanding to it. 280 * 281 * On sun4v CMT processors, threads share a core pipeline, and 282 * at less than 100% utilization, best throughput is obtained by 283 * spreading threads across more cores, even if some are in a 284 * different lgroup. Spread threads to a new lgroup if the 285 * current group is more than 50% loaded. Because of virtualization, 286 * lgroups may have different numbers of CPUs, but the tunables 287 * apply to all lgroups, so find the smallest lgroup and compute 288 * 50% loading. 289 */ 290 291 ncpu_min = NCPU; 292 for (i = 0; i < n_lgrpnodes; i++) { 293 int ncpu = mpo_lgroup[i].ncpu; 294 if (ncpu != 0 && ncpu < ncpu_min) 295 ncpu_min = ncpu; 296 } 297 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 298 299 /* new home may only be half as loaded as the existing home to use it */ 300 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 301 302 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 303 } 304 305 static mde_cookie_t 306 cpuid_to_cpunode(md_t *md, int cpuid) 307 { 308 mde_cookie_t rootnode, foundnode, *cpunodes; 309 uint64_t cpuid_prop; 310 int n_cpunodes, i; 311 312 if (md == NULL) 313 return (MDE_INVAL_ELEM_COOKIE); 314 315 rootnode = md_root_node(md); 316 if (rootnode == MDE_INVAL_ELEM_COOKIE) 317 return (MDE_INVAL_ELEM_COOKIE); 318 319 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU, 320 "fwd", &cpunodes); 321 if (n_cpunodes <= 0 || n_cpunodes > NCPU) 322 goto cpuid_fail; 323 324 for (i = 0; i < n_cpunodes; i++) { 325 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID, 326 &cpuid_prop)) 327 break; 328 if (cpuid_prop == (uint64_t)cpuid) { 329 foundnode = cpunodes[i]; 330 md_free_scan_dag(md, &cpunodes); 331 return (foundnode); 332 } 333 } 334 cpuid_fail: 335 if (n_cpunodes > 0) 336 md_free_scan_dag(md, &cpunodes); 337 return (MDE_INVAL_ELEM_COOKIE); 338 } 339 340 static int 341 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode) 342 { 343 mde_cookie_t *nodes; 344 uint64_t latency, lowest_latency; 345 uint64_t address_match, lowest_address_match; 346 int n_lgroups, j, result = 0; 347 348 /* Find lgroup nodes reachable from this cpu */ 349 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG, 350 "fwd", &nodes); 351 352 lowest_latency = ~(0UL); 353 354 /* Find the lgroup node with the smallest latency */ 355 for (j = 0; j < n_lgroups; j++) { 356 result = get_int(md, nodes[j], PROP_LG_LATENCY, 357 &latency); 358 result |= get_int(md, nodes[j], PROP_LG_MATCH, 359 &address_match); 360 if (result != 0) { 361 j = -1; 362 goto to_lgrp_done; 363 } 364 if (latency < lowest_latency) { 365 lowest_latency = latency; 366 lowest_address_match = address_match; 367 } 368 } 369 for (j = 0; j < n_lgrpnodes; j++) { 370 if ((mpo_lgroup[j].latency == lowest_latency) && 371 (mpo_lgroup[j].addr_match == lowest_address_match)) 372 break; 373 } 374 if (j == n_lgrpnodes) 375 j = -1; 376 377 to_lgrp_done: 378 if (n_lgroups > 0) 379 md_free_scan_dag(md, &nodes); 380 return (j); 381 } 382 383 /* Called when DR'ing in a CPU */ 384 void 385 mpo_cpu_add(md_t *md, int cpuid) 386 { 387 mde_cookie_t cpunode; 388 389 int i; 390 391 if (n_lgrpnodes <= 0) 392 return; 393 394 if (md == NULL) 395 goto add_fail; 396 397 cpunode = cpuid_to_cpunode(md, cpuid); 398 if (cpunode == MDE_INVAL_ELEM_COOKIE) 399 goto add_fail; 400 401 i = mpo_cpu_to_lgroup(md, cpunode); 402 if (i == -1) 403 goto add_fail; 404 405 mpo_cpu[cpuid].lgrp_index = i; 406 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift; 407 mpo_lgroup[i].ncpu++; 408 mpo_update_tunables(); 409 return; 410 add_fail: 411 panic("mpo_cpu_add: Cannot read MD"); 412 } 413 414 /* Called when DR'ing out a CPU */ 415 void 416 mpo_cpu_remove(int cpuid) 417 { 418 int i; 419 420 if (n_lgrpnodes <= 0) 421 return; 422 423 i = mpo_cpu[cpuid].lgrp_index; 424 mpo_lgroup[i].ncpu--; 425 mpo_cpu[cpuid].home = 0; 426 mpo_cpu[cpuid].lgrp_index = -1; 427 mpo_update_tunables(); 428 } 429 430 static mde_cookie_t 431 md_get_root(md_t *md) 432 { 433 mde_cookie_t root = MDE_INVAL_ELEM_COOKIE; 434 int n_nodes; 435 436 n_nodes = md_node_count(md); 437 438 if (n_nodes <= 0) { 439 MPO_STATUS("md_get_root: No nodes in node count\n"); 440 return (root); 441 } 442 443 root = md_root_node(md); 444 445 if (root == MDE_INVAL_ELEM_COOKIE) { 446 MPO_STATUS("md_get_root: Root node is missing\n"); 447 return (root); 448 } 449 450 MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes); 451 MPO_DEBUG("md_get_root: md: %p\n", md); 452 MPO_DEBUG("md_get_root: root: %lx\n", root); 453 done: 454 return (root); 455 } 456 457 static int 458 lgrp_update(md_t *md, mde_cookie_t root) 459 { 460 int i, j, result; 461 int ret_val = 0; 462 int sub_page_fix; 463 mde_cookie_t *nodes, *lgrpnodes; 464 465 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 466 "fwd", &lgrpnodes); 467 468 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 469 MPO_STATUS("lgrp_update: No Lgroups\n"); 470 ret_val = -1; 471 goto fail; 472 } 473 474 MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes); 475 476 for (i = 0; i < n_lgrpnodes; i++) { 477 mpo_lgroup[i].node = lgrpnodes[i]; 478 mpo_lgroup[i].id = i; 479 mpo_lgroup[i].ncpu = 0; 480 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 481 &mpo_lgroup[i].addr_mask); 482 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 483 &mpo_lgroup[i].addr_match); 484 485 /* 486 * If either the mask or match properties are missing, set to 0 487 */ 488 if (result < 0) { 489 mpo_lgroup[i].addr_mask = 0; 490 mpo_lgroup[i].addr_match = 0; 491 } 492 493 /* Set latency to 0 if property not present */ 494 495 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 496 &mpo_lgroup[i].latency); 497 if (result < 0) 498 mpo_lgroup[i].latency = 0; 499 } 500 501 /* 502 * Sub-page level interleave is not yet supported. Check for it, 503 * and remove sub-page interleaved lgroups from mpo_lgroup and 504 * n_lgrpnodes. If no lgroups are left, return. 505 */ 506 507 sub_page_fix = fix_interleave(); 508 if (n_lgrpnodes == 0) { 509 ret_val = -1; 510 goto fail; 511 } 512 513 /* Ensure that all of the addr_mask values are the same */ 514 515 for (i = 0; i < n_lgrpnodes; i++) { 516 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 517 MPO_STATUS("lgrp_update: " 518 "addr_mask values are not the same\n"); 519 ret_val = -1; 520 goto fail; 521 } 522 } 523 524 /* 525 * Ensure that all lgrp nodes see all the mblocks. However, if 526 * sub-page interleave is being fixed, they do not, so skip 527 * the check. 528 */ 529 530 if (sub_page_fix == 0) { 531 for (i = 0; i < n_lgrpnodes; i++) { 532 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 533 PROP_LG_MBLOCK, "fwd", &nodes); 534 md_free_scan_dag(md, &nodes); 535 if (j != n_mblocks) { 536 MPO_STATUS("lgrp_update: " 537 "sub-page interleave is being fixed\n"); 538 ret_val = -1; 539 goto fail; 540 } 541 } 542 } 543 fail: 544 if (n_lgrpnodes > 0) { 545 md_free_scan_dag(md, &lgrpnodes); 546 for (i = 0; i < n_lgrpnodes; i++) 547 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 548 } 549 550 return (ret_val); 551 } 552 553 /* 554 * 555 * Traverse the MD to determine: 556 * 557 * Number of CPU nodes, lgrp_nodes, and mblocks 558 * Then for each lgrp_node, obtain the appropriate data. 559 * For each CPU, determine its home locality and store it. 560 * For each mblock, retrieve its data and store it. 561 */ 562 static int 563 lgrp_traverse(md_t *md) 564 { 565 mde_cookie_t root, *cpunodes, *mblocknodes; 566 int o; 567 uint64_t i, k, stripe, stride; 568 uint64_t mem_lg_homeset = 0; 569 int ret_val = 0; 570 int result = 0; 571 int n_cpunodes = 0; 572 mpo_config_t new_config; 573 574 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) { 575 ret_val = -1; 576 goto fail; 577 } 578 579 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd", 580 &mblocknodes); 581 if (n_mblocks <= 0) { 582 MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine " 583 "Descriptor\n"); 584 ret_val = -1; 585 goto fail; 586 } 587 588 /* 589 * Build the Memory Nodes. Do this before any possibility of 590 * bailing from this routine so we obtain ra_to_pa (needed for page 591 * coloring) even when there are no lgroups defined. 592 */ 593 if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) { 594 ret_val = -1; 595 goto fail; 596 } 597 598 mblock_update(&new_config, md, mblocknodes); 599 mblock_install(&new_config); 600 601 /* Page coloring hook is required so we can iterate through mnodes */ 602 if (&page_next_pfn_for_color_cpu == NULL) { 603 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 604 ret_val = -1; 605 goto fail; 606 } 607 608 /* Global enable for mpo */ 609 if (sun4v_mpo_enable == 0) { 610 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 611 ret_val = -1; 612 goto fail; 613 } 614 615 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 616 617 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 618 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 619 "in MD\n"); 620 ret_val = -1; 621 goto fail; 622 } 623 624 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 625 626 if ((ret_val = lgrp_update(md, root)) == -1) 627 goto fail; 628 629 /* 630 * Use the address mask from the first lgroup node 631 * to establish our home_mask. 632 */ 633 home_mask = mpo_lgroup[0].addr_mask; 634 home_mask_pfn = btop(home_mask); 635 home_mask_shift = lowbit(home_mask) - 1; 636 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 637 mnode_pages = btop(1ULL << home_mask_shift); 638 639 /* 640 * How many values are possible in home mask? Assume the mask 641 * bits are contiguous. 642 */ 643 max_locality_groups = 644 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 645 646 stripe_shift = highbit(max_locality_groups) - 1; 647 stripe = ptob(mnode_pages); 648 stride = max_locality_groups * stripe; 649 mnode_stride = btop(stride); 650 651 /* Now verify the home mask bits are contiguous */ 652 653 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 654 MPO_STATUS("lgrp_traverse: " 655 "home mask bits are not contiguous\n"); 656 ret_val = -1; 657 goto fail; 658 } 659 660 /* Record all of the home bits */ 661 662 for (i = 0; i < n_lgrpnodes; i++) { 663 HOMESET_ADD(mem_lg_homeset, 664 mpo_lgroup[i].addr_match >> home_mask_shift); 665 } 666 667 /* Count the number different "home" mem_lg's we've discovered */ 668 669 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 670 671 /* If we have only 1 locality group then we can exit */ 672 if (n_locality_groups == 1) { 673 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 674 ret_val = -1; 675 goto fail; 676 } 677 678 /* 679 * Set the latencies. A CPU's lgroup is defined by the lowest 680 * latency found. All other memory is considered remote, and the 681 * remote latency is represented by the highest latency found. 682 * Thus hierarchical lgroups, if any, are approximated by a 683 * two level scheme. 684 * 685 * The Solaris MPO framework by convention wants to see latencies 686 * in units of nano-sec/10. In the MD, the units are defined to be 687 * pico-seconds. 688 */ 689 690 lower_latency = mpo_lgroup[0].latency; 691 higher_latency = mpo_lgroup[0].latency; 692 693 for (i = 1; i < n_lgrpnodes; i++) { 694 if (mpo_lgroup[i].latency < lower_latency) { 695 lower_latency = mpo_lgroup[i].latency; 696 } 697 if (mpo_lgroup[i].latency > higher_latency) { 698 higher_latency = mpo_lgroup[i].latency; 699 } 700 } 701 lower_latency /= 10000; 702 higher_latency /= 10000; 703 704 /* Clear our CPU data */ 705 706 for (i = 0; i < NCPU; i++) { 707 mpo_cpu[i].home = 0; 708 mpo_cpu[i].lgrp_index = -1; 709 } 710 711 /* Build the CPU nodes */ 712 for (i = 0; i < n_cpunodes; i++) { 713 714 /* Read in the lgroup nodes */ 715 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 716 if (result < 0) { 717 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 718 ret_val = -1; 719 goto fail; 720 } 721 722 o = mpo_cpu_to_lgroup(md, cpunodes[i]); 723 if (o == -1) { 724 ret_val = -1; 725 goto fail; 726 } 727 mpo_cpu[k].lgrp_index = o; 728 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift; 729 mpo_lgroup[o].ncpu++; 730 } 731 /* Validate that no large pages cross mnode boundaries. */ 732 if (valid_pages(md, cpunodes[0]) == 0) { 733 ret_val = -1; 734 goto fail; 735 } 736 737 fail: 738 if (n_cpunodes > 0) 739 md_free_scan_dag(md, &cpunodes); 740 if (n_mblocks > 0) 741 md_free_scan_dag(md, &mblocknodes); 742 else 743 panic("lgrp_traverse: No memory blocks found"); 744 745 if (ret_val == 0) { 746 MPO_STATUS("MPO feature is enabled.\n"); 747 } else 748 sun4v_mpo_enable = 0; /* set this for DR */ 749 750 return (ret_val); 751 } 752 753 /* 754 * Determine the number of unique mem_lg's present in our system 755 */ 756 static int 757 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 758 { 759 int homeid; 760 int count = 0; 761 762 /* 763 * Scan the "home" bits of the mem_lgs, count 764 * the number that are unique. 765 */ 766 767 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 768 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 769 count++; 770 } 771 } 772 773 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 774 mem_lg_homeset); 775 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 776 777 /* Default must be at least one */ 778 if (count == 0) 779 count = 1; 780 781 return (count); 782 } 783 784 /* 785 * Platform specific lgroup initialization 786 */ 787 void 788 plat_lgrp_init(void) 789 { 790 md_t *md; 791 int rc; 792 793 /* Get the Machine Descriptor handle */ 794 795 md = md_get_handle(); 796 797 /* If not, we cannot continue */ 798 799 if (md == NULL) { 800 panic("cannot access machine descriptor\n"); 801 } else { 802 rc = lgrp_traverse(md); 803 (void) md_fini_handle(md); 804 } 805 806 /* 807 * If we can't process the MD for lgroups then at least let the 808 * system try to boot. Assume we have one lgroup so that 809 * when plat_build_mem_nodes is called, it will attempt to init 810 * an mnode based on the supplied memory segment. 811 */ 812 813 if (rc == -1) { 814 home_mask_pfn = 0; 815 max_locality_groups = 1; 816 n_locality_groups = 1; 817 return; 818 } 819 820 mem_node_pfn_shift = 0; 821 mem_node_physalign = 0; 822 823 /* Use lgroup-aware TSB allocations */ 824 tsb_lgrp_affinity = 1; 825 826 /* Require that a home lgroup have some memory to be chosen */ 827 lgrp_mem_free_thresh = 1; 828 829 /* Standard home-on-next-touch policy */ 830 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 831 832 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 833 lgrp_load_thresh = UINT32_MAX; 834 835 mpo_update_tunables(); 836 } 837 838 /* 839 * Helper routine for debugging calls to mem_node_add_slice() 840 */ 841 static void 842 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 843 { 844 #if defined(DEBUG) && !defined(lint) 845 static int slice_count = 0; 846 847 slice_count++; 848 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 849 slice_count, basepfn, endpfn); 850 #endif 851 mem_node_add_slice(basepfn, endpfn); 852 } 853 854 static void 855 mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn) 856 { 857 #if defined(DEBUG) && !defined(lint) 858 static int slice_count = 0; 859 860 slice_count++; 861 MPO_DEBUG("mem_del_slice(%d): basepfn: %lx endpfn: %lx\n", 862 slice_count, basepfn, endpfn); 863 #endif 864 mem_node_del_slice(basepfn, endpfn); 865 } 866 867 /* 868 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 869 */ 870 static void 871 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 872 { 873 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, " 874 "mnode index: %d\n", plathand, mnode); 875 plat_assign_lgrphand_to_mem_node(plathand, mnode); 876 } 877 878 /* 879 * plat_build_mem_nodes() 880 * 881 * Define the mem_nodes based on the modified boot memory list, 882 * or based on info read from the MD in plat_lgrp_init(). 883 * 884 * When the home mask lies in the middle of the address bits (as it does on 885 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 886 * it is striped across an mblock in a repeating pattern of contiguous memory 887 * followed by a gap. The stripe width is the size of the contiguous piece. 888 * The stride is the distance from the start of one contiguous piece to the 889 * start of the next. The gap is thus stride - stripe_width. 890 * 891 * The stripe of an mnode that falls within an mblock is described by the type 892 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 893 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 894 * this array is predetermined. The mem_stripe_t that describes mnode m 895 * within mpo_mblock[i] is stored at 896 * mem_stripes[ m + i * max_locality_groups ] 897 * 898 * max_locality_groups is the total number of possible locality groups, 899 * as defined by the size of the home mask, even if the memory assigned 900 * to the domain is small and does not cover all the lgroups. Thus some 901 * mem_stripe_t's may be empty. 902 * 903 * The members of mem_stripe_t are: 904 * physbase: First valid page in mem_node in the corresponding mblock 905 * physmax: Last valid page in mem_node in mblock 906 * offset: The full stripe width starts at physbase - offset. 907 * Thus if offset is non-zero, this mem_node starts in the middle 908 * of a stripe width, and the second full stripe starts at 909 * physbase - offset + stride. (even though physmax may fall in the 910 * middle of a stripe width, we do not save the ending fragment size 911 * in this data structure.) 912 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 913 * 914 * The stripe width is kept in the global mnode_pages. 915 * The stride is kept in the global mnode_stride. 916 * All the above use pfn's as the unit. 917 * 918 * As an example, the memory layout for a domain with 2 mblocks and 4 919 * mem_nodes 0,1,2,3 could look like this: 920 * 921 * 123012301230 ... 012301230123 ... 922 * mblock 0 mblock 1 923 */ 924 925 /*ARGSUSED*/ 926 void 927 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems) 928 { 929 int elem; 930 uint64_t base, len; 931 932 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 933 max_mem_nodes = max_locality_groups; 934 935 mstripe_update(&mpo_config); 936 937 /* Check for non-MPO sun4v platforms */ 938 if (n_locality_groups <= 1) { 939 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 940 for (elem = 0; elem < nelems; list++, elem++) { 941 base = list->addr; 942 len = list->size; 943 944 mpo_mem_node_add_slice(btop(base), 945 btop(base + len - 1)); 946 } 947 mem_node_pfn_shift = 0; 948 mem_node_physalign = 0; 949 } else 950 mnode_update(&mpo_config, 0, 0, U_ADD_ALL); 951 952 /* 953 * Indicate to vm_pagelist that the hpm_counters array 954 * should be shared because the ranges overlap. 955 */ 956 if (max_mem_nodes > 1) { 957 interleaved_mnodes = 1; 958 } 959 } 960 961 /* 962 * Return the locality group value for the supplied processor 963 */ 964 lgrp_handle_t 965 plat_lgrp_cpu_to_hand(processorid_t id) 966 { 967 lgrp_handle_t lgrphand; 968 969 mpo_rd_lock(); 970 if (n_locality_groups > 1) { 971 lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home; 972 } else { 973 lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */ 974 } 975 mpo_rd_unlock(); 976 977 return (lgrphand); 978 } 979 980 int 981 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 982 { 983 /* 984 * Return min remote latency when there are more than two lgroups 985 * (root and child) and getting latency between two different lgroups 986 * or root is involved. 987 */ 988 if (lgrp_optimizations() && (from != to || 989 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 990 return ((int)higher_latency); 991 } else { 992 return ((int)lower_latency); 993 } 994 } 995 996 int 997 plat_pfn_to_mem_node(pfn_t pfn) 998 { 999 int i, mnode; 1000 pfn_t ra_to_pa_pfn; 1001 struct mblock_md *mb; 1002 1003 if (n_locality_groups <= 1) 1004 return (0); 1005 1006 /* 1007 * The mnode is defined to be 1:1 with the lgroup handle, which 1008 * is taken from from the home bits. Find the mblock in which 1009 * the pfn falls to get the ra_to_pa adjustment, and extract 1010 * the home bits. 1011 */ 1012 mpo_rd_lock(); 1013 mb = &mpo_mblock[0]; 1014 for (i = 0; i < n_mblocks; i++) { 1015 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 1016 ra_to_pa_pfn = btop(mb->ra_to_pa); 1017 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 1018 home_mask_pfn_shift); 1019 ASSERT(mnode < max_mem_nodes); 1020 mpo_rd_unlock(); 1021 return (mnode); 1022 } 1023 mb++; 1024 } 1025 1026 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 1027 return (pfn); 1028 } 1029 1030 /* 1031 * plat_rapfn_to_papfn 1032 * 1033 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 1034 * and home mask bits are correct. The upper bits do not necessarily 1035 * match the actual PA, however. 1036 */ 1037 pfn_t 1038 plat_rapfn_to_papfn(pfn_t pfn) 1039 { 1040 int i; 1041 pfn_t ra_to_pa_pfn; 1042 struct mblock_md *mb; 1043 1044 ASSERT(n_mblocks > 0); 1045 if (n_mblocks == 1) 1046 return (pfn + base_ra_to_pa_pfn); 1047 1048 /* 1049 * Find the mblock in which the pfn falls 1050 * in order to get the ra_to_pa adjustment. 1051 */ 1052 mpo_rd_lock(); 1053 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 1054 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 1055 ra_to_pa_pfn = btop(mb->ra_to_pa); 1056 mpo_rd_unlock(); 1057 return (pfn + ra_to_pa_pfn); 1058 } 1059 } 1060 1061 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 1062 return (pfn); 1063 } 1064 1065 /* 1066 * plat_mem_node_iterator_init() 1067 * Initialize cookie "it" to iterate over pfn's in an mnode. There is 1068 * no additional iterator function. The caller uses the info from 1069 * the iterator structure directly. 1070 * 1071 * pfn: starting pfn. 1072 * mnode: desired mnode. 1073 * szc: desired page size. 1074 * init: 1075 * if 1, start a new traversal, initialize "it", find first 1076 * mblock containing pfn, and return its starting pfn 1077 * within the mnode. 1078 * if 0, continue the previous traversal using passed-in data 1079 * from "it", advance to the next mblock, and return its 1080 * starting pfn within the mnode. 1081 * it: returns readonly data to the caller; see below. 1082 * 1083 * The input pfn must be aligned for the page size szc. 1084 * 1085 * Returns: starting pfn for the iteration for the mnode/mblock, 1086 * which is aligned according to the page size, 1087 * or returns (pfn_t)(-1) if the input pfn lies past the last 1088 * valid pfn of the mnode. 1089 * Returns misc values in the "it" struct that allows the caller 1090 * to advance the pfn within an mblock using address arithmetic; 1091 * see definition of mem_node_iterator_t in vm_dep.h. 1092 * When the caller calculates a pfn that is greater than the 1093 * returned value it->mi_mblock_end, the caller should again 1094 * call plat_mem_node_iterator_init, passing init=0. 1095 * 1096 * The last mblock in continuation case may be invalid because 1097 * of memory DR. To detect this situation mi_genid is checked 1098 * against mpo_genid which is incremented after a memory DR 1099 * operation. See also plat_slice_add()/plat_slice_del(). 1100 */ 1101 pfn_t 1102 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc, 1103 mem_node_iterator_t *it, int init) 1104 { 1105 int i; 1106 pgcnt_t szcpgcnt = PNUM_SIZE(szc); 1107 struct mblock_md *mblock; 1108 pfn_t base, end; 1109 mem_stripe_t *ms; 1110 uint64_t szcpagesize; 1111 1112 ASSERT(it != NULL); 1113 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1114 ASSERT(n_mblocks > 0); 1115 ASSERT(P2PHASE(pfn, szcpgcnt) == 0); 1116 1117 mpo_rd_lock(); 1118 1119 if (init || (it->mi_genid != mpo_genid)) { 1120 it->mi_genid = mpo_genid; 1121 it->mi_last_mblock = 0; 1122 it->mi_init = 1; 1123 } 1124 1125 /* Check if mpo is not enabled and we only have one mblock */ 1126 if (n_locality_groups == 1 && n_mblocks == 1) { 1127 if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) { 1128 pfn = (pfn_t)-1; 1129 goto done; 1130 } 1131 it->mi_mnode = mnode; 1132 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1133 it->mi_mnode_pfn_mask = 0; 1134 it->mi_mnode_pfn_shift = 0; 1135 it->mi_mnode_mask = 0; 1136 it->mi_mblock_base = mem_node_config[mnode].physbase; 1137 it->mi_mblock_end = mem_node_config[mnode].physmax; 1138 if (pfn < it->mi_mblock_base) 1139 pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt); 1140 if ((pfn + szcpgcnt - 1) > it->mi_mblock_end) 1141 pfn = (pfn_t)-1; 1142 goto done; 1143 } 1144 1145 /* init=1 means begin iterator, init=0 means continue */ 1146 if (init == 1) { 1147 i = 0; 1148 } else { 1149 ASSERT(it->mi_last_mblock < n_mblocks); 1150 i = it->mi_last_mblock; 1151 ASSERT(pfn > 1152 mem_stripes[i * max_locality_groups + mnode].physmax); 1153 if (++i == n_mblocks) { 1154 pfn = (pfn_t)-1; 1155 goto done; 1156 } 1157 } 1158 1159 /* 1160 * Find mblock that contains pfn for mnode's stripe, or first such an 1161 * mblock after pfn, else pfn is out of bound and we'll return -1. 1162 * mblocks and stripes are sorted in ascending address order. 1163 */ 1164 szcpagesize = szcpgcnt << PAGESHIFT; 1165 for (; i < n_mblocks; i++) { 1166 if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize)) 1167 continue; 1168 ms = &mem_stripes[i * max_locality_groups + mnode]; 1169 if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax && 1170 (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <= 1171 ms->physmax) 1172 break; 1173 } 1174 if (i == n_mblocks) { 1175 it->mi_last_mblock = i - 1; 1176 pfn = (pfn_t)-1; 1177 goto done; 1178 } 1179 1180 it->mi_last_mblock = i; 1181 1182 mblock = &mpo_mblock[i]; 1183 base = ms->physbase; 1184 end = ms->physmax; 1185 1186 it->mi_mnode = mnode; 1187 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1188 it->mi_mblock_base = base; 1189 it->mi_mblock_end = end; 1190 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1191 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1192 it->mi_mnode_mask = max_locality_groups - 1; 1193 if (pfn < base) { 1194 pfn = P2ROUNDUP(base, szcpgcnt); 1195 ASSERT(pfn + szcpgcnt - 1 <= end); 1196 } 1197 ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn); 1198 done: 1199 mpo_rd_unlock(); 1200 return (pfn); 1201 } 1202 1203 /* 1204 * plat_mem_node_intersect_range() 1205 * 1206 * Find the intersection between a memnode and a range of pfn's. 1207 */ 1208 void 1209 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1210 int mnode, pgcnt_t *npages_out) 1211 { 1212 pfn_t offset, len, hole, base, end, test_end, frag; 1213 pfn_t nearest; 1214 mem_stripe_t *ms; 1215 int i, npages; 1216 1217 *npages_out = 0; 1218 1219 if (!mem_node_config[mnode].exists || test_len == 0) 1220 return; 1221 1222 base = mem_node_config[mnode].physbase; 1223 end = mem_node_config[mnode].physmax; 1224 1225 test_end = test_base + test_len - 1; 1226 if (end < test_base || base > test_end) 1227 return; 1228 1229 if (n_locality_groups == 1) { 1230 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1231 return; 1232 } 1233 1234 hole = mnode_stride - mnode_pages; 1235 npages = 0; 1236 1237 /* 1238 * Iterate over all the stripes for this mnode (one per mblock), 1239 * find the intersection with each, and accumulate the intersections. 1240 * 1241 * Determing the intersection with a stripe is tricky. If base or end 1242 * fall outside the mem_node bounds, round them to physbase/physmax of 1243 * mem_node. If base or end fall in a gap, round them to start of 1244 * nearest stripe. If they fall within a stripe, keep base or end, 1245 * but calculate the fragment size that should be excluded from the 1246 * stripe. Calculate how many strides fall in the adjusted range, 1247 * multiply by stripe width, and add the start and end fragments. 1248 */ 1249 1250 mpo_rd_lock(); 1251 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1252 ms = &mem_stripes[i]; 1253 if (ms->exists && 1254 test_base <= (end = ms->physmax) && 1255 test_end >= (base = ms->physbase)) { 1256 1257 offset = ms->offset; 1258 1259 if (test_base > base) { 1260 /* Round test_base to next multiple of stride */ 1261 len = P2ROUNDUP(test_base - (base - offset), 1262 mnode_stride); 1263 nearest = base - offset + len; 1264 /* 1265 * Compute distance from test_base to the 1266 * stride boundary to see if test_base falls 1267 * in the stripe or in the hole. 1268 */ 1269 if (nearest - test_base > hole) { 1270 /* 1271 * test_base lies in stripe, 1272 * and offset should be excluded. 1273 */ 1274 offset = test_base - 1275 (nearest - mnode_stride); 1276 base = test_base; 1277 } else { 1278 /* round up to next stripe start */ 1279 offset = 0; 1280 base = nearest; 1281 if (base > end) 1282 continue; 1283 } 1284 1285 } 1286 1287 if (test_end < end) 1288 end = test_end; 1289 end++; /* adjust to an exclusive bound */ 1290 1291 /* Round end to next multiple of stride */ 1292 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1293 nearest = (base - offset) + len; 1294 if (nearest - end <= hole) { 1295 /* end falls in hole, use entire last stripe */ 1296 frag = 0; 1297 } else { 1298 /* end falls in stripe, compute fragment */ 1299 frag = nearest - hole - end; 1300 } 1301 1302 len = (len >> stripe_shift) - offset - frag; 1303 npages += len; 1304 } 1305 } 1306 1307 *npages_out = npages; 1308 mpo_rd_unlock(); 1309 } 1310 1311 /* 1312 * valid_pages() 1313 * 1314 * Return 1 if pages are valid and do not cross mnode boundaries 1315 * (which would break page free list assumptions), and 0 otherwise. 1316 */ 1317 1318 #define MNODE(pa) \ 1319 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1320 1321 static int 1322 valid_pages(md_t *md, mde_cookie_t cpu0) 1323 { 1324 int i, max_szc; 1325 uint64_t last_page_base, szc_mask; 1326 uint64_t max_page_len, max_coalesce_len; 1327 struct mblock_md *mb = mpo_mblock; 1328 1329 /* 1330 * Find the smaller of the largest page possible and supported. 1331 * mmu_exported_pagesize_mask is not yet initialized, so read 1332 * it from the MD. Apply minimal fixups in case of broken MDs 1333 * to get a sane mask. 1334 */ 1335 1336 if (cpu0 == NULL) 1337 szc_mask = szc_mask0; 1338 else { 1339 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1340 szc_mask = 0; 1341 /* largest in sun4v default support */ 1342 szc_mask |= (1 << TTE4M); 1343 szc_mask0 = szc_mask; 1344 } 1345 max_szc = highbit(szc_mask) - 1; 1346 if (max_szc > TTE256M) 1347 max_szc = TTE256M; 1348 max_page_len = TTEBYTES(max_szc); 1349 1350 /* 1351 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1352 * if mmu-page-size-list does not contain it, so 256M pages must fall 1353 * within one mnode to use MPO. 1354 */ 1355 max_coalesce_len = TTEBYTES(TTE256M); 1356 ASSERT(max_coalesce_len >= max_page_len); 1357 1358 if (ptob(mnode_pages) < max_coalesce_len) { 1359 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1360 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1361 return (0); 1362 } 1363 1364 for (i = 0; i < n_mblocks; i++) { 1365 uint64_t base = mb->base; 1366 uint64_t end = mb->base + mb->size - 1; 1367 uint64_t ra_to_pa = mb->ra_to_pa; 1368 1369 /* 1370 * If mblock is smaller than the max page size, then 1371 * RA = PA mod MAXPAGE is not guaranteed, but it must 1372 * not span mnodes. 1373 */ 1374 if (mb->size < max_page_len) { 1375 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1376 MPO_STATUS("Small mblock spans mnodes; " 1377 "MPO disabled: base = %lx, end = %lx, " 1378 "ra2pa = %lx\n", base, end, ra_to_pa); 1379 return (0); 1380 } 1381 } else { 1382 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1383 uint64_t pa_base = base + ra_to_pa; 1384 if ((base & (max_coalesce_len - 1)) != 1385 (pa_base & (max_coalesce_len - 1))) { 1386 MPO_STATUS("bad page alignment; MPO disabled: " 1387 "ra = %lx, pa = %lx, pagelen = %lx\n", 1388 base, pa_base, max_coalesce_len); 1389 return (0); 1390 } 1391 } 1392 1393 /* 1394 * Find start of last large page in mblock in RA space. 1395 * If page extends into the next mblock, verify the 1396 * mnode does not change. 1397 */ 1398 last_page_base = P2ALIGN(end, max_coalesce_len); 1399 if (i + 1 < n_mblocks && 1400 last_page_base + max_coalesce_len > mb[1].base && 1401 MNODE(last_page_base + ra_to_pa) != 1402 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1403 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1404 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1405 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1406 mb[1].ra_to_pa, max_coalesce_len); 1407 return (0); 1408 } 1409 1410 mb++; 1411 } 1412 return (1); 1413 } 1414 1415 1416 /* 1417 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1418 * if any, and remove them. This yields a config where the "coarse 1419 * grained" lgroups cover all of memory, even though part of that memory 1420 * is fine grain interleaved and does not deliver a purely local memory 1421 * latency. 1422 * 1423 * This function reads and modifies the globals: 1424 * mpo_lgroup[], n_lgrpnodes 1425 * 1426 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1427 */ 1428 1429 static int 1430 fix_interleave(void) 1431 { 1432 int i, j; 1433 uint64_t mask = 0; 1434 1435 j = 0; 1436 for (i = 0; i < n_lgrpnodes; i++) { 1437 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1438 /* remove this lgroup */ 1439 mask = mpo_lgroup[i].addr_mask; 1440 } else { 1441 mpo_lgroup[j++] = mpo_lgroup[i]; 1442 } 1443 } 1444 n_lgrpnodes = j; 1445 1446 if (mask != 0) 1447 MPO_STATUS("sub-page interleave %lx found; " 1448 "removing lgroup.\n", mask); 1449 1450 return (mask != 0); 1451 } 1452 1453 /* 1454 * mblock_alloc 1455 * 1456 * Allocate memory for mblock an stripe arrays from either static or 1457 * dynamic space depending on utype, and return the result in mc. 1458 * Returns 0 on success and -1 on error. 1459 */ 1460 1461 static int 1462 mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks) 1463 { 1464 mblock_md_t *mb = NULL; 1465 mem_stripe_t *ms = NULL; 1466 int nstripes = MAX_MEM_NODES * nmblocks; 1467 size_t mblocksz = nmblocks * sizeof (struct mblock_md); 1468 size_t mstripesz = nstripes * sizeof (mem_stripe_t); 1469 size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 1470 1471 /* 1472 * Allocate space for mblocks and mstripes. 1473 * 1474 * For DR allocations, just use kmem_alloc(), and set 1475 * mc_alloc_sz to indicate it was used. 1476 * 1477 * For boot allocation: 1478 * If we have a small number of mblocks we will use the space 1479 * that we preallocated. Otherwise, we will dynamically 1480 * allocate the space from the prom and map it to the 1481 * reserved VA at MPOBUF_BASE. 1482 */ 1483 1484 if (utype == U_ADD || utype == U_DEL) { 1485 mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP); 1486 ms = (mem_stripe_t *)(mb + nmblocks); 1487 mc->mc_alloc_sz = allocsz; 1488 } else if (nmblocks <= SMALL_MBLOCKS_COUNT) { 1489 mb = &small_mpo_mblocks[0]; 1490 ms = &small_mem_stripes[0]; 1491 mc->mc_alloc_sz = 0; 1492 } else { 1493 /* Ensure that we dont request more space than reserved */ 1494 if (allocsz > MPOBUF_SIZE) { 1495 MPO_STATUS("mblock_alloc: Insufficient space " 1496 "for mblock structures \n"); 1497 return (-1); 1498 } 1499 mb = (struct mblock_md *) 1500 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 1501 if (mb != (struct mblock_md *)MPOBUF_BASE) { 1502 MPO_STATUS("mblock_alloc: Cannot allocate space " 1503 "for mblocks \n"); 1504 return (-1); 1505 } 1506 mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 1507 mpo_heap32_bufsz = MPOBUF_SIZE; 1508 ms = (mem_stripe_t *)(mb + nmblocks); 1509 mc->mc_alloc_sz = 0; 1510 } 1511 mc->mc_mblocks = mb; 1512 mc->mc_stripes = ms; 1513 mc->mc_nmblocks = nmblocks; 1514 mc->mc_nstripes = nstripes; 1515 MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks); 1516 return (0); 1517 } 1518 1519 /* 1520 * mblock_free 1521 * 1522 * Free memory in mc that was allocated by mblock_alloc. 1523 */ 1524 1525 static void 1526 mblock_free(mpo_config_t *mc) 1527 { 1528 if (mc->mc_alloc_sz > 0) { 1529 ASSERT(mc->mc_mblocks != mpo_mblock); 1530 kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz); 1531 } 1532 bzero(mc, sizeof (*mc)); 1533 } 1534 1535 /* 1536 * mblock_install 1537 * 1538 * Install mblock config passed in mc as the global configuration. 1539 * May only be called at boot or while holding mpo_wr_lock. 1540 */ 1541 1542 static void 1543 mblock_install(mpo_config_t *mc) 1544 { 1545 mpo_mblock = mc->mc_mblocks; 1546 n_mblocks = mc->mc_nmblocks; 1547 mem_stripes = mc->mc_stripes; 1548 n_mem_stripes = mc->mc_nstripes; 1549 base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa); 1550 mpo_config = *mc; 1551 } 1552 1553 /* 1554 * mblock_update 1555 * 1556 * Traverse mblocknodes, read the mblock properties from the MD, and 1557 * save the mblocks in mc. 1558 */ 1559 1560 static void 1561 mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes) 1562 { 1563 uint64_t i, j; 1564 int result = 0; 1565 mblock_md_t *mblock = mc->mc_mblocks; 1566 1567 for (i = 0, j = 0; j < mc->mc_nmblocks; j++) { 1568 1569 /* Without a base or size value we will fail */ 1570 result = get_int(md, mblocknodes[j], PROP_LG_BASE, 1571 &mblock[i].base); 1572 if (result < 0) { 1573 MPO_STATUS("mblock_update: " 1574 "PROP_LG_BASE is missing\n"); 1575 mc->mc_nmblocks = 0; 1576 return; 1577 } 1578 1579 result = get_int(md, mblocknodes[j], PROP_LG_SIZE, 1580 &mblock[i].size); 1581 if (result < 0) { 1582 MPO_STATUS("mblock_update: " 1583 "PROP_LG_SIZE is missing\n"); 1584 mc->mc_nmblocks = 0; 1585 return; 1586 } 1587 1588 result = get_int(md, mblocknodes[j], 1589 PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa); 1590 1591 /* If we don't have an ra_pa_offset, just set it to 0 */ 1592 if (result < 0) 1593 mblock[i].ra_to_pa = 0; 1594 1595 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 1596 "ra_to_pa = %lx\n", i, 1597 mblock[i].base, 1598 mblock[i].size, 1599 mblock[i].ra_to_pa); 1600 1601 /* check for unsupportable values of base and size */ 1602 if (mblock[i].base > mblock[i].base + mblock[i].size) { 1603 MPO_STATUS("mblock_update: " 1604 "PROP_LG_BASE+PROP_LG_SIZE is invalid: " 1605 "base = %lx, size = %lx\n", 1606 mblock[i].base, mblock[i].size); 1607 mc->mc_nmblocks = 0; 1608 return; 1609 } 1610 1611 /* eliminate size==0 blocks */ 1612 if (mblock[i].size != 0) { 1613 uint64_t base = mblock[i].base; 1614 uint64_t end = base + mblock[i].size; 1615 ASSERT(end > base); 1616 mblock[i].base_pfn = btop(base); 1617 mblock[i].end_pfn = btop(end - 1); 1618 i++; 1619 } 1620 } 1621 1622 if (i == 0) { 1623 MPO_STATUS("mblock_update: " 1624 "No non-empty mblock nodes were found " 1625 "in the Machine Descriptor\n"); 1626 mc->mc_nmblocks = 0; 1627 return; 1628 } 1629 ASSERT(i <= mc->mc_nmblocks); 1630 mc->mc_nmblocks = i; 1631 1632 /* Must sort mblocks by address for mem_node_iterator_init() */ 1633 mblock_sort(mblock, mc->mc_nmblocks); 1634 } 1635 1636 /* 1637 * mblock_update_add 1638 * 1639 * Update mblock config after a memory DR add. The added range is not 1640 * needed, as we read *all* mblock nodes from the MD. Save the mblocks 1641 * in mc. 1642 */ 1643 1644 static void 1645 mblock_update_add(mpo_config_t *mc) 1646 { 1647 md_t *md; 1648 mde_cookie_t root, *mblocknodes; 1649 int nmblocks = 0; 1650 1651 if ((md = md_get_handle()) == NULL) { 1652 MPO_STATUS("Cannot access Machine Descriptor\n"); 1653 goto error; 1654 } 1655 1656 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) 1657 goto error; 1658 1659 nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd", 1660 &mblocknodes); 1661 if (nmblocks <= 0) { 1662 MPO_STATUS("No mblock nodes detected in Machine Descriptor\n"); 1663 goto error; 1664 } 1665 1666 if (mblock_alloc(mc, U_ADD, nmblocks) < 0) 1667 goto error; 1668 1669 mblock_update(mc, md, mblocknodes); 1670 md_free_scan_dag(md, &mblocknodes); 1671 (void) md_fini_handle(md); 1672 return; 1673 error: 1674 panic("mblock_update_add: cannot process mblocks from MD.\n"); 1675 } 1676 1677 /* 1678 * mblock_update_del 1679 * 1680 * Update mblocks after a memory DR deletion of the range (ubase, uend). 1681 * Allocate a new mblock config, copy old config to the new, modify the new 1682 * mblocks to reflect the deletion. The new mblocks are returned in 1683 * mc_new and are not yet installed as the active config. 1684 */ 1685 1686 static void 1687 mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase, 1688 pfn_t uend) 1689 { 1690 int i, j; 1691 pfn_t base, end; 1692 mblock_md_t *mblock; 1693 int nmblocks = mc_old->mc_nmblocks; 1694 1695 MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend); 1696 1697 /* 1698 * Allocate mblocks in mc_new and copy the old to the new. 1699 * Allocate one extra in case the deletion splits an mblock. 1700 */ 1701 if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0) 1702 return; 1703 mblock = mc_new->mc_mblocks; 1704 bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t)); 1705 1706 /* 1707 * Find the mblock containing the deleted range and adjust it in 1708 * the new config. 1709 */ 1710 for (i = 0; i < nmblocks; i++) { 1711 1712 base = btop(mblock[i].base); 1713 end = base + btop(mblock[i].size) - 1; 1714 1715 /* 1716 * Adjust the mblock based on the subset that was deleted. 1717 * 1718 * If the entire mblk was deleted, compact the table. 1719 * 1720 * If the middle of the mblk was deleted, extend 1721 * the table. Space for the new slot was already 1722 * allocated. 1723 * 1724 * The memory to be deleted is a mblock or a subset of 1725 * and does not span multiple mblocks. 1726 */ 1727 if (base == ubase && end == uend) { 1728 for (j = i; j < nmblocks - 1; j++) 1729 mblock[j] = mblock[j + 1]; 1730 nmblocks--; 1731 bzero(&mblock[nmblocks], sizeof (*mblock)); 1732 break; 1733 } else if (base < ubase && end > uend) { 1734 for (j = nmblocks - 1; j >= i; j--) 1735 mblock[j + 1] = mblock[j]; 1736 mblock[i].size = ptob(ubase - base); 1737 mblock[i].end_pfn = ubase - 1; 1738 mblock[i + 1].base = ptob(uend + 1); 1739 mblock[i + 1].size = ptob(end - uend); 1740 mblock[i + 1].base_pfn = uend + 1; 1741 nmblocks++; 1742 break; 1743 } else if (base == ubase) { 1744 MPO_DEBUG("mblock_update_del: shrink>" 1745 " i=%d base=0x%lx end=0x%lx", i, base, end); 1746 mblock[i].base = ptob(uend + 1); 1747 mblock[i].size -= ptob(uend - ubase + 1); 1748 base = uend + 1; 1749 mblock[i].base_pfn = base; 1750 mblock[i].end_pfn = end; 1751 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end); 1752 break; 1753 } else if (end == uend) { 1754 MPO_DEBUG("mblock_update_del: shrink<" 1755 " i=%d base=0x%lx end=0x%lx", i, base, end); 1756 mblock[i].size -= ptob(uend - ubase + 1); 1757 end = ubase - 1; 1758 mblock[i].base_pfn = base; 1759 mblock[i].end_pfn = end; 1760 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end); 1761 break; 1762 } 1763 } 1764 mc_new->mc_nmblocks = nmblocks; 1765 ASSERT(end > base); 1766 } 1767 1768 /* 1769 * mstripe_update 1770 * 1771 * Read mblocks from mc and update mstripes in mc 1772 */ 1773 1774 static void 1775 mstripe_update(mpo_config_t *mc) 1776 { 1777 lgrp_handle_t lgrphand, lgrp_start; 1778 int i, mnode; 1779 uint64_t offset, stripe_end, base, end, ra_to_pa, stride; 1780 uint64_t stripe, frag, remove; 1781 mem_stripe_t *ms; 1782 mblock_md_t *mblock = mc->mc_mblocks; 1783 int nmblocks = mc->mc_nmblocks; 1784 int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t); 1785 1786 /* Check for non-MPO sun4v platforms or memory DR removal */ 1787 if (n_locality_groups <= 1) { 1788 ASSERT(n_locality_groups == 1); 1789 ASSERT(max_locality_groups == 1 && max_mem_nodes == 1); 1790 1791 if (nmblocks == 1) { 1792 mc->mc_nstripes = 0; 1793 } else { 1794 mc->mc_nstripes = nmblocks; 1795 bzero(mc->mc_stripes, mstripesz); 1796 for (i = 0; i < nmblocks; i++) { 1797 mc->mc_stripes[i].exists = 1; 1798 mc->mc_stripes[i].physbase = mblock[i].base_pfn; 1799 mc->mc_stripes[i].physmax = mblock[i].end_pfn; 1800 } 1801 } 1802 return; 1803 } 1804 1805 bzero(mc->mc_stripes, mstripesz); 1806 mc->mc_nstripes = max_locality_groups * nmblocks; 1807 stripe = ptob(mnode_pages); 1808 stride = max_locality_groups * stripe; 1809 1810 for (i = 0; i < nmblocks; i++) { 1811 base = mblock[i].base; 1812 end = base + mblock[i].size; 1813 ra_to_pa = mblock[i].ra_to_pa; 1814 1815 /* Find the offset from the prev stripe boundary in PA space. */ 1816 offset = (base + ra_to_pa) & (stripe - 1); 1817 1818 /* Set the next stripe boundary. */ 1819 stripe_end = base - offset + stripe; 1820 1821 lgrp_start = (((base + ra_to_pa) & home_mask) >> 1822 home_mask_shift); 1823 lgrphand = lgrp_start; 1824 1825 /* 1826 * Loop over all lgroups covered by the mblock, creating a 1827 * stripe for each. Stop when lgrp_start is visited again. 1828 */ 1829 do { 1830 /* mblock may not span all lgroups */ 1831 if (base >= end) 1832 break; 1833 1834 mnode = lgrphand; 1835 ASSERT(mnode < max_mem_nodes); 1836 1837 /* 1838 * Calculate the size of the fragment that does not 1839 * belong to the mnode in the last partial stride. 1840 */ 1841 frag = (end - (base - offset)) & (stride - 1); 1842 if (frag == 0) { 1843 /* remove the gap */ 1844 remove = stride - stripe; 1845 } else if (frag < stripe) { 1846 /* fragment fits in stripe; keep it all */ 1847 remove = 0; 1848 } else { 1849 /* fragment is large; trim after whole stripe */ 1850 remove = frag - stripe; 1851 } 1852 1853 ms = &mc->mc_stripes[i * max_locality_groups + mnode]; 1854 ms->physbase = btop(base); 1855 ms->physmax = btop(end - 1 - remove); 1856 ms->offset = btop(offset); 1857 ms->exists = 1; 1858 1859 base = stripe_end; 1860 stripe_end += stripe; 1861 offset = 0; 1862 lgrphand = (((base + ra_to_pa) & home_mask) >> 1863 home_mask_shift); 1864 } while (lgrphand != lgrp_start); 1865 } 1866 } 1867 1868 #define INTERSECT(a, b, c, d) \ 1869 if (((a) >= (c) && (a) <= (d)) || \ 1870 ((c) >= (a) && (c) <= (b))) { \ 1871 (c) = MAX((a), (c)); \ 1872 (d) = MIN((b), (d)); \ 1873 } else { \ 1874 ASSERT((a) >= (d) || (b) <= (c)); \ 1875 continue; \ 1876 } \ 1877 1878 /* 1879 * mnode_update 1880 * 1881 * Read stripes from mc and update mnode extents. The mnode extents are 1882 * part of the live configuration, so this can only be done at boot time 1883 * or while holding the mpo_wr_lock. 1884 */ 1885 1886 static void 1887 mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype) 1888 { 1889 int i, j, mnode, found; 1890 pfn_t base, end; 1891 mem_stripe_t *ms; 1892 1893 MPO_DEBUG("mnode_udpate: basepfn: %lx endpfn: %lx\n", ubase, uend); 1894 1895 if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) { 1896 if (utype == U_ADD) 1897 mpo_mem_node_add_slice(ubase, uend); 1898 else if (utype == U_DEL) 1899 mpo_mem_node_del_slice(ubase, uend); 1900 else 1901 panic("mnode update: %d: invalid\n", utype); 1902 return; 1903 } 1904 1905 found = 0; 1906 for (i = 0; i < mc->mc_nmblocks; i++) { 1907 for (mnode = 0; mnode < max_locality_groups; mnode++) { 1908 1909 j = i * max_locality_groups + mnode; 1910 ms = &mc->mc_stripes[j]; 1911 if (!ms->exists) 1912 continue; 1913 1914 base = ms->physbase; 1915 end = ms->physmax; 1916 1917 /* 1918 * Look for the mstripes intersecting this slice. 1919 * 1920 * The mstripe and slice pairs may not be equal 1921 * if a subset of a mblock is added/deleted. 1922 */ 1923 switch (utype) { 1924 case U_ADD: 1925 INTERSECT(ubase, uend, base, end); 1926 /*FALLTHROUGH*/ 1927 case U_ADD_ALL: 1928 if (n_locality_groups > 1) 1929 mpo_plat_assign_lgrphand_to_mem_node( 1930 mnode, mnode); 1931 mpo_mem_node_add_slice(base, end); 1932 break; 1933 case U_DEL: 1934 INTERSECT(ubase, uend, base, end); 1935 mpo_mem_node_del_slice(base, end); 1936 break; 1937 default: 1938 panic("mnode_update: %d: invalid\n", utype); 1939 break; 1940 } 1941 1942 found++; 1943 } 1944 } 1945 1946 if (!found) 1947 panic("mnode_update: mstripe not found"); 1948 1949 #ifdef DEBUG 1950 if (utype == U_ADD_ALL || utype == U_DEL) 1951 return; 1952 found = 0; 1953 for (i = 0; i < max_mem_nodes; i++) { 1954 if (!mem_node_config[i].exists) 1955 continue; 1956 if (ubase >= mem_node_config[i].physbase && 1957 ubase <= mem_node_config[i].physmax) 1958 found |= 1; 1959 if (uend >= mem_node_config[i].physbase && 1960 uend <= mem_node_config[i].physmax) 1961 found |= 2; 1962 } 1963 ASSERT(found == 3); 1964 { 1965 pfn_t minpfn, maxpfn; 1966 1967 mem_node_max_range(&minpfn, &maxpfn); 1968 ASSERT(minpfn <= ubase); 1969 ASSERT(maxpfn >= uend); 1970 } 1971 #endif 1972 } 1973 1974 /* 1975 * Plat_slice_add()/plat_slice_del() are the platform hooks 1976 * for adding/deleting a pfn range to/from the system. 1977 * 1978 * Platform_slice_add() is used for both boot/DR cases. 1979 * 1980 * - Zeus has already added the mblocks to the MD, so read the updated 1981 * MD and allocate all data structures required to manage the new memory 1982 * configuration. 1983 * 1984 * - Recompute the stripes which are derived from the mblocks. 1985 * 1986 * - Update (expand) the mnode extents and install the modified mblocks as 1987 * the new mpo config. This must be done while holding the mpo_wr_lock 1988 * to guarantee that no other threads access the mpo meta-data. 1989 * 1990 * - Unlock MPO data structures; the new config is live. Free the old config. 1991 * 1992 * Plat_slice_del() is used for DR only. 1993 * 1994 * - Zeus has not yet modified the MD to reflect the deletion, so copy 1995 * the old mpo mblocks and delete the range from the copy. 1996 * 1997 * - Recompute the stripes which are derived from the mblocks. 1998 * 1999 * - Update (shrink) the mnode extents and install the modified mblocks as 2000 * the new mpo config. This must be done while holding the mpo_wr_lock 2001 * to guarantee that no other threads access the mpo meta-data. 2002 * 2003 * - Unlock MPO data structures; the new config is live. Free the old config. 2004 */ 2005 2006 void 2007 plat_slice_add(pfn_t base, pfn_t end) 2008 { 2009 mpo_config_t old_config = mpo_config; 2010 mpo_config_t new_config; 2011 2012 VALIDATE_SLICE(base, end); 2013 mblock_update_add(&new_config); 2014 mstripe_update(&new_config); 2015 mpo_wr_lock(); 2016 mblock_install(&new_config); 2017 /* Use new config to add all ranges for mnode_update */ 2018 mnode_update(&new_config, base, end, U_ADD); 2019 mpo_genid++; 2020 mpo_wr_unlock(); 2021 mblock_free(&old_config); 2022 } 2023 2024 void 2025 plat_slice_del(pfn_t base, pfn_t end) 2026 { 2027 mpo_config_t old_config = mpo_config; 2028 mpo_config_t new_config; 2029 2030 VALIDATE_SLICE(base, end); 2031 mblock_update_del(&new_config, &old_config, base, end); 2032 mstripe_update(&new_config); 2033 mpo_wr_lock(); 2034 /* Use old config to find deleted range for mnode_update */ 2035 mnode_update(&old_config, base, end, U_DEL); 2036 mblock_install(&new_config); 2037 mpo_genid++; 2038 mpo_wr_unlock(); 2039 mblock_free(&old_config); 2040 } 2041