1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/cpupart.h> 33 #include <sys/cpuvar.h> 34 #include <sys/lgrp.h> 35 #include <sys/machsystm.h> 36 #include <sys/memlist.h> 37 #include <sys/memnode.h> 38 #include <sys/mman.h> 39 #include <sys/pci_impl.h> /* for PCI configuration space macros */ 40 #include <sys/param.h> 41 #include <sys/promif.h> /* for prom_printf() */ 42 #include <sys/systm.h> 43 #include <sys/thread.h> 44 #include <sys/types.h> 45 #include <sys/var.h> 46 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 47 #include <vm/hat_i86.h> 48 #include <vm/seg_kmem.h> 49 50 51 52 /* 53 * lgroup platform support for x86 platforms. 54 */ 55 56 #define MAX_NODES 8 57 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 58 59 #define LGRP_PLAT_CPU_TO_NODE(cpu) (chip_plat_get_chipid(cpu)) 60 61 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 62 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 63 64 65 /* 66 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 67 * 68 * Until System Affinity Resource Table (SRAT) becomes part of ACPI standard, 69 * we need to examine registers in PCI configuration space to determine how 70 * many nodes are in the system and which CPUs and memory are in each node. 71 * This could be determined by probing all memory from each CPU, but that is 72 * too expensive to do while booting the kernel. 73 * 74 * NOTE: Using these PCI configuration space registers to determine this 75 * locality info is Opteron K8 specific and not guaranteed to work on 76 * the next generation Opteron processor. Furthermore, we assume that 77 * there is one CPU per node and CPU 0 is in node 0, CPU 1 is in node 1, 78 * etc. which should be true for Opteron K8.... 79 */ 80 81 /* 82 * Opteron DRAM Address Map in PCI configuration space gives base and limit 83 * of physical memory in each node for Opteron K8. The following constants 84 * and macros define their contents, structure, and access. 85 */ 86 87 /* 88 * How many bits to shift Opteron DRAM Address Map base and limit registers 89 * to get actual value 90 */ 91 #define OPT_DRAMADDR_LSHIFT_ADDR 8 /* shift left for address */ 92 93 #define OPT_DRAMADDR_MASK_OFF 0xFFFFFF /* offset for address */ 94 95 /* 96 * Bit masks defining what's in Opteron DRAM Address Map base register 97 */ 98 #define OPT_DRAMBASE_MASK_RE 0x1 /* read enable */ 99 #define OPT_DRAMBASE_MASK_WE 0x2 /* write enable */ 100 #define OPT_DRAMBASE_MASK_INTRLVEN 0x700 /* interleave */ 101 102 #define OPT_DRAMBASE_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 103 104 /* 105 * Macros to get values from Opteron DRAM Address Map base register 106 */ 107 #define OPT_DRAMBASE(reg) \ 108 (((u_longlong_t)reg & OPT_DRAMBASE_MASK_ADDR) << \ 109 OPT_DRAMADDR_LSHIFT_ADDR) 110 111 112 /* 113 * Bit masks defining what's in Opteron DRAM Address Map limit register 114 */ 115 #define OPT_DRAMLIMIT_MASK_DSTNODE 0x7 /* destination node */ 116 #define OPT_DRAMLIMIT_MASK_INTRLVSEL 0x70 /* interleave select */ 117 #define OPT_DRAMLIMIT_MASK_ADDR 0xFFFF0000 /* addr bits 39-24 */ 118 119 /* 120 * Macros to get values from Opteron DRAM Address Map limit register 121 */ 122 #define OPT_DRAMLIMIT(reg) \ 123 (((u_longlong_t)reg & OPT_DRAMLIMIT_MASK_ADDR) << \ 124 OPT_DRAMADDR_LSHIFT_ADDR) 125 126 127 /* 128 * Opteron Node ID register in PCI configuration space contains 129 * number of nodes in system, etc. for Opteron K8. The following 130 * constants and macros define its contents, structure, and access. 131 */ 132 133 /* 134 * Bit masks defining what's in Opteron Node ID register 135 */ 136 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 137 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 138 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 139 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 140 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 141 142 /* 143 * How many bits in Opteron Node ID register to shift right to get actual value 144 */ 145 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 146 147 /* 148 * Macros to get values from Opteron Node ID register 149 */ 150 #define OPT_NODE_CNT(reg) \ 151 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 152 153 154 /* 155 * PCI configuration space registers accessed by specifying 156 * a bus, device, function, and offset. The following constants 157 * define the values needed to access Opteron K8 configuration 158 * info to determine its node topology 159 */ 160 161 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 162 163 /* 164 * Opteron PCI configuration space register function values 165 */ 166 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 167 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 168 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 169 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 170 171 /* 172 * PCI Configuration Space register offsets 173 */ 174 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 175 #define OPT_PCS_OFF_DRAMBASE 0x40 /* DRAM Base register (node 0) */ 176 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 177 178 /* 179 * Opteron PCI Configuration Space device IDs for nodes 180 */ 181 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 182 183 184 /* 185 * Bookkeeping for latencies seen during probing (used for verification) 186 */ 187 typedef struct lgrp_plat_latency_acct { 188 hrtime_t la_value; /* latency value */ 189 int la_count; /* occurrences */ 190 } lgrp_plat_latency_acct_t; 191 192 193 /* 194 * Choices for probing to determine lgroup topology 195 */ 196 typedef enum lgrp_plat_probe_op { 197 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 198 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 199 } lgrp_plat_probe_op_t; 200 201 202 /* 203 * Opteron DRAM address map gives base and limit for physical memory in a node 204 */ 205 typedef struct opt_dram_addr_map { 206 uint32_t base; 207 uint32_t limit; 208 } opt_dram_addr_map_t; 209 210 211 /* 212 * Starting and ending page for physical memory in node 213 */ 214 typedef struct phys_addr_map { 215 pfn_t start; 216 pfn_t end; 217 } phys_addr_map_t; 218 219 220 /* 221 * Opteron DRAM address map for each node 222 */ 223 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 224 225 /* 226 * Node ID register contents for each node 227 */ 228 uint_t opt_node_info[MAX_NODES]; 229 230 /* 231 * Whether memory is interleaved across nodes causing MPO to be disabled 232 */ 233 int lgrp_plat_mem_intrlv = 0; 234 235 /* 236 * Number of nodes in system 237 */ 238 uint_t lgrp_plat_node_cnt = 1; 239 240 /* 241 * Physical address range for memory in each node 242 */ 243 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 244 245 /* 246 * Probe costs (individual and total) and flush cost 247 */ 248 hrtime_t lgrp_plat_flush_cost = 0; 249 hrtime_t lgrp_plat_probe_cost = 0; 250 hrtime_t lgrp_plat_probe_cost_total = 0; 251 252 /* 253 * Error code for latency adjustment and verification 254 */ 255 int lgrp_plat_probe_error_code = 0; 256 257 /* 258 * How much latencies were off from minimum values gotten 259 */ 260 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 261 262 /* 263 * Unique probe latencies and number of occurrences of each 264 */ 265 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 266 267 /* 268 * Size of memory buffer in each node for probing 269 */ 270 size_t lgrp_plat_probe_memsize = 0; 271 272 /* 273 * Virtual address of page in each node for probing 274 */ 275 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 276 277 /* 278 * Number of unique latencies in probe times 279 */ 280 int lgrp_plat_probe_nlatencies = 0; 281 282 /* 283 * How many rounds of probing to do 284 */ 285 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 286 287 /* 288 * Number of samples to take when probing each node 289 */ 290 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 291 292 /* 293 * How to probe to determine lgroup topology 294 */ 295 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 296 297 /* 298 * PFN of page in each node for probing 299 */ 300 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 301 302 /* 303 * Whether probe time was suspect (ie. not within tolerance of value that it 304 * should match) 305 */ 306 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 307 308 /* 309 * How long it takes to access memory from each node 310 */ 311 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 312 313 /* 314 * Min and max node memory probe times seen 315 */ 316 hrtime_t lgrp_plat_probe_time_max = 0; 317 hrtime_t lgrp_plat_probe_time_min = -1; 318 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 319 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 320 321 322 /* 323 * Allocate lgrp and lgrp stat arrays statically. 324 */ 325 static lgrp_t lgrp_space[NLGRP]; 326 static int nlgrps_alloc; 327 328 struct lgrp_stats lgrp_stats[NLGRP]; 329 330 #define CPUID_FAMILY_OPTERON 15 331 332 uint_t opt_family = 0; 333 uint_t opt_model = 0; 334 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 335 336 337 /* 338 * Determine whether we're running on an AMD Opteron K8 machine 339 */ 340 int 341 is_opteron(void) 342 { 343 if (x86_vendor != X86_VENDOR_AMD) 344 return (0); 345 346 if (cpuid_getfamily(CPU) == CPUID_FAMILY_OPTERON) 347 return (1); 348 else 349 return (0); 350 } 351 352 int 353 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 354 { 355 if (max_mem_nodes == 1) 356 return (0); 357 358 return ((int)hand); 359 } 360 361 lgrp_handle_t 362 plat_mem_node_to_lgrphand(int mnode) 363 { 364 if (max_mem_nodes == 1) 365 return (LGRP_DEFAULT_HANDLE); 366 367 return ((lgrp_handle_t)mnode); 368 } 369 370 int 371 plat_pfn_to_mem_node(pfn_t pfn) 372 { 373 int node; 374 375 if (max_mem_nodes == 1) 376 return (0); 377 378 for (node = 0; node < lgrp_plat_node_cnt; node++) { 379 if (pfn >= lgrp_plat_node_memory[node].start && 380 pfn <= lgrp_plat_node_memory[node].end) 381 return (node); 382 } 383 384 ASSERT(node < lgrp_plat_node_cnt); 385 return (-1); 386 } 387 388 /* 389 * Configure memory nodes for machines with more than one node (ie NUMA) 390 */ 391 void 392 plat_build_mem_nodes(struct memlist *list) 393 { 394 pfn_t cur_start, cur_end; /* start & end addr of subrange */ 395 pfn_t start, end; /* start & end addr of whole range */ 396 397 /* 398 * Boot install lists are arranged <addr, len>, ... 399 */ 400 while (list) { 401 int node; 402 403 start = list->address >> PAGESHIFT; 404 end = (list->address + list->size - 1) >> PAGESHIFT; 405 406 if (start > physmax) { 407 list = list->next; 408 continue; 409 } 410 if (end > physmax) 411 end = physmax; 412 413 /* 414 * When there is only one memnode, just add memory to memnode 415 */ 416 if (max_mem_nodes == 1) { 417 mem_node_add_slice(start, end); 418 list = list->next; 419 continue; 420 } 421 422 /* 423 * mem_node_add_slice() expects to get a memory range that 424 * is within one memnode, so need to split any memory range 425 * that spans multiple memnodes into subranges that are each 426 * contained within one memnode when feeding them to 427 * mem_node_add_slice() 428 */ 429 cur_start = start; 430 do { 431 node = plat_pfn_to_mem_node(cur_start); 432 ASSERT(cur_start >= 433 lgrp_plat_node_memory[node].start && 434 cur_start <= lgrp_plat_node_memory[node].end); 435 436 cur_end = end; 437 438 /* 439 * End of current subrange should not span memnodes 440 */ 441 if (cur_end > lgrp_plat_node_memory[node].end) 442 cur_end = lgrp_plat_node_memory[node].end; 443 444 mem_node_add_slice(cur_start, cur_end); 445 446 /* 447 * Next subrange starts after end of current one 448 */ 449 cur_start = cur_end + 1; 450 } while (cur_end < end); 451 452 list = list->next; 453 } 454 mem_node_physalign = 0; 455 mem_node_pfn_shift = 0; 456 } 457 458 459 /* 460 * Platform-specific initialization of lgroups 461 */ 462 void 463 lgrp_plat_init(void) 464 { 465 uint_t bus; 466 uint_t dev; 467 uint_t node; 468 uint_t off; 469 470 extern lgrp_load_t lgrp_expand_proc_thresh; 471 extern lgrp_load_t lgrp_expand_proc_diff; 472 473 /* 474 * Initialize as a UMA machine if this isn't an Opteron 475 */ 476 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 477 lgrp_plat_node_cnt = max_mem_nodes = 1; 478 return; 479 } 480 481 /* 482 * Read configuration registers from PCI configuration space to 483 * determine node information, which memory is in each node, etc. 484 * 485 * Write to PCI configuration space address register to specify 486 * which configuration register to read and read/write PCI 487 * configuration space data register to get/set contents 488 */ 489 bus = OPT_PCS_BUS_CONFIG; 490 dev = OPT_PCS_DEV_NODE0; 491 off = OPT_PCS_OFF_DRAMBASE; 492 493 /* 494 * Read node ID register for node 0 to get node count 495 */ 496 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_HT, 497 OPT_PCS_OFF_NODEID)); 498 opt_node_info[0] = inl(PCI_CONFDATA); 499 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 500 501 for (node = 0; node < lgrp_plat_node_cnt; node++) { 502 /* 503 * Read node ID register (except for node 0 which we just read) 504 */ 505 if (node > 0) { 506 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, 507 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID)); 508 opt_node_info[node] = inl(PCI_CONFDATA); 509 } 510 511 /* 512 * Read DRAM base and limit registers which specify 513 * physical memory range of each node 514 */ 515 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_ADDRMAP, 516 off)); 517 opt_dram_map[node].base = inl(PCI_CONFDATA); 518 if (opt_dram_map[node].base & OPT_DRAMBASE_MASK_INTRLVEN) 519 lgrp_plat_mem_intrlv++; 520 521 off += 4; /* limit register offset */ 522 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_ADDRMAP, 523 off)); 524 opt_dram_map[node].limit = inl(PCI_CONFDATA); 525 526 /* 527 * Increment device number to next node and register offset for 528 * DRAM base register of next node 529 */ 530 off += 4; 531 dev++; 532 533 /* 534 * Get PFN for first page in each node, 535 * so we can probe memory to determine latency topology 536 */ 537 lgrp_plat_probe_pfn[node] = 538 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 539 540 /* 541 * Remember physical address range of each node for use later 542 */ 543 lgrp_plat_node_memory[node].start = 544 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 545 lgrp_plat_node_memory[node].end = 546 btop(OPT_DRAMLIMIT(opt_dram_map[node].limit) | 547 OPT_DRAMADDR_MASK_OFF); 548 } 549 550 /* 551 * Only use one memory node if memory is interleaved between any nodes 552 */ 553 if (lgrp_plat_mem_intrlv) { 554 lgrp_plat_node_cnt = max_mem_nodes = 1; 555 (void) lgrp_topo_ht_limit_set(1); 556 } else { 557 max_mem_nodes = lgrp_plat_node_cnt; 558 559 /* 560 * Probing errors can mess up the lgroup topology and force us 561 * fall back to a 2 level lgroup topology. Here we bound how 562 * tall the lgroup topology can grow in hopes of avoiding any 563 * anamolies in probing from messing up the lgroup topology 564 * by limiting the accuracy of the latency topology. 565 * 566 * Assume that nodes will at least be configured in a ring, 567 * so limit height of lgroup topology to be less than number 568 * of nodes on a system with 4 or more nodes 569 */ 570 if (lgrp_plat_node_cnt >= 4 && 571 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 572 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 573 } 574 575 /* 576 * Lgroups on Opteron architectures have but a single physical 577 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 578 * so that lgrp_choose() will spread things out aggressively. 579 */ 580 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 581 lgrp_expand_proc_diff = 0; 582 } 583 584 585 /* 586 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 587 * be considered same 588 */ 589 #define LGRP_LAT_TOLERANCE_SHIFT 4 590 591 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 592 593 594 /* 595 * Adjust latencies between nodes to be symmetric, normalize latencies between 596 * any nodes that are within some tolerance to be same, and make local 597 * latencies be same 598 */ 599 static void 600 lgrp_plat_latency_adjust(void) 601 { 602 int i; 603 int j; 604 int k; 605 int l; 606 u_longlong_t max; 607 u_longlong_t min; 608 u_longlong_t t; 609 u_longlong_t t1; 610 u_longlong_t t2; 611 const lgrp_config_flag_t cflag = LGRP_CONFIG_LATENCY_CHANGE; 612 int lat_corrected[MAX_NODES][MAX_NODES]; 613 614 /* 615 * Nothing to do when this is an UMA machine 616 */ 617 if (max_mem_nodes == 1) 618 return; 619 620 /* 621 * Make sure that latencies are symmetric between any two nodes 622 * (ie. latency(node0, node1) == latency(node1, node0)) 623 */ 624 for (i = 0; i < lgrp_plat_node_cnt; i++) 625 for (j = 0; j < lgrp_plat_node_cnt; j++) { 626 t1 = lgrp_plat_probe_times[i][j]; 627 t2 = lgrp_plat_probe_times[j][i]; 628 629 if (t1 == 0 || t2 == 0 || t1 == t2) 630 continue; 631 632 /* 633 * Latencies should be same 634 * - Use minimum of two latencies which should be same 635 * - Track suspect probe times not within tolerance of 636 * min value 637 * - Remember how much values are corrected by 638 */ 639 if (t1 > t2) { 640 t = t2; 641 lgrp_plat_probe_errors[i][j] += t1 - t2; 642 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 643 lgrp_plat_probe_suspect[i][j]++; 644 lgrp_plat_probe_suspect[j][i]++; 645 } 646 } else if (t2 > t1) { 647 t = t1; 648 lgrp_plat_probe_errors[j][i] += t2 - t1; 649 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 650 lgrp_plat_probe_suspect[i][j]++; 651 lgrp_plat_probe_suspect[j][i]++; 652 } 653 } 654 655 lgrp_plat_probe_times[i][j] = 656 lgrp_plat_probe_times[j][i] = t; 657 lgrp_config(cflag, t1, t); 658 lgrp_config(cflag, t2, t); 659 } 660 661 /* 662 * Keep track of which latencies get corrected 663 */ 664 for (i = 0; i < MAX_NODES; i++) 665 for (j = 0; j < MAX_NODES; j++) 666 lat_corrected[i][j] = 0; 667 668 /* 669 * For every two nodes, see whether there is another pair of nodes which 670 * are about the same distance apart and make the latencies be the same 671 * if they are close enough together 672 */ 673 for (i = 0; i < lgrp_plat_node_cnt; i++) 674 for (j = 0; j < lgrp_plat_node_cnt; j++) { 675 /* 676 * Pick one pair of nodes (i, j) 677 * and get latency between them 678 */ 679 t1 = lgrp_plat_probe_times[i][j]; 680 681 /* 682 * Skip this pair of nodes if there isn't a latency 683 * for it yet 684 */ 685 if (t1 == 0) 686 continue; 687 688 for (k = 0; k < lgrp_plat_node_cnt; k++) 689 for (l = 0; l < lgrp_plat_node_cnt; l++) { 690 /* 691 * Pick another pair of nodes (k, l) 692 * not same as (i, j) and get latency 693 * between them 694 */ 695 if (k == i && l == j) 696 continue; 697 698 t2 = lgrp_plat_probe_times[k][l]; 699 700 /* 701 * Skip this pair of nodes if there 702 * isn't a latency for it yet 703 */ 704 705 if (t2 == 0) 706 continue; 707 708 /* 709 * Skip nodes (k, l) if they already 710 * have same latency as (i, j) or 711 * their latency isn't close enough to 712 * be considered/made the same 713 */ 714 if (t1 == t2 || (t1 > t2 && t1 - t2 > 715 t1 >> lgrp_plat_probe_lt_shift) || 716 (t2 > t1 && t2 - t1 > 717 t2 >> lgrp_plat_probe_lt_shift)) 718 continue; 719 720 /* 721 * Make latency(i, j) same as 722 * latency(k, l), try to use latency 723 * that has been adjusted already to get 724 * more consistency (if possible), and 725 * remember which latencies were 726 * adjusted for next time 727 */ 728 if (lat_corrected[i][j]) { 729 t = t1; 730 lgrp_config(cflag, t2, t); 731 t2 = t; 732 } else if (lat_corrected[k][l]) { 733 t = t2; 734 lgrp_config(cflag, t1, t); 735 t1 = t; 736 } else { 737 if (t1 > t2) 738 t = t2; 739 else 740 t = t1; 741 lgrp_config(cflag, t1, t); 742 lgrp_config(cflag, t2, t); 743 t1 = t2 = t; 744 } 745 746 lgrp_plat_probe_times[i][j] = 747 lgrp_plat_probe_times[k][l] = t; 748 749 lat_corrected[i][j] = 750 lat_corrected[k][l] = 1; 751 } 752 } 753 754 /* 755 * Local latencies should be same 756 * - Find min and max local latencies 757 * - Make all local latencies be minimum 758 */ 759 min = -1; 760 max = 0; 761 for (i = 0; i < lgrp_plat_node_cnt; i++) { 762 t = lgrp_plat_probe_times[i][i]; 763 if (t == 0) 764 continue; 765 if (min == -1 || t < min) 766 min = t; 767 if (t > max) 768 max = t; 769 } 770 if (min != max) { 771 for (i = 0; i < lgrp_plat_node_cnt; i++) { 772 int local; 773 774 local = lgrp_plat_probe_times[i][i]; 775 if (local == 0) 776 continue; 777 778 /* 779 * Track suspect probe times that aren't within 780 * tolerance of minimum local latency and how much 781 * probe times are corrected by 782 */ 783 if (local - min > min >> lgrp_plat_probe_lt_shift) 784 lgrp_plat_probe_suspect[i][i]++; 785 786 lgrp_plat_probe_errors[i][i] += local - min; 787 788 /* 789 * Make local latencies be minimum 790 */ 791 lgrp_config(cflag, local, min); 792 lgrp_plat_probe_times[i][i] = min; 793 } 794 } 795 796 /* 797 * Determine max probe time again since just adjusted latencies 798 */ 799 lgrp_plat_probe_time_max = 0; 800 for (i = 0; i < lgrp_plat_node_cnt; i++) 801 for (j = 0; j < lgrp_plat_node_cnt; j++) { 802 t = lgrp_plat_probe_times[i][j]; 803 if (t > lgrp_plat_probe_time_max) 804 lgrp_plat_probe_time_max = t; 805 } 806 } 807 808 809 /* 810 * Verify following about latencies between nodes: 811 * 812 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 813 * - Local latencies same 814 * - Local < remote 815 * - Number of latencies seen is reasonable 816 * - Number of occurrences of a given latency should be more than 1 817 * 818 * Returns: 819 * 0 Success 820 * -1 Not symmetric 821 * -2 Local latencies not same 822 * -3 Local >= remote 823 * -4 Wrong number of latencies 824 * -5 Not enough occurrences of given latency 825 */ 826 static int 827 lgrp_plat_latency_verify(void) 828 { 829 int i; 830 int j; 831 lgrp_plat_latency_acct_t *l; 832 int probed; 833 u_longlong_t t1; 834 u_longlong_t t2; 835 836 /* 837 * Nothing to do when this is an UMA machine, lgroup topology is 838 * limited to 2 levels, or there aren't any probe times yet 839 */ 840 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 841 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 842 return (0); 843 844 /* 845 * Make sure that latencies are symmetric between any two nodes 846 * (ie. latency(node0, node1) == latency(node1, node0)) 847 */ 848 for (i = 0; i < lgrp_plat_node_cnt; i++) 849 for (j = 0; j < lgrp_plat_node_cnt; j++) { 850 t1 = lgrp_plat_probe_times[i][j]; 851 t2 = lgrp_plat_probe_times[j][i]; 852 853 if (t1 == 0 || t2 == 0 || t1 == t2) 854 continue; 855 856 return (-1); 857 } 858 859 /* 860 * Local latencies should be same 861 */ 862 t1 = lgrp_plat_probe_times[0][0]; 863 for (i = 1; i < lgrp_plat_node_cnt; i++) { 864 t2 = lgrp_plat_probe_times[i][i]; 865 if (t2 == 0) 866 continue; 867 868 if (t1 == 0) { 869 t1 = t2; 870 continue; 871 } 872 873 if (t1 != t2) 874 return (-2); 875 } 876 877 /* 878 * Local latencies should be less than remote 879 */ 880 if (t1) { 881 for (i = 0; i < lgrp_plat_node_cnt; i++) 882 for (j = 0; j < lgrp_plat_node_cnt; j++) { 883 t2 = lgrp_plat_probe_times[i][j]; 884 if (i == j || t2 == 0) 885 continue; 886 887 if (t1 >= t2) 888 return (-3); 889 } 890 } 891 892 /* 893 * Rest of checks are not very useful for machines with less than 894 * 4 nodes (which means less than 3 latencies on Opteron) 895 */ 896 if (lgrp_plat_node_cnt < 4) 897 return (0); 898 899 /* 900 * Need to see whether done probing in order to verify number of 901 * latencies are correct 902 */ 903 probed = 0; 904 for (i = 0; i < lgrp_plat_node_cnt; i++) 905 if (lgrp_plat_probe_times[i][i]) 906 probed++; 907 908 if (probed != lgrp_plat_node_cnt) 909 return (0); 910 911 /* 912 * Determine number of unique latencies seen in probe times, 913 * their values, and number of occurrences of each 914 */ 915 lgrp_plat_probe_nlatencies = 0; 916 bzero(lgrp_plat_probe_lat_acct, 917 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 918 for (i = 0; i < lgrp_plat_node_cnt; i++) { 919 for (j = 0; j < lgrp_plat_node_cnt; j++) { 920 int k; 921 922 /* 923 * Look at each probe time 924 */ 925 t1 = lgrp_plat_probe_times[i][j]; 926 if (t1 == 0) 927 continue; 928 929 /* 930 * Account for unique latencies 931 */ 932 for (k = 0; k < lgrp_plat_node_cnt; k++) { 933 l = &lgrp_plat_probe_lat_acct[k]; 934 if (t1 == l->la_value) { 935 /* 936 * Increment number of occurrences 937 * if seen before 938 */ 939 l->la_count++; 940 break; 941 } else if (l->la_value == 0) { 942 /* 943 * Record latency if haven't seen before 944 */ 945 l->la_value = t1; 946 l->la_count++; 947 lgrp_plat_probe_nlatencies++; 948 break; 949 } 950 } 951 } 952 } 953 954 /* 955 * Number of latencies should be relative to number of 956 * nodes in system: 957 * - Same as nodes when nodes <= 2 958 * - Less than nodes when nodes > 2 959 * - Greater than 2 when nodes >= 4 960 */ 961 if ((lgrp_plat_node_cnt <= 2 && 962 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 963 (lgrp_plat_node_cnt > 2 && 964 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 965 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 966 lgrp_plat_probe_nlatencies <= 2)) 967 return (-4); 968 969 /* 970 * There should be more than one occurrence of every latency 971 * as long as probing is complete 972 */ 973 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 974 l = &lgrp_plat_probe_lat_acct[i]; 975 if (l->la_count <= 1) 976 return (-5); 977 } 978 return (0); 979 } 980 981 982 /* 983 * Set lgroup latencies for 2 level lgroup topology 984 */ 985 static void 986 lgrp_plat_2level_setup(void) 987 { 988 int i; 989 990 if (lgrp_plat_node_cnt >= 4) 991 cmn_err(CE_NOTE, 992 "MPO only optimizing for local and remote\n"); 993 for (i = 0; i < lgrp_plat_node_cnt; i++) { 994 int j; 995 996 for (j = 0; j < lgrp_plat_node_cnt; j++) { 997 if (i == j) 998 lgrp_plat_probe_times[i][j] = 2; 999 else 1000 lgrp_plat_probe_times[i][j] = 3; 1001 } 1002 } 1003 lgrp_plat_probe_time_min = 2; 1004 lgrp_plat_probe_time_max = 3; 1005 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1006 } 1007 1008 1009 /* 1010 * Return time needed to probe from current CPU to memory in given node 1011 */ 1012 static hrtime_t 1013 lgrp_plat_probe_time(int to) 1014 { 1015 caddr_t buf; 1016 uint_t dev; 1017 /* LINTED: set but not used in function */ 1018 volatile uint_t dev_vendor; 1019 hrtime_t elapsed; 1020 hrtime_t end; 1021 int from; 1022 int i; 1023 int ipl; 1024 hrtime_t max; 1025 hrtime_t min; 1026 hrtime_t start; 1027 extern int use_sse_pagecopy; 1028 1029 /* 1030 * Determine ID of node containing current CPU 1031 */ 1032 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1033 1034 /* 1035 * Do common work for probing main memory 1036 */ 1037 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1038 /* 1039 * Skip probing any nodes without memory and 1040 * set probe time to 0 1041 */ 1042 if (lgrp_plat_probe_memory[to] == NULL) { 1043 lgrp_plat_probe_times[from][to] = 0; 1044 return (0); 1045 } 1046 1047 /* 1048 * Invalidate caches once instead of once every sample 1049 * which should cut cost of probing by a lot 1050 */ 1051 lgrp_plat_flush_cost = gethrtime(); 1052 invalidate_cache(); 1053 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1054 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1055 } 1056 1057 /* 1058 * Probe from current CPU to given memory using specified operation 1059 * and take specified number of samples 1060 */ 1061 max = 0; 1062 min = -1; 1063 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1064 lgrp_plat_probe_cost = gethrtime(); 1065 1066 /* 1067 * Can't measure probe time if gethrtime() isn't working yet 1068 */ 1069 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1070 return (0); 1071 1072 switch (lgrp_plat_probe_op) { 1073 1074 case LGRP_PLAT_PROBE_PGCPY: 1075 default: 1076 /* 1077 * Measure how long it takes to copy page 1078 * on top of itself 1079 */ 1080 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1081 1082 kpreempt_disable(); 1083 ipl = splhigh(); 1084 start = gethrtime(); 1085 if (use_sse_pagecopy) 1086 hwblkpagecopy(buf, buf); 1087 else 1088 bcopy(buf, buf, PAGESIZE); 1089 end = gethrtime(); 1090 elapsed = end - start; 1091 splx(ipl); 1092 kpreempt_enable(); 1093 break; 1094 1095 case LGRP_PLAT_PROBE_VENDOR: 1096 /* 1097 * Measure how long it takes to read vendor ID from 1098 * Northbridge 1099 */ 1100 dev = OPT_PCS_DEV_NODE0 + to; 1101 kpreempt_disable(); 1102 ipl = spl8(); 1103 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1104 OPT_PCS_OFF_VENDOR)); 1105 start = gethrtime(); 1106 dev_vendor = inl(PCI_CONFDATA); 1107 end = gethrtime(); 1108 elapsed = end - start; 1109 splx(ipl); 1110 kpreempt_enable(); 1111 break; 1112 } 1113 1114 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1115 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1116 1117 if (min == -1 || elapsed < min) 1118 min = elapsed; 1119 if (elapsed > max) 1120 max = elapsed; 1121 } 1122 1123 /* 1124 * Update minimum and maximum probe times between 1125 * these two nodes 1126 */ 1127 if (min < lgrp_plat_probe_min[from][to] || 1128 lgrp_plat_probe_min[from][to] == 0) 1129 lgrp_plat_probe_min[from][to] = min; 1130 1131 if (max > lgrp_plat_probe_max[from][to]) 1132 lgrp_plat_probe_max[from][to] = max; 1133 1134 return (min); 1135 } 1136 1137 1138 /* 1139 * Probe memory in each node from current CPU to determine latency topology 1140 */ 1141 void 1142 lgrp_plat_probe(void) 1143 { 1144 int from; 1145 int i; 1146 hrtime_t probe_time; 1147 int to; 1148 1149 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1150 return; 1151 1152 /* 1153 * Determine ID of node containing current CPU 1154 */ 1155 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1156 1157 /* 1158 * Don't need to probe if got times already 1159 */ 1160 if (lgrp_plat_probe_times[from][from] != 0) 1161 return; 1162 1163 /* 1164 * Read vendor ID in Northbridge or read and write page(s) 1165 * in each node from current CPU and remember how long it takes, 1166 * so we can build latency topology of machine later. 1167 * This should approximate the memory latency between each node. 1168 */ 1169 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1170 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1171 /* 1172 * Get probe time and bail out if can't get it yet 1173 */ 1174 probe_time = lgrp_plat_probe_time(to); 1175 if (probe_time == 0) 1176 return; 1177 1178 /* 1179 * Keep lowest probe time as latency between nodes 1180 */ 1181 if (lgrp_plat_probe_times[from][to] == 0 || 1182 probe_time < lgrp_plat_probe_times[from][to]) 1183 lgrp_plat_probe_times[from][to] = probe_time; 1184 1185 /* 1186 * Update overall minimum and maximum probe times 1187 * across all nodes 1188 */ 1189 if (probe_time < lgrp_plat_probe_time_min || 1190 lgrp_plat_probe_time_min == -1) 1191 lgrp_plat_probe_time_min = probe_time; 1192 if (probe_time > lgrp_plat_probe_time_max) 1193 lgrp_plat_probe_time_max = probe_time; 1194 } 1195 1196 /* 1197 * - Fix up latencies such that local latencies are same, 1198 * latency(i, j) == latency(j, i), etc. (if possible) 1199 * 1200 * - Verify that latencies look ok 1201 * 1202 * - Fallback to just optimizing for local and remote if 1203 * latencies didn't look right 1204 */ 1205 lgrp_plat_latency_adjust(); 1206 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1207 if (lgrp_plat_probe_error_code) 1208 lgrp_plat_2level_setup(); 1209 } 1210 1211 1212 /* 1213 * Platform-specific initialization 1214 */ 1215 void 1216 lgrp_plat_main_init(void) 1217 { 1218 int curnode; 1219 int ht_limit; 1220 int i; 1221 1222 /* 1223 * Print a notice that MPO is disabled when memory is interleaved 1224 * across nodes....Would do this when it is discovered, but can't 1225 * because it happens way too early during boot.... 1226 */ 1227 if (lgrp_plat_mem_intrlv) 1228 cmn_err(CE_NOTE, 1229 "MPO disabled because memory is interleaved\n"); 1230 1231 /* 1232 * Don't bother to do any probing if there is only one node or the 1233 * height of the lgroup topology less than or equal to 2 1234 */ 1235 ht_limit = lgrp_topo_ht_limit(); 1236 if (max_mem_nodes == 1 || ht_limit <= 2) { 1237 /* 1238 * Setup lgroup latencies for 2 level lgroup topology 1239 * (ie. local and remote only) if they haven't been set yet 1240 */ 1241 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1242 lgrp_plat_probe_time_max == 0) 1243 lgrp_plat_2level_setup(); 1244 return; 1245 } 1246 1247 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1248 /* 1249 * Should have been able to probe from CPU 0 when it was added 1250 * to lgroup hierarchy, but may not have been able to then 1251 * because it happens so early in boot that gethrtime() hasn't 1252 * been initialized. (:-( 1253 */ 1254 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1255 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1256 lgrp_plat_probe(); 1257 1258 return; 1259 } 1260 1261 /* 1262 * When probing memory, use one page for every sample to determine 1263 * lgroup topology and taking multiple samples 1264 */ 1265 if (lgrp_plat_probe_memsize == 0) 1266 lgrp_plat_probe_memsize = PAGESIZE * 1267 lgrp_plat_probe_nsamples; 1268 1269 /* 1270 * Map memory in each node needed for probing to determine latency 1271 * topology 1272 */ 1273 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1274 int mnode; 1275 1276 /* 1277 * Skip this node and leave its probe page NULL 1278 * if it doesn't have any memory 1279 */ 1280 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1281 if (!mem_node_config[mnode].exists) { 1282 lgrp_plat_probe_memory[i] = NULL; 1283 continue; 1284 } 1285 1286 /* 1287 * Allocate one kernel virtual page 1288 */ 1289 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1290 lgrp_plat_probe_memsize, VM_NOSLEEP); 1291 if (lgrp_plat_probe_memory[i] == NULL) { 1292 cmn_err(CE_WARN, 1293 "lgrp_plat_main_init: couldn't allocate memory"); 1294 return; 1295 } 1296 1297 /* 1298 * Map virtual page to first page in node 1299 */ 1300 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1301 lgrp_plat_probe_memsize, 1302 lgrp_plat_probe_pfn[i], 1303 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1304 HAT_LOAD_NOCONSIST); 1305 } 1306 1307 /* 1308 * Probe from current CPU 1309 */ 1310 lgrp_plat_probe(); 1311 } 1312 1313 /* 1314 * Allocate additional space for an lgroup. 1315 */ 1316 /* ARGSUSED */ 1317 lgrp_t * 1318 lgrp_plat_alloc(lgrp_id_t lgrpid) 1319 { 1320 lgrp_t *lgrp; 1321 1322 lgrp = &lgrp_space[nlgrps_alloc++]; 1323 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1324 return (NULL); 1325 return (lgrp); 1326 } 1327 1328 /* 1329 * Platform handling for (re)configuration changes 1330 */ 1331 /* ARGSUSED */ 1332 void 1333 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1334 { 1335 } 1336 1337 /* 1338 * Return the platform handle for the lgroup containing the given CPU 1339 */ 1340 /* ARGSUSED */ 1341 lgrp_handle_t 1342 lgrp_plat_cpu_to_hand(processorid_t id) 1343 { 1344 if (lgrp_plat_node_cnt == 1) 1345 return (LGRP_DEFAULT_HANDLE); 1346 1347 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1348 } 1349 1350 /* 1351 * Return the platform handle of the lgroup that contains the physical memory 1352 * corresponding to the given page frame number 1353 */ 1354 /* ARGSUSED */ 1355 lgrp_handle_t 1356 lgrp_plat_pfn_to_hand(pfn_t pfn) 1357 { 1358 int mnode; 1359 1360 if (max_mem_nodes == 1) 1361 return (LGRP_DEFAULT_HANDLE); 1362 1363 mnode = plat_pfn_to_mem_node(pfn); 1364 return (MEM_NODE_2_LGRPHAND(mnode)); 1365 } 1366 1367 /* 1368 * Return the maximum number of lgrps supported by the platform. 1369 * Before lgrp topology is known it returns an estimate based on the number of 1370 * nodes. Once topology is known it returns the actual maximim number of lgrps 1371 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1372 * may not grow during system lifetime. 1373 */ 1374 int 1375 lgrp_plat_max_lgrps() 1376 { 1377 return (lgrp_topo_initialized ? 1378 lgrp_alloc_max + 1 : 1379 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1380 } 1381 1382 /* 1383 * Return the number of free, allocatable, or installed 1384 * pages in an lgroup 1385 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1386 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1387 */ 1388 /* ARGSUSED */ 1389 static pgcnt_t 1390 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1391 { 1392 struct memlist *mlist; 1393 pgcnt_t npgs = 0; 1394 extern struct memlist *phys_avail; 1395 extern struct memlist *phys_install; 1396 1397 switch (query) { 1398 case LGRP_MEM_SIZE_FREE: 1399 return ((pgcnt_t)freemem); 1400 case LGRP_MEM_SIZE_AVAIL: 1401 memlist_read_lock(); 1402 for (mlist = phys_avail; mlist; mlist = mlist->next) 1403 npgs += btop(mlist->size); 1404 memlist_read_unlock(); 1405 return (npgs); 1406 case LGRP_MEM_SIZE_INSTALL: 1407 memlist_read_lock(); 1408 for (mlist = phys_install; mlist; mlist = mlist->next) 1409 npgs += btop(mlist->size); 1410 memlist_read_unlock(); 1411 return (npgs); 1412 default: 1413 return ((pgcnt_t)0); 1414 } 1415 } 1416 1417 /* 1418 * Return the number of free pages in an lgroup. 1419 * 1420 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1421 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1422 * number of allocatable base pagesize pages corresponding to the 1423 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1424 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1425 * memory installed, regardless of whether or not it's usable. 1426 */ 1427 pgcnt_t 1428 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1429 { 1430 int mnode; 1431 pgcnt_t npgs = (pgcnt_t)0; 1432 extern struct memlist *phys_avail; 1433 extern struct memlist *phys_install; 1434 1435 1436 if (plathand == LGRP_DEFAULT_HANDLE) 1437 return (lgrp_plat_mem_size_default(plathand, query)); 1438 1439 if (plathand != LGRP_NULL_HANDLE) { 1440 mnode = plat_lgrphand_to_mem_node(plathand); 1441 if (mnode >= 0 && mem_node_config[mnode].exists) { 1442 switch (query) { 1443 case LGRP_MEM_SIZE_FREE: 1444 npgs = mem_node_config[mnode].cursize; 1445 break; 1446 case LGRP_MEM_SIZE_AVAIL: 1447 npgs = mem_node_memlist_pages(mnode, 1448 phys_avail); 1449 break; 1450 case LGRP_MEM_SIZE_INSTALL: 1451 npgs = mem_node_memlist_pages(mnode, 1452 phys_install); 1453 break; 1454 default: 1455 break; 1456 } 1457 } 1458 } 1459 return (npgs); 1460 } 1461 1462 /* 1463 * Return latency between "from" and "to" lgroups 1464 * 1465 * This latency number can only be used for relative comparison 1466 * between lgroups on the running system, cannot be used across platforms, 1467 * and may not reflect the actual latency. It is platform and implementation 1468 * specific, so platform gets to decide its value. It would be nice if the 1469 * number was at least proportional to make comparisons more meaningful though. 1470 */ 1471 /* ARGSUSED */ 1472 int 1473 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1474 { 1475 lgrp_handle_t src, dest; 1476 1477 if (max_mem_nodes == 1) 1478 return (0); 1479 1480 /* 1481 * Return max latency for root lgroup 1482 */ 1483 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1484 return (lgrp_plat_probe_time_max); 1485 1486 src = from; 1487 dest = to; 1488 1489 /* 1490 * Return 0 for nodes (lgroup platform handles) out of range 1491 */ 1492 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1493 return (0); 1494 1495 /* 1496 * Probe from current CPU if its lgroup latencies haven't been set yet 1497 * and we are trying to get latency from current CPU to some node 1498 */ 1499 if (lgrp_plat_probe_times[src][src] == 0 && 1500 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1501 lgrp_plat_probe(); 1502 1503 return (lgrp_plat_probe_times[src][dest]); 1504 } 1505 1506 /* 1507 * Return platform handle for root lgroup 1508 */ 1509 lgrp_handle_t 1510 lgrp_plat_root_hand(void) 1511 { 1512 return (LGRP_DEFAULT_HANDLE); 1513 } 1514