1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/cpupart.h> 33 #include <sys/cpuvar.h> 34 #include <sys/lgrp.h> 35 #include <sys/machsystm.h> 36 #include <sys/memlist.h> 37 #include <sys/memnode.h> 38 #include <sys/mman.h> 39 #include <sys/pci_impl.h> /* for PCI configuration space macros */ 40 #include <sys/param.h> 41 #include <sys/promif.h> /* for prom_printf() */ 42 #include <sys/systm.h> 43 #include <sys/thread.h> 44 #include <sys/types.h> 45 #include <sys/var.h> 46 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 47 #include <vm/hat_i86.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/vm_dep.h> 50 51 52 53 /* 54 * lgroup platform support for x86 platforms. 55 */ 56 57 #define MAX_NODES 8 58 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 59 60 #define LGRP_PLAT_CPU_TO_NODE(cpu) (chip_plat_get_chipid(cpu)) 61 62 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 63 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 64 65 66 /* 67 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 68 * 69 * Until System Affinity Resource Table (SRAT) becomes part of ACPI standard, 70 * we need to examine registers in PCI configuration space to determine how 71 * many nodes are in the system and which CPUs and memory are in each node. 72 * This could be determined by probing all memory from each CPU, but that is 73 * too expensive to do while booting the kernel. 74 * 75 * NOTE: Using these PCI configuration space registers to determine this 76 * locality info is Opteron K8 specific and not guaranteed to work on 77 * the next generation Opteron processor. Furthermore, we assume that 78 * there is one CPU per node and CPU 0 is in node 0, CPU 1 is in node 1, 79 * etc. which should be true for Opteron K8.... 80 */ 81 82 /* 83 * Opteron DRAM Address Map in PCI configuration space gives base and limit 84 * of physical memory in each node for Opteron K8. The following constants 85 * and macros define their contents, structure, and access. 86 */ 87 88 /* 89 * How many bits to shift Opteron DRAM Address Map base and limit registers 90 * to get actual value 91 */ 92 #define OPT_DRAMADDR_LSHIFT_ADDR 8 /* shift left for address */ 93 94 #define OPT_DRAMADDR_MASK_OFF 0xFFFFFF /* offset for address */ 95 96 /* 97 * Bit masks defining what's in Opteron DRAM Address Map base register 98 */ 99 #define OPT_DRAMBASE_MASK_RE 0x1 /* read enable */ 100 #define OPT_DRAMBASE_MASK_WE 0x2 /* write enable */ 101 #define OPT_DRAMBASE_MASK_INTRLVEN 0x700 /* interleave */ 102 103 #define OPT_DRAMBASE_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 104 105 /* 106 * Macros to get values from Opteron DRAM Address Map base register 107 */ 108 #define OPT_DRAMBASE(reg) \ 109 (((u_longlong_t)reg & OPT_DRAMBASE_MASK_ADDR) << \ 110 OPT_DRAMADDR_LSHIFT_ADDR) 111 112 113 /* 114 * Bit masks defining what's in Opteron DRAM Address Map limit register 115 */ 116 #define OPT_DRAMLIMIT_MASK_DSTNODE 0x7 /* destination node */ 117 #define OPT_DRAMLIMIT_MASK_INTRLVSEL 0x70 /* interleave select */ 118 #define OPT_DRAMLIMIT_MASK_ADDR 0xFFFF0000 /* addr bits 39-24 */ 119 120 /* 121 * Macros to get values from Opteron DRAM Address Map limit register 122 */ 123 #define OPT_DRAMLIMIT(reg) \ 124 (((u_longlong_t)reg & OPT_DRAMLIMIT_MASK_ADDR) << \ 125 OPT_DRAMADDR_LSHIFT_ADDR) 126 127 128 /* 129 * Opteron Node ID register in PCI configuration space contains 130 * number of nodes in system, etc. for Opteron K8. The following 131 * constants and macros define its contents, structure, and access. 132 */ 133 134 /* 135 * Bit masks defining what's in Opteron Node ID register 136 */ 137 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 138 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 139 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 140 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 141 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 142 143 /* 144 * How many bits in Opteron Node ID register to shift right to get actual value 145 */ 146 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 147 148 /* 149 * Macros to get values from Opteron Node ID register 150 */ 151 #define OPT_NODE_CNT(reg) \ 152 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 153 154 155 /* 156 * PCI configuration space registers accessed by specifying 157 * a bus, device, function, and offset. The following constants 158 * define the values needed to access Opteron K8 configuration 159 * info to determine its node topology 160 */ 161 162 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 163 164 /* 165 * Opteron PCI configuration space register function values 166 */ 167 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 168 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 169 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 170 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 171 172 /* 173 * PCI Configuration Space register offsets 174 */ 175 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 176 #define OPT_PCS_OFF_DRAMBASE 0x40 /* DRAM Base register (node 0) */ 177 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 178 179 /* 180 * Opteron PCI Configuration Space device IDs for nodes 181 */ 182 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 183 184 185 /* 186 * Bookkeeping for latencies seen during probing (used for verification) 187 */ 188 typedef struct lgrp_plat_latency_acct { 189 hrtime_t la_value; /* latency value */ 190 int la_count; /* occurrences */ 191 } lgrp_plat_latency_acct_t; 192 193 194 /* 195 * Choices for probing to determine lgroup topology 196 */ 197 typedef enum lgrp_plat_probe_op { 198 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 199 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 200 } lgrp_plat_probe_op_t; 201 202 203 /* 204 * Opteron DRAM address map gives base and limit for physical memory in a node 205 */ 206 typedef struct opt_dram_addr_map { 207 uint32_t base; 208 uint32_t limit; 209 } opt_dram_addr_map_t; 210 211 212 /* 213 * Starting and ending page for physical memory in node 214 */ 215 typedef struct phys_addr_map { 216 pfn_t start; 217 pfn_t end; 218 } phys_addr_map_t; 219 220 221 /* 222 * Opteron DRAM address map for each node 223 */ 224 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 225 226 /* 227 * Node ID register contents for each node 228 */ 229 uint_t opt_node_info[MAX_NODES]; 230 231 /* 232 * Whether memory is interleaved across nodes causing MPO to be disabled 233 */ 234 int lgrp_plat_mem_intrlv = 0; 235 236 /* 237 * Number of nodes in system 238 */ 239 uint_t lgrp_plat_node_cnt = 1; 240 241 /* 242 * Physical address range for memory in each node 243 */ 244 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 245 246 /* 247 * Probe costs (individual and total) and flush cost 248 */ 249 hrtime_t lgrp_plat_flush_cost = 0; 250 hrtime_t lgrp_plat_probe_cost = 0; 251 hrtime_t lgrp_plat_probe_cost_total = 0; 252 253 /* 254 * Error code for latency adjustment and verification 255 */ 256 int lgrp_plat_probe_error_code = 0; 257 258 /* 259 * How much latencies were off from minimum values gotten 260 */ 261 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 262 263 /* 264 * Unique probe latencies and number of occurrences of each 265 */ 266 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 267 268 /* 269 * Size of memory buffer in each node for probing 270 */ 271 size_t lgrp_plat_probe_memsize = 0; 272 273 /* 274 * Virtual address of page in each node for probing 275 */ 276 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 277 278 /* 279 * Number of unique latencies in probe times 280 */ 281 int lgrp_plat_probe_nlatencies = 0; 282 283 /* 284 * How many rounds of probing to do 285 */ 286 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 287 288 /* 289 * Number of samples to take when probing each node 290 */ 291 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 292 293 /* 294 * How to probe to determine lgroup topology 295 */ 296 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 297 298 /* 299 * PFN of page in each node for probing 300 */ 301 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 302 303 /* 304 * Whether probe time was suspect (ie. not within tolerance of value that it 305 * should match) 306 */ 307 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 308 309 /* 310 * How long it takes to access memory from each node 311 */ 312 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 313 314 /* 315 * Min and max node memory probe times seen 316 */ 317 hrtime_t lgrp_plat_probe_time_max = 0; 318 hrtime_t lgrp_plat_probe_time_min = -1; 319 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 320 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 321 322 323 /* 324 * Allocate lgrp and lgrp stat arrays statically. 325 */ 326 static lgrp_t lgrp_space[NLGRP]; 327 static int nlgrps_alloc; 328 329 struct lgrp_stats lgrp_stats[NLGRP]; 330 331 #define CPUID_FAMILY_OPTERON 15 332 333 uint_t opt_family = 0; 334 uint_t opt_model = 0; 335 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 336 337 338 /* 339 * Determine whether we're running on an AMD Opteron K8 machine 340 */ 341 int 342 is_opteron(void) 343 { 344 if (x86_vendor != X86_VENDOR_AMD) 345 return (0); 346 347 if (cpuid_getfamily(CPU) == CPUID_FAMILY_OPTERON) 348 return (1); 349 else 350 return (0); 351 } 352 353 int 354 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 355 { 356 if (max_mem_nodes == 1) 357 return (0); 358 359 return ((int)hand); 360 } 361 362 lgrp_handle_t 363 plat_mem_node_to_lgrphand(int mnode) 364 { 365 if (max_mem_nodes == 1) 366 return (LGRP_DEFAULT_HANDLE); 367 368 return ((lgrp_handle_t)mnode); 369 } 370 371 int 372 plat_pfn_to_mem_node(pfn_t pfn) 373 { 374 int node; 375 376 if (max_mem_nodes == 1) 377 return (0); 378 379 for (node = 0; node < lgrp_plat_node_cnt; node++) { 380 if (pfn >= lgrp_plat_node_memory[node].start && 381 pfn <= lgrp_plat_node_memory[node].end) 382 return (node); 383 } 384 385 ASSERT(node < lgrp_plat_node_cnt); 386 return (-1); 387 } 388 389 /* 390 * Configure memory nodes for machines with more than one node (ie NUMA) 391 */ 392 void 393 plat_build_mem_nodes(struct memlist *list) 394 { 395 pfn_t cur_start, cur_end; /* start & end addr of subrange */ 396 pfn_t start, end; /* start & end addr of whole range */ 397 398 /* 399 * Boot install lists are arranged <addr, len>, ... 400 */ 401 while (list) { 402 int node; 403 404 start = list->address >> PAGESHIFT; 405 end = (list->address + list->size - 1) >> PAGESHIFT; 406 407 if (start > physmax) { 408 list = list->next; 409 continue; 410 } 411 if (end > physmax) 412 end = physmax; 413 414 /* 415 * When there is only one memnode, just add memory to memnode 416 */ 417 if (max_mem_nodes == 1) { 418 mem_node_add_slice(start, end); 419 list = list->next; 420 continue; 421 } 422 423 /* 424 * mem_node_add_slice() expects to get a memory range that 425 * is within one memnode, so need to split any memory range 426 * that spans multiple memnodes into subranges that are each 427 * contained within one memnode when feeding them to 428 * mem_node_add_slice() 429 */ 430 cur_start = start; 431 do { 432 node = plat_pfn_to_mem_node(cur_start); 433 ASSERT(cur_start >= 434 lgrp_plat_node_memory[node].start && 435 cur_start <= lgrp_plat_node_memory[node].end); 436 437 cur_end = end; 438 439 /* 440 * End of current subrange should not span memnodes 441 */ 442 if (cur_end > lgrp_plat_node_memory[node].end) 443 cur_end = lgrp_plat_node_memory[node].end; 444 445 mem_node_add_slice(cur_start, cur_end); 446 447 /* 448 * Next subrange starts after end of current one 449 */ 450 cur_start = cur_end + 1; 451 } while (cur_end < end); 452 453 list = list->next; 454 } 455 mem_node_physalign = 0; 456 mem_node_pfn_shift = 0; 457 } 458 459 460 /* 461 * Platform-specific initialization of lgroups 462 */ 463 void 464 lgrp_plat_init(void) 465 { 466 uint_t bus; 467 uint_t dev; 468 uint_t node; 469 uint_t off; 470 471 extern lgrp_load_t lgrp_expand_proc_thresh; 472 extern lgrp_load_t lgrp_expand_proc_diff; 473 474 /* 475 * Initialize as a UMA machine if this isn't an Opteron 476 */ 477 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 478 lgrp_plat_node_cnt = max_mem_nodes = 1; 479 return; 480 } 481 482 /* 483 * Read configuration registers from PCI configuration space to 484 * determine node information, which memory is in each node, etc. 485 * 486 * Write to PCI configuration space address register to specify 487 * which configuration register to read and read/write PCI 488 * configuration space data register to get/set contents 489 */ 490 bus = OPT_PCS_BUS_CONFIG; 491 dev = OPT_PCS_DEV_NODE0; 492 off = OPT_PCS_OFF_DRAMBASE; 493 494 /* 495 * Read node ID register for node 0 to get node count 496 */ 497 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_HT, 498 OPT_PCS_OFF_NODEID)); 499 opt_node_info[0] = inl(PCI_CONFDATA); 500 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 501 502 for (node = 0; node < lgrp_plat_node_cnt; node++) { 503 /* 504 * Read node ID register (except for node 0 which we just read) 505 */ 506 if (node > 0) { 507 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, 508 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID)); 509 opt_node_info[node] = inl(PCI_CONFDATA); 510 } 511 512 /* 513 * Read DRAM base and limit registers which specify 514 * physical memory range of each node 515 */ 516 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_ADDRMAP, 517 off)); 518 opt_dram_map[node].base = inl(PCI_CONFDATA); 519 if (opt_dram_map[node].base & OPT_DRAMBASE_MASK_INTRLVEN) 520 lgrp_plat_mem_intrlv++; 521 522 off += 4; /* limit register offset */ 523 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_ADDRMAP, 524 off)); 525 opt_dram_map[node].limit = inl(PCI_CONFDATA); 526 527 /* 528 * Increment device number to next node and register offset for 529 * DRAM base register of next node 530 */ 531 off += 4; 532 dev++; 533 534 /* 535 * Get PFN for first page in each node, 536 * so we can probe memory to determine latency topology 537 */ 538 lgrp_plat_probe_pfn[node] = 539 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 540 541 /* 542 * Remember physical address range of each node for use later 543 */ 544 lgrp_plat_node_memory[node].start = 545 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 546 lgrp_plat_node_memory[node].end = 547 btop(OPT_DRAMLIMIT(opt_dram_map[node].limit) | 548 OPT_DRAMADDR_MASK_OFF); 549 } 550 551 /* 552 * Only use one memory node if memory is interleaved between any nodes 553 */ 554 if (lgrp_plat_mem_intrlv) { 555 lgrp_plat_node_cnt = max_mem_nodes = 1; 556 (void) lgrp_topo_ht_limit_set(1); 557 } else { 558 max_mem_nodes = lgrp_plat_node_cnt; 559 560 /* 561 * Probing errors can mess up the lgroup topology and force us 562 * fall back to a 2 level lgroup topology. Here we bound how 563 * tall the lgroup topology can grow in hopes of avoiding any 564 * anamolies in probing from messing up the lgroup topology 565 * by limiting the accuracy of the latency topology. 566 * 567 * Assume that nodes will at least be configured in a ring, 568 * so limit height of lgroup topology to be less than number 569 * of nodes on a system with 4 or more nodes 570 */ 571 if (lgrp_plat_node_cnt >= 4 && 572 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 573 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 574 } 575 576 /* 577 * Lgroups on Opteron architectures have but a single physical 578 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 579 * so that lgrp_choose() will spread things out aggressively. 580 */ 581 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 582 lgrp_expand_proc_diff = 0; 583 } 584 585 586 /* 587 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 588 * be considered same 589 */ 590 #define LGRP_LAT_TOLERANCE_SHIFT 4 591 592 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 593 594 595 /* 596 * Adjust latencies between nodes to be symmetric, normalize latencies between 597 * any nodes that are within some tolerance to be same, and make local 598 * latencies be same 599 */ 600 static void 601 lgrp_plat_latency_adjust(void) 602 { 603 int i; 604 int j; 605 int k; 606 int l; 607 u_longlong_t max; 608 u_longlong_t min; 609 u_longlong_t t; 610 u_longlong_t t1; 611 u_longlong_t t2; 612 const lgrp_config_flag_t cflag = LGRP_CONFIG_LATENCY_CHANGE; 613 int lat_corrected[MAX_NODES][MAX_NODES]; 614 615 /* 616 * Nothing to do when this is an UMA machine 617 */ 618 if (max_mem_nodes == 1) 619 return; 620 621 /* 622 * Make sure that latencies are symmetric between any two nodes 623 * (ie. latency(node0, node1) == latency(node1, node0)) 624 */ 625 for (i = 0; i < lgrp_plat_node_cnt; i++) 626 for (j = 0; j < lgrp_plat_node_cnt; j++) { 627 t1 = lgrp_plat_probe_times[i][j]; 628 t2 = lgrp_plat_probe_times[j][i]; 629 630 if (t1 == 0 || t2 == 0 || t1 == t2) 631 continue; 632 633 /* 634 * Latencies should be same 635 * - Use minimum of two latencies which should be same 636 * - Track suspect probe times not within tolerance of 637 * min value 638 * - Remember how much values are corrected by 639 */ 640 if (t1 > t2) { 641 t = t2; 642 lgrp_plat_probe_errors[i][j] += t1 - t2; 643 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 644 lgrp_plat_probe_suspect[i][j]++; 645 lgrp_plat_probe_suspect[j][i]++; 646 } 647 } else if (t2 > t1) { 648 t = t1; 649 lgrp_plat_probe_errors[j][i] += t2 - t1; 650 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 651 lgrp_plat_probe_suspect[i][j]++; 652 lgrp_plat_probe_suspect[j][i]++; 653 } 654 } 655 656 lgrp_plat_probe_times[i][j] = 657 lgrp_plat_probe_times[j][i] = t; 658 lgrp_config(cflag, t1, t); 659 lgrp_config(cflag, t2, t); 660 } 661 662 /* 663 * Keep track of which latencies get corrected 664 */ 665 for (i = 0; i < MAX_NODES; i++) 666 for (j = 0; j < MAX_NODES; j++) 667 lat_corrected[i][j] = 0; 668 669 /* 670 * For every two nodes, see whether there is another pair of nodes which 671 * are about the same distance apart and make the latencies be the same 672 * if they are close enough together 673 */ 674 for (i = 0; i < lgrp_plat_node_cnt; i++) 675 for (j = 0; j < lgrp_plat_node_cnt; j++) { 676 /* 677 * Pick one pair of nodes (i, j) 678 * and get latency between them 679 */ 680 t1 = lgrp_plat_probe_times[i][j]; 681 682 /* 683 * Skip this pair of nodes if there isn't a latency 684 * for it yet 685 */ 686 if (t1 == 0) 687 continue; 688 689 for (k = 0; k < lgrp_plat_node_cnt; k++) 690 for (l = 0; l < lgrp_plat_node_cnt; l++) { 691 /* 692 * Pick another pair of nodes (k, l) 693 * not same as (i, j) and get latency 694 * between them 695 */ 696 if (k == i && l == j) 697 continue; 698 699 t2 = lgrp_plat_probe_times[k][l]; 700 701 /* 702 * Skip this pair of nodes if there 703 * isn't a latency for it yet 704 */ 705 706 if (t2 == 0) 707 continue; 708 709 /* 710 * Skip nodes (k, l) if they already 711 * have same latency as (i, j) or 712 * their latency isn't close enough to 713 * be considered/made the same 714 */ 715 if (t1 == t2 || (t1 > t2 && t1 - t2 > 716 t1 >> lgrp_plat_probe_lt_shift) || 717 (t2 > t1 && t2 - t1 > 718 t2 >> lgrp_plat_probe_lt_shift)) 719 continue; 720 721 /* 722 * Make latency(i, j) same as 723 * latency(k, l), try to use latency 724 * that has been adjusted already to get 725 * more consistency (if possible), and 726 * remember which latencies were 727 * adjusted for next time 728 */ 729 if (lat_corrected[i][j]) { 730 t = t1; 731 lgrp_config(cflag, t2, t); 732 t2 = t; 733 } else if (lat_corrected[k][l]) { 734 t = t2; 735 lgrp_config(cflag, t1, t); 736 t1 = t; 737 } else { 738 if (t1 > t2) 739 t = t2; 740 else 741 t = t1; 742 lgrp_config(cflag, t1, t); 743 lgrp_config(cflag, t2, t); 744 t1 = t2 = t; 745 } 746 747 lgrp_plat_probe_times[i][j] = 748 lgrp_plat_probe_times[k][l] = t; 749 750 lat_corrected[i][j] = 751 lat_corrected[k][l] = 1; 752 } 753 } 754 755 /* 756 * Local latencies should be same 757 * - Find min and max local latencies 758 * - Make all local latencies be minimum 759 */ 760 min = -1; 761 max = 0; 762 for (i = 0; i < lgrp_plat_node_cnt; i++) { 763 t = lgrp_plat_probe_times[i][i]; 764 if (t == 0) 765 continue; 766 if (min == -1 || t < min) 767 min = t; 768 if (t > max) 769 max = t; 770 } 771 if (min != max) { 772 for (i = 0; i < lgrp_plat_node_cnt; i++) { 773 int local; 774 775 local = lgrp_plat_probe_times[i][i]; 776 if (local == 0) 777 continue; 778 779 /* 780 * Track suspect probe times that aren't within 781 * tolerance of minimum local latency and how much 782 * probe times are corrected by 783 */ 784 if (local - min > min >> lgrp_plat_probe_lt_shift) 785 lgrp_plat_probe_suspect[i][i]++; 786 787 lgrp_plat_probe_errors[i][i] += local - min; 788 789 /* 790 * Make local latencies be minimum 791 */ 792 lgrp_config(cflag, local, min); 793 lgrp_plat_probe_times[i][i] = min; 794 } 795 } 796 797 /* 798 * Determine max probe time again since just adjusted latencies 799 */ 800 lgrp_plat_probe_time_max = 0; 801 for (i = 0; i < lgrp_plat_node_cnt; i++) 802 for (j = 0; j < lgrp_plat_node_cnt; j++) { 803 t = lgrp_plat_probe_times[i][j]; 804 if (t > lgrp_plat_probe_time_max) 805 lgrp_plat_probe_time_max = t; 806 } 807 } 808 809 810 /* 811 * Verify following about latencies between nodes: 812 * 813 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 814 * - Local latencies same 815 * - Local < remote 816 * - Number of latencies seen is reasonable 817 * - Number of occurrences of a given latency should be more than 1 818 * 819 * Returns: 820 * 0 Success 821 * -1 Not symmetric 822 * -2 Local latencies not same 823 * -3 Local >= remote 824 * -4 Wrong number of latencies 825 * -5 Not enough occurrences of given latency 826 */ 827 static int 828 lgrp_plat_latency_verify(void) 829 { 830 int i; 831 int j; 832 lgrp_plat_latency_acct_t *l; 833 int probed; 834 u_longlong_t t1; 835 u_longlong_t t2; 836 837 /* 838 * Nothing to do when this is an UMA machine, lgroup topology is 839 * limited to 2 levels, or there aren't any probe times yet 840 */ 841 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 842 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 843 return (0); 844 845 /* 846 * Make sure that latencies are symmetric between any two nodes 847 * (ie. latency(node0, node1) == latency(node1, node0)) 848 */ 849 for (i = 0; i < lgrp_plat_node_cnt; i++) 850 for (j = 0; j < lgrp_plat_node_cnt; j++) { 851 t1 = lgrp_plat_probe_times[i][j]; 852 t2 = lgrp_plat_probe_times[j][i]; 853 854 if (t1 == 0 || t2 == 0 || t1 == t2) 855 continue; 856 857 return (-1); 858 } 859 860 /* 861 * Local latencies should be same 862 */ 863 t1 = lgrp_plat_probe_times[0][0]; 864 for (i = 1; i < lgrp_plat_node_cnt; i++) { 865 t2 = lgrp_plat_probe_times[i][i]; 866 if (t2 == 0) 867 continue; 868 869 if (t1 == 0) { 870 t1 = t2; 871 continue; 872 } 873 874 if (t1 != t2) 875 return (-2); 876 } 877 878 /* 879 * Local latencies should be less than remote 880 */ 881 if (t1) { 882 for (i = 0; i < lgrp_plat_node_cnt; i++) 883 for (j = 0; j < lgrp_plat_node_cnt; j++) { 884 t2 = lgrp_plat_probe_times[i][j]; 885 if (i == j || t2 == 0) 886 continue; 887 888 if (t1 >= t2) 889 return (-3); 890 } 891 } 892 893 /* 894 * Rest of checks are not very useful for machines with less than 895 * 4 nodes (which means less than 3 latencies on Opteron) 896 */ 897 if (lgrp_plat_node_cnt < 4) 898 return (0); 899 900 /* 901 * Need to see whether done probing in order to verify number of 902 * latencies are correct 903 */ 904 probed = 0; 905 for (i = 0; i < lgrp_plat_node_cnt; i++) 906 if (lgrp_plat_probe_times[i][i]) 907 probed++; 908 909 if (probed != lgrp_plat_node_cnt) 910 return (0); 911 912 /* 913 * Determine number of unique latencies seen in probe times, 914 * their values, and number of occurrences of each 915 */ 916 lgrp_plat_probe_nlatencies = 0; 917 bzero(lgrp_plat_probe_lat_acct, 918 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 919 for (i = 0; i < lgrp_plat_node_cnt; i++) { 920 for (j = 0; j < lgrp_plat_node_cnt; j++) { 921 int k; 922 923 /* 924 * Look at each probe time 925 */ 926 t1 = lgrp_plat_probe_times[i][j]; 927 if (t1 == 0) 928 continue; 929 930 /* 931 * Account for unique latencies 932 */ 933 for (k = 0; k < lgrp_plat_node_cnt; k++) { 934 l = &lgrp_plat_probe_lat_acct[k]; 935 if (t1 == l->la_value) { 936 /* 937 * Increment number of occurrences 938 * if seen before 939 */ 940 l->la_count++; 941 break; 942 } else if (l->la_value == 0) { 943 /* 944 * Record latency if haven't seen before 945 */ 946 l->la_value = t1; 947 l->la_count++; 948 lgrp_plat_probe_nlatencies++; 949 break; 950 } 951 } 952 } 953 } 954 955 /* 956 * Number of latencies should be relative to number of 957 * nodes in system: 958 * - Same as nodes when nodes <= 2 959 * - Less than nodes when nodes > 2 960 * - Greater than 2 when nodes >= 4 961 */ 962 if ((lgrp_plat_node_cnt <= 2 && 963 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 964 (lgrp_plat_node_cnt > 2 && 965 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 966 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 967 lgrp_plat_probe_nlatencies <= 2)) 968 return (-4); 969 970 /* 971 * There should be more than one occurrence of every latency 972 * as long as probing is complete 973 */ 974 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 975 l = &lgrp_plat_probe_lat_acct[i]; 976 if (l->la_count <= 1) 977 return (-5); 978 } 979 return (0); 980 } 981 982 983 /* 984 * Set lgroup latencies for 2 level lgroup topology 985 */ 986 static void 987 lgrp_plat_2level_setup(void) 988 { 989 int i; 990 991 if (lgrp_plat_node_cnt >= 4) 992 cmn_err(CE_NOTE, 993 "MPO only optimizing for local and remote\n"); 994 for (i = 0; i < lgrp_plat_node_cnt; i++) { 995 int j; 996 997 for (j = 0; j < lgrp_plat_node_cnt; j++) { 998 if (i == j) 999 lgrp_plat_probe_times[i][j] = 2; 1000 else 1001 lgrp_plat_probe_times[i][j] = 3; 1002 } 1003 } 1004 lgrp_plat_probe_time_min = 2; 1005 lgrp_plat_probe_time_max = 3; 1006 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1007 } 1008 1009 1010 /* 1011 * Return time needed to probe from current CPU to memory in given node 1012 */ 1013 static hrtime_t 1014 lgrp_plat_probe_time(int to) 1015 { 1016 caddr_t buf; 1017 uint_t dev; 1018 /* LINTED: set but not used in function */ 1019 volatile uint_t dev_vendor; 1020 hrtime_t elapsed; 1021 hrtime_t end; 1022 int from; 1023 int i; 1024 int ipl; 1025 hrtime_t max; 1026 hrtime_t min; 1027 hrtime_t start; 1028 extern int use_sse_pagecopy; 1029 1030 /* 1031 * Determine ID of node containing current CPU 1032 */ 1033 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1034 1035 /* 1036 * Do common work for probing main memory 1037 */ 1038 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1039 /* 1040 * Skip probing any nodes without memory and 1041 * set probe time to 0 1042 */ 1043 if (lgrp_plat_probe_memory[to] == NULL) { 1044 lgrp_plat_probe_times[from][to] = 0; 1045 return (0); 1046 } 1047 1048 /* 1049 * Invalidate caches once instead of once every sample 1050 * which should cut cost of probing by a lot 1051 */ 1052 lgrp_plat_flush_cost = gethrtime(); 1053 invalidate_cache(); 1054 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1055 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1056 } 1057 1058 /* 1059 * Probe from current CPU to given memory using specified operation 1060 * and take specified number of samples 1061 */ 1062 max = 0; 1063 min = -1; 1064 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1065 lgrp_plat_probe_cost = gethrtime(); 1066 1067 /* 1068 * Can't measure probe time if gethrtime() isn't working yet 1069 */ 1070 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1071 return (0); 1072 1073 switch (lgrp_plat_probe_op) { 1074 1075 case LGRP_PLAT_PROBE_PGCPY: 1076 default: 1077 /* 1078 * Measure how long it takes to copy page 1079 * on top of itself 1080 */ 1081 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1082 1083 kpreempt_disable(); 1084 ipl = splhigh(); 1085 start = gethrtime(); 1086 if (use_sse_pagecopy) 1087 hwblkpagecopy(buf, buf); 1088 else 1089 bcopy(buf, buf, PAGESIZE); 1090 end = gethrtime(); 1091 elapsed = end - start; 1092 splx(ipl); 1093 kpreempt_enable(); 1094 break; 1095 1096 case LGRP_PLAT_PROBE_VENDOR: 1097 /* 1098 * Measure how long it takes to read vendor ID from 1099 * Northbridge 1100 */ 1101 dev = OPT_PCS_DEV_NODE0 + to; 1102 kpreempt_disable(); 1103 ipl = spl8(); 1104 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1105 OPT_PCS_OFF_VENDOR)); 1106 start = gethrtime(); 1107 dev_vendor = inl(PCI_CONFDATA); 1108 end = gethrtime(); 1109 elapsed = end - start; 1110 splx(ipl); 1111 kpreempt_enable(); 1112 break; 1113 } 1114 1115 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1116 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1117 1118 if (min == -1 || elapsed < min) 1119 min = elapsed; 1120 if (elapsed > max) 1121 max = elapsed; 1122 } 1123 1124 /* 1125 * Update minimum and maximum probe times between 1126 * these two nodes 1127 */ 1128 if (min < lgrp_plat_probe_min[from][to] || 1129 lgrp_plat_probe_min[from][to] == 0) 1130 lgrp_plat_probe_min[from][to] = min; 1131 1132 if (max > lgrp_plat_probe_max[from][to]) 1133 lgrp_plat_probe_max[from][to] = max; 1134 1135 return (min); 1136 } 1137 1138 1139 /* 1140 * Probe memory in each node from current CPU to determine latency topology 1141 */ 1142 void 1143 lgrp_plat_probe(void) 1144 { 1145 int from; 1146 int i; 1147 hrtime_t probe_time; 1148 int to; 1149 1150 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1151 return; 1152 1153 /* 1154 * Determine ID of node containing current CPU 1155 */ 1156 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1157 1158 /* 1159 * Don't need to probe if got times already 1160 */ 1161 if (lgrp_plat_probe_times[from][from] != 0) 1162 return; 1163 1164 /* 1165 * Read vendor ID in Northbridge or read and write page(s) 1166 * in each node from current CPU and remember how long it takes, 1167 * so we can build latency topology of machine later. 1168 * This should approximate the memory latency between each node. 1169 */ 1170 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1171 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1172 /* 1173 * Get probe time and bail out if can't get it yet 1174 */ 1175 probe_time = lgrp_plat_probe_time(to); 1176 if (probe_time == 0) 1177 return; 1178 1179 /* 1180 * Keep lowest probe time as latency between nodes 1181 */ 1182 if (lgrp_plat_probe_times[from][to] == 0 || 1183 probe_time < lgrp_plat_probe_times[from][to]) 1184 lgrp_plat_probe_times[from][to] = probe_time; 1185 1186 /* 1187 * Update overall minimum and maximum probe times 1188 * across all nodes 1189 */ 1190 if (probe_time < lgrp_plat_probe_time_min || 1191 lgrp_plat_probe_time_min == -1) 1192 lgrp_plat_probe_time_min = probe_time; 1193 if (probe_time > lgrp_plat_probe_time_max) 1194 lgrp_plat_probe_time_max = probe_time; 1195 } 1196 1197 /* 1198 * - Fix up latencies such that local latencies are same, 1199 * latency(i, j) == latency(j, i), etc. (if possible) 1200 * 1201 * - Verify that latencies look ok 1202 * 1203 * - Fallback to just optimizing for local and remote if 1204 * latencies didn't look right 1205 */ 1206 lgrp_plat_latency_adjust(); 1207 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1208 if (lgrp_plat_probe_error_code) 1209 lgrp_plat_2level_setup(); 1210 } 1211 1212 1213 /* 1214 * Platform-specific initialization 1215 */ 1216 void 1217 lgrp_plat_main_init(void) 1218 { 1219 int curnode; 1220 int ht_limit; 1221 int i; 1222 1223 /* 1224 * Print a notice that MPO is disabled when memory is interleaved 1225 * across nodes....Would do this when it is discovered, but can't 1226 * because it happens way too early during boot.... 1227 */ 1228 if (lgrp_plat_mem_intrlv) 1229 cmn_err(CE_NOTE, 1230 "MPO disabled because memory is interleaved\n"); 1231 1232 /* 1233 * Don't bother to do any probing if there is only one node or the 1234 * height of the lgroup topology less than or equal to 2 1235 */ 1236 ht_limit = lgrp_topo_ht_limit(); 1237 if (max_mem_nodes == 1 || ht_limit <= 2) { 1238 /* 1239 * Setup lgroup latencies for 2 level lgroup topology 1240 * (ie. local and remote only) if they haven't been set yet 1241 */ 1242 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1243 lgrp_plat_probe_time_max == 0) 1244 lgrp_plat_2level_setup(); 1245 return; 1246 } 1247 1248 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1249 /* 1250 * Should have been able to probe from CPU 0 when it was added 1251 * to lgroup hierarchy, but may not have been able to then 1252 * because it happens so early in boot that gethrtime() hasn't 1253 * been initialized. (:-( 1254 */ 1255 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1256 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1257 lgrp_plat_probe(); 1258 1259 return; 1260 } 1261 1262 /* 1263 * When probing memory, use one page for every sample to determine 1264 * lgroup topology and taking multiple samples 1265 */ 1266 if (lgrp_plat_probe_memsize == 0) 1267 lgrp_plat_probe_memsize = PAGESIZE * 1268 lgrp_plat_probe_nsamples; 1269 1270 /* 1271 * Map memory in each node needed for probing to determine latency 1272 * topology 1273 */ 1274 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1275 int mnode; 1276 1277 /* 1278 * Skip this node and leave its probe page NULL 1279 * if it doesn't have any memory 1280 */ 1281 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1282 if (!mem_node_config[mnode].exists) { 1283 lgrp_plat_probe_memory[i] = NULL; 1284 continue; 1285 } 1286 1287 /* 1288 * Allocate one kernel virtual page 1289 */ 1290 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1291 lgrp_plat_probe_memsize, VM_NOSLEEP); 1292 if (lgrp_plat_probe_memory[i] == NULL) { 1293 cmn_err(CE_WARN, 1294 "lgrp_plat_main_init: couldn't allocate memory"); 1295 return; 1296 } 1297 1298 /* 1299 * Map virtual page to first page in node 1300 */ 1301 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1302 lgrp_plat_probe_memsize, 1303 lgrp_plat_probe_pfn[i], 1304 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1305 HAT_LOAD_NOCONSIST); 1306 } 1307 1308 /* 1309 * Probe from current CPU 1310 */ 1311 lgrp_plat_probe(); 1312 } 1313 1314 /* 1315 * Allocate additional space for an lgroup. 1316 */ 1317 /* ARGSUSED */ 1318 lgrp_t * 1319 lgrp_plat_alloc(lgrp_id_t lgrpid) 1320 { 1321 lgrp_t *lgrp; 1322 1323 lgrp = &lgrp_space[nlgrps_alloc++]; 1324 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1325 return (NULL); 1326 return (lgrp); 1327 } 1328 1329 /* 1330 * Platform handling for (re)configuration changes 1331 */ 1332 /* ARGSUSED */ 1333 void 1334 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1335 { 1336 } 1337 1338 /* 1339 * Return the platform handle for the lgroup containing the given CPU 1340 */ 1341 /* ARGSUSED */ 1342 lgrp_handle_t 1343 lgrp_plat_cpu_to_hand(processorid_t id) 1344 { 1345 if (lgrp_plat_node_cnt == 1) 1346 return (LGRP_DEFAULT_HANDLE); 1347 1348 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1349 } 1350 1351 /* 1352 * Return the platform handle of the lgroup that contains the physical memory 1353 * corresponding to the given page frame number 1354 */ 1355 /* ARGSUSED */ 1356 lgrp_handle_t 1357 lgrp_plat_pfn_to_hand(pfn_t pfn) 1358 { 1359 int mnode; 1360 1361 if (max_mem_nodes == 1) 1362 return (LGRP_DEFAULT_HANDLE); 1363 1364 mnode = plat_pfn_to_mem_node(pfn); 1365 return (MEM_NODE_2_LGRPHAND(mnode)); 1366 } 1367 1368 /* 1369 * Return the maximum number of lgrps supported by the platform. 1370 * Before lgrp topology is known it returns an estimate based on the number of 1371 * nodes. Once topology is known it returns the actual maximim number of lgrps 1372 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1373 * may not grow during system lifetime. 1374 */ 1375 int 1376 lgrp_plat_max_lgrps() 1377 { 1378 return (lgrp_topo_initialized ? 1379 lgrp_alloc_max + 1 : 1380 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1381 } 1382 1383 /* 1384 * Return the number of free, allocatable, or installed 1385 * pages in an lgroup 1386 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1387 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1388 */ 1389 /* ARGSUSED */ 1390 static pgcnt_t 1391 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1392 { 1393 struct memlist *mlist; 1394 pgcnt_t npgs = 0; 1395 extern struct memlist *phys_avail; 1396 extern struct memlist *phys_install; 1397 1398 switch (query) { 1399 case LGRP_MEM_SIZE_FREE: 1400 return ((pgcnt_t)freemem); 1401 case LGRP_MEM_SIZE_AVAIL: 1402 memlist_read_lock(); 1403 for (mlist = phys_avail; mlist; mlist = mlist->next) 1404 npgs += btop(mlist->size); 1405 memlist_read_unlock(); 1406 return (npgs); 1407 case LGRP_MEM_SIZE_INSTALL: 1408 memlist_read_lock(); 1409 for (mlist = phys_install; mlist; mlist = mlist->next) 1410 npgs += btop(mlist->size); 1411 memlist_read_unlock(); 1412 return (npgs); 1413 default: 1414 return ((pgcnt_t)0); 1415 } 1416 } 1417 1418 /* 1419 * Return the number of free pages in an lgroup. 1420 * 1421 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1422 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1423 * number of allocatable base pagesize pages corresponding to the 1424 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1425 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1426 * memory installed, regardless of whether or not it's usable. 1427 */ 1428 pgcnt_t 1429 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1430 { 1431 int mnode; 1432 pgcnt_t npgs = (pgcnt_t)0; 1433 extern struct memlist *phys_avail; 1434 extern struct memlist *phys_install; 1435 1436 1437 if (plathand == LGRP_DEFAULT_HANDLE) 1438 return (lgrp_plat_mem_size_default(plathand, query)); 1439 1440 if (plathand != LGRP_NULL_HANDLE) { 1441 mnode = plat_lgrphand_to_mem_node(plathand); 1442 if (mnode >= 0 && mem_node_config[mnode].exists) { 1443 switch (query) { 1444 case LGRP_MEM_SIZE_FREE: 1445 npgs = MNODE_PGCNT(mnode); 1446 break; 1447 case LGRP_MEM_SIZE_AVAIL: 1448 npgs = mem_node_memlist_pages(mnode, 1449 phys_avail); 1450 break; 1451 case LGRP_MEM_SIZE_INSTALL: 1452 npgs = mem_node_memlist_pages(mnode, 1453 phys_install); 1454 break; 1455 default: 1456 break; 1457 } 1458 } 1459 } 1460 return (npgs); 1461 } 1462 1463 /* 1464 * Return latency between "from" and "to" lgroups 1465 * 1466 * This latency number can only be used for relative comparison 1467 * between lgroups on the running system, cannot be used across platforms, 1468 * and may not reflect the actual latency. It is platform and implementation 1469 * specific, so platform gets to decide its value. It would be nice if the 1470 * number was at least proportional to make comparisons more meaningful though. 1471 */ 1472 /* ARGSUSED */ 1473 int 1474 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1475 { 1476 lgrp_handle_t src, dest; 1477 1478 if (max_mem_nodes == 1) 1479 return (0); 1480 1481 /* 1482 * Return max latency for root lgroup 1483 */ 1484 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1485 return (lgrp_plat_probe_time_max); 1486 1487 src = from; 1488 dest = to; 1489 1490 /* 1491 * Return 0 for nodes (lgroup platform handles) out of range 1492 */ 1493 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1494 return (0); 1495 1496 /* 1497 * Probe from current CPU if its lgroup latencies haven't been set yet 1498 * and we are trying to get latency from current CPU to some node 1499 */ 1500 if (lgrp_plat_probe_times[src][src] == 0 && 1501 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1502 lgrp_plat_probe(); 1503 1504 return (lgrp_plat_probe_times[src][dest]); 1505 } 1506 1507 /* 1508 * Return platform handle for root lgroup 1509 */ 1510 lgrp_handle_t 1511 lgrp_plat_root_hand(void) 1512 { 1513 return (LGRP_DEFAULT_HANDLE); 1514 } 1515