1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/cpupart.h> 33 #include <sys/cpuvar.h> 34 #include <sys/lgrp.h> 35 #include <sys/machsystm.h> 36 #include <sys/memlist.h> 37 #include <sys/memnode.h> 38 #include <sys/mman.h> 39 #include <sys/pci_impl.h> /* for PCI configuration space macros */ 40 #include <sys/param.h> 41 #include <sys/promif.h> /* for prom_printf() */ 42 #include <sys/systm.h> 43 #include <sys/thread.h> 44 #include <sys/types.h> 45 #include <sys/var.h> 46 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 47 #include <vm/hat_i86.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/vm_dep.h> 50 51 52 53 /* 54 * lgroup platform support for x86 platforms. 55 */ 56 57 #define MAX_NODES 8 58 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 59 60 #define LGRP_PLAT_CPU_TO_NODE(cpu) (chip_plat_get_chipid(cpu)) 61 62 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 63 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 64 65 66 /* 67 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 68 * 69 * Until System Affinity Resource Table (SRAT) becomes part of ACPI standard, 70 * we need to examine registers in PCI configuration space to determine how 71 * many nodes are in the system and which CPUs and memory are in each node. 72 * This could be determined by probing all memory from each CPU, but that is 73 * too expensive to do while booting the kernel. 74 * 75 * NOTE: Using these PCI configuration space registers to determine this 76 * locality info is Opteron K8 specific and not guaranteed to work on 77 * the next generation Opteron processor. Furthermore, we assume that 78 * there is one CPU per node and CPU 0 is in node 0, CPU 1 is in node 1, 79 * etc. which should be true for Opteron K8.... 80 */ 81 82 /* 83 * Opteron DRAM Address Map in PCI configuration space gives base and limit 84 * of physical memory in each node for Opteron K8. The following constants 85 * and macros define their contents, structure, and access. 86 */ 87 88 /* 89 * How many bits to shift Opteron DRAM Address Map base and limit registers 90 * to get actual value 91 */ 92 #define OPT_DRAMADDR_LSHIFT_ADDR 8 /* shift left for address */ 93 94 #define OPT_DRAMADDR_MASK_OFF 0xFFFFFF /* offset for address */ 95 96 /* 97 * Bit masks defining what's in Opteron DRAM Address Map base register 98 */ 99 #define OPT_DRAMBASE_MASK_RE 0x1 /* read enable */ 100 #define OPT_DRAMBASE_MASK_WE 0x2 /* write enable */ 101 #define OPT_DRAMBASE_MASK_INTRLVEN 0x700 /* interleave */ 102 103 #define OPT_DRAMBASE_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 104 105 /* 106 * Macros to get values from Opteron DRAM Address Map base register 107 */ 108 #define OPT_DRAMBASE(reg) \ 109 (((u_longlong_t)reg & OPT_DRAMBASE_MASK_ADDR) << \ 110 OPT_DRAMADDR_LSHIFT_ADDR) 111 112 113 /* 114 * Bit masks defining what's in Opteron DRAM Address Map limit register 115 */ 116 #define OPT_DRAMLIMIT_MASK_DSTNODE 0x7 /* destination node */ 117 #define OPT_DRAMLIMIT_MASK_INTRLVSEL 0x70 /* interleave select */ 118 #define OPT_DRAMLIMIT_MASK_ADDR 0xFFFF0000 /* addr bits 39-24 */ 119 120 /* 121 * Macros to get values from Opteron DRAM Address Map limit register 122 */ 123 #define OPT_DRAMLIMIT(reg) \ 124 (((u_longlong_t)reg & OPT_DRAMLIMIT_MASK_ADDR) << \ 125 OPT_DRAMADDR_LSHIFT_ADDR) 126 127 128 /* 129 * Opteron Node ID register in PCI configuration space contains 130 * number of nodes in system, etc. for Opteron K8. The following 131 * constants and macros define its contents, structure, and access. 132 */ 133 134 /* 135 * Bit masks defining what's in Opteron Node ID register 136 */ 137 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 138 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 139 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 140 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 141 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 142 143 /* 144 * How many bits in Opteron Node ID register to shift right to get actual value 145 */ 146 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 147 148 /* 149 * Macros to get values from Opteron Node ID register 150 */ 151 #define OPT_NODE_CNT(reg) \ 152 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 153 154 155 /* 156 * PCI configuration space registers accessed by specifying 157 * a bus, device, function, and offset. The following constants 158 * define the values needed to access Opteron K8 configuration 159 * info to determine its node topology 160 */ 161 162 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 163 164 /* 165 * Opteron PCI configuration space register function values 166 */ 167 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 168 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 169 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 170 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 171 172 /* 173 * PCI Configuration Space register offsets 174 */ 175 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 176 #define OPT_PCS_OFF_DRAMBASE 0x40 /* DRAM Base register (node 0) */ 177 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 178 179 /* 180 * Opteron PCI Configuration Space device IDs for nodes 181 */ 182 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 183 184 185 /* 186 * Bookkeeping for latencies seen during probing (used for verification) 187 */ 188 typedef struct lgrp_plat_latency_acct { 189 hrtime_t la_value; /* latency value */ 190 int la_count; /* occurrences */ 191 } lgrp_plat_latency_acct_t; 192 193 194 /* 195 * Choices for probing to determine lgroup topology 196 */ 197 typedef enum lgrp_plat_probe_op { 198 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 199 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 200 } lgrp_plat_probe_op_t; 201 202 203 /* 204 * Opteron DRAM address map gives base and limit for physical memory in a node 205 */ 206 typedef struct opt_dram_addr_map { 207 uint32_t base; 208 uint32_t limit; 209 } opt_dram_addr_map_t; 210 211 212 /* 213 * Starting and ending page for physical memory in node 214 */ 215 typedef struct phys_addr_map { 216 pfn_t start; 217 pfn_t end; 218 int exists; 219 } phys_addr_map_t; 220 221 222 /* 223 * Opteron DRAM address map for each node 224 */ 225 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 226 227 /* 228 * Node ID register contents for each node 229 */ 230 uint_t opt_node_info[MAX_NODES]; 231 232 /* 233 * Whether memory is interleaved across nodes causing MPO to be disabled 234 */ 235 int lgrp_plat_mem_intrlv = 0; 236 237 /* 238 * Number of nodes in system 239 */ 240 uint_t lgrp_plat_node_cnt = 1; 241 242 /* 243 * Physical address range for memory in each node 244 */ 245 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 246 247 /* 248 * Probe costs (individual and total) and flush cost 249 */ 250 hrtime_t lgrp_plat_flush_cost = 0; 251 hrtime_t lgrp_plat_probe_cost = 0; 252 hrtime_t lgrp_plat_probe_cost_total = 0; 253 254 /* 255 * Error code for latency adjustment and verification 256 */ 257 int lgrp_plat_probe_error_code = 0; 258 259 /* 260 * How much latencies were off from minimum values gotten 261 */ 262 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 263 264 /* 265 * Unique probe latencies and number of occurrences of each 266 */ 267 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 268 269 /* 270 * Size of memory buffer in each node for probing 271 */ 272 size_t lgrp_plat_probe_memsize = 0; 273 274 /* 275 * Virtual address of page in each node for probing 276 */ 277 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 278 279 /* 280 * Number of unique latencies in probe times 281 */ 282 int lgrp_plat_probe_nlatencies = 0; 283 284 /* 285 * How many rounds of probing to do 286 */ 287 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 288 289 /* 290 * Number of samples to take when probing each node 291 */ 292 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 293 294 /* 295 * How to probe to determine lgroup topology 296 */ 297 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 298 299 /* 300 * PFN of page in each node for probing 301 */ 302 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 303 304 /* 305 * Whether probe time was suspect (ie. not within tolerance of value that it 306 * should match) 307 */ 308 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 309 310 /* 311 * How long it takes to access memory from each node 312 */ 313 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 314 315 /* 316 * Min and max node memory probe times seen 317 */ 318 hrtime_t lgrp_plat_probe_time_max = 0; 319 hrtime_t lgrp_plat_probe_time_min = -1; 320 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 321 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 322 323 324 /* 325 * Allocate lgrp and lgrp stat arrays statically. 326 */ 327 static lgrp_t lgrp_space[NLGRP]; 328 static int nlgrps_alloc; 329 330 struct lgrp_stats lgrp_stats[NLGRP]; 331 332 #define CPUID_FAMILY_OPTERON 15 333 334 uint_t opt_family = 0; 335 uint_t opt_model = 0; 336 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 337 338 339 /* 340 * Determine whether we're running on an AMD Opteron K8 machine 341 */ 342 int 343 is_opteron(void) 344 { 345 if (x86_vendor != X86_VENDOR_AMD) 346 return (0); 347 348 if (cpuid_getfamily(CPU) == CPUID_FAMILY_OPTERON) 349 return (1); 350 else 351 return (0); 352 } 353 354 int 355 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 356 { 357 if (max_mem_nodes == 1) 358 return (0); 359 360 return ((int)hand); 361 } 362 363 lgrp_handle_t 364 plat_mem_node_to_lgrphand(int mnode) 365 { 366 if (max_mem_nodes == 1) 367 return (LGRP_DEFAULT_HANDLE); 368 369 return ((lgrp_handle_t)mnode); 370 } 371 372 int 373 plat_pfn_to_mem_node(pfn_t pfn) 374 { 375 int node; 376 377 if (max_mem_nodes == 1) 378 return (0); 379 380 for (node = 0; node < lgrp_plat_node_cnt; node++) { 381 /* 382 * Skip nodes with no memory 383 */ 384 if (!lgrp_plat_node_memory[node].exists) 385 continue; 386 387 if (pfn >= lgrp_plat_node_memory[node].start && 388 pfn <= lgrp_plat_node_memory[node].end) 389 return (node); 390 } 391 392 ASSERT(node < lgrp_plat_node_cnt); 393 return (-1); 394 } 395 396 /* 397 * Configure memory nodes for machines with more than one node (ie NUMA) 398 */ 399 void 400 plat_build_mem_nodes(struct memlist *list) 401 { 402 pfn_t cur_start; /* start addr of subrange */ 403 pfn_t cur_end; /* end addr of subrange */ 404 pfn_t start; /* start addr of whole range */ 405 pfn_t end; /* end addr of whole range */ 406 407 /* 408 * Boot install lists are arranged <addr, len>, ... 409 */ 410 while (list) { 411 int node; 412 413 start = list->address >> PAGESHIFT; 414 end = (list->address + list->size - 1) >> PAGESHIFT; 415 416 if (start > physmax) { 417 list = list->next; 418 continue; 419 } 420 if (end > physmax) 421 end = physmax; 422 423 /* 424 * When there is only one memnode, just add memory to memnode 425 */ 426 if (max_mem_nodes == 1) { 427 mem_node_add_slice(start, end); 428 list = list->next; 429 continue; 430 } 431 432 /* 433 * mem_node_add_slice() expects to get a memory range that 434 * is within one memnode, so need to split any memory range 435 * that spans multiple memnodes into subranges that are each 436 * contained within one memnode when feeding them to 437 * mem_node_add_slice() 438 */ 439 cur_start = start; 440 do { 441 node = plat_pfn_to_mem_node(cur_start); 442 443 /* 444 * Panic if DRAM address map registers or SRAT say 445 * memory in node doesn't exist or address from 446 * boot installed memory list entry isn't in this node. 447 * This shouldn't happen and rest of code can't deal 448 * with this if it does. 449 */ 450 if (node < 0 || node >= lgrp_plat_node_cnt || 451 !lgrp_plat_node_memory[node].exists || 452 cur_start < lgrp_plat_node_memory[node].start || 453 cur_start > lgrp_plat_node_memory[node].end) { 454 cmn_err(CE_PANIC, "Don't know which memnode " 455 "to add installed memory address 0x%lx\n", 456 cur_start); 457 } 458 459 /* 460 * End of current subrange should not span memnodes 461 */ 462 cur_end = end; 463 if (lgrp_plat_node_memory[node].exists && 464 cur_end > lgrp_plat_node_memory[node].end) 465 cur_end = lgrp_plat_node_memory[node].end; 466 467 mem_node_add_slice(cur_start, cur_end); 468 469 /* 470 * Next subrange starts after end of current one 471 */ 472 cur_start = cur_end + 1; 473 } while (cur_end < end); 474 475 list = list->next; 476 } 477 mem_node_physalign = 0; 478 mem_node_pfn_shift = 0; 479 } 480 481 482 /* 483 * Platform-specific initialization of lgroups 484 */ 485 void 486 lgrp_plat_init(void) 487 { 488 uint_t bus; 489 uint_t dev; 490 uint_t node; 491 uint_t off; 492 493 extern lgrp_load_t lgrp_expand_proc_thresh; 494 extern lgrp_load_t lgrp_expand_proc_diff; 495 496 /* 497 * Initialize as a UMA machine if this isn't an Opteron 498 */ 499 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 500 lgrp_plat_node_cnt = max_mem_nodes = 1; 501 return; 502 } 503 504 /* 505 * Read configuration registers from PCI configuration space to 506 * determine node information, which memory is in each node, etc. 507 * 508 * Write to PCI configuration space address register to specify 509 * which configuration register to read and read/write PCI 510 * configuration space data register to get/set contents 511 */ 512 bus = OPT_PCS_BUS_CONFIG; 513 dev = OPT_PCS_DEV_NODE0; 514 off = OPT_PCS_OFF_DRAMBASE; 515 516 /* 517 * Read node ID register for node 0 to get node count 518 */ 519 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_HT, 520 OPT_PCS_OFF_NODEID)); 521 opt_node_info[0] = inl(PCI_CONFDATA); 522 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 523 524 for (node = 0; node < lgrp_plat_node_cnt; node++) { 525 /* 526 * Read node ID register (except for node 0 which we just read) 527 */ 528 if (node > 0) { 529 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, 530 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID)); 531 opt_node_info[node] = inl(PCI_CONFDATA); 532 } 533 534 /* 535 * Read DRAM base and limit registers which specify 536 * physical memory range of each node 537 */ 538 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_ADDRMAP, 539 off)); 540 opt_dram_map[node].base = inl(PCI_CONFDATA); 541 if (opt_dram_map[node].base & OPT_DRAMBASE_MASK_INTRLVEN) 542 lgrp_plat_mem_intrlv++; 543 544 off += 4; /* limit register offset */ 545 outl(PCI_CONFADD, PCI_CADDR1(bus, dev, OPT_PCS_FUNC_ADDRMAP, 546 off)); 547 opt_dram_map[node].limit = inl(PCI_CONFDATA); 548 549 /* 550 * Increment device number to next node and register offset for 551 * DRAM base register of next node 552 */ 553 off += 4; 554 dev++; 555 556 /* 557 * Both read and write enable bits must be enabled in DRAM 558 * address map base register for physical memory to exist in 559 * node 560 */ 561 if ((opt_dram_map[node].base & OPT_DRAMBASE_MASK_RE) == 0 || 562 (opt_dram_map[node].base & OPT_DRAMBASE_MASK_WE) == 0) { 563 /* 564 * Mark node memory as non-existent and set start and 565 * end addresses to be same in lgrp_plat_node_memory[] 566 */ 567 lgrp_plat_node_memory[node].exists = 0; 568 lgrp_plat_node_memory[node].start = 569 lgrp_plat_node_memory[node].end = (pfn_t)-1; 570 continue; 571 } 572 573 /* 574 * Get PFN for first page in each node, 575 * so we can probe memory to determine latency topology 576 */ 577 lgrp_plat_probe_pfn[node] = 578 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 579 580 /* 581 * Mark node memory as existing and remember physical address 582 * range of each node for use later 583 */ 584 lgrp_plat_node_memory[node].exists = 1; 585 lgrp_plat_node_memory[node].start = 586 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 587 lgrp_plat_node_memory[node].end = 588 btop(OPT_DRAMLIMIT(opt_dram_map[node].limit) | 589 OPT_DRAMADDR_MASK_OFF); 590 } 591 592 /* 593 * Only use one memory node if memory is interleaved between any nodes 594 */ 595 if (lgrp_plat_mem_intrlv) { 596 lgrp_plat_node_cnt = max_mem_nodes = 1; 597 (void) lgrp_topo_ht_limit_set(1); 598 } else { 599 max_mem_nodes = lgrp_plat_node_cnt; 600 601 /* 602 * Probing errors can mess up the lgroup topology and force us 603 * fall back to a 2 level lgroup topology. Here we bound how 604 * tall the lgroup topology can grow in hopes of avoiding any 605 * anamolies in probing from messing up the lgroup topology 606 * by limiting the accuracy of the latency topology. 607 * 608 * Assume that nodes will at least be configured in a ring, 609 * so limit height of lgroup topology to be less than number 610 * of nodes on a system with 4 or more nodes 611 */ 612 if (lgrp_plat_node_cnt >= 4 && 613 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 614 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 615 } 616 617 /* 618 * Lgroups on Opteron architectures have but a single physical 619 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 620 * so that lgrp_choose() will spread things out aggressively. 621 */ 622 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 623 lgrp_expand_proc_diff = 0; 624 } 625 626 627 /* 628 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 629 * be considered same 630 */ 631 #define LGRP_LAT_TOLERANCE_SHIFT 4 632 633 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 634 635 636 /* 637 * Adjust latencies between nodes to be symmetric, normalize latencies between 638 * any nodes that are within some tolerance to be same, and make local 639 * latencies be same 640 */ 641 static void 642 lgrp_plat_latency_adjust(void) 643 { 644 int i; 645 int j; 646 int k; 647 int l; 648 u_longlong_t max; 649 u_longlong_t min; 650 u_longlong_t t; 651 u_longlong_t t1; 652 u_longlong_t t2; 653 const lgrp_config_flag_t cflag = LGRP_CONFIG_LATENCY_CHANGE; 654 int lat_corrected[MAX_NODES][MAX_NODES]; 655 656 /* 657 * Nothing to do when this is an UMA machine 658 */ 659 if (max_mem_nodes == 1) 660 return; 661 662 /* 663 * Make sure that latencies are symmetric between any two nodes 664 * (ie. latency(node0, node1) == latency(node1, node0)) 665 */ 666 for (i = 0; i < lgrp_plat_node_cnt; i++) 667 for (j = 0; j < lgrp_plat_node_cnt; j++) { 668 t1 = lgrp_plat_probe_times[i][j]; 669 t2 = lgrp_plat_probe_times[j][i]; 670 671 if (t1 == 0 || t2 == 0 || t1 == t2) 672 continue; 673 674 /* 675 * Latencies should be same 676 * - Use minimum of two latencies which should be same 677 * - Track suspect probe times not within tolerance of 678 * min value 679 * - Remember how much values are corrected by 680 */ 681 if (t1 > t2) { 682 t = t2; 683 lgrp_plat_probe_errors[i][j] += t1 - t2; 684 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 685 lgrp_plat_probe_suspect[i][j]++; 686 lgrp_plat_probe_suspect[j][i]++; 687 } 688 } else if (t2 > t1) { 689 t = t1; 690 lgrp_plat_probe_errors[j][i] += t2 - t1; 691 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 692 lgrp_plat_probe_suspect[i][j]++; 693 lgrp_plat_probe_suspect[j][i]++; 694 } 695 } 696 697 lgrp_plat_probe_times[i][j] = 698 lgrp_plat_probe_times[j][i] = t; 699 lgrp_config(cflag, t1, t); 700 lgrp_config(cflag, t2, t); 701 } 702 703 /* 704 * Keep track of which latencies get corrected 705 */ 706 for (i = 0; i < MAX_NODES; i++) 707 for (j = 0; j < MAX_NODES; j++) 708 lat_corrected[i][j] = 0; 709 710 /* 711 * For every two nodes, see whether there is another pair of nodes which 712 * are about the same distance apart and make the latencies be the same 713 * if they are close enough together 714 */ 715 for (i = 0; i < lgrp_plat_node_cnt; i++) 716 for (j = 0; j < lgrp_plat_node_cnt; j++) { 717 /* 718 * Pick one pair of nodes (i, j) 719 * and get latency between them 720 */ 721 t1 = lgrp_plat_probe_times[i][j]; 722 723 /* 724 * Skip this pair of nodes if there isn't a latency 725 * for it yet 726 */ 727 if (t1 == 0) 728 continue; 729 730 for (k = 0; k < lgrp_plat_node_cnt; k++) 731 for (l = 0; l < lgrp_plat_node_cnt; l++) { 732 /* 733 * Pick another pair of nodes (k, l) 734 * not same as (i, j) and get latency 735 * between them 736 */ 737 if (k == i && l == j) 738 continue; 739 740 t2 = lgrp_plat_probe_times[k][l]; 741 742 /* 743 * Skip this pair of nodes if there 744 * isn't a latency for it yet 745 */ 746 747 if (t2 == 0) 748 continue; 749 750 /* 751 * Skip nodes (k, l) if they already 752 * have same latency as (i, j) or 753 * their latency isn't close enough to 754 * be considered/made the same 755 */ 756 if (t1 == t2 || (t1 > t2 && t1 - t2 > 757 t1 >> lgrp_plat_probe_lt_shift) || 758 (t2 > t1 && t2 - t1 > 759 t2 >> lgrp_plat_probe_lt_shift)) 760 continue; 761 762 /* 763 * Make latency(i, j) same as 764 * latency(k, l), try to use latency 765 * that has been adjusted already to get 766 * more consistency (if possible), and 767 * remember which latencies were 768 * adjusted for next time 769 */ 770 if (lat_corrected[i][j]) { 771 t = t1; 772 lgrp_config(cflag, t2, t); 773 t2 = t; 774 } else if (lat_corrected[k][l]) { 775 t = t2; 776 lgrp_config(cflag, t1, t); 777 t1 = t; 778 } else { 779 if (t1 > t2) 780 t = t2; 781 else 782 t = t1; 783 lgrp_config(cflag, t1, t); 784 lgrp_config(cflag, t2, t); 785 t1 = t2 = t; 786 } 787 788 lgrp_plat_probe_times[i][j] = 789 lgrp_plat_probe_times[k][l] = t; 790 791 lat_corrected[i][j] = 792 lat_corrected[k][l] = 1; 793 } 794 } 795 796 /* 797 * Local latencies should be same 798 * - Find min and max local latencies 799 * - Make all local latencies be minimum 800 */ 801 min = -1; 802 max = 0; 803 for (i = 0; i < lgrp_plat_node_cnt; i++) { 804 t = lgrp_plat_probe_times[i][i]; 805 if (t == 0) 806 continue; 807 if (min == -1 || t < min) 808 min = t; 809 if (t > max) 810 max = t; 811 } 812 if (min != max) { 813 for (i = 0; i < lgrp_plat_node_cnt; i++) { 814 int local; 815 816 local = lgrp_plat_probe_times[i][i]; 817 if (local == 0) 818 continue; 819 820 /* 821 * Track suspect probe times that aren't within 822 * tolerance of minimum local latency and how much 823 * probe times are corrected by 824 */ 825 if (local - min > min >> lgrp_plat_probe_lt_shift) 826 lgrp_plat_probe_suspect[i][i]++; 827 828 lgrp_plat_probe_errors[i][i] += local - min; 829 830 /* 831 * Make local latencies be minimum 832 */ 833 lgrp_config(cflag, local, min); 834 lgrp_plat_probe_times[i][i] = min; 835 } 836 } 837 838 /* 839 * Determine max probe time again since just adjusted latencies 840 */ 841 lgrp_plat_probe_time_max = 0; 842 for (i = 0; i < lgrp_plat_node_cnt; i++) 843 for (j = 0; j < lgrp_plat_node_cnt; j++) { 844 t = lgrp_plat_probe_times[i][j]; 845 if (t > lgrp_plat_probe_time_max) 846 lgrp_plat_probe_time_max = t; 847 } 848 } 849 850 851 /* 852 * Verify following about latencies between nodes: 853 * 854 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 855 * - Local latencies same 856 * - Local < remote 857 * - Number of latencies seen is reasonable 858 * - Number of occurrences of a given latency should be more than 1 859 * 860 * Returns: 861 * 0 Success 862 * -1 Not symmetric 863 * -2 Local latencies not same 864 * -3 Local >= remote 865 * -4 Wrong number of latencies 866 * -5 Not enough occurrences of given latency 867 */ 868 static int 869 lgrp_plat_latency_verify(void) 870 { 871 int i; 872 int j; 873 lgrp_plat_latency_acct_t *l; 874 int probed; 875 u_longlong_t t1; 876 u_longlong_t t2; 877 878 /* 879 * Nothing to do when this is an UMA machine, lgroup topology is 880 * limited to 2 levels, or there aren't any probe times yet 881 */ 882 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 883 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 884 return (0); 885 886 /* 887 * Make sure that latencies are symmetric between any two nodes 888 * (ie. latency(node0, node1) == latency(node1, node0)) 889 */ 890 for (i = 0; i < lgrp_plat_node_cnt; i++) 891 for (j = 0; j < lgrp_plat_node_cnt; j++) { 892 t1 = lgrp_plat_probe_times[i][j]; 893 t2 = lgrp_plat_probe_times[j][i]; 894 895 if (t1 == 0 || t2 == 0 || t1 == t2) 896 continue; 897 898 return (-1); 899 } 900 901 /* 902 * Local latencies should be same 903 */ 904 t1 = lgrp_plat_probe_times[0][0]; 905 for (i = 1; i < lgrp_plat_node_cnt; i++) { 906 t2 = lgrp_plat_probe_times[i][i]; 907 if (t2 == 0) 908 continue; 909 910 if (t1 == 0) { 911 t1 = t2; 912 continue; 913 } 914 915 if (t1 != t2) 916 return (-2); 917 } 918 919 /* 920 * Local latencies should be less than remote 921 */ 922 if (t1) { 923 for (i = 0; i < lgrp_plat_node_cnt; i++) 924 for (j = 0; j < lgrp_plat_node_cnt; j++) { 925 t2 = lgrp_plat_probe_times[i][j]; 926 if (i == j || t2 == 0) 927 continue; 928 929 if (t1 >= t2) 930 return (-3); 931 } 932 } 933 934 /* 935 * Rest of checks are not very useful for machines with less than 936 * 4 nodes (which means less than 3 latencies on Opteron) 937 */ 938 if (lgrp_plat_node_cnt < 4) 939 return (0); 940 941 /* 942 * Need to see whether done probing in order to verify number of 943 * latencies are correct 944 */ 945 probed = 0; 946 for (i = 0; i < lgrp_plat_node_cnt; i++) 947 if (lgrp_plat_probe_times[i][i]) 948 probed++; 949 950 if (probed != lgrp_plat_node_cnt) 951 return (0); 952 953 /* 954 * Determine number of unique latencies seen in probe times, 955 * their values, and number of occurrences of each 956 */ 957 lgrp_plat_probe_nlatencies = 0; 958 bzero(lgrp_plat_probe_lat_acct, 959 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 960 for (i = 0; i < lgrp_plat_node_cnt; i++) { 961 for (j = 0; j < lgrp_plat_node_cnt; j++) { 962 int k; 963 964 /* 965 * Look at each probe time 966 */ 967 t1 = lgrp_plat_probe_times[i][j]; 968 if (t1 == 0) 969 continue; 970 971 /* 972 * Account for unique latencies 973 */ 974 for (k = 0; k < lgrp_plat_node_cnt; k++) { 975 l = &lgrp_plat_probe_lat_acct[k]; 976 if (t1 == l->la_value) { 977 /* 978 * Increment number of occurrences 979 * if seen before 980 */ 981 l->la_count++; 982 break; 983 } else if (l->la_value == 0) { 984 /* 985 * Record latency if haven't seen before 986 */ 987 l->la_value = t1; 988 l->la_count++; 989 lgrp_plat_probe_nlatencies++; 990 break; 991 } 992 } 993 } 994 } 995 996 /* 997 * Number of latencies should be relative to number of 998 * nodes in system: 999 * - Same as nodes when nodes <= 2 1000 * - Less than nodes when nodes > 2 1001 * - Greater than 2 when nodes >= 4 1002 */ 1003 if ((lgrp_plat_node_cnt <= 2 && 1004 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 1005 (lgrp_plat_node_cnt > 2 && 1006 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 1007 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 1008 lgrp_plat_probe_nlatencies <= 2)) 1009 return (-4); 1010 1011 /* 1012 * There should be more than one occurrence of every latency 1013 * as long as probing is complete 1014 */ 1015 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 1016 l = &lgrp_plat_probe_lat_acct[i]; 1017 if (l->la_count <= 1) 1018 return (-5); 1019 } 1020 return (0); 1021 } 1022 1023 1024 /* 1025 * Set lgroup latencies for 2 level lgroup topology 1026 */ 1027 static void 1028 lgrp_plat_2level_setup(void) 1029 { 1030 int i; 1031 1032 if (lgrp_plat_node_cnt >= 4) 1033 cmn_err(CE_NOTE, 1034 "MPO only optimizing for local and remote\n"); 1035 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1036 int j; 1037 1038 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1039 if (i == j) 1040 lgrp_plat_probe_times[i][j] = 2; 1041 else 1042 lgrp_plat_probe_times[i][j] = 3; 1043 } 1044 } 1045 lgrp_plat_probe_time_min = 2; 1046 lgrp_plat_probe_time_max = 3; 1047 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1048 } 1049 1050 1051 /* 1052 * Return time needed to probe from current CPU to memory in given node 1053 */ 1054 static hrtime_t 1055 lgrp_plat_probe_time(int to) 1056 { 1057 caddr_t buf; 1058 uint_t dev; 1059 /* LINTED: set but not used in function */ 1060 volatile uint_t dev_vendor; 1061 hrtime_t elapsed; 1062 hrtime_t end; 1063 int from; 1064 int i; 1065 int ipl; 1066 hrtime_t max; 1067 hrtime_t min; 1068 hrtime_t start; 1069 extern int use_sse_pagecopy; 1070 1071 /* 1072 * Determine ID of node containing current CPU 1073 */ 1074 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1075 1076 /* 1077 * Do common work for probing main memory 1078 */ 1079 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1080 /* 1081 * Skip probing any nodes without memory and 1082 * set probe time to 0 1083 */ 1084 if (lgrp_plat_probe_memory[to] == NULL) { 1085 lgrp_plat_probe_times[from][to] = 0; 1086 return (0); 1087 } 1088 1089 /* 1090 * Invalidate caches once instead of once every sample 1091 * which should cut cost of probing by a lot 1092 */ 1093 lgrp_plat_flush_cost = gethrtime(); 1094 invalidate_cache(); 1095 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1096 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1097 } 1098 1099 /* 1100 * Probe from current CPU to given memory using specified operation 1101 * and take specified number of samples 1102 */ 1103 max = 0; 1104 min = -1; 1105 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1106 lgrp_plat_probe_cost = gethrtime(); 1107 1108 /* 1109 * Can't measure probe time if gethrtime() isn't working yet 1110 */ 1111 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1112 return (0); 1113 1114 switch (lgrp_plat_probe_op) { 1115 1116 case LGRP_PLAT_PROBE_PGCPY: 1117 default: 1118 /* 1119 * Measure how long it takes to copy page 1120 * on top of itself 1121 */ 1122 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1123 1124 kpreempt_disable(); 1125 ipl = splhigh(); 1126 start = gethrtime(); 1127 if (use_sse_pagecopy) 1128 hwblkpagecopy(buf, buf); 1129 else 1130 bcopy(buf, buf, PAGESIZE); 1131 end = gethrtime(); 1132 elapsed = end - start; 1133 splx(ipl); 1134 kpreempt_enable(); 1135 break; 1136 1137 case LGRP_PLAT_PROBE_VENDOR: 1138 /* 1139 * Measure how long it takes to read vendor ID from 1140 * Northbridge 1141 */ 1142 dev = OPT_PCS_DEV_NODE0 + to; 1143 kpreempt_disable(); 1144 ipl = spl8(); 1145 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1146 OPT_PCS_OFF_VENDOR)); 1147 start = gethrtime(); 1148 dev_vendor = inl(PCI_CONFDATA); 1149 end = gethrtime(); 1150 elapsed = end - start; 1151 splx(ipl); 1152 kpreempt_enable(); 1153 break; 1154 } 1155 1156 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1157 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1158 1159 if (min == -1 || elapsed < min) 1160 min = elapsed; 1161 if (elapsed > max) 1162 max = elapsed; 1163 } 1164 1165 /* 1166 * Update minimum and maximum probe times between 1167 * these two nodes 1168 */ 1169 if (min < lgrp_plat_probe_min[from][to] || 1170 lgrp_plat_probe_min[from][to] == 0) 1171 lgrp_plat_probe_min[from][to] = min; 1172 1173 if (max > lgrp_plat_probe_max[from][to]) 1174 lgrp_plat_probe_max[from][to] = max; 1175 1176 return (min); 1177 } 1178 1179 1180 /* 1181 * Probe memory in each node from current CPU to determine latency topology 1182 */ 1183 void 1184 lgrp_plat_probe(void) 1185 { 1186 int from; 1187 int i; 1188 hrtime_t probe_time; 1189 int to; 1190 1191 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1192 return; 1193 1194 /* 1195 * Determine ID of node containing current CPU 1196 */ 1197 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1198 1199 /* 1200 * Don't need to probe if got times already 1201 */ 1202 if (lgrp_plat_probe_times[from][from] != 0) 1203 return; 1204 1205 /* 1206 * Read vendor ID in Northbridge or read and write page(s) 1207 * in each node from current CPU and remember how long it takes, 1208 * so we can build latency topology of machine later. 1209 * This should approximate the memory latency between each node. 1210 */ 1211 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1212 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1213 /* 1214 * Get probe time and bail out if can't get it yet 1215 */ 1216 probe_time = lgrp_plat_probe_time(to); 1217 if (probe_time == 0) 1218 return; 1219 1220 /* 1221 * Keep lowest probe time as latency between nodes 1222 */ 1223 if (lgrp_plat_probe_times[from][to] == 0 || 1224 probe_time < lgrp_plat_probe_times[from][to]) 1225 lgrp_plat_probe_times[from][to] = probe_time; 1226 1227 /* 1228 * Update overall minimum and maximum probe times 1229 * across all nodes 1230 */ 1231 if (probe_time < lgrp_plat_probe_time_min || 1232 lgrp_plat_probe_time_min == -1) 1233 lgrp_plat_probe_time_min = probe_time; 1234 if (probe_time > lgrp_plat_probe_time_max) 1235 lgrp_plat_probe_time_max = probe_time; 1236 } 1237 1238 /* 1239 * - Fix up latencies such that local latencies are same, 1240 * latency(i, j) == latency(j, i), etc. (if possible) 1241 * 1242 * - Verify that latencies look ok 1243 * 1244 * - Fallback to just optimizing for local and remote if 1245 * latencies didn't look right 1246 */ 1247 lgrp_plat_latency_adjust(); 1248 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1249 if (lgrp_plat_probe_error_code) 1250 lgrp_plat_2level_setup(); 1251 } 1252 1253 1254 /* 1255 * Platform-specific initialization 1256 */ 1257 void 1258 lgrp_plat_main_init(void) 1259 { 1260 int curnode; 1261 int ht_limit; 1262 int i; 1263 1264 /* 1265 * Print a notice that MPO is disabled when memory is interleaved 1266 * across nodes....Would do this when it is discovered, but can't 1267 * because it happens way too early during boot.... 1268 */ 1269 if (lgrp_plat_mem_intrlv) 1270 cmn_err(CE_NOTE, 1271 "MPO disabled because memory is interleaved\n"); 1272 1273 /* 1274 * Don't bother to do any probing if there is only one node or the 1275 * height of the lgroup topology less than or equal to 2 1276 */ 1277 ht_limit = lgrp_topo_ht_limit(); 1278 if (max_mem_nodes == 1 || ht_limit <= 2) { 1279 /* 1280 * Setup lgroup latencies for 2 level lgroup topology 1281 * (ie. local and remote only) if they haven't been set yet 1282 */ 1283 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1284 lgrp_plat_probe_time_max == 0) 1285 lgrp_plat_2level_setup(); 1286 return; 1287 } 1288 1289 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1290 /* 1291 * Should have been able to probe from CPU 0 when it was added 1292 * to lgroup hierarchy, but may not have been able to then 1293 * because it happens so early in boot that gethrtime() hasn't 1294 * been initialized. (:-( 1295 */ 1296 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1297 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1298 lgrp_plat_probe(); 1299 1300 return; 1301 } 1302 1303 /* 1304 * When probing memory, use one page for every sample to determine 1305 * lgroup topology and taking multiple samples 1306 */ 1307 if (lgrp_plat_probe_memsize == 0) 1308 lgrp_plat_probe_memsize = PAGESIZE * 1309 lgrp_plat_probe_nsamples; 1310 1311 /* 1312 * Map memory in each node needed for probing to determine latency 1313 * topology 1314 */ 1315 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1316 int mnode; 1317 1318 /* 1319 * Skip this node and leave its probe page NULL 1320 * if it doesn't have any memory 1321 */ 1322 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1323 if (!mem_node_config[mnode].exists) { 1324 lgrp_plat_probe_memory[i] = NULL; 1325 continue; 1326 } 1327 1328 /* 1329 * Allocate one kernel virtual page 1330 */ 1331 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1332 lgrp_plat_probe_memsize, VM_NOSLEEP); 1333 if (lgrp_plat_probe_memory[i] == NULL) { 1334 cmn_err(CE_WARN, 1335 "lgrp_plat_main_init: couldn't allocate memory"); 1336 return; 1337 } 1338 1339 /* 1340 * Map virtual page to first page in node 1341 */ 1342 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1343 lgrp_plat_probe_memsize, 1344 lgrp_plat_probe_pfn[i], 1345 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1346 HAT_LOAD_NOCONSIST); 1347 } 1348 1349 /* 1350 * Probe from current CPU 1351 */ 1352 lgrp_plat_probe(); 1353 } 1354 1355 /* 1356 * Allocate additional space for an lgroup. 1357 */ 1358 /* ARGSUSED */ 1359 lgrp_t * 1360 lgrp_plat_alloc(lgrp_id_t lgrpid) 1361 { 1362 lgrp_t *lgrp; 1363 1364 lgrp = &lgrp_space[nlgrps_alloc++]; 1365 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1366 return (NULL); 1367 return (lgrp); 1368 } 1369 1370 /* 1371 * Platform handling for (re)configuration changes 1372 */ 1373 /* ARGSUSED */ 1374 void 1375 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1376 { 1377 } 1378 1379 /* 1380 * Return the platform handle for the lgroup containing the given CPU 1381 */ 1382 /* ARGSUSED */ 1383 lgrp_handle_t 1384 lgrp_plat_cpu_to_hand(processorid_t id) 1385 { 1386 if (lgrp_plat_node_cnt == 1) 1387 return (LGRP_DEFAULT_HANDLE); 1388 1389 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1390 } 1391 1392 /* 1393 * Return the platform handle of the lgroup that contains the physical memory 1394 * corresponding to the given page frame number 1395 */ 1396 /* ARGSUSED */ 1397 lgrp_handle_t 1398 lgrp_plat_pfn_to_hand(pfn_t pfn) 1399 { 1400 int mnode; 1401 1402 if (max_mem_nodes == 1) 1403 return (LGRP_DEFAULT_HANDLE); 1404 1405 mnode = plat_pfn_to_mem_node(pfn); 1406 return (MEM_NODE_2_LGRPHAND(mnode)); 1407 } 1408 1409 /* 1410 * Return the maximum number of lgrps supported by the platform. 1411 * Before lgrp topology is known it returns an estimate based on the number of 1412 * nodes. Once topology is known it returns the actual maximim number of lgrps 1413 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1414 * may not grow during system lifetime. 1415 */ 1416 int 1417 lgrp_plat_max_lgrps() 1418 { 1419 return (lgrp_topo_initialized ? 1420 lgrp_alloc_max + 1 : 1421 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1422 } 1423 1424 /* 1425 * Return the number of free, allocatable, or installed 1426 * pages in an lgroup 1427 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1428 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1429 */ 1430 /* ARGSUSED */ 1431 static pgcnt_t 1432 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1433 { 1434 struct memlist *mlist; 1435 pgcnt_t npgs = 0; 1436 extern struct memlist *phys_avail; 1437 extern struct memlist *phys_install; 1438 1439 switch (query) { 1440 case LGRP_MEM_SIZE_FREE: 1441 return ((pgcnt_t)freemem); 1442 case LGRP_MEM_SIZE_AVAIL: 1443 memlist_read_lock(); 1444 for (mlist = phys_avail; mlist; mlist = mlist->next) 1445 npgs += btop(mlist->size); 1446 memlist_read_unlock(); 1447 return (npgs); 1448 case LGRP_MEM_SIZE_INSTALL: 1449 memlist_read_lock(); 1450 for (mlist = phys_install; mlist; mlist = mlist->next) 1451 npgs += btop(mlist->size); 1452 memlist_read_unlock(); 1453 return (npgs); 1454 default: 1455 return ((pgcnt_t)0); 1456 } 1457 } 1458 1459 /* 1460 * Return the number of free pages in an lgroup. 1461 * 1462 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1463 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1464 * number of allocatable base pagesize pages corresponding to the 1465 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1466 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1467 * memory installed, regardless of whether or not it's usable. 1468 */ 1469 pgcnt_t 1470 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1471 { 1472 int mnode; 1473 pgcnt_t npgs = (pgcnt_t)0; 1474 extern struct memlist *phys_avail; 1475 extern struct memlist *phys_install; 1476 1477 1478 if (plathand == LGRP_DEFAULT_HANDLE) 1479 return (lgrp_plat_mem_size_default(plathand, query)); 1480 1481 if (plathand != LGRP_NULL_HANDLE) { 1482 mnode = plat_lgrphand_to_mem_node(plathand); 1483 if (mnode >= 0 && mem_node_config[mnode].exists) { 1484 switch (query) { 1485 case LGRP_MEM_SIZE_FREE: 1486 npgs = MNODE_PGCNT(mnode); 1487 break; 1488 case LGRP_MEM_SIZE_AVAIL: 1489 npgs = mem_node_memlist_pages(mnode, 1490 phys_avail); 1491 break; 1492 case LGRP_MEM_SIZE_INSTALL: 1493 npgs = mem_node_memlist_pages(mnode, 1494 phys_install); 1495 break; 1496 default: 1497 break; 1498 } 1499 } 1500 } 1501 return (npgs); 1502 } 1503 1504 /* 1505 * Return latency between "from" and "to" lgroups 1506 * 1507 * This latency number can only be used for relative comparison 1508 * between lgroups on the running system, cannot be used across platforms, 1509 * and may not reflect the actual latency. It is platform and implementation 1510 * specific, so platform gets to decide its value. It would be nice if the 1511 * number was at least proportional to make comparisons more meaningful though. 1512 */ 1513 /* ARGSUSED */ 1514 int 1515 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1516 { 1517 lgrp_handle_t src, dest; 1518 1519 if (max_mem_nodes == 1) 1520 return (0); 1521 1522 /* 1523 * Return max latency for root lgroup 1524 */ 1525 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1526 return (lgrp_plat_probe_time_max); 1527 1528 src = from; 1529 dest = to; 1530 1531 /* 1532 * Return 0 for nodes (lgroup platform handles) out of range 1533 */ 1534 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1535 return (0); 1536 1537 /* 1538 * Probe from current CPU if its lgroup latencies haven't been set yet 1539 * and we are trying to get latency from current CPU to some node 1540 */ 1541 if (lgrp_plat_probe_times[src][src] == 0 && 1542 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1543 lgrp_plat_probe(); 1544 1545 return (lgrp_plat_probe_times[src][dest]); 1546 } 1547 1548 /* 1549 * Return platform handle for root lgroup 1550 */ 1551 lgrp_handle_t 1552 lgrp_plat_root_hand(void) 1553 { 1554 return (LGRP_DEFAULT_HANDLE); 1555 } 1556