1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/controlregs.h> 33 #include <sys/cpupart.h> 34 #include <sys/cpuvar.h> 35 #include <sys/lgrp.h> 36 #include <sys/machsystm.h> 37 #include <sys/memlist.h> 38 #include <sys/memnode.h> 39 #include <sys/mman.h> 40 #include <sys/pci_cfgspace.h> 41 #include <sys/pci_impl.h> 42 #include <sys/param.h> 43 #include <sys/pghw.h> 44 #include <sys/promif.h> /* for prom_printf() */ 45 #include <sys/systm.h> 46 #include <sys/thread.h> 47 #include <sys/types.h> 48 #include <sys/var.h> 49 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 50 #include <sys/sysmacros.h> 51 #include <vm/hat_i86.h> 52 #include <vm/seg_kmem.h> 53 #include <vm/vm_dep.h> 54 55 56 /* 57 * lgroup platform support for x86 platforms. 58 */ 59 60 #define MAX_NODES 8 61 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 62 63 #define LGRP_PLAT_CPU_TO_NODE(cpu) (pg_plat_hw_instance_id(cpu, PGHW_CHIP)) 64 65 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 66 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 67 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 68 69 /* 70 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 71 * 72 * Until this code supports reading System Resource Affinity Table (SRAT), 73 * we need to examine registers in PCI configuration space to determine how 74 * many nodes are in the system and which CPUs and memory are in each node. 75 * This could be determined by probing all memory from each CPU, but that is 76 * too expensive to do while booting the kernel. 77 * 78 * NOTE: Using these PCI configuration space registers to determine this 79 * locality info is not guaranteed to work on future generations of 80 * Opteron processor. 81 */ 82 83 /* 84 * Opteron DRAM Address Map in PCI configuration space gives base and limit 85 * of physical memory in each node. The following constants and macros define 86 * their contents, structure, and access. 87 */ 88 89 /* 90 * How many bits to shift Opteron DRAM Address Map base and limit registers 91 * to get actual value 92 */ 93 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 94 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 95 96 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 97 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 98 99 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 100 101 /* 102 * Macros to derive addresses from Opteron DRAM Address Map registers 103 */ 104 #define OPT_DRAMADDR_HI(reg) \ 105 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 106 OPT_DRAMADDR_HI_LSHIFT_ADDR) 107 108 #define OPT_DRAMADDR_LO(reg) \ 109 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 110 OPT_DRAMADDR_LO_LSHIFT_ADDR) 111 112 #define OPT_DRAMADDR(high, low) \ 113 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 114 115 /* 116 * Bit masks defining what's in Opteron DRAM Address Map base register 117 */ 118 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 119 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 120 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 121 122 /* 123 * Bit masks defining what's in Opteron DRAM Address Map limit register 124 */ 125 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 126 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 127 128 129 /* 130 * Opteron Node ID register in PCI configuration space contains 131 * number of nodes in system, etc. for Opteron K8. The following 132 * constants and macros define its contents, structure, and access. 133 */ 134 135 /* 136 * Bit masks defining what's in Opteron Node ID register 137 */ 138 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 139 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 140 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 141 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 142 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 143 144 /* 145 * How many bits in Opteron Node ID register to shift right to get actual value 146 */ 147 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 148 149 /* 150 * Macros to get values from Opteron Node ID register 151 */ 152 #define OPT_NODE_CNT(reg) \ 153 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 154 155 /* 156 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 157 * "in/out" instructions 158 * 159 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 160 * other uses should just do MMIO to access PCI ECS. 161 * Must enable special bit in Northbridge Configuration Register on 162 * Greyhound for extended CF8 space access to be able to access PCI ECS 163 * using "in/out" instructions and restore special bit after done 164 * accessing PCI ECS. 165 */ 166 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 167 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 168 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 169 ((((reg) >> 8) & 0xf) << 24)) 170 171 /* 172 * PCI configuration space registers accessed by specifying 173 * a bus, device, function, and offset. The following constants 174 * define the values needed to access Opteron K8 configuration 175 * info to determine its node topology 176 */ 177 178 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 179 180 /* 181 * Opteron PCI configuration space register function values 182 */ 183 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 184 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 185 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 186 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 187 188 /* 189 * PCI Configuration Space register offsets 190 */ 191 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 192 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 193 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 194 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 195 196 /* 197 * Opteron PCI Configuration Space device IDs for nodes 198 */ 199 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 200 201 202 /* 203 * Bookkeeping for latencies seen during probing (used for verification) 204 */ 205 typedef struct lgrp_plat_latency_acct { 206 hrtime_t la_value; /* latency value */ 207 int la_count; /* occurrences */ 208 } lgrp_plat_latency_acct_t; 209 210 211 /* 212 * Choices for probing to determine lgroup topology 213 */ 214 typedef enum lgrp_plat_probe_op { 215 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 216 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 217 } lgrp_plat_probe_op_t; 218 219 220 /* 221 * Opteron DRAM address map gives base and limit for physical memory in a node 222 */ 223 typedef struct opt_dram_addr_map { 224 uint32_t base_hi; 225 uint32_t base_lo; 226 uint32_t limit_hi; 227 uint32_t limit_lo; 228 } opt_dram_addr_map_t; 229 230 231 /* 232 * Starting and ending page for physical memory in node 233 */ 234 typedef struct phys_addr_map { 235 pfn_t start; 236 pfn_t end; 237 int exists; 238 } phys_addr_map_t; 239 240 241 /* 242 * Opteron DRAM address map for each node 243 */ 244 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 245 246 /* 247 * Node ID register contents for each node 248 */ 249 uint_t opt_node_info[MAX_NODES]; 250 251 /* 252 * Whether memory is interleaved across nodes causing MPO to be disabled 253 */ 254 int lgrp_plat_mem_intrlv = 0; 255 256 /* 257 * Number of nodes in system 258 */ 259 uint_t lgrp_plat_node_cnt = 1; 260 261 /* 262 * Physical address range for memory in each node 263 */ 264 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 265 266 /* 267 * Probe costs (individual and total) and flush cost 268 */ 269 hrtime_t lgrp_plat_flush_cost = 0; 270 hrtime_t lgrp_plat_probe_cost = 0; 271 hrtime_t lgrp_plat_probe_cost_total = 0; 272 273 /* 274 * Error code for latency adjustment and verification 275 */ 276 int lgrp_plat_probe_error_code = 0; 277 278 /* 279 * How much latencies were off from minimum values gotten 280 */ 281 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 282 283 /* 284 * Unique probe latencies and number of occurrences of each 285 */ 286 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 287 288 /* 289 * Size of memory buffer in each node for probing 290 */ 291 size_t lgrp_plat_probe_memsize = 0; 292 293 /* 294 * Virtual address of page in each node for probing 295 */ 296 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 297 298 /* 299 * Number of unique latencies in probe times 300 */ 301 int lgrp_plat_probe_nlatencies = 0; 302 303 /* 304 * How many rounds of probing to do 305 */ 306 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 307 308 /* 309 * Number of samples to take when probing each node 310 */ 311 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 312 313 /* 314 * Number of times to read vendor ID from Northbridge for each probe. 315 */ 316 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 317 318 /* 319 * How to probe to determine lgroup topology 320 */ 321 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 322 323 /* 324 * PFN of page in each node for probing 325 */ 326 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 327 328 /* 329 * Whether probe time was suspect (ie. not within tolerance of value that it 330 * should match) 331 */ 332 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 333 334 /* 335 * How long it takes to access memory from each node 336 */ 337 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 338 339 /* 340 * Min and max node memory probe times seen 341 */ 342 hrtime_t lgrp_plat_probe_time_max = 0; 343 hrtime_t lgrp_plat_probe_time_min = -1; 344 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 345 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 346 347 348 /* 349 * Allocate lgrp and lgrp stat arrays statically. 350 */ 351 static lgrp_t lgrp_space[NLGRP]; 352 static int nlgrps_alloc; 353 354 struct lgrp_stats lgrp_stats[NLGRP]; 355 356 /* 357 * Supported AMD processor families 358 */ 359 #define AMD_FAMILY_HAMMER 15 360 #define AMD_FAMILY_GREYHOUND 16 361 362 /* 363 * Whether to have is_opteron() return 1 even when processor isn't 364 * supported 365 */ 366 uint_t is_opteron_override = 0; 367 368 /* 369 * AMD processor family for current CPU 370 */ 371 uint_t opt_family = 0; 372 373 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 374 375 376 /* 377 * Determine whether we're running on a supported AMD Opteron since reading 378 * node count and DRAM address map registers may have different format or 379 * may not be supported in future processor families 380 */ 381 int 382 is_opteron(void) 383 { 384 385 if (x86_vendor != X86_VENDOR_AMD) 386 return (0); 387 388 opt_family = cpuid_getfamily(CPU); 389 if (opt_family == AMD_FAMILY_HAMMER || 390 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 391 return (1); 392 else 393 return (0); 394 } 395 396 int 397 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 398 { 399 if (max_mem_nodes == 1) 400 return (0); 401 402 return ((int)hand); 403 } 404 405 lgrp_handle_t 406 plat_mem_node_to_lgrphand(int mnode) 407 { 408 if (max_mem_nodes == 1) 409 return (LGRP_DEFAULT_HANDLE); 410 411 return ((lgrp_handle_t)mnode); 412 } 413 414 415 /* 416 * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 417 * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 418 * a crossing is found and returns 0 otherwise. 419 */ 420 int 421 plat_mnode_xcheck(pfn_t pfncnt) 422 { 423 int node, prevnode = -1, basenode; 424 pfn_t ea, sa; 425 426 for (node = 0; node < lgrp_plat_node_cnt; node++) { 427 428 if (lgrp_plat_node_memory[node].exists == 0) 429 continue; 430 431 if (prevnode == -1) { 432 prevnode = node; 433 basenode = node; 434 continue; 435 } 436 437 /* assume x86 node pfn ranges are in increasing order */ 438 ASSERT(lgrp_plat_node_memory[node].start > 439 lgrp_plat_node_memory[prevnode].end); 440 441 /* 442 * continue if the starting address of node is not contiguous 443 * with the previous node. 444 */ 445 446 if (lgrp_plat_node_memory[node].start != 447 (lgrp_plat_node_memory[prevnode].end + 1)) { 448 basenode = node; 449 prevnode = node; 450 continue; 451 } 452 453 /* check if the starting address of node is pfncnt aligned */ 454 if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) { 455 456 /* 457 * at this point, node starts at an unaligned boundary 458 * and is contiguous with the previous node(s) to 459 * basenode. Check if there is an aligned contiguous 460 * range of length pfncnt that crosses this boundary. 461 */ 462 463 sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end, 464 pfncnt); 465 ea = P2ROUNDUP((lgrp_plat_node_memory[node].start), 466 pfncnt); 467 468 ASSERT((ea - sa) == pfncnt); 469 if (sa >= lgrp_plat_node_memory[basenode].start && 470 ea <= (lgrp_plat_node_memory[node].end + 1)) 471 return (1); 472 } 473 prevnode = node; 474 } 475 return (0); 476 } 477 478 int 479 plat_pfn_to_mem_node(pfn_t pfn) 480 { 481 int node; 482 483 if (max_mem_nodes == 1) 484 return (0); 485 486 for (node = 0; node < lgrp_plat_node_cnt; node++) { 487 /* 488 * Skip nodes with no memory 489 */ 490 if (!lgrp_plat_node_memory[node].exists) 491 continue; 492 493 if (pfn >= lgrp_plat_node_memory[node].start && 494 pfn <= lgrp_plat_node_memory[node].end) 495 return (node); 496 } 497 498 ASSERT(node < lgrp_plat_node_cnt); 499 return (-1); 500 } 501 502 /* 503 * Configure memory nodes for machines with more than one node (ie NUMA) 504 */ 505 void 506 plat_build_mem_nodes(struct memlist *list) 507 { 508 pfn_t cur_start; /* start addr of subrange */ 509 pfn_t cur_end; /* end addr of subrange */ 510 pfn_t start; /* start addr of whole range */ 511 pfn_t end; /* end addr of whole range */ 512 513 /* 514 * Boot install lists are arranged <addr, len>, ... 515 */ 516 while (list) { 517 int node; 518 519 start = list->address >> PAGESHIFT; 520 end = (list->address + list->size - 1) >> PAGESHIFT; 521 522 if (start > physmax) { 523 list = list->next; 524 continue; 525 } 526 if (end > physmax) 527 end = physmax; 528 529 /* 530 * When there is only one memnode, just add memory to memnode 531 */ 532 if (max_mem_nodes == 1) { 533 mem_node_add_slice(start, end); 534 list = list->next; 535 continue; 536 } 537 538 /* 539 * mem_node_add_slice() expects to get a memory range that 540 * is within one memnode, so need to split any memory range 541 * that spans multiple memnodes into subranges that are each 542 * contained within one memnode when feeding them to 543 * mem_node_add_slice() 544 */ 545 cur_start = start; 546 do { 547 node = plat_pfn_to_mem_node(cur_start); 548 549 /* 550 * Panic if DRAM address map registers or SRAT say 551 * memory in node doesn't exist or address from 552 * boot installed memory list entry isn't in this node. 553 * This shouldn't happen and rest of code can't deal 554 * with this if it does. 555 */ 556 if (node < 0 || node >= lgrp_plat_node_cnt || 557 !lgrp_plat_node_memory[node].exists || 558 cur_start < lgrp_plat_node_memory[node].start || 559 cur_start > lgrp_plat_node_memory[node].end) { 560 cmn_err(CE_PANIC, "Don't know which memnode " 561 "to add installed memory address 0x%lx\n", 562 cur_start); 563 } 564 565 /* 566 * End of current subrange should not span memnodes 567 */ 568 cur_end = end; 569 if (lgrp_plat_node_memory[node].exists && 570 cur_end > lgrp_plat_node_memory[node].end) 571 cur_end = lgrp_plat_node_memory[node].end; 572 573 mem_node_add_slice(cur_start, cur_end); 574 575 /* 576 * Next subrange starts after end of current one 577 */ 578 cur_start = cur_end + 1; 579 } while (cur_end < end); 580 581 list = list->next; 582 } 583 mem_node_physalign = 0; 584 mem_node_pfn_shift = 0; 585 } 586 587 588 /* 589 * Platform-specific initialization of lgroups 590 */ 591 void 592 lgrp_plat_init(void) 593 { 594 #if defined(__xpv) 595 /* 596 * XXPV For now, the hypervisor treats all memory equally. 597 */ 598 lgrp_plat_node_cnt = max_mem_nodes = 1; 599 #else /* __xpv */ 600 uint_t bus; 601 uint_t dev; 602 uint_t node; 603 uint_t off_hi; 604 uint_t off_lo; 605 uint64_t nb_cfg_reg; 606 607 extern lgrp_load_t lgrp_expand_proc_thresh; 608 extern lgrp_load_t lgrp_expand_proc_diff; 609 610 /* 611 * Initialize as a UMA machine if this isn't an Opteron 612 */ 613 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 614 lgrp_plat_node_cnt = max_mem_nodes = 1; 615 return; 616 } 617 618 /* 619 * Read configuration registers from PCI configuration space to 620 * determine node information, which memory is in each node, etc. 621 * 622 * Write to PCI configuration space address register to specify 623 * which configuration register to read and read/write PCI 624 * configuration space data register to get/set contents 625 */ 626 bus = OPT_PCS_BUS_CONFIG; 627 dev = OPT_PCS_DEV_NODE0; 628 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 629 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 630 631 /* 632 * Read node ID register for node 0 to get node count 633 */ 634 opt_node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 635 OPT_PCS_OFF_NODEID); 636 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 637 638 /* 639 * For Greyhound, PCI Extended Configuration Space must be enabled to 640 * read high DRAM address map base and limit registers 641 */ 642 if (opt_family == AMD_FAMILY_GREYHOUND) { 643 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 644 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 645 wrmsr(MSR_AMD_NB_CFG, 646 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 647 } 648 649 for (node = 0; node < lgrp_plat_node_cnt; node++) { 650 uint32_t base_hi; 651 uint32_t base_lo; 652 uint32_t limit_hi; 653 uint32_t limit_lo; 654 655 /* 656 * Read node ID register (except for node 0 which we just read) 657 */ 658 if (node > 0) { 659 opt_node_info[node] = pci_getl_func(bus, dev, 660 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 661 } 662 663 /* 664 * Read DRAM base and limit registers which specify 665 * physical memory range of each node 666 */ 667 if (opt_family != AMD_FAMILY_GREYHOUND) 668 base_hi = 0; 669 else { 670 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 671 OPT_PCS_FUNC_ADDRMAP, off_hi)); 672 base_hi = opt_dram_map[node].base_hi = 673 inl(PCI_CONFDATA); 674 } 675 base_lo = opt_dram_map[node].base_lo = pci_getl_func(bus, dev, 676 OPT_PCS_FUNC_ADDRMAP, off_lo); 677 678 if (opt_dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) 679 lgrp_plat_mem_intrlv++; 680 681 off_hi += 4; /* high limit register offset */ 682 if (opt_family != AMD_FAMILY_GREYHOUND) 683 limit_hi = 0; 684 else { 685 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 686 OPT_PCS_FUNC_ADDRMAP, off_hi)); 687 limit_hi = opt_dram_map[node].limit_hi = 688 inl(PCI_CONFDATA); 689 } 690 691 off_lo += 4; /* low limit register offset */ 692 limit_lo = opt_dram_map[node].limit_lo = pci_getl_func(bus, 693 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 694 695 /* 696 * Increment device number to next node and register offsets 697 * for DRAM base register of next node 698 */ 699 off_hi += 4; 700 off_lo += 4; 701 dev++; 702 703 /* 704 * Both read and write enable bits must be enabled in DRAM 705 * address map base register for physical memory to exist in 706 * node 707 */ 708 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 709 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 710 /* 711 * Mark node memory as non-existent and set start and 712 * end addresses to be same in lgrp_plat_node_memory[] 713 */ 714 lgrp_plat_node_memory[node].exists = 0; 715 lgrp_plat_node_memory[node].start = 716 lgrp_plat_node_memory[node].end = (pfn_t)-1; 717 continue; 718 } 719 720 /* 721 * Get PFN for first page in each node, 722 * so we can probe memory to determine latency topology 723 */ 724 lgrp_plat_probe_pfn[node] = 725 btop(OPT_DRAMADDR(base_hi, base_lo)); 726 727 /* 728 * Mark node memory as existing and remember physical address 729 * range of each node for use later 730 */ 731 lgrp_plat_node_memory[node].exists = 1; 732 733 lgrp_plat_node_memory[node].start = 734 btop(OPT_DRAMADDR(base_hi, base_lo)); 735 736 lgrp_plat_node_memory[node].end = 737 btop(OPT_DRAMADDR(limit_hi, limit_lo) | 738 OPT_DRAMADDR_LO_MASK_OFF); 739 } 740 741 /* 742 * Restore PCI Extended Configuration Space enable bit 743 */ 744 if (opt_family == AMD_FAMILY_GREYHOUND) { 745 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 746 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 747 } 748 749 /* 750 * Only use one memory node if memory is interleaved between any nodes 751 */ 752 if (lgrp_plat_mem_intrlv) { 753 lgrp_plat_node_cnt = max_mem_nodes = 1; 754 (void) lgrp_topo_ht_limit_set(1); 755 } else { 756 max_mem_nodes = lgrp_plat_node_cnt; 757 758 /* 759 * Probing errors can mess up the lgroup topology and force us 760 * fall back to a 2 level lgroup topology. Here we bound how 761 * tall the lgroup topology can grow in hopes of avoiding any 762 * anamolies in probing from messing up the lgroup topology 763 * by limiting the accuracy of the latency topology. 764 * 765 * Assume that nodes will at least be configured in a ring, 766 * so limit height of lgroup topology to be less than number 767 * of nodes on a system with 4 or more nodes 768 */ 769 if (lgrp_plat_node_cnt >= 4 && 770 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 771 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 772 } 773 774 /* 775 * Lgroups on Opteron architectures have but a single physical 776 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 777 * so that lgrp_choose() will spread things out aggressively. 778 */ 779 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 780 lgrp_expand_proc_diff = 0; 781 #endif /* __xpv */ 782 } 783 784 785 /* 786 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 787 * be considered same 788 */ 789 #define LGRP_LAT_TOLERANCE_SHIFT 4 790 791 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 792 793 794 /* 795 * Adjust latencies between nodes to be symmetric, normalize latencies between 796 * any nodes that are within some tolerance to be same, and make local 797 * latencies be same 798 */ 799 static void 800 lgrp_plat_latency_adjust(void) 801 { 802 int i; 803 int j; 804 int k; 805 int l; 806 u_longlong_t max; 807 u_longlong_t min; 808 u_longlong_t t; 809 u_longlong_t t1; 810 u_longlong_t t2; 811 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 812 int lat_corrected[MAX_NODES][MAX_NODES]; 813 814 /* 815 * Nothing to do when this is an UMA machine 816 */ 817 if (max_mem_nodes == 1) 818 return; 819 820 /* 821 * Make sure that latencies are symmetric between any two nodes 822 * (ie. latency(node0, node1) == latency(node1, node0)) 823 */ 824 for (i = 0; i < lgrp_plat_node_cnt; i++) 825 for (j = 0; j < lgrp_plat_node_cnt; j++) { 826 t1 = lgrp_plat_probe_times[i][j]; 827 t2 = lgrp_plat_probe_times[j][i]; 828 829 if (t1 == 0 || t2 == 0 || t1 == t2) 830 continue; 831 832 /* 833 * Latencies should be same 834 * - Use minimum of two latencies which should be same 835 * - Track suspect probe times not within tolerance of 836 * min value 837 * - Remember how much values are corrected by 838 */ 839 if (t1 > t2) { 840 t = t2; 841 lgrp_plat_probe_errors[i][j] += t1 - t2; 842 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 843 lgrp_plat_probe_suspect[i][j]++; 844 lgrp_plat_probe_suspect[j][i]++; 845 } 846 } else if (t2 > t1) { 847 t = t1; 848 lgrp_plat_probe_errors[j][i] += t2 - t1; 849 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 850 lgrp_plat_probe_suspect[i][j]++; 851 lgrp_plat_probe_suspect[j][i]++; 852 } 853 } 854 855 lgrp_plat_probe_times[i][j] = 856 lgrp_plat_probe_times[j][i] = t; 857 lgrp_config(cflag, t1, t); 858 lgrp_config(cflag, t2, t); 859 } 860 861 /* 862 * Keep track of which latencies get corrected 863 */ 864 for (i = 0; i < MAX_NODES; i++) 865 for (j = 0; j < MAX_NODES; j++) 866 lat_corrected[i][j] = 0; 867 868 /* 869 * For every two nodes, see whether there is another pair of nodes which 870 * are about the same distance apart and make the latencies be the same 871 * if they are close enough together 872 */ 873 for (i = 0; i < lgrp_plat_node_cnt; i++) 874 for (j = 0; j < lgrp_plat_node_cnt; j++) { 875 /* 876 * Pick one pair of nodes (i, j) 877 * and get latency between them 878 */ 879 t1 = lgrp_plat_probe_times[i][j]; 880 881 /* 882 * Skip this pair of nodes if there isn't a latency 883 * for it yet 884 */ 885 if (t1 == 0) 886 continue; 887 888 for (k = 0; k < lgrp_plat_node_cnt; k++) 889 for (l = 0; l < lgrp_plat_node_cnt; l++) { 890 /* 891 * Pick another pair of nodes (k, l) 892 * not same as (i, j) and get latency 893 * between them 894 */ 895 if (k == i && l == j) 896 continue; 897 898 t2 = lgrp_plat_probe_times[k][l]; 899 900 /* 901 * Skip this pair of nodes if there 902 * isn't a latency for it yet 903 */ 904 905 if (t2 == 0) 906 continue; 907 908 /* 909 * Skip nodes (k, l) if they already 910 * have same latency as (i, j) or 911 * their latency isn't close enough to 912 * be considered/made the same 913 */ 914 if (t1 == t2 || (t1 > t2 && t1 - t2 > 915 t1 >> lgrp_plat_probe_lt_shift) || 916 (t2 > t1 && t2 - t1 > 917 t2 >> lgrp_plat_probe_lt_shift)) 918 continue; 919 920 /* 921 * Make latency(i, j) same as 922 * latency(k, l), try to use latency 923 * that has been adjusted already to get 924 * more consistency (if possible), and 925 * remember which latencies were 926 * adjusted for next time 927 */ 928 if (lat_corrected[i][j]) { 929 t = t1; 930 lgrp_config(cflag, t2, t); 931 t2 = t; 932 } else if (lat_corrected[k][l]) { 933 t = t2; 934 lgrp_config(cflag, t1, t); 935 t1 = t; 936 } else { 937 if (t1 > t2) 938 t = t2; 939 else 940 t = t1; 941 lgrp_config(cflag, t1, t); 942 lgrp_config(cflag, t2, t); 943 t1 = t2 = t; 944 } 945 946 lgrp_plat_probe_times[i][j] = 947 lgrp_plat_probe_times[k][l] = t; 948 949 lat_corrected[i][j] = 950 lat_corrected[k][l] = 1; 951 } 952 } 953 954 /* 955 * Local latencies should be same 956 * - Find min and max local latencies 957 * - Make all local latencies be minimum 958 */ 959 min = -1; 960 max = 0; 961 for (i = 0; i < lgrp_plat_node_cnt; i++) { 962 t = lgrp_plat_probe_times[i][i]; 963 if (t == 0) 964 continue; 965 if (min == -1 || t < min) 966 min = t; 967 if (t > max) 968 max = t; 969 } 970 if (min != max) { 971 for (i = 0; i < lgrp_plat_node_cnt; i++) { 972 int local; 973 974 local = lgrp_plat_probe_times[i][i]; 975 if (local == 0) 976 continue; 977 978 /* 979 * Track suspect probe times that aren't within 980 * tolerance of minimum local latency and how much 981 * probe times are corrected by 982 */ 983 if (local - min > min >> lgrp_plat_probe_lt_shift) 984 lgrp_plat_probe_suspect[i][i]++; 985 986 lgrp_plat_probe_errors[i][i] += local - min; 987 988 /* 989 * Make local latencies be minimum 990 */ 991 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 992 lgrp_plat_probe_times[i][i] = min; 993 } 994 } 995 996 /* 997 * Determine max probe time again since just adjusted latencies 998 */ 999 lgrp_plat_probe_time_max = 0; 1000 for (i = 0; i < lgrp_plat_node_cnt; i++) 1001 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1002 t = lgrp_plat_probe_times[i][j]; 1003 if (t > lgrp_plat_probe_time_max) 1004 lgrp_plat_probe_time_max = t; 1005 } 1006 } 1007 1008 1009 /* 1010 * Verify following about latencies between nodes: 1011 * 1012 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1013 * - Local latencies same 1014 * - Local < remote 1015 * - Number of latencies seen is reasonable 1016 * - Number of occurrences of a given latency should be more than 1 1017 * 1018 * Returns: 1019 * 0 Success 1020 * -1 Not symmetric 1021 * -2 Local latencies not same 1022 * -3 Local >= remote 1023 * -4 Wrong number of latencies 1024 * -5 Not enough occurrences of given latency 1025 */ 1026 static int 1027 lgrp_plat_latency_verify(void) 1028 { 1029 int i; 1030 int j; 1031 lgrp_plat_latency_acct_t *l; 1032 int probed; 1033 u_longlong_t t1; 1034 u_longlong_t t2; 1035 1036 /* 1037 * Nothing to do when this is an UMA machine, lgroup topology is 1038 * limited to 2 levels, or there aren't any probe times yet 1039 */ 1040 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1041 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 1042 return (0); 1043 1044 /* 1045 * Make sure that latencies are symmetric between any two nodes 1046 * (ie. latency(node0, node1) == latency(node1, node0)) 1047 */ 1048 for (i = 0; i < lgrp_plat_node_cnt; i++) 1049 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1050 t1 = lgrp_plat_probe_times[i][j]; 1051 t2 = lgrp_plat_probe_times[j][i]; 1052 1053 if (t1 == 0 || t2 == 0 || t1 == t2) 1054 continue; 1055 1056 return (-1); 1057 } 1058 1059 /* 1060 * Local latencies should be same 1061 */ 1062 t1 = lgrp_plat_probe_times[0][0]; 1063 for (i = 1; i < lgrp_plat_node_cnt; i++) { 1064 t2 = lgrp_plat_probe_times[i][i]; 1065 if (t2 == 0) 1066 continue; 1067 1068 if (t1 == 0) { 1069 t1 = t2; 1070 continue; 1071 } 1072 1073 if (t1 != t2) 1074 return (-2); 1075 } 1076 1077 /* 1078 * Local latencies should be less than remote 1079 */ 1080 if (t1) { 1081 for (i = 0; i < lgrp_plat_node_cnt; i++) 1082 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1083 t2 = lgrp_plat_probe_times[i][j]; 1084 if (i == j || t2 == 0) 1085 continue; 1086 1087 if (t1 >= t2) 1088 return (-3); 1089 } 1090 } 1091 1092 /* 1093 * Rest of checks are not very useful for machines with less than 1094 * 4 nodes (which means less than 3 latencies on Opteron) 1095 */ 1096 if (lgrp_plat_node_cnt < 4) 1097 return (0); 1098 1099 /* 1100 * Need to see whether done probing in order to verify number of 1101 * latencies are correct 1102 */ 1103 probed = 0; 1104 for (i = 0; i < lgrp_plat_node_cnt; i++) 1105 if (lgrp_plat_probe_times[i][i]) 1106 probed++; 1107 1108 if (probed != lgrp_plat_node_cnt) 1109 return (0); 1110 1111 /* 1112 * Determine number of unique latencies seen in probe times, 1113 * their values, and number of occurrences of each 1114 */ 1115 lgrp_plat_probe_nlatencies = 0; 1116 bzero(lgrp_plat_probe_lat_acct, 1117 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 1118 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1119 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1120 int k; 1121 1122 /* 1123 * Look at each probe time 1124 */ 1125 t1 = lgrp_plat_probe_times[i][j]; 1126 if (t1 == 0) 1127 continue; 1128 1129 /* 1130 * Account for unique latencies 1131 */ 1132 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1133 l = &lgrp_plat_probe_lat_acct[k]; 1134 if (t1 == l->la_value) { 1135 /* 1136 * Increment number of occurrences 1137 * if seen before 1138 */ 1139 l->la_count++; 1140 break; 1141 } else if (l->la_value == 0) { 1142 /* 1143 * Record latency if haven't seen before 1144 */ 1145 l->la_value = t1; 1146 l->la_count++; 1147 lgrp_plat_probe_nlatencies++; 1148 break; 1149 } 1150 } 1151 } 1152 } 1153 1154 /* 1155 * Number of latencies should be relative to number of 1156 * nodes in system: 1157 * - Same as nodes when nodes <= 2 1158 * - Less than nodes when nodes > 2 1159 * - Greater than 2 when nodes >= 4 1160 */ 1161 if ((lgrp_plat_node_cnt <= 2 && 1162 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 1163 (lgrp_plat_node_cnt > 2 && 1164 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 1165 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 1166 lgrp_plat_probe_nlatencies <= 2)) 1167 return (-4); 1168 1169 /* 1170 * There should be more than one occurrence of every latency 1171 * as long as probing is complete 1172 */ 1173 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 1174 l = &lgrp_plat_probe_lat_acct[i]; 1175 if (l->la_count <= 1) 1176 return (-5); 1177 } 1178 return (0); 1179 } 1180 1181 1182 /* 1183 * Set lgroup latencies for 2 level lgroup topology 1184 */ 1185 static void 1186 lgrp_plat_2level_setup(void) 1187 { 1188 int i; 1189 1190 if (lgrp_plat_node_cnt >= 4) 1191 cmn_err(CE_NOTE, 1192 "MPO only optimizing for local and remote\n"); 1193 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1194 int j; 1195 1196 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1197 if (i == j) 1198 lgrp_plat_probe_times[i][j] = 2; 1199 else 1200 lgrp_plat_probe_times[i][j] = 3; 1201 } 1202 } 1203 lgrp_plat_probe_time_min = 2; 1204 lgrp_plat_probe_time_max = 3; 1205 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1206 } 1207 1208 1209 /* 1210 * Return time needed to probe from current CPU to memory in given node 1211 */ 1212 static hrtime_t 1213 lgrp_plat_probe_time(int to) 1214 { 1215 caddr_t buf; 1216 uint_t dev; 1217 /* LINTED: set but not used in function */ 1218 volatile uint_t dev_vendor; 1219 hrtime_t elapsed; 1220 hrtime_t end; 1221 int from; 1222 int i; 1223 int ipl; 1224 hrtime_t max; 1225 hrtime_t min; 1226 hrtime_t start; 1227 int cnt; 1228 extern int use_sse_pagecopy; 1229 1230 /* 1231 * Determine ID of node containing current CPU 1232 */ 1233 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1234 1235 /* 1236 * Do common work for probing main memory 1237 */ 1238 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1239 /* 1240 * Skip probing any nodes without memory and 1241 * set probe time to 0 1242 */ 1243 if (lgrp_plat_probe_memory[to] == NULL) { 1244 lgrp_plat_probe_times[from][to] = 0; 1245 return (0); 1246 } 1247 1248 /* 1249 * Invalidate caches once instead of once every sample 1250 * which should cut cost of probing by a lot 1251 */ 1252 lgrp_plat_flush_cost = gethrtime(); 1253 invalidate_cache(); 1254 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1255 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1256 } 1257 1258 /* 1259 * Probe from current CPU to given memory using specified operation 1260 * and take specified number of samples 1261 */ 1262 max = 0; 1263 min = -1; 1264 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1265 lgrp_plat_probe_cost = gethrtime(); 1266 1267 /* 1268 * Can't measure probe time if gethrtime() isn't working yet 1269 */ 1270 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1271 return (0); 1272 1273 switch (lgrp_plat_probe_op) { 1274 1275 case LGRP_PLAT_PROBE_PGCPY: 1276 default: 1277 /* 1278 * Measure how long it takes to copy page 1279 * on top of itself 1280 */ 1281 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1282 1283 kpreempt_disable(); 1284 ipl = splhigh(); 1285 start = gethrtime(); 1286 if (use_sse_pagecopy) 1287 hwblkpagecopy(buf, buf); 1288 else 1289 bcopy(buf, buf, PAGESIZE); 1290 end = gethrtime(); 1291 elapsed = end - start; 1292 splx(ipl); 1293 kpreempt_enable(); 1294 break; 1295 1296 case LGRP_PLAT_PROBE_VENDOR: 1297 /* 1298 * Measure how long it takes to read vendor ID from 1299 * Northbridge 1300 */ 1301 dev = OPT_PCS_DEV_NODE0 + to; 1302 kpreempt_disable(); 1303 ipl = spl8(); 1304 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1305 OPT_PCS_OFF_VENDOR)); 1306 start = gethrtime(); 1307 for (cnt = 0; cnt < lgrp_plat_probe_nreads; cnt++) 1308 dev_vendor = inl(PCI_CONFDATA); 1309 end = gethrtime(); 1310 elapsed = (end - start) / lgrp_plat_probe_nreads; 1311 splx(ipl); 1312 kpreempt_enable(); 1313 break; 1314 } 1315 1316 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1317 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1318 1319 if (min == -1 || elapsed < min) 1320 min = elapsed; 1321 if (elapsed > max) 1322 max = elapsed; 1323 } 1324 1325 /* 1326 * Update minimum and maximum probe times between 1327 * these two nodes 1328 */ 1329 if (min < lgrp_plat_probe_min[from][to] || 1330 lgrp_plat_probe_min[from][to] == 0) 1331 lgrp_plat_probe_min[from][to] = min; 1332 1333 if (max > lgrp_plat_probe_max[from][to]) 1334 lgrp_plat_probe_max[from][to] = max; 1335 1336 return (min); 1337 } 1338 1339 1340 /* 1341 * Probe memory in each node from current CPU to determine latency topology 1342 */ 1343 void 1344 lgrp_plat_probe(void) 1345 { 1346 int from; 1347 int i; 1348 hrtime_t probe_time; 1349 int to; 1350 1351 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1352 return; 1353 1354 /* 1355 * Determine ID of node containing current CPU 1356 */ 1357 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1358 1359 /* 1360 * Don't need to probe if got times already 1361 */ 1362 if (lgrp_plat_probe_times[from][from] != 0) 1363 return; 1364 1365 /* 1366 * Read vendor ID in Northbridge or read and write page(s) 1367 * in each node from current CPU and remember how long it takes, 1368 * so we can build latency topology of machine later. 1369 * This should approximate the memory latency between each node. 1370 */ 1371 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1372 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1373 /* 1374 * Get probe time and bail out if can't get it yet 1375 */ 1376 probe_time = lgrp_plat_probe_time(to); 1377 if (probe_time == 0) 1378 return; 1379 1380 /* 1381 * Keep lowest probe time as latency between nodes 1382 */ 1383 if (lgrp_plat_probe_times[from][to] == 0 || 1384 probe_time < lgrp_plat_probe_times[from][to]) 1385 lgrp_plat_probe_times[from][to] = probe_time; 1386 1387 /* 1388 * Update overall minimum and maximum probe times 1389 * across all nodes 1390 */ 1391 if (probe_time < lgrp_plat_probe_time_min || 1392 lgrp_plat_probe_time_min == -1) 1393 lgrp_plat_probe_time_min = probe_time; 1394 if (probe_time > lgrp_plat_probe_time_max) 1395 lgrp_plat_probe_time_max = probe_time; 1396 } 1397 1398 /* 1399 * - Fix up latencies such that local latencies are same, 1400 * latency(i, j) == latency(j, i), etc. (if possible) 1401 * 1402 * - Verify that latencies look ok 1403 * 1404 * - Fallback to just optimizing for local and remote if 1405 * latencies didn't look right 1406 */ 1407 lgrp_plat_latency_adjust(); 1408 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1409 if (lgrp_plat_probe_error_code) 1410 lgrp_plat_2level_setup(); 1411 } 1412 1413 1414 /* 1415 * Platform-specific initialization 1416 */ 1417 void 1418 lgrp_plat_main_init(void) 1419 { 1420 int curnode; 1421 int ht_limit; 1422 int i; 1423 1424 /* 1425 * Print a notice that MPO is disabled when memory is interleaved 1426 * across nodes....Would do this when it is discovered, but can't 1427 * because it happens way too early during boot.... 1428 */ 1429 if (lgrp_plat_mem_intrlv) 1430 cmn_err(CE_NOTE, 1431 "MPO disabled because memory is interleaved\n"); 1432 1433 /* 1434 * Don't bother to do any probing if there is only one node or the 1435 * height of the lgroup topology less than or equal to 2 1436 */ 1437 ht_limit = lgrp_topo_ht_limit(); 1438 if (max_mem_nodes == 1 || ht_limit <= 2) { 1439 /* 1440 * Setup lgroup latencies for 2 level lgroup topology 1441 * (ie. local and remote only) if they haven't been set yet 1442 */ 1443 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1444 lgrp_plat_probe_time_max == 0) 1445 lgrp_plat_2level_setup(); 1446 return; 1447 } 1448 1449 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1450 /* 1451 * Should have been able to probe from CPU 0 when it was added 1452 * to lgroup hierarchy, but may not have been able to then 1453 * because it happens so early in boot that gethrtime() hasn't 1454 * been initialized. (:-( 1455 */ 1456 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1457 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1458 lgrp_plat_probe(); 1459 1460 return; 1461 } 1462 1463 /* 1464 * When probing memory, use one page for every sample to determine 1465 * lgroup topology and taking multiple samples 1466 */ 1467 if (lgrp_plat_probe_memsize == 0) 1468 lgrp_plat_probe_memsize = PAGESIZE * 1469 lgrp_plat_probe_nsamples; 1470 1471 /* 1472 * Map memory in each node needed for probing to determine latency 1473 * topology 1474 */ 1475 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1476 int mnode; 1477 1478 /* 1479 * Skip this node and leave its probe page NULL 1480 * if it doesn't have any memory 1481 */ 1482 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1483 if (!mem_node_config[mnode].exists) { 1484 lgrp_plat_probe_memory[i] = NULL; 1485 continue; 1486 } 1487 1488 /* 1489 * Allocate one kernel virtual page 1490 */ 1491 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1492 lgrp_plat_probe_memsize, VM_NOSLEEP); 1493 if (lgrp_plat_probe_memory[i] == NULL) { 1494 cmn_err(CE_WARN, 1495 "lgrp_plat_main_init: couldn't allocate memory"); 1496 return; 1497 } 1498 1499 /* 1500 * Map virtual page to first page in node 1501 */ 1502 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1503 lgrp_plat_probe_memsize, 1504 lgrp_plat_probe_pfn[i], 1505 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1506 HAT_LOAD_NOCONSIST); 1507 } 1508 1509 /* 1510 * Probe from current CPU 1511 */ 1512 lgrp_plat_probe(); 1513 } 1514 1515 /* 1516 * Allocate additional space for an lgroup. 1517 */ 1518 /* ARGSUSED */ 1519 lgrp_t * 1520 lgrp_plat_alloc(lgrp_id_t lgrpid) 1521 { 1522 lgrp_t *lgrp; 1523 1524 lgrp = &lgrp_space[nlgrps_alloc++]; 1525 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1526 return (NULL); 1527 return (lgrp); 1528 } 1529 1530 /* 1531 * Platform handling for (re)configuration changes 1532 */ 1533 /* ARGSUSED */ 1534 void 1535 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1536 { 1537 } 1538 1539 /* 1540 * Return the platform handle for the lgroup containing the given CPU 1541 */ 1542 /* ARGSUSED */ 1543 lgrp_handle_t 1544 lgrp_plat_cpu_to_hand(processorid_t id) 1545 { 1546 if (lgrp_plat_node_cnt == 1) 1547 return (LGRP_DEFAULT_HANDLE); 1548 1549 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1550 } 1551 1552 /* 1553 * Return the platform handle of the lgroup that contains the physical memory 1554 * corresponding to the given page frame number 1555 */ 1556 /* ARGSUSED */ 1557 lgrp_handle_t 1558 lgrp_plat_pfn_to_hand(pfn_t pfn) 1559 { 1560 int mnode; 1561 1562 if (max_mem_nodes == 1) 1563 return (LGRP_DEFAULT_HANDLE); 1564 1565 if (pfn > physmax) 1566 return (LGRP_NULL_HANDLE); 1567 1568 mnode = plat_pfn_to_mem_node(pfn); 1569 if (mnode < 0) 1570 return (LGRP_NULL_HANDLE); 1571 1572 return (MEM_NODE_2_LGRPHAND(mnode)); 1573 } 1574 1575 /* 1576 * Return the maximum number of lgrps supported by the platform. 1577 * Before lgrp topology is known it returns an estimate based on the number of 1578 * nodes. Once topology is known it returns the actual maximim number of lgrps 1579 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1580 * may not grow during system lifetime. 1581 */ 1582 int 1583 lgrp_plat_max_lgrps() 1584 { 1585 return (lgrp_topo_initialized ? 1586 lgrp_alloc_max + 1 : 1587 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1588 } 1589 1590 /* 1591 * Return the number of free, allocatable, or installed 1592 * pages in an lgroup 1593 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1594 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1595 */ 1596 /* ARGSUSED */ 1597 static pgcnt_t 1598 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1599 { 1600 struct memlist *mlist; 1601 pgcnt_t npgs = 0; 1602 extern struct memlist *phys_avail; 1603 extern struct memlist *phys_install; 1604 1605 switch (query) { 1606 case LGRP_MEM_SIZE_FREE: 1607 return ((pgcnt_t)freemem); 1608 case LGRP_MEM_SIZE_AVAIL: 1609 memlist_read_lock(); 1610 for (mlist = phys_avail; mlist; mlist = mlist->next) 1611 npgs += btop(mlist->size); 1612 memlist_read_unlock(); 1613 return (npgs); 1614 case LGRP_MEM_SIZE_INSTALL: 1615 memlist_read_lock(); 1616 for (mlist = phys_install; mlist; mlist = mlist->next) 1617 npgs += btop(mlist->size); 1618 memlist_read_unlock(); 1619 return (npgs); 1620 default: 1621 return ((pgcnt_t)0); 1622 } 1623 } 1624 1625 /* 1626 * Return the number of free pages in an lgroup. 1627 * 1628 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1629 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1630 * number of allocatable base pagesize pages corresponding to the 1631 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1632 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1633 * memory installed, regardless of whether or not it's usable. 1634 */ 1635 pgcnt_t 1636 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1637 { 1638 int mnode; 1639 pgcnt_t npgs = (pgcnt_t)0; 1640 extern struct memlist *phys_avail; 1641 extern struct memlist *phys_install; 1642 1643 1644 if (plathand == LGRP_DEFAULT_HANDLE) 1645 return (lgrp_plat_mem_size_default(plathand, query)); 1646 1647 if (plathand != LGRP_NULL_HANDLE) { 1648 mnode = plat_lgrphand_to_mem_node(plathand); 1649 if (mnode >= 0 && mem_node_config[mnode].exists) { 1650 switch (query) { 1651 case LGRP_MEM_SIZE_FREE: 1652 npgs = MNODE_PGCNT(mnode); 1653 break; 1654 case LGRP_MEM_SIZE_AVAIL: 1655 npgs = mem_node_memlist_pages(mnode, 1656 phys_avail); 1657 break; 1658 case LGRP_MEM_SIZE_INSTALL: 1659 npgs = mem_node_memlist_pages(mnode, 1660 phys_install); 1661 break; 1662 default: 1663 break; 1664 } 1665 } 1666 } 1667 return (npgs); 1668 } 1669 1670 /* 1671 * Return latency between "from" and "to" lgroups 1672 * 1673 * This latency number can only be used for relative comparison 1674 * between lgroups on the running system, cannot be used across platforms, 1675 * and may not reflect the actual latency. It is platform and implementation 1676 * specific, so platform gets to decide its value. It would be nice if the 1677 * number was at least proportional to make comparisons more meaningful though. 1678 */ 1679 /* ARGSUSED */ 1680 int 1681 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1682 { 1683 lgrp_handle_t src, dest; 1684 1685 if (max_mem_nodes == 1) 1686 return (0); 1687 1688 /* 1689 * Return max latency for root lgroup 1690 */ 1691 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1692 return (lgrp_plat_probe_time_max); 1693 1694 src = from; 1695 dest = to; 1696 1697 /* 1698 * Return 0 for nodes (lgroup platform handles) out of range 1699 */ 1700 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1701 return (0); 1702 1703 /* 1704 * Probe from current CPU if its lgroup latencies haven't been set yet 1705 * and we are trying to get latency from current CPU to some node 1706 */ 1707 if (lgrp_plat_probe_times[src][src] == 0 && 1708 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1709 lgrp_plat_probe(); 1710 1711 return (lgrp_plat_probe_times[src][dest]); 1712 } 1713 1714 /* 1715 * Return platform handle for root lgroup 1716 */ 1717 lgrp_handle_t 1718 lgrp_plat_root_hand(void) 1719 { 1720 return (LGRP_DEFAULT_HANDLE); 1721 } 1722