1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/controlregs.h> 33 #include <sys/cpupart.h> 34 #include <sys/cpuvar.h> 35 #include <sys/lgrp.h> 36 #include <sys/machsystm.h> 37 #include <sys/memlist.h> 38 #include <sys/memnode.h> 39 #include <sys/mman.h> 40 #include <sys/pci_cfgspace.h> 41 #include <sys/pci_impl.h> 42 #include <sys/param.h> 43 #include <sys/pghw.h> 44 #include <sys/promif.h> /* for prom_printf() */ 45 #include <sys/systm.h> 46 #include <sys/thread.h> 47 #include <sys/types.h> 48 #include <sys/var.h> 49 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 50 #include <vm/hat_i86.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/vm_dep.h> 53 54 55 /* 56 * lgroup platform support for x86 platforms. 57 */ 58 59 #define MAX_NODES 8 60 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 61 62 #define LGRP_PLAT_CPU_TO_NODE(cpu) (pg_plat_hw_instance_id(cpu, PGHW_CHIP)) 63 64 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 65 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 66 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 67 68 /* 69 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 70 * 71 * Until this code supports reading System Resource Affinity Table (SRAT), 72 * we need to examine registers in PCI configuration space to determine how 73 * many nodes are in the system and which CPUs and memory are in each node. 74 * This could be determined by probing all memory from each CPU, but that is 75 * too expensive to do while booting the kernel. 76 * 77 * NOTE: Using these PCI configuration space registers to determine this 78 * locality info is not guaranteed to work on future generations of 79 * Opteron processor. 80 */ 81 82 /* 83 * Opteron DRAM Address Map in PCI configuration space gives base and limit 84 * of physical memory in each node. The following constants and macros define 85 * their contents, structure, and access. 86 */ 87 88 /* 89 * How many bits to shift Opteron DRAM Address Map base and limit registers 90 * to get actual value 91 */ 92 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 93 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 94 95 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 96 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 97 98 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 99 100 /* 101 * Macros to derive addresses from Opteron DRAM Address Map registers 102 */ 103 #define OPT_DRAMADDR_HI(reg) \ 104 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 105 OPT_DRAMADDR_HI_LSHIFT_ADDR) 106 107 #define OPT_DRAMADDR_LO(reg) \ 108 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 109 OPT_DRAMADDR_LO_LSHIFT_ADDR) 110 111 #define OPT_DRAMADDR(high, low) \ 112 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 113 114 /* 115 * Bit masks defining what's in Opteron DRAM Address Map base register 116 */ 117 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 118 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 119 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 120 121 /* 122 * Bit masks defining what's in Opteron DRAM Address Map limit register 123 */ 124 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 125 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 126 127 128 /* 129 * Opteron Node ID register in PCI configuration space contains 130 * number of nodes in system, etc. for Opteron K8. The following 131 * constants and macros define its contents, structure, and access. 132 */ 133 134 /* 135 * Bit masks defining what's in Opteron Node ID register 136 */ 137 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 138 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 139 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 140 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 141 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 142 143 /* 144 * How many bits in Opteron Node ID register to shift right to get actual value 145 */ 146 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 147 148 /* 149 * Macros to get values from Opteron Node ID register 150 */ 151 #define OPT_NODE_CNT(reg) \ 152 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 153 154 /* 155 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 156 * "in/out" instructions 157 * 158 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 159 * other uses should just do MMIO to access PCI ECS. 160 * Must enable special bit in Northbridge Configuration Register on 161 * Greyhound for extended CF8 space access to be able to access PCI ECS 162 * using "in/out" instructions and restore special bit after done 163 * accessing PCI ECS. 164 */ 165 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 166 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 167 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 168 ((((reg) >> 8) & 0xf) << 24)) 169 170 /* 171 * PCI configuration space registers accessed by specifying 172 * a bus, device, function, and offset. The following constants 173 * define the values needed to access Opteron K8 configuration 174 * info to determine its node topology 175 */ 176 177 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 178 179 /* 180 * Opteron PCI configuration space register function values 181 */ 182 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 183 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 184 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 185 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 186 187 /* 188 * PCI Configuration Space register offsets 189 */ 190 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 191 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 192 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 193 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 194 195 /* 196 * Opteron PCI Configuration Space device IDs for nodes 197 */ 198 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 199 200 201 /* 202 * Bookkeeping for latencies seen during probing (used for verification) 203 */ 204 typedef struct lgrp_plat_latency_acct { 205 hrtime_t la_value; /* latency value */ 206 int la_count; /* occurrences */ 207 } lgrp_plat_latency_acct_t; 208 209 210 /* 211 * Choices for probing to determine lgroup topology 212 */ 213 typedef enum lgrp_plat_probe_op { 214 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 215 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 216 } lgrp_plat_probe_op_t; 217 218 219 /* 220 * Opteron DRAM address map gives base and limit for physical memory in a node 221 */ 222 typedef struct opt_dram_addr_map { 223 uint32_t base_hi; 224 uint32_t base_lo; 225 uint32_t limit_hi; 226 uint32_t limit_lo; 227 } opt_dram_addr_map_t; 228 229 230 /* 231 * Starting and ending page for physical memory in node 232 */ 233 typedef struct phys_addr_map { 234 pfn_t start; 235 pfn_t end; 236 int exists; 237 } phys_addr_map_t; 238 239 240 /* 241 * Opteron DRAM address map for each node 242 */ 243 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 244 245 /* 246 * Node ID register contents for each node 247 */ 248 uint_t opt_node_info[MAX_NODES]; 249 250 /* 251 * Whether memory is interleaved across nodes causing MPO to be disabled 252 */ 253 int lgrp_plat_mem_intrlv = 0; 254 255 /* 256 * Number of nodes in system 257 */ 258 uint_t lgrp_plat_node_cnt = 1; 259 260 /* 261 * Physical address range for memory in each node 262 */ 263 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 264 265 /* 266 * Probe costs (individual and total) and flush cost 267 */ 268 hrtime_t lgrp_plat_flush_cost = 0; 269 hrtime_t lgrp_plat_probe_cost = 0; 270 hrtime_t lgrp_plat_probe_cost_total = 0; 271 272 /* 273 * Error code for latency adjustment and verification 274 */ 275 int lgrp_plat_probe_error_code = 0; 276 277 /* 278 * How much latencies were off from minimum values gotten 279 */ 280 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 281 282 /* 283 * Unique probe latencies and number of occurrences of each 284 */ 285 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 286 287 /* 288 * Size of memory buffer in each node for probing 289 */ 290 size_t lgrp_plat_probe_memsize = 0; 291 292 /* 293 * Virtual address of page in each node for probing 294 */ 295 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 296 297 /* 298 * Number of unique latencies in probe times 299 */ 300 int lgrp_plat_probe_nlatencies = 0; 301 302 /* 303 * How many rounds of probing to do 304 */ 305 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 306 307 /* 308 * Number of samples to take when probing each node 309 */ 310 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 311 312 /* 313 * Number of times to read vendor ID from Northbridge for each probe. 314 */ 315 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 316 317 /* 318 * How to probe to determine lgroup topology 319 */ 320 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 321 322 /* 323 * PFN of page in each node for probing 324 */ 325 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 326 327 /* 328 * Whether probe time was suspect (ie. not within tolerance of value that it 329 * should match) 330 */ 331 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 332 333 /* 334 * How long it takes to access memory from each node 335 */ 336 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 337 338 /* 339 * Min and max node memory probe times seen 340 */ 341 hrtime_t lgrp_plat_probe_time_max = 0; 342 hrtime_t lgrp_plat_probe_time_min = -1; 343 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 344 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 345 346 347 /* 348 * Allocate lgrp and lgrp stat arrays statically. 349 */ 350 static lgrp_t lgrp_space[NLGRP]; 351 static int nlgrps_alloc; 352 353 struct lgrp_stats lgrp_stats[NLGRP]; 354 355 /* 356 * Supported AMD processor families 357 */ 358 #define AMD_FAMILY_HAMMER 15 359 #define AMD_FAMILY_GREYHOUND 16 360 361 /* 362 * Whether to have is_opteron() return 1 even when processor isn't 363 * supported 364 */ 365 uint_t is_opteron_override = 0; 366 367 /* 368 * AMD processor family for current CPU 369 */ 370 uint_t opt_family = 0; 371 372 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 373 374 375 /* 376 * Determine whether we're running on a supported AMD Opteron since reading 377 * node count and DRAM address map registers may have different format or 378 * may not be supported in future processor families 379 */ 380 int 381 is_opteron(void) 382 { 383 384 if (x86_vendor != X86_VENDOR_AMD) 385 return (0); 386 387 opt_family = cpuid_getfamily(CPU); 388 if (opt_family == AMD_FAMILY_HAMMER || 389 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 390 return (1); 391 else 392 return (0); 393 } 394 395 int 396 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 397 { 398 if (max_mem_nodes == 1) 399 return (0); 400 401 return ((int)hand); 402 } 403 404 lgrp_handle_t 405 plat_mem_node_to_lgrphand(int mnode) 406 { 407 if (max_mem_nodes == 1) 408 return (LGRP_DEFAULT_HANDLE); 409 410 return ((lgrp_handle_t)mnode); 411 } 412 413 int 414 plat_pfn_to_mem_node(pfn_t pfn) 415 { 416 int node; 417 418 if (max_mem_nodes == 1) 419 return (0); 420 421 for (node = 0; node < lgrp_plat_node_cnt; node++) { 422 /* 423 * Skip nodes with no memory 424 */ 425 if (!lgrp_plat_node_memory[node].exists) 426 continue; 427 428 if (pfn >= lgrp_plat_node_memory[node].start && 429 pfn <= lgrp_plat_node_memory[node].end) 430 return (node); 431 } 432 433 ASSERT(node < lgrp_plat_node_cnt); 434 return (-1); 435 } 436 437 /* 438 * Configure memory nodes for machines with more than one node (ie NUMA) 439 */ 440 void 441 plat_build_mem_nodes(struct memlist *list) 442 { 443 pfn_t cur_start; /* start addr of subrange */ 444 pfn_t cur_end; /* end addr of subrange */ 445 pfn_t start; /* start addr of whole range */ 446 pfn_t end; /* end addr of whole range */ 447 448 /* 449 * Boot install lists are arranged <addr, len>, ... 450 */ 451 while (list) { 452 int node; 453 454 start = list->address >> PAGESHIFT; 455 end = (list->address + list->size - 1) >> PAGESHIFT; 456 457 if (start > physmax) { 458 list = list->next; 459 continue; 460 } 461 if (end > physmax) 462 end = physmax; 463 464 /* 465 * When there is only one memnode, just add memory to memnode 466 */ 467 if (max_mem_nodes == 1) { 468 mem_node_add_slice(start, end); 469 list = list->next; 470 continue; 471 } 472 473 /* 474 * mem_node_add_slice() expects to get a memory range that 475 * is within one memnode, so need to split any memory range 476 * that spans multiple memnodes into subranges that are each 477 * contained within one memnode when feeding them to 478 * mem_node_add_slice() 479 */ 480 cur_start = start; 481 do { 482 node = plat_pfn_to_mem_node(cur_start); 483 484 /* 485 * Panic if DRAM address map registers or SRAT say 486 * memory in node doesn't exist or address from 487 * boot installed memory list entry isn't in this node. 488 * This shouldn't happen and rest of code can't deal 489 * with this if it does. 490 */ 491 if (node < 0 || node >= lgrp_plat_node_cnt || 492 !lgrp_plat_node_memory[node].exists || 493 cur_start < lgrp_plat_node_memory[node].start || 494 cur_start > lgrp_plat_node_memory[node].end) { 495 cmn_err(CE_PANIC, "Don't know which memnode " 496 "to add installed memory address 0x%lx\n", 497 cur_start); 498 } 499 500 /* 501 * End of current subrange should not span memnodes 502 */ 503 cur_end = end; 504 if (lgrp_plat_node_memory[node].exists && 505 cur_end > lgrp_plat_node_memory[node].end) 506 cur_end = lgrp_plat_node_memory[node].end; 507 508 mem_node_add_slice(cur_start, cur_end); 509 510 /* 511 * Next subrange starts after end of current one 512 */ 513 cur_start = cur_end + 1; 514 } while (cur_end < end); 515 516 list = list->next; 517 } 518 mem_node_physalign = 0; 519 mem_node_pfn_shift = 0; 520 } 521 522 523 /* 524 * Platform-specific initialization of lgroups 525 */ 526 void 527 lgrp_plat_init(void) 528 { 529 #if defined(__xpv) 530 /* 531 * XXPV For now, the hypervisor treats all memory equally. 532 */ 533 lgrp_plat_node_cnt = max_mem_nodes = 1; 534 #else /* __xpv */ 535 uint_t bus; 536 uint_t dev; 537 uint_t node; 538 uint_t off_hi; 539 uint_t off_lo; 540 uint64_t nb_cfg_reg; 541 542 extern lgrp_load_t lgrp_expand_proc_thresh; 543 extern lgrp_load_t lgrp_expand_proc_diff; 544 545 /* 546 * Initialize as a UMA machine if this isn't an Opteron 547 */ 548 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 549 lgrp_plat_node_cnt = max_mem_nodes = 1; 550 return; 551 } 552 553 /* 554 * Read configuration registers from PCI configuration space to 555 * determine node information, which memory is in each node, etc. 556 * 557 * Write to PCI configuration space address register to specify 558 * which configuration register to read and read/write PCI 559 * configuration space data register to get/set contents 560 */ 561 bus = OPT_PCS_BUS_CONFIG; 562 dev = OPT_PCS_DEV_NODE0; 563 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 564 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 565 566 /* 567 * Read node ID register for node 0 to get node count 568 */ 569 opt_node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 570 OPT_PCS_OFF_NODEID); 571 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 572 573 /* 574 * For Greyhound, PCI Extended Configuration Space must be enabled to 575 * read high DRAM address map base and limit registers 576 */ 577 if (opt_family == AMD_FAMILY_GREYHOUND) { 578 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 579 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 580 wrmsr(MSR_AMD_NB_CFG, 581 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 582 } 583 584 for (node = 0; node < lgrp_plat_node_cnt; node++) { 585 uint32_t base_hi; 586 uint32_t base_lo; 587 uint32_t limit_hi; 588 uint32_t limit_lo; 589 590 /* 591 * Read node ID register (except for node 0 which we just read) 592 */ 593 if (node > 0) { 594 opt_node_info[node] = pci_getl_func(bus, dev, 595 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 596 } 597 598 /* 599 * Read DRAM base and limit registers which specify 600 * physical memory range of each node 601 */ 602 if (opt_family != AMD_FAMILY_GREYHOUND) 603 base_hi = 0; 604 else { 605 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 606 OPT_PCS_FUNC_ADDRMAP, off_hi)); 607 base_hi = opt_dram_map[node].base_hi = 608 inl(PCI_CONFDATA); 609 } 610 base_lo = opt_dram_map[node].base_lo = pci_getl_func(bus, dev, 611 OPT_PCS_FUNC_ADDRMAP, off_lo); 612 613 if (opt_dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) 614 lgrp_plat_mem_intrlv++; 615 616 off_hi += 4; /* high limit register offset */ 617 if (opt_family != AMD_FAMILY_GREYHOUND) 618 limit_hi = 0; 619 else { 620 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 621 OPT_PCS_FUNC_ADDRMAP, off_hi)); 622 limit_hi = opt_dram_map[node].limit_hi = 623 inl(PCI_CONFDATA); 624 } 625 626 off_lo += 4; /* low limit register offset */ 627 limit_lo = opt_dram_map[node].limit_lo = pci_getl_func(bus, 628 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 629 630 /* 631 * Increment device number to next node and register offsets 632 * for DRAM base register of next node 633 */ 634 off_hi += 4; 635 off_lo += 4; 636 dev++; 637 638 /* 639 * Both read and write enable bits must be enabled in DRAM 640 * address map base register for physical memory to exist in 641 * node 642 */ 643 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 644 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 645 /* 646 * Mark node memory as non-existent and set start and 647 * end addresses to be same in lgrp_plat_node_memory[] 648 */ 649 lgrp_plat_node_memory[node].exists = 0; 650 lgrp_plat_node_memory[node].start = 651 lgrp_plat_node_memory[node].end = (pfn_t)-1; 652 continue; 653 } 654 655 /* 656 * Get PFN for first page in each node, 657 * so we can probe memory to determine latency topology 658 */ 659 lgrp_plat_probe_pfn[node] = 660 btop(OPT_DRAMADDR(base_hi, base_lo)); 661 662 /* 663 * Mark node memory as existing and remember physical address 664 * range of each node for use later 665 */ 666 lgrp_plat_node_memory[node].exists = 1; 667 668 lgrp_plat_node_memory[node].start = 669 btop(OPT_DRAMADDR(base_hi, base_lo)); 670 671 lgrp_plat_node_memory[node].end = 672 btop(OPT_DRAMADDR(limit_hi, limit_lo) | 673 OPT_DRAMADDR_LO_MASK_OFF); 674 } 675 676 /* 677 * Restore PCI Extended Configuration Space enable bit 678 */ 679 if (opt_family == AMD_FAMILY_GREYHOUND) { 680 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 681 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 682 } 683 684 /* 685 * Only use one memory node if memory is interleaved between any nodes 686 */ 687 if (lgrp_plat_mem_intrlv) { 688 lgrp_plat_node_cnt = max_mem_nodes = 1; 689 (void) lgrp_topo_ht_limit_set(1); 690 } else { 691 max_mem_nodes = lgrp_plat_node_cnt; 692 693 /* 694 * Probing errors can mess up the lgroup topology and force us 695 * fall back to a 2 level lgroup topology. Here we bound how 696 * tall the lgroup topology can grow in hopes of avoiding any 697 * anamolies in probing from messing up the lgroup topology 698 * by limiting the accuracy of the latency topology. 699 * 700 * Assume that nodes will at least be configured in a ring, 701 * so limit height of lgroup topology to be less than number 702 * of nodes on a system with 4 or more nodes 703 */ 704 if (lgrp_plat_node_cnt >= 4 && 705 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 706 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 707 } 708 709 /* 710 * Lgroups on Opteron architectures have but a single physical 711 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 712 * so that lgrp_choose() will spread things out aggressively. 713 */ 714 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 715 lgrp_expand_proc_diff = 0; 716 #endif /* __xpv */ 717 } 718 719 720 /* 721 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 722 * be considered same 723 */ 724 #define LGRP_LAT_TOLERANCE_SHIFT 4 725 726 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 727 728 729 /* 730 * Adjust latencies between nodes to be symmetric, normalize latencies between 731 * any nodes that are within some tolerance to be same, and make local 732 * latencies be same 733 */ 734 static void 735 lgrp_plat_latency_adjust(void) 736 { 737 int i; 738 int j; 739 int k; 740 int l; 741 u_longlong_t max; 742 u_longlong_t min; 743 u_longlong_t t; 744 u_longlong_t t1; 745 u_longlong_t t2; 746 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 747 int lat_corrected[MAX_NODES][MAX_NODES]; 748 749 /* 750 * Nothing to do when this is an UMA machine 751 */ 752 if (max_mem_nodes == 1) 753 return; 754 755 /* 756 * Make sure that latencies are symmetric between any two nodes 757 * (ie. latency(node0, node1) == latency(node1, node0)) 758 */ 759 for (i = 0; i < lgrp_plat_node_cnt; i++) 760 for (j = 0; j < lgrp_plat_node_cnt; j++) { 761 t1 = lgrp_plat_probe_times[i][j]; 762 t2 = lgrp_plat_probe_times[j][i]; 763 764 if (t1 == 0 || t2 == 0 || t1 == t2) 765 continue; 766 767 /* 768 * Latencies should be same 769 * - Use minimum of two latencies which should be same 770 * - Track suspect probe times not within tolerance of 771 * min value 772 * - Remember how much values are corrected by 773 */ 774 if (t1 > t2) { 775 t = t2; 776 lgrp_plat_probe_errors[i][j] += t1 - t2; 777 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 778 lgrp_plat_probe_suspect[i][j]++; 779 lgrp_plat_probe_suspect[j][i]++; 780 } 781 } else if (t2 > t1) { 782 t = t1; 783 lgrp_plat_probe_errors[j][i] += t2 - t1; 784 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 785 lgrp_plat_probe_suspect[i][j]++; 786 lgrp_plat_probe_suspect[j][i]++; 787 } 788 } 789 790 lgrp_plat_probe_times[i][j] = 791 lgrp_plat_probe_times[j][i] = t; 792 lgrp_config(cflag, t1, t); 793 lgrp_config(cflag, t2, t); 794 } 795 796 /* 797 * Keep track of which latencies get corrected 798 */ 799 for (i = 0; i < MAX_NODES; i++) 800 for (j = 0; j < MAX_NODES; j++) 801 lat_corrected[i][j] = 0; 802 803 /* 804 * For every two nodes, see whether there is another pair of nodes which 805 * are about the same distance apart and make the latencies be the same 806 * if they are close enough together 807 */ 808 for (i = 0; i < lgrp_plat_node_cnt; i++) 809 for (j = 0; j < lgrp_plat_node_cnt; j++) { 810 /* 811 * Pick one pair of nodes (i, j) 812 * and get latency between them 813 */ 814 t1 = lgrp_plat_probe_times[i][j]; 815 816 /* 817 * Skip this pair of nodes if there isn't a latency 818 * for it yet 819 */ 820 if (t1 == 0) 821 continue; 822 823 for (k = 0; k < lgrp_plat_node_cnt; k++) 824 for (l = 0; l < lgrp_plat_node_cnt; l++) { 825 /* 826 * Pick another pair of nodes (k, l) 827 * not same as (i, j) and get latency 828 * between them 829 */ 830 if (k == i && l == j) 831 continue; 832 833 t2 = lgrp_plat_probe_times[k][l]; 834 835 /* 836 * Skip this pair of nodes if there 837 * isn't a latency for it yet 838 */ 839 840 if (t2 == 0) 841 continue; 842 843 /* 844 * Skip nodes (k, l) if they already 845 * have same latency as (i, j) or 846 * their latency isn't close enough to 847 * be considered/made the same 848 */ 849 if (t1 == t2 || (t1 > t2 && t1 - t2 > 850 t1 >> lgrp_plat_probe_lt_shift) || 851 (t2 > t1 && t2 - t1 > 852 t2 >> lgrp_plat_probe_lt_shift)) 853 continue; 854 855 /* 856 * Make latency(i, j) same as 857 * latency(k, l), try to use latency 858 * that has been adjusted already to get 859 * more consistency (if possible), and 860 * remember which latencies were 861 * adjusted for next time 862 */ 863 if (lat_corrected[i][j]) { 864 t = t1; 865 lgrp_config(cflag, t2, t); 866 t2 = t; 867 } else if (lat_corrected[k][l]) { 868 t = t2; 869 lgrp_config(cflag, t1, t); 870 t1 = t; 871 } else { 872 if (t1 > t2) 873 t = t2; 874 else 875 t = t1; 876 lgrp_config(cflag, t1, t); 877 lgrp_config(cflag, t2, t); 878 t1 = t2 = t; 879 } 880 881 lgrp_plat_probe_times[i][j] = 882 lgrp_plat_probe_times[k][l] = t; 883 884 lat_corrected[i][j] = 885 lat_corrected[k][l] = 1; 886 } 887 } 888 889 /* 890 * Local latencies should be same 891 * - Find min and max local latencies 892 * - Make all local latencies be minimum 893 */ 894 min = -1; 895 max = 0; 896 for (i = 0; i < lgrp_plat_node_cnt; i++) { 897 t = lgrp_plat_probe_times[i][i]; 898 if (t == 0) 899 continue; 900 if (min == -1 || t < min) 901 min = t; 902 if (t > max) 903 max = t; 904 } 905 if (min != max) { 906 for (i = 0; i < lgrp_plat_node_cnt; i++) { 907 int local; 908 909 local = lgrp_plat_probe_times[i][i]; 910 if (local == 0) 911 continue; 912 913 /* 914 * Track suspect probe times that aren't within 915 * tolerance of minimum local latency and how much 916 * probe times are corrected by 917 */ 918 if (local - min > min >> lgrp_plat_probe_lt_shift) 919 lgrp_plat_probe_suspect[i][i]++; 920 921 lgrp_plat_probe_errors[i][i] += local - min; 922 923 /* 924 * Make local latencies be minimum 925 */ 926 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 927 lgrp_plat_probe_times[i][i] = min; 928 } 929 } 930 931 /* 932 * Determine max probe time again since just adjusted latencies 933 */ 934 lgrp_plat_probe_time_max = 0; 935 for (i = 0; i < lgrp_plat_node_cnt; i++) 936 for (j = 0; j < lgrp_plat_node_cnt; j++) { 937 t = lgrp_plat_probe_times[i][j]; 938 if (t > lgrp_plat_probe_time_max) 939 lgrp_plat_probe_time_max = t; 940 } 941 } 942 943 944 /* 945 * Verify following about latencies between nodes: 946 * 947 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 948 * - Local latencies same 949 * - Local < remote 950 * - Number of latencies seen is reasonable 951 * - Number of occurrences of a given latency should be more than 1 952 * 953 * Returns: 954 * 0 Success 955 * -1 Not symmetric 956 * -2 Local latencies not same 957 * -3 Local >= remote 958 * -4 Wrong number of latencies 959 * -5 Not enough occurrences of given latency 960 */ 961 static int 962 lgrp_plat_latency_verify(void) 963 { 964 int i; 965 int j; 966 lgrp_plat_latency_acct_t *l; 967 int probed; 968 u_longlong_t t1; 969 u_longlong_t t2; 970 971 /* 972 * Nothing to do when this is an UMA machine, lgroup topology is 973 * limited to 2 levels, or there aren't any probe times yet 974 */ 975 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 976 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 977 return (0); 978 979 /* 980 * Make sure that latencies are symmetric between any two nodes 981 * (ie. latency(node0, node1) == latency(node1, node0)) 982 */ 983 for (i = 0; i < lgrp_plat_node_cnt; i++) 984 for (j = 0; j < lgrp_plat_node_cnt; j++) { 985 t1 = lgrp_plat_probe_times[i][j]; 986 t2 = lgrp_plat_probe_times[j][i]; 987 988 if (t1 == 0 || t2 == 0 || t1 == t2) 989 continue; 990 991 return (-1); 992 } 993 994 /* 995 * Local latencies should be same 996 */ 997 t1 = lgrp_plat_probe_times[0][0]; 998 for (i = 1; i < lgrp_plat_node_cnt; i++) { 999 t2 = lgrp_plat_probe_times[i][i]; 1000 if (t2 == 0) 1001 continue; 1002 1003 if (t1 == 0) { 1004 t1 = t2; 1005 continue; 1006 } 1007 1008 if (t1 != t2) 1009 return (-2); 1010 } 1011 1012 /* 1013 * Local latencies should be less than remote 1014 */ 1015 if (t1) { 1016 for (i = 0; i < lgrp_plat_node_cnt; i++) 1017 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1018 t2 = lgrp_plat_probe_times[i][j]; 1019 if (i == j || t2 == 0) 1020 continue; 1021 1022 if (t1 >= t2) 1023 return (-3); 1024 } 1025 } 1026 1027 /* 1028 * Rest of checks are not very useful for machines with less than 1029 * 4 nodes (which means less than 3 latencies on Opteron) 1030 */ 1031 if (lgrp_plat_node_cnt < 4) 1032 return (0); 1033 1034 /* 1035 * Need to see whether done probing in order to verify number of 1036 * latencies are correct 1037 */ 1038 probed = 0; 1039 for (i = 0; i < lgrp_plat_node_cnt; i++) 1040 if (lgrp_plat_probe_times[i][i]) 1041 probed++; 1042 1043 if (probed != lgrp_plat_node_cnt) 1044 return (0); 1045 1046 /* 1047 * Determine number of unique latencies seen in probe times, 1048 * their values, and number of occurrences of each 1049 */ 1050 lgrp_plat_probe_nlatencies = 0; 1051 bzero(lgrp_plat_probe_lat_acct, 1052 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 1053 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1054 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1055 int k; 1056 1057 /* 1058 * Look at each probe time 1059 */ 1060 t1 = lgrp_plat_probe_times[i][j]; 1061 if (t1 == 0) 1062 continue; 1063 1064 /* 1065 * Account for unique latencies 1066 */ 1067 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1068 l = &lgrp_plat_probe_lat_acct[k]; 1069 if (t1 == l->la_value) { 1070 /* 1071 * Increment number of occurrences 1072 * if seen before 1073 */ 1074 l->la_count++; 1075 break; 1076 } else if (l->la_value == 0) { 1077 /* 1078 * Record latency if haven't seen before 1079 */ 1080 l->la_value = t1; 1081 l->la_count++; 1082 lgrp_plat_probe_nlatencies++; 1083 break; 1084 } 1085 } 1086 } 1087 } 1088 1089 /* 1090 * Number of latencies should be relative to number of 1091 * nodes in system: 1092 * - Same as nodes when nodes <= 2 1093 * - Less than nodes when nodes > 2 1094 * - Greater than 2 when nodes >= 4 1095 */ 1096 if ((lgrp_plat_node_cnt <= 2 && 1097 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 1098 (lgrp_plat_node_cnt > 2 && 1099 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 1100 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 1101 lgrp_plat_probe_nlatencies <= 2)) 1102 return (-4); 1103 1104 /* 1105 * There should be more than one occurrence of every latency 1106 * as long as probing is complete 1107 */ 1108 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 1109 l = &lgrp_plat_probe_lat_acct[i]; 1110 if (l->la_count <= 1) 1111 return (-5); 1112 } 1113 return (0); 1114 } 1115 1116 1117 /* 1118 * Set lgroup latencies for 2 level lgroup topology 1119 */ 1120 static void 1121 lgrp_plat_2level_setup(void) 1122 { 1123 int i; 1124 1125 if (lgrp_plat_node_cnt >= 4) 1126 cmn_err(CE_NOTE, 1127 "MPO only optimizing for local and remote\n"); 1128 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1129 int j; 1130 1131 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1132 if (i == j) 1133 lgrp_plat_probe_times[i][j] = 2; 1134 else 1135 lgrp_plat_probe_times[i][j] = 3; 1136 } 1137 } 1138 lgrp_plat_probe_time_min = 2; 1139 lgrp_plat_probe_time_max = 3; 1140 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1141 } 1142 1143 1144 /* 1145 * Return time needed to probe from current CPU to memory in given node 1146 */ 1147 static hrtime_t 1148 lgrp_plat_probe_time(int to) 1149 { 1150 caddr_t buf; 1151 uint_t dev; 1152 /* LINTED: set but not used in function */ 1153 volatile uint_t dev_vendor; 1154 hrtime_t elapsed; 1155 hrtime_t end; 1156 int from; 1157 int i; 1158 int ipl; 1159 hrtime_t max; 1160 hrtime_t min; 1161 hrtime_t start; 1162 int cnt; 1163 extern int use_sse_pagecopy; 1164 1165 /* 1166 * Determine ID of node containing current CPU 1167 */ 1168 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1169 1170 /* 1171 * Do common work for probing main memory 1172 */ 1173 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1174 /* 1175 * Skip probing any nodes without memory and 1176 * set probe time to 0 1177 */ 1178 if (lgrp_plat_probe_memory[to] == NULL) { 1179 lgrp_plat_probe_times[from][to] = 0; 1180 return (0); 1181 } 1182 1183 /* 1184 * Invalidate caches once instead of once every sample 1185 * which should cut cost of probing by a lot 1186 */ 1187 lgrp_plat_flush_cost = gethrtime(); 1188 invalidate_cache(); 1189 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1190 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1191 } 1192 1193 /* 1194 * Probe from current CPU to given memory using specified operation 1195 * and take specified number of samples 1196 */ 1197 max = 0; 1198 min = -1; 1199 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1200 lgrp_plat_probe_cost = gethrtime(); 1201 1202 /* 1203 * Can't measure probe time if gethrtime() isn't working yet 1204 */ 1205 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1206 return (0); 1207 1208 switch (lgrp_plat_probe_op) { 1209 1210 case LGRP_PLAT_PROBE_PGCPY: 1211 default: 1212 /* 1213 * Measure how long it takes to copy page 1214 * on top of itself 1215 */ 1216 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1217 1218 kpreempt_disable(); 1219 ipl = splhigh(); 1220 start = gethrtime(); 1221 if (use_sse_pagecopy) 1222 hwblkpagecopy(buf, buf); 1223 else 1224 bcopy(buf, buf, PAGESIZE); 1225 end = gethrtime(); 1226 elapsed = end - start; 1227 splx(ipl); 1228 kpreempt_enable(); 1229 break; 1230 1231 case LGRP_PLAT_PROBE_VENDOR: 1232 /* 1233 * Measure how long it takes to read vendor ID from 1234 * Northbridge 1235 */ 1236 dev = OPT_PCS_DEV_NODE0 + to; 1237 kpreempt_disable(); 1238 ipl = spl8(); 1239 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1240 OPT_PCS_OFF_VENDOR)); 1241 start = gethrtime(); 1242 for (cnt = 0; cnt < lgrp_plat_probe_nreads; cnt++) 1243 dev_vendor = inl(PCI_CONFDATA); 1244 end = gethrtime(); 1245 elapsed = (end - start) / lgrp_plat_probe_nreads; 1246 splx(ipl); 1247 kpreempt_enable(); 1248 break; 1249 } 1250 1251 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1252 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1253 1254 if (min == -1 || elapsed < min) 1255 min = elapsed; 1256 if (elapsed > max) 1257 max = elapsed; 1258 } 1259 1260 /* 1261 * Update minimum and maximum probe times between 1262 * these two nodes 1263 */ 1264 if (min < lgrp_plat_probe_min[from][to] || 1265 lgrp_plat_probe_min[from][to] == 0) 1266 lgrp_plat_probe_min[from][to] = min; 1267 1268 if (max > lgrp_plat_probe_max[from][to]) 1269 lgrp_plat_probe_max[from][to] = max; 1270 1271 return (min); 1272 } 1273 1274 1275 /* 1276 * Probe memory in each node from current CPU to determine latency topology 1277 */ 1278 void 1279 lgrp_plat_probe(void) 1280 { 1281 int from; 1282 int i; 1283 hrtime_t probe_time; 1284 int to; 1285 1286 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1287 return; 1288 1289 /* 1290 * Determine ID of node containing current CPU 1291 */ 1292 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1293 1294 /* 1295 * Don't need to probe if got times already 1296 */ 1297 if (lgrp_plat_probe_times[from][from] != 0) 1298 return; 1299 1300 /* 1301 * Read vendor ID in Northbridge or read and write page(s) 1302 * in each node from current CPU and remember how long it takes, 1303 * so we can build latency topology of machine later. 1304 * This should approximate the memory latency between each node. 1305 */ 1306 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1307 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1308 /* 1309 * Get probe time and bail out if can't get it yet 1310 */ 1311 probe_time = lgrp_plat_probe_time(to); 1312 if (probe_time == 0) 1313 return; 1314 1315 /* 1316 * Keep lowest probe time as latency between nodes 1317 */ 1318 if (lgrp_plat_probe_times[from][to] == 0 || 1319 probe_time < lgrp_plat_probe_times[from][to]) 1320 lgrp_plat_probe_times[from][to] = probe_time; 1321 1322 /* 1323 * Update overall minimum and maximum probe times 1324 * across all nodes 1325 */ 1326 if (probe_time < lgrp_plat_probe_time_min || 1327 lgrp_plat_probe_time_min == -1) 1328 lgrp_plat_probe_time_min = probe_time; 1329 if (probe_time > lgrp_plat_probe_time_max) 1330 lgrp_plat_probe_time_max = probe_time; 1331 } 1332 1333 /* 1334 * - Fix up latencies such that local latencies are same, 1335 * latency(i, j) == latency(j, i), etc. (if possible) 1336 * 1337 * - Verify that latencies look ok 1338 * 1339 * - Fallback to just optimizing for local and remote if 1340 * latencies didn't look right 1341 */ 1342 lgrp_plat_latency_adjust(); 1343 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1344 if (lgrp_plat_probe_error_code) 1345 lgrp_plat_2level_setup(); 1346 } 1347 1348 1349 /* 1350 * Platform-specific initialization 1351 */ 1352 void 1353 lgrp_plat_main_init(void) 1354 { 1355 int curnode; 1356 int ht_limit; 1357 int i; 1358 1359 /* 1360 * Print a notice that MPO is disabled when memory is interleaved 1361 * across nodes....Would do this when it is discovered, but can't 1362 * because it happens way too early during boot.... 1363 */ 1364 if (lgrp_plat_mem_intrlv) 1365 cmn_err(CE_NOTE, 1366 "MPO disabled because memory is interleaved\n"); 1367 1368 /* 1369 * Don't bother to do any probing if there is only one node or the 1370 * height of the lgroup topology less than or equal to 2 1371 */ 1372 ht_limit = lgrp_topo_ht_limit(); 1373 if (max_mem_nodes == 1 || ht_limit <= 2) { 1374 /* 1375 * Setup lgroup latencies for 2 level lgroup topology 1376 * (ie. local and remote only) if they haven't been set yet 1377 */ 1378 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1379 lgrp_plat_probe_time_max == 0) 1380 lgrp_plat_2level_setup(); 1381 return; 1382 } 1383 1384 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1385 /* 1386 * Should have been able to probe from CPU 0 when it was added 1387 * to lgroup hierarchy, but may not have been able to then 1388 * because it happens so early in boot that gethrtime() hasn't 1389 * been initialized. (:-( 1390 */ 1391 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1392 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1393 lgrp_plat_probe(); 1394 1395 return; 1396 } 1397 1398 /* 1399 * When probing memory, use one page for every sample to determine 1400 * lgroup topology and taking multiple samples 1401 */ 1402 if (lgrp_plat_probe_memsize == 0) 1403 lgrp_plat_probe_memsize = PAGESIZE * 1404 lgrp_plat_probe_nsamples; 1405 1406 /* 1407 * Map memory in each node needed for probing to determine latency 1408 * topology 1409 */ 1410 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1411 int mnode; 1412 1413 /* 1414 * Skip this node and leave its probe page NULL 1415 * if it doesn't have any memory 1416 */ 1417 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1418 if (!mem_node_config[mnode].exists) { 1419 lgrp_plat_probe_memory[i] = NULL; 1420 continue; 1421 } 1422 1423 /* 1424 * Allocate one kernel virtual page 1425 */ 1426 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1427 lgrp_plat_probe_memsize, VM_NOSLEEP); 1428 if (lgrp_plat_probe_memory[i] == NULL) { 1429 cmn_err(CE_WARN, 1430 "lgrp_plat_main_init: couldn't allocate memory"); 1431 return; 1432 } 1433 1434 /* 1435 * Map virtual page to first page in node 1436 */ 1437 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1438 lgrp_plat_probe_memsize, 1439 lgrp_plat_probe_pfn[i], 1440 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1441 HAT_LOAD_NOCONSIST); 1442 } 1443 1444 /* 1445 * Probe from current CPU 1446 */ 1447 lgrp_plat_probe(); 1448 } 1449 1450 /* 1451 * Allocate additional space for an lgroup. 1452 */ 1453 /* ARGSUSED */ 1454 lgrp_t * 1455 lgrp_plat_alloc(lgrp_id_t lgrpid) 1456 { 1457 lgrp_t *lgrp; 1458 1459 lgrp = &lgrp_space[nlgrps_alloc++]; 1460 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1461 return (NULL); 1462 return (lgrp); 1463 } 1464 1465 /* 1466 * Platform handling for (re)configuration changes 1467 */ 1468 /* ARGSUSED */ 1469 void 1470 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1471 { 1472 } 1473 1474 /* 1475 * Return the platform handle for the lgroup containing the given CPU 1476 */ 1477 /* ARGSUSED */ 1478 lgrp_handle_t 1479 lgrp_plat_cpu_to_hand(processorid_t id) 1480 { 1481 if (lgrp_plat_node_cnt == 1) 1482 return (LGRP_DEFAULT_HANDLE); 1483 1484 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1485 } 1486 1487 /* 1488 * Return the platform handle of the lgroup that contains the physical memory 1489 * corresponding to the given page frame number 1490 */ 1491 /* ARGSUSED */ 1492 lgrp_handle_t 1493 lgrp_plat_pfn_to_hand(pfn_t pfn) 1494 { 1495 int mnode; 1496 1497 if (max_mem_nodes == 1) 1498 return (LGRP_DEFAULT_HANDLE); 1499 1500 if (pfn > physmax) 1501 return (LGRP_NULL_HANDLE); 1502 1503 mnode = plat_pfn_to_mem_node(pfn); 1504 if (mnode < 0) 1505 return (LGRP_NULL_HANDLE); 1506 1507 return (MEM_NODE_2_LGRPHAND(mnode)); 1508 } 1509 1510 /* 1511 * Return the maximum number of lgrps supported by the platform. 1512 * Before lgrp topology is known it returns an estimate based on the number of 1513 * nodes. Once topology is known it returns the actual maximim number of lgrps 1514 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1515 * may not grow during system lifetime. 1516 */ 1517 int 1518 lgrp_plat_max_lgrps() 1519 { 1520 return (lgrp_topo_initialized ? 1521 lgrp_alloc_max + 1 : 1522 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1523 } 1524 1525 /* 1526 * Return the number of free, allocatable, or installed 1527 * pages in an lgroup 1528 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1529 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1530 */ 1531 /* ARGSUSED */ 1532 static pgcnt_t 1533 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1534 { 1535 struct memlist *mlist; 1536 pgcnt_t npgs = 0; 1537 extern struct memlist *phys_avail; 1538 extern struct memlist *phys_install; 1539 1540 switch (query) { 1541 case LGRP_MEM_SIZE_FREE: 1542 return ((pgcnt_t)freemem); 1543 case LGRP_MEM_SIZE_AVAIL: 1544 memlist_read_lock(); 1545 for (mlist = phys_avail; mlist; mlist = mlist->next) 1546 npgs += btop(mlist->size); 1547 memlist_read_unlock(); 1548 return (npgs); 1549 case LGRP_MEM_SIZE_INSTALL: 1550 memlist_read_lock(); 1551 for (mlist = phys_install; mlist; mlist = mlist->next) 1552 npgs += btop(mlist->size); 1553 memlist_read_unlock(); 1554 return (npgs); 1555 default: 1556 return ((pgcnt_t)0); 1557 } 1558 } 1559 1560 /* 1561 * Return the number of free pages in an lgroup. 1562 * 1563 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1564 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1565 * number of allocatable base pagesize pages corresponding to the 1566 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1567 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1568 * memory installed, regardless of whether or not it's usable. 1569 */ 1570 pgcnt_t 1571 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1572 { 1573 int mnode; 1574 pgcnt_t npgs = (pgcnt_t)0; 1575 extern struct memlist *phys_avail; 1576 extern struct memlist *phys_install; 1577 1578 1579 if (plathand == LGRP_DEFAULT_HANDLE) 1580 return (lgrp_plat_mem_size_default(plathand, query)); 1581 1582 if (plathand != LGRP_NULL_HANDLE) { 1583 mnode = plat_lgrphand_to_mem_node(plathand); 1584 if (mnode >= 0 && mem_node_config[mnode].exists) { 1585 switch (query) { 1586 case LGRP_MEM_SIZE_FREE: 1587 npgs = MNODE_PGCNT(mnode); 1588 break; 1589 case LGRP_MEM_SIZE_AVAIL: 1590 npgs = mem_node_memlist_pages(mnode, 1591 phys_avail); 1592 break; 1593 case LGRP_MEM_SIZE_INSTALL: 1594 npgs = mem_node_memlist_pages(mnode, 1595 phys_install); 1596 break; 1597 default: 1598 break; 1599 } 1600 } 1601 } 1602 return (npgs); 1603 } 1604 1605 /* 1606 * Return latency between "from" and "to" lgroups 1607 * 1608 * This latency number can only be used for relative comparison 1609 * between lgroups on the running system, cannot be used across platforms, 1610 * and may not reflect the actual latency. It is platform and implementation 1611 * specific, so platform gets to decide its value. It would be nice if the 1612 * number was at least proportional to make comparisons more meaningful though. 1613 */ 1614 /* ARGSUSED */ 1615 int 1616 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1617 { 1618 lgrp_handle_t src, dest; 1619 1620 if (max_mem_nodes == 1) 1621 return (0); 1622 1623 /* 1624 * Return max latency for root lgroup 1625 */ 1626 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1627 return (lgrp_plat_probe_time_max); 1628 1629 src = from; 1630 dest = to; 1631 1632 /* 1633 * Return 0 for nodes (lgroup platform handles) out of range 1634 */ 1635 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1636 return (0); 1637 1638 /* 1639 * Probe from current CPU if its lgroup latencies haven't been set yet 1640 * and we are trying to get latency from current CPU to some node 1641 */ 1642 if (lgrp_plat_probe_times[src][src] == 0 && 1643 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1644 lgrp_plat_probe(); 1645 1646 return (lgrp_plat_probe_times[src][dest]); 1647 } 1648 1649 /* 1650 * Return platform handle for root lgroup 1651 */ 1652 lgrp_handle_t 1653 lgrp_plat_root_hand(void) 1654 { 1655 return (LGRP_DEFAULT_HANDLE); 1656 } 1657