1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/controlregs.h> 33 #include <sys/cpupart.h> 34 #include <sys/cpuvar.h> 35 #include <sys/lgrp.h> 36 #include <sys/machsystm.h> 37 #include <sys/memlist.h> 38 #include <sys/memnode.h> 39 #include <sys/mman.h> 40 #include <sys/pci_cfgspace.h> 41 #include <sys/pci_impl.h> 42 #include <sys/param.h> 43 #include <sys/pghw.h> 44 #include <sys/promif.h> /* for prom_printf() */ 45 #include <sys/systm.h> 46 #include <sys/thread.h> 47 #include <sys/types.h> 48 #include <sys/var.h> 49 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 50 #include <vm/hat_i86.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/vm_dep.h> 53 54 55 /* 56 * lgroup platform support for x86 platforms. 57 */ 58 59 #define MAX_NODES 8 60 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 61 62 #define LGRP_PLAT_CPU_TO_NODE(cpu) (pg_plat_hw_instance_id(cpu, PGHW_CHIP)) 63 64 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 65 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 66 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 67 68 /* 69 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 70 * 71 * Until this code supports reading System Resource Affinity Table (SRAT), 72 * we need to examine registers in PCI configuration space to determine how 73 * many nodes are in the system and which CPUs and memory are in each node. 74 * This could be determined by probing all memory from each CPU, but that is 75 * too expensive to do while booting the kernel. 76 * 77 * NOTE: Using these PCI configuration space registers to determine this 78 * locality info is not guaranteed to work on future generations of 79 * Opteron processor. 80 */ 81 82 /* 83 * Opteron DRAM Address Map in PCI configuration space gives base and limit 84 * of physical memory in each node. The following constants and macros define 85 * their contents, structure, and access. 86 */ 87 88 /* 89 * How many bits to shift Opteron DRAM Address Map base and limit registers 90 * to get actual value 91 */ 92 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 93 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 94 95 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 96 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 97 98 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 99 100 /* 101 * Macros to derive addresses from Opteron DRAM Address Map registers 102 */ 103 #define OPT_DRAMADDR_HI(reg) \ 104 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 105 OPT_DRAMADDR_HI_LSHIFT_ADDR) 106 107 #define OPT_DRAMADDR_LO(reg) \ 108 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 109 OPT_DRAMADDR_LO_LSHIFT_ADDR) 110 111 #define OPT_DRAMADDR(high, low) \ 112 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 113 114 /* 115 * Bit masks defining what's in Opteron DRAM Address Map base register 116 */ 117 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 118 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 119 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 120 121 /* 122 * Bit masks defining what's in Opteron DRAM Address Map limit register 123 */ 124 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 125 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 126 127 128 /* 129 * Opteron Node ID register in PCI configuration space contains 130 * number of nodes in system, etc. for Opteron K8. The following 131 * constants and macros define its contents, structure, and access. 132 */ 133 134 /* 135 * Bit masks defining what's in Opteron Node ID register 136 */ 137 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 138 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 139 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 140 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 141 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 142 143 /* 144 * How many bits in Opteron Node ID register to shift right to get actual value 145 */ 146 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 147 148 /* 149 * Macros to get values from Opteron Node ID register 150 */ 151 #define OPT_NODE_CNT(reg) \ 152 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 153 154 /* 155 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 156 * "in/out" instructions 157 * 158 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 159 * other uses should just do MMIO to access PCI ECS. 160 * Must enable special bit in Northbridge Configuration Register on 161 * Greyhound for extended CF8 space access to be able to access PCI ECS 162 * using "in/out" instructions and restore special bit after done 163 * accessing PCI ECS. 164 */ 165 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 166 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 167 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 168 ((((reg) >> 8) & 0xf) << 24)) 169 170 /* 171 * PCI configuration space registers accessed by specifying 172 * a bus, device, function, and offset. The following constants 173 * define the values needed to access Opteron K8 configuration 174 * info to determine its node topology 175 */ 176 177 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 178 179 /* 180 * Opteron PCI configuration space register function values 181 */ 182 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 183 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 184 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 185 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 186 187 /* 188 * PCI Configuration Space register offsets 189 */ 190 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 191 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 192 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 193 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 194 195 /* 196 * Opteron PCI Configuration Space device IDs for nodes 197 */ 198 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 199 200 201 /* 202 * Bookkeeping for latencies seen during probing (used for verification) 203 */ 204 typedef struct lgrp_plat_latency_acct { 205 hrtime_t la_value; /* latency value */ 206 int la_count; /* occurrences */ 207 } lgrp_plat_latency_acct_t; 208 209 210 /* 211 * Choices for probing to determine lgroup topology 212 */ 213 typedef enum lgrp_plat_probe_op { 214 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 215 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 216 } lgrp_plat_probe_op_t; 217 218 219 /* 220 * Opteron DRAM address map gives base and limit for physical memory in a node 221 */ 222 typedef struct opt_dram_addr_map { 223 uint32_t base_hi; 224 uint32_t base_lo; 225 uint32_t limit_hi; 226 uint32_t limit_lo; 227 } opt_dram_addr_map_t; 228 229 230 /* 231 * Starting and ending page for physical memory in node 232 */ 233 typedef struct phys_addr_map { 234 pfn_t start; 235 pfn_t end; 236 int exists; 237 } phys_addr_map_t; 238 239 240 /* 241 * Opteron DRAM address map for each node 242 */ 243 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 244 245 /* 246 * Node ID register contents for each node 247 */ 248 uint_t opt_node_info[MAX_NODES]; 249 250 /* 251 * Whether memory is interleaved across nodes causing MPO to be disabled 252 */ 253 int lgrp_plat_mem_intrlv = 0; 254 255 /* 256 * Number of nodes in system 257 */ 258 uint_t lgrp_plat_node_cnt = 1; 259 260 /* 261 * Physical address range for memory in each node 262 */ 263 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 264 265 /* 266 * Probe costs (individual and total) and flush cost 267 */ 268 hrtime_t lgrp_plat_flush_cost = 0; 269 hrtime_t lgrp_plat_probe_cost = 0; 270 hrtime_t lgrp_plat_probe_cost_total = 0; 271 272 /* 273 * Error code for latency adjustment and verification 274 */ 275 int lgrp_plat_probe_error_code = 0; 276 277 /* 278 * How much latencies were off from minimum values gotten 279 */ 280 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 281 282 /* 283 * Unique probe latencies and number of occurrences of each 284 */ 285 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 286 287 /* 288 * Size of memory buffer in each node for probing 289 */ 290 size_t lgrp_plat_probe_memsize = 0; 291 292 /* 293 * Virtual address of page in each node for probing 294 */ 295 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 296 297 /* 298 * Number of unique latencies in probe times 299 */ 300 int lgrp_plat_probe_nlatencies = 0; 301 302 /* 303 * How many rounds of probing to do 304 */ 305 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 306 307 /* 308 * Number of samples to take when probing each node 309 */ 310 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 311 312 /* 313 * Number of times to read vendor ID from Northbridge for each probe. 314 */ 315 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 316 317 /* 318 * How to probe to determine lgroup topology 319 */ 320 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 321 322 /* 323 * PFN of page in each node for probing 324 */ 325 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 326 327 /* 328 * Whether probe time was suspect (ie. not within tolerance of value that it 329 * should match) 330 */ 331 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 332 333 /* 334 * How long it takes to access memory from each node 335 */ 336 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 337 338 /* 339 * Min and max node memory probe times seen 340 */ 341 hrtime_t lgrp_plat_probe_time_max = 0; 342 hrtime_t lgrp_plat_probe_time_min = -1; 343 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 344 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 345 346 347 /* 348 * Allocate lgrp and lgrp stat arrays statically. 349 */ 350 static lgrp_t lgrp_space[NLGRP]; 351 static int nlgrps_alloc; 352 353 struct lgrp_stats lgrp_stats[NLGRP]; 354 355 /* 356 * Supported AMD processor families 357 */ 358 #define AMD_FAMILY_HAMMER 15 359 #define AMD_FAMILY_GREYHOUND 16 360 361 /* 362 * Whether to have is_opteron() return 1 even when processor isn't 363 * supported 364 */ 365 uint_t is_opteron_override = 0; 366 367 /* 368 * AMD processor family for current CPU 369 */ 370 uint_t opt_family = 0; 371 372 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 373 374 375 /* 376 * Determine whether we're running on a supported AMD Opteron since reading 377 * node count and DRAM address map registers may have different format or 378 * may not be supported in future processor families 379 */ 380 int 381 is_opteron(void) 382 { 383 384 if (x86_vendor != X86_VENDOR_AMD) 385 return (0); 386 387 opt_family = cpuid_getfamily(CPU); 388 if (opt_family == AMD_FAMILY_HAMMER || 389 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 390 return (1); 391 else 392 return (0); 393 } 394 395 int 396 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 397 { 398 if (max_mem_nodes == 1) 399 return (0); 400 401 return ((int)hand); 402 } 403 404 lgrp_handle_t 405 plat_mem_node_to_lgrphand(int mnode) 406 { 407 if (max_mem_nodes == 1) 408 return (LGRP_DEFAULT_HANDLE); 409 410 return ((lgrp_handle_t)mnode); 411 } 412 413 int 414 plat_pfn_to_mem_node(pfn_t pfn) 415 { 416 int node; 417 418 if (max_mem_nodes == 1) 419 return (0); 420 421 for (node = 0; node < lgrp_plat_node_cnt; node++) { 422 /* 423 * Skip nodes with no memory 424 */ 425 if (!lgrp_plat_node_memory[node].exists) 426 continue; 427 428 if (pfn >= lgrp_plat_node_memory[node].start && 429 pfn <= lgrp_plat_node_memory[node].end) 430 return (node); 431 } 432 433 ASSERT(node < lgrp_plat_node_cnt); 434 return (-1); 435 } 436 437 /* 438 * Configure memory nodes for machines with more than one node (ie NUMA) 439 */ 440 void 441 plat_build_mem_nodes(struct memlist *list) 442 { 443 pfn_t cur_start; /* start addr of subrange */ 444 pfn_t cur_end; /* end addr of subrange */ 445 pfn_t start; /* start addr of whole range */ 446 pfn_t end; /* end addr of whole range */ 447 448 /* 449 * Boot install lists are arranged <addr, len>, ... 450 */ 451 while (list) { 452 int node; 453 454 start = list->address >> PAGESHIFT; 455 end = (list->address + list->size - 1) >> PAGESHIFT; 456 457 if (start > physmax) { 458 list = list->next; 459 continue; 460 } 461 if (end > physmax) 462 end = physmax; 463 464 /* 465 * When there is only one memnode, just add memory to memnode 466 */ 467 if (max_mem_nodes == 1) { 468 mem_node_add_slice(start, end); 469 list = list->next; 470 continue; 471 } 472 473 /* 474 * mem_node_add_slice() expects to get a memory range that 475 * is within one memnode, so need to split any memory range 476 * that spans multiple memnodes into subranges that are each 477 * contained within one memnode when feeding them to 478 * mem_node_add_slice() 479 */ 480 cur_start = start; 481 do { 482 node = plat_pfn_to_mem_node(cur_start); 483 484 /* 485 * Panic if DRAM address map registers or SRAT say 486 * memory in node doesn't exist or address from 487 * boot installed memory list entry isn't in this node. 488 * This shouldn't happen and rest of code can't deal 489 * with this if it does. 490 */ 491 if (node < 0 || node >= lgrp_plat_node_cnt || 492 !lgrp_plat_node_memory[node].exists || 493 cur_start < lgrp_plat_node_memory[node].start || 494 cur_start > lgrp_plat_node_memory[node].end) { 495 cmn_err(CE_PANIC, "Don't know which memnode " 496 "to add installed memory address 0x%lx\n", 497 cur_start); 498 } 499 500 /* 501 * End of current subrange should not span memnodes 502 */ 503 cur_end = end; 504 if (lgrp_plat_node_memory[node].exists && 505 cur_end > lgrp_plat_node_memory[node].end) 506 cur_end = lgrp_plat_node_memory[node].end; 507 508 mem_node_add_slice(cur_start, cur_end); 509 510 /* 511 * Next subrange starts after end of current one 512 */ 513 cur_start = cur_end + 1; 514 } while (cur_end < end); 515 516 list = list->next; 517 } 518 mem_node_physalign = 0; 519 mem_node_pfn_shift = 0; 520 } 521 522 523 /* 524 * Platform-specific initialization of lgroups 525 */ 526 void 527 lgrp_plat_init(void) 528 { 529 uint_t bus; 530 uint_t dev; 531 uint_t node; 532 uint_t off_hi; 533 uint_t off_lo; 534 uint64_t nb_cfg_reg; 535 536 extern lgrp_load_t lgrp_expand_proc_thresh; 537 extern lgrp_load_t lgrp_expand_proc_diff; 538 539 /* 540 * Initialize as a UMA machine if this isn't an Opteron 541 */ 542 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 543 lgrp_plat_node_cnt = max_mem_nodes = 1; 544 return; 545 } 546 547 /* 548 * Read configuration registers from PCI configuration space to 549 * determine node information, which memory is in each node, etc. 550 * 551 * Write to PCI configuration space address register to specify 552 * which configuration register to read and read/write PCI 553 * configuration space data register to get/set contents 554 */ 555 bus = OPT_PCS_BUS_CONFIG; 556 dev = OPT_PCS_DEV_NODE0; 557 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 558 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 559 560 /* 561 * Read node ID register for node 0 to get node count 562 */ 563 opt_node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 564 OPT_PCS_OFF_NODEID); 565 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 566 567 /* 568 * For Greyhound, PCI Extended Configuration Space must be enabled to 569 * read high DRAM address map base and limit registers 570 */ 571 if (opt_family == AMD_FAMILY_GREYHOUND) { 572 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 573 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 574 wrmsr(MSR_AMD_NB_CFG, 575 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 576 } 577 578 for (node = 0; node < lgrp_plat_node_cnt; node++) { 579 uint32_t base_hi; 580 uint32_t base_lo; 581 uint32_t limit_hi; 582 uint32_t limit_lo; 583 584 /* 585 * Read node ID register (except for node 0 which we just read) 586 */ 587 if (node > 0) { 588 opt_node_info[node] = pci_getl_func(bus, dev, 589 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 590 } 591 592 /* 593 * Read DRAM base and limit registers which specify 594 * physical memory range of each node 595 */ 596 if (opt_family != AMD_FAMILY_GREYHOUND) 597 base_hi = 0; 598 else { 599 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 600 OPT_PCS_FUNC_ADDRMAP, off_hi)); 601 base_hi = opt_dram_map[node].base_hi = 602 inl(PCI_CONFDATA); 603 } 604 base_lo = opt_dram_map[node].base_lo = pci_getl_func(bus, dev, 605 OPT_PCS_FUNC_ADDRMAP, off_lo); 606 607 if (opt_dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) 608 lgrp_plat_mem_intrlv++; 609 610 off_hi += 4; /* high limit register offset */ 611 if (opt_family != AMD_FAMILY_GREYHOUND) 612 limit_hi = 0; 613 else { 614 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 615 OPT_PCS_FUNC_ADDRMAP, off_hi)); 616 limit_hi = opt_dram_map[node].limit_hi = 617 inl(PCI_CONFDATA); 618 } 619 620 off_lo += 4; /* low limit register offset */ 621 limit_lo = opt_dram_map[node].limit_lo = pci_getl_func(bus, 622 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 623 624 /* 625 * Increment device number to next node and register offsets 626 * for DRAM base register of next node 627 */ 628 off_hi += 4; 629 off_lo += 4; 630 dev++; 631 632 /* 633 * Both read and write enable bits must be enabled in DRAM 634 * address map base register for physical memory to exist in 635 * node 636 */ 637 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 638 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 639 /* 640 * Mark node memory as non-existent and set start and 641 * end addresses to be same in lgrp_plat_node_memory[] 642 */ 643 lgrp_plat_node_memory[node].exists = 0; 644 lgrp_plat_node_memory[node].start = 645 lgrp_plat_node_memory[node].end = (pfn_t)-1; 646 continue; 647 } 648 649 /* 650 * Get PFN for first page in each node, 651 * so we can probe memory to determine latency topology 652 */ 653 lgrp_plat_probe_pfn[node] = 654 btop(OPT_DRAMADDR(base_hi, base_lo)); 655 656 /* 657 * Mark node memory as existing and remember physical address 658 * range of each node for use later 659 */ 660 lgrp_plat_node_memory[node].exists = 1; 661 662 lgrp_plat_node_memory[node].start = 663 btop(OPT_DRAMADDR(base_hi, base_lo)); 664 665 lgrp_plat_node_memory[node].end = 666 btop(OPT_DRAMADDR(limit_hi, limit_lo) | 667 OPT_DRAMADDR_LO_MASK_OFF); 668 } 669 670 /* 671 * Restore PCI Extended Configuration Space enable bit 672 */ 673 if (opt_family == AMD_FAMILY_GREYHOUND) { 674 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 675 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 676 } 677 678 /* 679 * Only use one memory node if memory is interleaved between any nodes 680 */ 681 if (lgrp_plat_mem_intrlv) { 682 lgrp_plat_node_cnt = max_mem_nodes = 1; 683 (void) lgrp_topo_ht_limit_set(1); 684 } else { 685 max_mem_nodes = lgrp_plat_node_cnt; 686 687 /* 688 * Probing errors can mess up the lgroup topology and force us 689 * fall back to a 2 level lgroup topology. Here we bound how 690 * tall the lgroup topology can grow in hopes of avoiding any 691 * anamolies in probing from messing up the lgroup topology 692 * by limiting the accuracy of the latency topology. 693 * 694 * Assume that nodes will at least be configured in a ring, 695 * so limit height of lgroup topology to be less than number 696 * of nodes on a system with 4 or more nodes 697 */ 698 if (lgrp_plat_node_cnt >= 4 && 699 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 700 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 701 } 702 703 /* 704 * Lgroups on Opteron architectures have but a single physical 705 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 706 * so that lgrp_choose() will spread things out aggressively. 707 */ 708 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 709 lgrp_expand_proc_diff = 0; 710 } 711 712 713 /* 714 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 715 * be considered same 716 */ 717 #define LGRP_LAT_TOLERANCE_SHIFT 4 718 719 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 720 721 722 /* 723 * Adjust latencies between nodes to be symmetric, normalize latencies between 724 * any nodes that are within some tolerance to be same, and make local 725 * latencies be same 726 */ 727 static void 728 lgrp_plat_latency_adjust(void) 729 { 730 int i; 731 int j; 732 int k; 733 int l; 734 u_longlong_t max; 735 u_longlong_t min; 736 u_longlong_t t; 737 u_longlong_t t1; 738 u_longlong_t t2; 739 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 740 int lat_corrected[MAX_NODES][MAX_NODES]; 741 742 /* 743 * Nothing to do when this is an UMA machine 744 */ 745 if (max_mem_nodes == 1) 746 return; 747 748 /* 749 * Make sure that latencies are symmetric between any two nodes 750 * (ie. latency(node0, node1) == latency(node1, node0)) 751 */ 752 for (i = 0; i < lgrp_plat_node_cnt; i++) 753 for (j = 0; j < lgrp_plat_node_cnt; j++) { 754 t1 = lgrp_plat_probe_times[i][j]; 755 t2 = lgrp_plat_probe_times[j][i]; 756 757 if (t1 == 0 || t2 == 0 || t1 == t2) 758 continue; 759 760 /* 761 * Latencies should be same 762 * - Use minimum of two latencies which should be same 763 * - Track suspect probe times not within tolerance of 764 * min value 765 * - Remember how much values are corrected by 766 */ 767 if (t1 > t2) { 768 t = t2; 769 lgrp_plat_probe_errors[i][j] += t1 - t2; 770 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 771 lgrp_plat_probe_suspect[i][j]++; 772 lgrp_plat_probe_suspect[j][i]++; 773 } 774 } else if (t2 > t1) { 775 t = t1; 776 lgrp_plat_probe_errors[j][i] += t2 - t1; 777 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 778 lgrp_plat_probe_suspect[i][j]++; 779 lgrp_plat_probe_suspect[j][i]++; 780 } 781 } 782 783 lgrp_plat_probe_times[i][j] = 784 lgrp_plat_probe_times[j][i] = t; 785 lgrp_config(cflag, t1, t); 786 lgrp_config(cflag, t2, t); 787 } 788 789 /* 790 * Keep track of which latencies get corrected 791 */ 792 for (i = 0; i < MAX_NODES; i++) 793 for (j = 0; j < MAX_NODES; j++) 794 lat_corrected[i][j] = 0; 795 796 /* 797 * For every two nodes, see whether there is another pair of nodes which 798 * are about the same distance apart and make the latencies be the same 799 * if they are close enough together 800 */ 801 for (i = 0; i < lgrp_plat_node_cnt; i++) 802 for (j = 0; j < lgrp_plat_node_cnt; j++) { 803 /* 804 * Pick one pair of nodes (i, j) 805 * and get latency between them 806 */ 807 t1 = lgrp_plat_probe_times[i][j]; 808 809 /* 810 * Skip this pair of nodes if there isn't a latency 811 * for it yet 812 */ 813 if (t1 == 0) 814 continue; 815 816 for (k = 0; k < lgrp_plat_node_cnt; k++) 817 for (l = 0; l < lgrp_plat_node_cnt; l++) { 818 /* 819 * Pick another pair of nodes (k, l) 820 * not same as (i, j) and get latency 821 * between them 822 */ 823 if (k == i && l == j) 824 continue; 825 826 t2 = lgrp_plat_probe_times[k][l]; 827 828 /* 829 * Skip this pair of nodes if there 830 * isn't a latency for it yet 831 */ 832 833 if (t2 == 0) 834 continue; 835 836 /* 837 * Skip nodes (k, l) if they already 838 * have same latency as (i, j) or 839 * their latency isn't close enough to 840 * be considered/made the same 841 */ 842 if (t1 == t2 || (t1 > t2 && t1 - t2 > 843 t1 >> lgrp_plat_probe_lt_shift) || 844 (t2 > t1 && t2 - t1 > 845 t2 >> lgrp_plat_probe_lt_shift)) 846 continue; 847 848 /* 849 * Make latency(i, j) same as 850 * latency(k, l), try to use latency 851 * that has been adjusted already to get 852 * more consistency (if possible), and 853 * remember which latencies were 854 * adjusted for next time 855 */ 856 if (lat_corrected[i][j]) { 857 t = t1; 858 lgrp_config(cflag, t2, t); 859 t2 = t; 860 } else if (lat_corrected[k][l]) { 861 t = t2; 862 lgrp_config(cflag, t1, t); 863 t1 = t; 864 } else { 865 if (t1 > t2) 866 t = t2; 867 else 868 t = t1; 869 lgrp_config(cflag, t1, t); 870 lgrp_config(cflag, t2, t); 871 t1 = t2 = t; 872 } 873 874 lgrp_plat_probe_times[i][j] = 875 lgrp_plat_probe_times[k][l] = t; 876 877 lat_corrected[i][j] = 878 lat_corrected[k][l] = 1; 879 } 880 } 881 882 /* 883 * Local latencies should be same 884 * - Find min and max local latencies 885 * - Make all local latencies be minimum 886 */ 887 min = -1; 888 max = 0; 889 for (i = 0; i < lgrp_plat_node_cnt; i++) { 890 t = lgrp_plat_probe_times[i][i]; 891 if (t == 0) 892 continue; 893 if (min == -1 || t < min) 894 min = t; 895 if (t > max) 896 max = t; 897 } 898 if (min != max) { 899 for (i = 0; i < lgrp_plat_node_cnt; i++) { 900 int local; 901 902 local = lgrp_plat_probe_times[i][i]; 903 if (local == 0) 904 continue; 905 906 /* 907 * Track suspect probe times that aren't within 908 * tolerance of minimum local latency and how much 909 * probe times are corrected by 910 */ 911 if (local - min > min >> lgrp_plat_probe_lt_shift) 912 lgrp_plat_probe_suspect[i][i]++; 913 914 lgrp_plat_probe_errors[i][i] += local - min; 915 916 /* 917 * Make local latencies be minimum 918 */ 919 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 920 lgrp_plat_probe_times[i][i] = min; 921 } 922 } 923 924 /* 925 * Determine max probe time again since just adjusted latencies 926 */ 927 lgrp_plat_probe_time_max = 0; 928 for (i = 0; i < lgrp_plat_node_cnt; i++) 929 for (j = 0; j < lgrp_plat_node_cnt; j++) { 930 t = lgrp_plat_probe_times[i][j]; 931 if (t > lgrp_plat_probe_time_max) 932 lgrp_plat_probe_time_max = t; 933 } 934 } 935 936 937 /* 938 * Verify following about latencies between nodes: 939 * 940 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 941 * - Local latencies same 942 * - Local < remote 943 * - Number of latencies seen is reasonable 944 * - Number of occurrences of a given latency should be more than 1 945 * 946 * Returns: 947 * 0 Success 948 * -1 Not symmetric 949 * -2 Local latencies not same 950 * -3 Local >= remote 951 * -4 Wrong number of latencies 952 * -5 Not enough occurrences of given latency 953 */ 954 static int 955 lgrp_plat_latency_verify(void) 956 { 957 int i; 958 int j; 959 lgrp_plat_latency_acct_t *l; 960 int probed; 961 u_longlong_t t1; 962 u_longlong_t t2; 963 964 /* 965 * Nothing to do when this is an UMA machine, lgroup topology is 966 * limited to 2 levels, or there aren't any probe times yet 967 */ 968 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 969 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 970 return (0); 971 972 /* 973 * Make sure that latencies are symmetric between any two nodes 974 * (ie. latency(node0, node1) == latency(node1, node0)) 975 */ 976 for (i = 0; i < lgrp_plat_node_cnt; i++) 977 for (j = 0; j < lgrp_plat_node_cnt; j++) { 978 t1 = lgrp_plat_probe_times[i][j]; 979 t2 = lgrp_plat_probe_times[j][i]; 980 981 if (t1 == 0 || t2 == 0 || t1 == t2) 982 continue; 983 984 return (-1); 985 } 986 987 /* 988 * Local latencies should be same 989 */ 990 t1 = lgrp_plat_probe_times[0][0]; 991 for (i = 1; i < lgrp_plat_node_cnt; i++) { 992 t2 = lgrp_plat_probe_times[i][i]; 993 if (t2 == 0) 994 continue; 995 996 if (t1 == 0) { 997 t1 = t2; 998 continue; 999 } 1000 1001 if (t1 != t2) 1002 return (-2); 1003 } 1004 1005 /* 1006 * Local latencies should be less than remote 1007 */ 1008 if (t1) { 1009 for (i = 0; i < lgrp_plat_node_cnt; i++) 1010 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1011 t2 = lgrp_plat_probe_times[i][j]; 1012 if (i == j || t2 == 0) 1013 continue; 1014 1015 if (t1 >= t2) 1016 return (-3); 1017 } 1018 } 1019 1020 /* 1021 * Rest of checks are not very useful for machines with less than 1022 * 4 nodes (which means less than 3 latencies on Opteron) 1023 */ 1024 if (lgrp_plat_node_cnt < 4) 1025 return (0); 1026 1027 /* 1028 * Need to see whether done probing in order to verify number of 1029 * latencies are correct 1030 */ 1031 probed = 0; 1032 for (i = 0; i < lgrp_plat_node_cnt; i++) 1033 if (lgrp_plat_probe_times[i][i]) 1034 probed++; 1035 1036 if (probed != lgrp_plat_node_cnt) 1037 return (0); 1038 1039 /* 1040 * Determine number of unique latencies seen in probe times, 1041 * their values, and number of occurrences of each 1042 */ 1043 lgrp_plat_probe_nlatencies = 0; 1044 bzero(lgrp_plat_probe_lat_acct, 1045 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 1046 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1047 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1048 int k; 1049 1050 /* 1051 * Look at each probe time 1052 */ 1053 t1 = lgrp_plat_probe_times[i][j]; 1054 if (t1 == 0) 1055 continue; 1056 1057 /* 1058 * Account for unique latencies 1059 */ 1060 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1061 l = &lgrp_plat_probe_lat_acct[k]; 1062 if (t1 == l->la_value) { 1063 /* 1064 * Increment number of occurrences 1065 * if seen before 1066 */ 1067 l->la_count++; 1068 break; 1069 } else if (l->la_value == 0) { 1070 /* 1071 * Record latency if haven't seen before 1072 */ 1073 l->la_value = t1; 1074 l->la_count++; 1075 lgrp_plat_probe_nlatencies++; 1076 break; 1077 } 1078 } 1079 } 1080 } 1081 1082 /* 1083 * Number of latencies should be relative to number of 1084 * nodes in system: 1085 * - Same as nodes when nodes <= 2 1086 * - Less than nodes when nodes > 2 1087 * - Greater than 2 when nodes >= 4 1088 */ 1089 if ((lgrp_plat_node_cnt <= 2 && 1090 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 1091 (lgrp_plat_node_cnt > 2 && 1092 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 1093 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 1094 lgrp_plat_probe_nlatencies <= 2)) 1095 return (-4); 1096 1097 /* 1098 * There should be more than one occurrence of every latency 1099 * as long as probing is complete 1100 */ 1101 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 1102 l = &lgrp_plat_probe_lat_acct[i]; 1103 if (l->la_count <= 1) 1104 return (-5); 1105 } 1106 return (0); 1107 } 1108 1109 1110 /* 1111 * Set lgroup latencies for 2 level lgroup topology 1112 */ 1113 static void 1114 lgrp_plat_2level_setup(void) 1115 { 1116 int i; 1117 1118 if (lgrp_plat_node_cnt >= 4) 1119 cmn_err(CE_NOTE, 1120 "MPO only optimizing for local and remote\n"); 1121 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1122 int j; 1123 1124 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1125 if (i == j) 1126 lgrp_plat_probe_times[i][j] = 2; 1127 else 1128 lgrp_plat_probe_times[i][j] = 3; 1129 } 1130 } 1131 lgrp_plat_probe_time_min = 2; 1132 lgrp_plat_probe_time_max = 3; 1133 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1134 } 1135 1136 1137 /* 1138 * Return time needed to probe from current CPU to memory in given node 1139 */ 1140 static hrtime_t 1141 lgrp_plat_probe_time(int to) 1142 { 1143 caddr_t buf; 1144 uint_t dev; 1145 /* LINTED: set but not used in function */ 1146 volatile uint_t dev_vendor; 1147 hrtime_t elapsed; 1148 hrtime_t end; 1149 int from; 1150 int i; 1151 int ipl; 1152 hrtime_t max; 1153 hrtime_t min; 1154 hrtime_t start; 1155 int cnt; 1156 extern int use_sse_pagecopy; 1157 1158 /* 1159 * Determine ID of node containing current CPU 1160 */ 1161 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1162 1163 /* 1164 * Do common work for probing main memory 1165 */ 1166 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1167 /* 1168 * Skip probing any nodes without memory and 1169 * set probe time to 0 1170 */ 1171 if (lgrp_plat_probe_memory[to] == NULL) { 1172 lgrp_plat_probe_times[from][to] = 0; 1173 return (0); 1174 } 1175 1176 /* 1177 * Invalidate caches once instead of once every sample 1178 * which should cut cost of probing by a lot 1179 */ 1180 lgrp_plat_flush_cost = gethrtime(); 1181 invalidate_cache(); 1182 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1183 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1184 } 1185 1186 /* 1187 * Probe from current CPU to given memory using specified operation 1188 * and take specified number of samples 1189 */ 1190 max = 0; 1191 min = -1; 1192 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1193 lgrp_plat_probe_cost = gethrtime(); 1194 1195 /* 1196 * Can't measure probe time if gethrtime() isn't working yet 1197 */ 1198 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1199 return (0); 1200 1201 switch (lgrp_plat_probe_op) { 1202 1203 case LGRP_PLAT_PROBE_PGCPY: 1204 default: 1205 /* 1206 * Measure how long it takes to copy page 1207 * on top of itself 1208 */ 1209 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1210 1211 kpreempt_disable(); 1212 ipl = splhigh(); 1213 start = gethrtime(); 1214 if (use_sse_pagecopy) 1215 hwblkpagecopy(buf, buf); 1216 else 1217 bcopy(buf, buf, PAGESIZE); 1218 end = gethrtime(); 1219 elapsed = end - start; 1220 splx(ipl); 1221 kpreempt_enable(); 1222 break; 1223 1224 case LGRP_PLAT_PROBE_VENDOR: 1225 /* 1226 * Measure how long it takes to read vendor ID from 1227 * Northbridge 1228 */ 1229 dev = OPT_PCS_DEV_NODE0 + to; 1230 kpreempt_disable(); 1231 ipl = spl8(); 1232 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1233 OPT_PCS_OFF_VENDOR)); 1234 start = gethrtime(); 1235 for (cnt = 0; cnt < lgrp_plat_probe_nreads; cnt++) 1236 dev_vendor = inl(PCI_CONFDATA); 1237 end = gethrtime(); 1238 elapsed = (end - start) / lgrp_plat_probe_nreads; 1239 splx(ipl); 1240 kpreempt_enable(); 1241 break; 1242 } 1243 1244 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1245 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1246 1247 if (min == -1 || elapsed < min) 1248 min = elapsed; 1249 if (elapsed > max) 1250 max = elapsed; 1251 } 1252 1253 /* 1254 * Update minimum and maximum probe times between 1255 * these two nodes 1256 */ 1257 if (min < lgrp_plat_probe_min[from][to] || 1258 lgrp_plat_probe_min[from][to] == 0) 1259 lgrp_plat_probe_min[from][to] = min; 1260 1261 if (max > lgrp_plat_probe_max[from][to]) 1262 lgrp_plat_probe_max[from][to] = max; 1263 1264 return (min); 1265 } 1266 1267 1268 /* 1269 * Probe memory in each node from current CPU to determine latency topology 1270 */ 1271 void 1272 lgrp_plat_probe(void) 1273 { 1274 int from; 1275 int i; 1276 hrtime_t probe_time; 1277 int to; 1278 1279 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1280 return; 1281 1282 /* 1283 * Determine ID of node containing current CPU 1284 */ 1285 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1286 1287 /* 1288 * Don't need to probe if got times already 1289 */ 1290 if (lgrp_plat_probe_times[from][from] != 0) 1291 return; 1292 1293 /* 1294 * Read vendor ID in Northbridge or read and write page(s) 1295 * in each node from current CPU and remember how long it takes, 1296 * so we can build latency topology of machine later. 1297 * This should approximate the memory latency between each node. 1298 */ 1299 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1300 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1301 /* 1302 * Get probe time and bail out if can't get it yet 1303 */ 1304 probe_time = lgrp_plat_probe_time(to); 1305 if (probe_time == 0) 1306 return; 1307 1308 /* 1309 * Keep lowest probe time as latency between nodes 1310 */ 1311 if (lgrp_plat_probe_times[from][to] == 0 || 1312 probe_time < lgrp_plat_probe_times[from][to]) 1313 lgrp_plat_probe_times[from][to] = probe_time; 1314 1315 /* 1316 * Update overall minimum and maximum probe times 1317 * across all nodes 1318 */ 1319 if (probe_time < lgrp_plat_probe_time_min || 1320 lgrp_plat_probe_time_min == -1) 1321 lgrp_plat_probe_time_min = probe_time; 1322 if (probe_time > lgrp_plat_probe_time_max) 1323 lgrp_plat_probe_time_max = probe_time; 1324 } 1325 1326 /* 1327 * - Fix up latencies such that local latencies are same, 1328 * latency(i, j) == latency(j, i), etc. (if possible) 1329 * 1330 * - Verify that latencies look ok 1331 * 1332 * - Fallback to just optimizing for local and remote if 1333 * latencies didn't look right 1334 */ 1335 lgrp_plat_latency_adjust(); 1336 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1337 if (lgrp_plat_probe_error_code) 1338 lgrp_plat_2level_setup(); 1339 } 1340 1341 1342 /* 1343 * Platform-specific initialization 1344 */ 1345 void 1346 lgrp_plat_main_init(void) 1347 { 1348 int curnode; 1349 int ht_limit; 1350 int i; 1351 1352 /* 1353 * Print a notice that MPO is disabled when memory is interleaved 1354 * across nodes....Would do this when it is discovered, but can't 1355 * because it happens way too early during boot.... 1356 */ 1357 if (lgrp_plat_mem_intrlv) 1358 cmn_err(CE_NOTE, 1359 "MPO disabled because memory is interleaved\n"); 1360 1361 /* 1362 * Don't bother to do any probing if there is only one node or the 1363 * height of the lgroup topology less than or equal to 2 1364 */ 1365 ht_limit = lgrp_topo_ht_limit(); 1366 if (max_mem_nodes == 1 || ht_limit <= 2) { 1367 /* 1368 * Setup lgroup latencies for 2 level lgroup topology 1369 * (ie. local and remote only) if they haven't been set yet 1370 */ 1371 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1372 lgrp_plat_probe_time_max == 0) 1373 lgrp_plat_2level_setup(); 1374 return; 1375 } 1376 1377 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1378 /* 1379 * Should have been able to probe from CPU 0 when it was added 1380 * to lgroup hierarchy, but may not have been able to then 1381 * because it happens so early in boot that gethrtime() hasn't 1382 * been initialized. (:-( 1383 */ 1384 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1385 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1386 lgrp_plat_probe(); 1387 1388 return; 1389 } 1390 1391 /* 1392 * When probing memory, use one page for every sample to determine 1393 * lgroup topology and taking multiple samples 1394 */ 1395 if (lgrp_plat_probe_memsize == 0) 1396 lgrp_plat_probe_memsize = PAGESIZE * 1397 lgrp_plat_probe_nsamples; 1398 1399 /* 1400 * Map memory in each node needed for probing to determine latency 1401 * topology 1402 */ 1403 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1404 int mnode; 1405 1406 /* 1407 * Skip this node and leave its probe page NULL 1408 * if it doesn't have any memory 1409 */ 1410 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1411 if (!mem_node_config[mnode].exists) { 1412 lgrp_plat_probe_memory[i] = NULL; 1413 continue; 1414 } 1415 1416 /* 1417 * Allocate one kernel virtual page 1418 */ 1419 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1420 lgrp_plat_probe_memsize, VM_NOSLEEP); 1421 if (lgrp_plat_probe_memory[i] == NULL) { 1422 cmn_err(CE_WARN, 1423 "lgrp_plat_main_init: couldn't allocate memory"); 1424 return; 1425 } 1426 1427 /* 1428 * Map virtual page to first page in node 1429 */ 1430 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1431 lgrp_plat_probe_memsize, 1432 lgrp_plat_probe_pfn[i], 1433 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1434 HAT_LOAD_NOCONSIST); 1435 } 1436 1437 /* 1438 * Probe from current CPU 1439 */ 1440 lgrp_plat_probe(); 1441 } 1442 1443 /* 1444 * Allocate additional space for an lgroup. 1445 */ 1446 /* ARGSUSED */ 1447 lgrp_t * 1448 lgrp_plat_alloc(lgrp_id_t lgrpid) 1449 { 1450 lgrp_t *lgrp; 1451 1452 lgrp = &lgrp_space[nlgrps_alloc++]; 1453 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1454 return (NULL); 1455 return (lgrp); 1456 } 1457 1458 /* 1459 * Platform handling for (re)configuration changes 1460 */ 1461 /* ARGSUSED */ 1462 void 1463 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1464 { 1465 } 1466 1467 /* 1468 * Return the platform handle for the lgroup containing the given CPU 1469 */ 1470 /* ARGSUSED */ 1471 lgrp_handle_t 1472 lgrp_plat_cpu_to_hand(processorid_t id) 1473 { 1474 if (lgrp_plat_node_cnt == 1) 1475 return (LGRP_DEFAULT_HANDLE); 1476 1477 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1478 } 1479 1480 /* 1481 * Return the platform handle of the lgroup that contains the physical memory 1482 * corresponding to the given page frame number 1483 */ 1484 /* ARGSUSED */ 1485 lgrp_handle_t 1486 lgrp_plat_pfn_to_hand(pfn_t pfn) 1487 { 1488 int mnode; 1489 1490 if (max_mem_nodes == 1) 1491 return (LGRP_DEFAULT_HANDLE); 1492 1493 if (pfn > physmax) 1494 return (LGRP_NULL_HANDLE); 1495 1496 mnode = plat_pfn_to_mem_node(pfn); 1497 if (mnode < 0) 1498 return (LGRP_NULL_HANDLE); 1499 1500 return (MEM_NODE_2_LGRPHAND(mnode)); 1501 } 1502 1503 /* 1504 * Return the maximum number of lgrps supported by the platform. 1505 * Before lgrp topology is known it returns an estimate based on the number of 1506 * nodes. Once topology is known it returns the actual maximim number of lgrps 1507 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1508 * may not grow during system lifetime. 1509 */ 1510 int 1511 lgrp_plat_max_lgrps() 1512 { 1513 return (lgrp_topo_initialized ? 1514 lgrp_alloc_max + 1 : 1515 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1516 } 1517 1518 /* 1519 * Return the number of free, allocatable, or installed 1520 * pages in an lgroup 1521 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1522 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1523 */ 1524 /* ARGSUSED */ 1525 static pgcnt_t 1526 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1527 { 1528 struct memlist *mlist; 1529 pgcnt_t npgs = 0; 1530 extern struct memlist *phys_avail; 1531 extern struct memlist *phys_install; 1532 1533 switch (query) { 1534 case LGRP_MEM_SIZE_FREE: 1535 return ((pgcnt_t)freemem); 1536 case LGRP_MEM_SIZE_AVAIL: 1537 memlist_read_lock(); 1538 for (mlist = phys_avail; mlist; mlist = mlist->next) 1539 npgs += btop(mlist->size); 1540 memlist_read_unlock(); 1541 return (npgs); 1542 case LGRP_MEM_SIZE_INSTALL: 1543 memlist_read_lock(); 1544 for (mlist = phys_install; mlist; mlist = mlist->next) 1545 npgs += btop(mlist->size); 1546 memlist_read_unlock(); 1547 return (npgs); 1548 default: 1549 return ((pgcnt_t)0); 1550 } 1551 } 1552 1553 /* 1554 * Return the number of free pages in an lgroup. 1555 * 1556 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1557 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1558 * number of allocatable base pagesize pages corresponding to the 1559 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1560 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1561 * memory installed, regardless of whether or not it's usable. 1562 */ 1563 pgcnt_t 1564 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1565 { 1566 int mnode; 1567 pgcnt_t npgs = (pgcnt_t)0; 1568 extern struct memlist *phys_avail; 1569 extern struct memlist *phys_install; 1570 1571 1572 if (plathand == LGRP_DEFAULT_HANDLE) 1573 return (lgrp_plat_mem_size_default(plathand, query)); 1574 1575 if (plathand != LGRP_NULL_HANDLE) { 1576 mnode = plat_lgrphand_to_mem_node(plathand); 1577 if (mnode >= 0 && mem_node_config[mnode].exists) { 1578 switch (query) { 1579 case LGRP_MEM_SIZE_FREE: 1580 npgs = MNODE_PGCNT(mnode); 1581 break; 1582 case LGRP_MEM_SIZE_AVAIL: 1583 npgs = mem_node_memlist_pages(mnode, 1584 phys_avail); 1585 break; 1586 case LGRP_MEM_SIZE_INSTALL: 1587 npgs = mem_node_memlist_pages(mnode, 1588 phys_install); 1589 break; 1590 default: 1591 break; 1592 } 1593 } 1594 } 1595 return (npgs); 1596 } 1597 1598 /* 1599 * Return latency between "from" and "to" lgroups 1600 * 1601 * This latency number can only be used for relative comparison 1602 * between lgroups on the running system, cannot be used across platforms, 1603 * and may not reflect the actual latency. It is platform and implementation 1604 * specific, so platform gets to decide its value. It would be nice if the 1605 * number was at least proportional to make comparisons more meaningful though. 1606 */ 1607 /* ARGSUSED */ 1608 int 1609 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1610 { 1611 lgrp_handle_t src, dest; 1612 1613 if (max_mem_nodes == 1) 1614 return (0); 1615 1616 /* 1617 * Return max latency for root lgroup 1618 */ 1619 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1620 return (lgrp_plat_probe_time_max); 1621 1622 src = from; 1623 dest = to; 1624 1625 /* 1626 * Return 0 for nodes (lgroup platform handles) out of range 1627 */ 1628 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1629 return (0); 1630 1631 /* 1632 * Probe from current CPU if its lgroup latencies haven't been set yet 1633 * and we are trying to get latency from current CPU to some node 1634 */ 1635 if (lgrp_plat_probe_times[src][src] == 0 && 1636 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1637 lgrp_plat_probe(); 1638 1639 return (lgrp_plat_probe_times[src][dest]); 1640 } 1641 1642 /* 1643 * Return platform handle for root lgroup 1644 */ 1645 lgrp_handle_t 1646 lgrp_plat_root_hand(void) 1647 { 1648 return (LGRP_DEFAULT_HANDLE); 1649 } 1650