1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/cpupart.h> 33 #include <sys/cpuvar.h> 34 #include <sys/lgrp.h> 35 #include <sys/machsystm.h> 36 #include <sys/memlist.h> 37 #include <sys/memnode.h> 38 #include <sys/mman.h> 39 #include <sys/pci_cfgspace.h> 40 #include <sys/pci_impl.h> 41 #include <sys/param.h> 42 #include <sys/promif.h> /* for prom_printf() */ 43 #include <sys/systm.h> 44 #include <sys/thread.h> 45 #include <sys/types.h> 46 #include <sys/var.h> 47 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 48 #include <vm/hat_i86.h> 49 #include <vm/seg_kmem.h> 50 #include <vm/vm_dep.h> 51 52 53 54 /* 55 * lgroup platform support for x86 platforms. 56 */ 57 58 #define MAX_NODES 8 59 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 60 61 #define LGRP_PLAT_CPU_TO_NODE(cpu) (chip_plat_get_chipid(cpu)) 62 63 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 64 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 65 66 67 /* 68 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 69 * 70 * Until System Affinity Resource Table (SRAT) becomes part of ACPI standard, 71 * we need to examine registers in PCI configuration space to determine how 72 * many nodes are in the system and which CPUs and memory are in each node. 73 * This could be determined by probing all memory from each CPU, but that is 74 * too expensive to do while booting the kernel. 75 * 76 * NOTE: Using these PCI configuration space registers to determine this 77 * locality info is Opteron K8 specific and not guaranteed to work on 78 * the next generation Opteron processor. Furthermore, we assume that 79 * there is one CPU per node and CPU 0 is in node 0, CPU 1 is in node 1, 80 * etc. which should be true for Opteron K8.... 81 */ 82 83 /* 84 * Opteron DRAM Address Map in PCI configuration space gives base and limit 85 * of physical memory in each node for Opteron K8. The following constants 86 * and macros define their contents, structure, and access. 87 */ 88 89 /* 90 * How many bits to shift Opteron DRAM Address Map base and limit registers 91 * to get actual value 92 */ 93 #define OPT_DRAMADDR_LSHIFT_ADDR 8 /* shift left for address */ 94 95 #define OPT_DRAMADDR_MASK_OFF 0xFFFFFF /* offset for address */ 96 97 /* 98 * Bit masks defining what's in Opteron DRAM Address Map base register 99 */ 100 #define OPT_DRAMBASE_MASK_RE 0x1 /* read enable */ 101 #define OPT_DRAMBASE_MASK_WE 0x2 /* write enable */ 102 #define OPT_DRAMBASE_MASK_INTRLVEN 0x700 /* interleave */ 103 104 #define OPT_DRAMBASE_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 105 106 /* 107 * Macros to get values from Opteron DRAM Address Map base register 108 */ 109 #define OPT_DRAMBASE(reg) \ 110 (((u_longlong_t)reg & OPT_DRAMBASE_MASK_ADDR) << \ 111 OPT_DRAMADDR_LSHIFT_ADDR) 112 113 114 /* 115 * Bit masks defining what's in Opteron DRAM Address Map limit register 116 */ 117 #define OPT_DRAMLIMIT_MASK_DSTNODE 0x7 /* destination node */ 118 #define OPT_DRAMLIMIT_MASK_INTRLVSEL 0x70 /* interleave select */ 119 #define OPT_DRAMLIMIT_MASK_ADDR 0xFFFF0000 /* addr bits 39-24 */ 120 121 /* 122 * Macros to get values from Opteron DRAM Address Map limit register 123 */ 124 #define OPT_DRAMLIMIT(reg) \ 125 (((u_longlong_t)reg & OPT_DRAMLIMIT_MASK_ADDR) << \ 126 OPT_DRAMADDR_LSHIFT_ADDR) 127 128 129 /* 130 * Opteron Node ID register in PCI configuration space contains 131 * number of nodes in system, etc. for Opteron K8. The following 132 * constants and macros define its contents, structure, and access. 133 */ 134 135 /* 136 * Bit masks defining what's in Opteron Node ID register 137 */ 138 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 139 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 140 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 141 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 142 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 143 144 /* 145 * How many bits in Opteron Node ID register to shift right to get actual value 146 */ 147 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 148 149 /* 150 * Macros to get values from Opteron Node ID register 151 */ 152 #define OPT_NODE_CNT(reg) \ 153 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 154 155 156 /* 157 * PCI configuration space registers accessed by specifying 158 * a bus, device, function, and offset. The following constants 159 * define the values needed to access Opteron K8 configuration 160 * info to determine its node topology 161 */ 162 163 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 164 165 /* 166 * Opteron PCI configuration space register function values 167 */ 168 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 169 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 170 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 171 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 172 173 /* 174 * PCI Configuration Space register offsets 175 */ 176 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 177 #define OPT_PCS_OFF_DRAMBASE 0x40 /* DRAM Base register (node 0) */ 178 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 179 180 /* 181 * Opteron PCI Configuration Space device IDs for nodes 182 */ 183 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 184 185 186 /* 187 * Bookkeeping for latencies seen during probing (used for verification) 188 */ 189 typedef struct lgrp_plat_latency_acct { 190 hrtime_t la_value; /* latency value */ 191 int la_count; /* occurrences */ 192 } lgrp_plat_latency_acct_t; 193 194 195 /* 196 * Choices for probing to determine lgroup topology 197 */ 198 typedef enum lgrp_plat_probe_op { 199 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 200 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 201 } lgrp_plat_probe_op_t; 202 203 204 /* 205 * Opteron DRAM address map gives base and limit for physical memory in a node 206 */ 207 typedef struct opt_dram_addr_map { 208 uint32_t base; 209 uint32_t limit; 210 } opt_dram_addr_map_t; 211 212 213 /* 214 * Starting and ending page for physical memory in node 215 */ 216 typedef struct phys_addr_map { 217 pfn_t start; 218 pfn_t end; 219 int exists; 220 } phys_addr_map_t; 221 222 223 /* 224 * Opteron DRAM address map for each node 225 */ 226 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 227 228 /* 229 * Node ID register contents for each node 230 */ 231 uint_t opt_node_info[MAX_NODES]; 232 233 /* 234 * Whether memory is interleaved across nodes causing MPO to be disabled 235 */ 236 int lgrp_plat_mem_intrlv = 0; 237 238 /* 239 * Number of nodes in system 240 */ 241 uint_t lgrp_plat_node_cnt = 1; 242 243 /* 244 * Physical address range for memory in each node 245 */ 246 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 247 248 /* 249 * Probe costs (individual and total) and flush cost 250 */ 251 hrtime_t lgrp_plat_flush_cost = 0; 252 hrtime_t lgrp_plat_probe_cost = 0; 253 hrtime_t lgrp_plat_probe_cost_total = 0; 254 255 /* 256 * Error code for latency adjustment and verification 257 */ 258 int lgrp_plat_probe_error_code = 0; 259 260 /* 261 * How much latencies were off from minimum values gotten 262 */ 263 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 264 265 /* 266 * Unique probe latencies and number of occurrences of each 267 */ 268 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 269 270 /* 271 * Size of memory buffer in each node for probing 272 */ 273 size_t lgrp_plat_probe_memsize = 0; 274 275 /* 276 * Virtual address of page in each node for probing 277 */ 278 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 279 280 /* 281 * Number of unique latencies in probe times 282 */ 283 int lgrp_plat_probe_nlatencies = 0; 284 285 /* 286 * How many rounds of probing to do 287 */ 288 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 289 290 /* 291 * Number of samples to take when probing each node 292 */ 293 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 294 295 /* 296 * How to probe to determine lgroup topology 297 */ 298 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 299 300 /* 301 * PFN of page in each node for probing 302 */ 303 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 304 305 /* 306 * Whether probe time was suspect (ie. not within tolerance of value that it 307 * should match) 308 */ 309 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 310 311 /* 312 * How long it takes to access memory from each node 313 */ 314 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 315 316 /* 317 * Min and max node memory probe times seen 318 */ 319 hrtime_t lgrp_plat_probe_time_max = 0; 320 hrtime_t lgrp_plat_probe_time_min = -1; 321 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 322 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 323 324 325 /* 326 * Allocate lgrp and lgrp stat arrays statically. 327 */ 328 static lgrp_t lgrp_space[NLGRP]; 329 static int nlgrps_alloc; 330 331 struct lgrp_stats lgrp_stats[NLGRP]; 332 333 #define CPUID_FAMILY_OPTERON 15 334 335 uint_t opt_family = 0; 336 uint_t opt_model = 0; 337 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 338 339 340 /* 341 * Determine whether we're running on an AMD Opteron K8 machine 342 */ 343 int 344 is_opteron(void) 345 { 346 if (x86_vendor != X86_VENDOR_AMD) 347 return (0); 348 349 if (cpuid_getfamily(CPU) == CPUID_FAMILY_OPTERON) 350 return (1); 351 else 352 return (0); 353 } 354 355 int 356 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 357 { 358 if (max_mem_nodes == 1) 359 return (0); 360 361 return ((int)hand); 362 } 363 364 lgrp_handle_t 365 plat_mem_node_to_lgrphand(int mnode) 366 { 367 if (max_mem_nodes == 1) 368 return (LGRP_DEFAULT_HANDLE); 369 370 return ((lgrp_handle_t)mnode); 371 } 372 373 int 374 plat_pfn_to_mem_node(pfn_t pfn) 375 { 376 int node; 377 378 if (max_mem_nodes == 1) 379 return (0); 380 381 for (node = 0; node < lgrp_plat_node_cnt; node++) { 382 /* 383 * Skip nodes with no memory 384 */ 385 if (!lgrp_plat_node_memory[node].exists) 386 continue; 387 388 if (pfn >= lgrp_plat_node_memory[node].start && 389 pfn <= lgrp_plat_node_memory[node].end) 390 return (node); 391 } 392 393 ASSERT(node < lgrp_plat_node_cnt); 394 return (-1); 395 } 396 397 /* 398 * Configure memory nodes for machines with more than one node (ie NUMA) 399 */ 400 void 401 plat_build_mem_nodes(struct memlist *list) 402 { 403 pfn_t cur_start; /* start addr of subrange */ 404 pfn_t cur_end; /* end addr of subrange */ 405 pfn_t start; /* start addr of whole range */ 406 pfn_t end; /* end addr of whole range */ 407 408 /* 409 * Boot install lists are arranged <addr, len>, ... 410 */ 411 while (list) { 412 int node; 413 414 start = list->address >> PAGESHIFT; 415 end = (list->address + list->size - 1) >> PAGESHIFT; 416 417 if (start > physmax) { 418 list = list->next; 419 continue; 420 } 421 if (end > physmax) 422 end = physmax; 423 424 /* 425 * When there is only one memnode, just add memory to memnode 426 */ 427 if (max_mem_nodes == 1) { 428 mem_node_add_slice(start, end); 429 list = list->next; 430 continue; 431 } 432 433 /* 434 * mem_node_add_slice() expects to get a memory range that 435 * is within one memnode, so need to split any memory range 436 * that spans multiple memnodes into subranges that are each 437 * contained within one memnode when feeding them to 438 * mem_node_add_slice() 439 */ 440 cur_start = start; 441 do { 442 node = plat_pfn_to_mem_node(cur_start); 443 444 /* 445 * Panic if DRAM address map registers or SRAT say 446 * memory in node doesn't exist or address from 447 * boot installed memory list entry isn't in this node. 448 * This shouldn't happen and rest of code can't deal 449 * with this if it does. 450 */ 451 if (node < 0 || node >= lgrp_plat_node_cnt || 452 !lgrp_plat_node_memory[node].exists || 453 cur_start < lgrp_plat_node_memory[node].start || 454 cur_start > lgrp_plat_node_memory[node].end) { 455 cmn_err(CE_PANIC, "Don't know which memnode " 456 "to add installed memory address 0x%lx\n", 457 cur_start); 458 } 459 460 /* 461 * End of current subrange should not span memnodes 462 */ 463 cur_end = end; 464 if (lgrp_plat_node_memory[node].exists && 465 cur_end > lgrp_plat_node_memory[node].end) 466 cur_end = lgrp_plat_node_memory[node].end; 467 468 mem_node_add_slice(cur_start, cur_end); 469 470 /* 471 * Next subrange starts after end of current one 472 */ 473 cur_start = cur_end + 1; 474 } while (cur_end < end); 475 476 list = list->next; 477 } 478 mem_node_physalign = 0; 479 mem_node_pfn_shift = 0; 480 } 481 482 483 /* 484 * Platform-specific initialization of lgroups 485 */ 486 void 487 lgrp_plat_init(void) 488 { 489 uint_t bus; 490 uint_t dev; 491 uint_t node; 492 uint_t off; 493 494 extern lgrp_load_t lgrp_expand_proc_thresh; 495 extern lgrp_load_t lgrp_expand_proc_diff; 496 497 /* 498 * Initialize as a UMA machine if this isn't an Opteron 499 */ 500 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 501 lgrp_plat_node_cnt = max_mem_nodes = 1; 502 return; 503 } 504 505 /* 506 * Read configuration registers from PCI configuration space to 507 * determine node information, which memory is in each node, etc. 508 * 509 * Write to PCI configuration space address register to specify 510 * which configuration register to read and read/write PCI 511 * configuration space data register to get/set contents 512 */ 513 bus = OPT_PCS_BUS_CONFIG; 514 dev = OPT_PCS_DEV_NODE0; 515 off = OPT_PCS_OFF_DRAMBASE; 516 517 /* 518 * Read node ID register for node 0 to get node count 519 */ 520 opt_node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 521 OPT_PCS_OFF_NODEID); 522 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 523 524 for (node = 0; node < lgrp_plat_node_cnt; node++) { 525 /* 526 * Read node ID register (except for node 0 which we just read) 527 */ 528 if (node > 0) { 529 opt_node_info[node] = pci_getl_func(bus, dev, 530 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 531 } 532 533 /* 534 * Read DRAM base and limit registers which specify 535 * physical memory range of each node 536 */ 537 opt_dram_map[node].base = pci_getl_func(bus, dev, 538 OPT_PCS_FUNC_ADDRMAP, off); 539 if (opt_dram_map[node].base & OPT_DRAMBASE_MASK_INTRLVEN) 540 lgrp_plat_mem_intrlv++; 541 542 off += 4; /* limit register offset */ 543 opt_dram_map[node].limit = pci_getl_func(bus, dev, 544 OPT_PCS_FUNC_ADDRMAP, off); 545 546 /* 547 * Increment device number to next node and register offset for 548 * DRAM base register of next node 549 */ 550 off += 4; 551 dev++; 552 553 /* 554 * Both read and write enable bits must be enabled in DRAM 555 * address map base register for physical memory to exist in 556 * node 557 */ 558 if ((opt_dram_map[node].base & OPT_DRAMBASE_MASK_RE) == 0 || 559 (opt_dram_map[node].base & OPT_DRAMBASE_MASK_WE) == 0) { 560 /* 561 * Mark node memory as non-existent and set start and 562 * end addresses to be same in lgrp_plat_node_memory[] 563 */ 564 lgrp_plat_node_memory[node].exists = 0; 565 lgrp_plat_node_memory[node].start = 566 lgrp_plat_node_memory[node].end = (pfn_t)-1; 567 continue; 568 } 569 570 /* 571 * Get PFN for first page in each node, 572 * so we can probe memory to determine latency topology 573 */ 574 lgrp_plat_probe_pfn[node] = 575 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 576 577 /* 578 * Mark node memory as existing and remember physical address 579 * range of each node for use later 580 */ 581 lgrp_plat_node_memory[node].exists = 1; 582 lgrp_plat_node_memory[node].start = 583 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 584 lgrp_plat_node_memory[node].end = 585 btop(OPT_DRAMLIMIT(opt_dram_map[node].limit) | 586 OPT_DRAMADDR_MASK_OFF); 587 } 588 589 /* 590 * Only use one memory node if memory is interleaved between any nodes 591 */ 592 if (lgrp_plat_mem_intrlv) { 593 lgrp_plat_node_cnt = max_mem_nodes = 1; 594 (void) lgrp_topo_ht_limit_set(1); 595 } else { 596 max_mem_nodes = lgrp_plat_node_cnt; 597 598 /* 599 * Probing errors can mess up the lgroup topology and force us 600 * fall back to a 2 level lgroup topology. Here we bound how 601 * tall the lgroup topology can grow in hopes of avoiding any 602 * anamolies in probing from messing up the lgroup topology 603 * by limiting the accuracy of the latency topology. 604 * 605 * Assume that nodes will at least be configured in a ring, 606 * so limit height of lgroup topology to be less than number 607 * of nodes on a system with 4 or more nodes 608 */ 609 if (lgrp_plat_node_cnt >= 4 && 610 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 611 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 612 } 613 614 /* 615 * Lgroups on Opteron architectures have but a single physical 616 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 617 * so that lgrp_choose() will spread things out aggressively. 618 */ 619 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 620 lgrp_expand_proc_diff = 0; 621 } 622 623 624 /* 625 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 626 * be considered same 627 */ 628 #define LGRP_LAT_TOLERANCE_SHIFT 4 629 630 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 631 632 633 /* 634 * Adjust latencies between nodes to be symmetric, normalize latencies between 635 * any nodes that are within some tolerance to be same, and make local 636 * latencies be same 637 */ 638 static void 639 lgrp_plat_latency_adjust(void) 640 { 641 int i; 642 int j; 643 int k; 644 int l; 645 u_longlong_t max; 646 u_longlong_t min; 647 u_longlong_t t; 648 u_longlong_t t1; 649 u_longlong_t t2; 650 const lgrp_config_flag_t cflag = LGRP_CONFIG_LATENCY_CHANGE; 651 int lat_corrected[MAX_NODES][MAX_NODES]; 652 653 /* 654 * Nothing to do when this is an UMA machine 655 */ 656 if (max_mem_nodes == 1) 657 return; 658 659 /* 660 * Make sure that latencies are symmetric between any two nodes 661 * (ie. latency(node0, node1) == latency(node1, node0)) 662 */ 663 for (i = 0; i < lgrp_plat_node_cnt; i++) 664 for (j = 0; j < lgrp_plat_node_cnt; j++) { 665 t1 = lgrp_plat_probe_times[i][j]; 666 t2 = lgrp_plat_probe_times[j][i]; 667 668 if (t1 == 0 || t2 == 0 || t1 == t2) 669 continue; 670 671 /* 672 * Latencies should be same 673 * - Use minimum of two latencies which should be same 674 * - Track suspect probe times not within tolerance of 675 * min value 676 * - Remember how much values are corrected by 677 */ 678 if (t1 > t2) { 679 t = t2; 680 lgrp_plat_probe_errors[i][j] += t1 - t2; 681 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 682 lgrp_plat_probe_suspect[i][j]++; 683 lgrp_plat_probe_suspect[j][i]++; 684 } 685 } else if (t2 > t1) { 686 t = t1; 687 lgrp_plat_probe_errors[j][i] += t2 - t1; 688 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 689 lgrp_plat_probe_suspect[i][j]++; 690 lgrp_plat_probe_suspect[j][i]++; 691 } 692 } 693 694 lgrp_plat_probe_times[i][j] = 695 lgrp_plat_probe_times[j][i] = t; 696 lgrp_config(cflag, t1, t); 697 lgrp_config(cflag, t2, t); 698 } 699 700 /* 701 * Keep track of which latencies get corrected 702 */ 703 for (i = 0; i < MAX_NODES; i++) 704 for (j = 0; j < MAX_NODES; j++) 705 lat_corrected[i][j] = 0; 706 707 /* 708 * For every two nodes, see whether there is another pair of nodes which 709 * are about the same distance apart and make the latencies be the same 710 * if they are close enough together 711 */ 712 for (i = 0; i < lgrp_plat_node_cnt; i++) 713 for (j = 0; j < lgrp_plat_node_cnt; j++) { 714 /* 715 * Pick one pair of nodes (i, j) 716 * and get latency between them 717 */ 718 t1 = lgrp_plat_probe_times[i][j]; 719 720 /* 721 * Skip this pair of nodes if there isn't a latency 722 * for it yet 723 */ 724 if (t1 == 0) 725 continue; 726 727 for (k = 0; k < lgrp_plat_node_cnt; k++) 728 for (l = 0; l < lgrp_plat_node_cnt; l++) { 729 /* 730 * Pick another pair of nodes (k, l) 731 * not same as (i, j) and get latency 732 * between them 733 */ 734 if (k == i && l == j) 735 continue; 736 737 t2 = lgrp_plat_probe_times[k][l]; 738 739 /* 740 * Skip this pair of nodes if there 741 * isn't a latency for it yet 742 */ 743 744 if (t2 == 0) 745 continue; 746 747 /* 748 * Skip nodes (k, l) if they already 749 * have same latency as (i, j) or 750 * their latency isn't close enough to 751 * be considered/made the same 752 */ 753 if (t1 == t2 || (t1 > t2 && t1 - t2 > 754 t1 >> lgrp_plat_probe_lt_shift) || 755 (t2 > t1 && t2 - t1 > 756 t2 >> lgrp_plat_probe_lt_shift)) 757 continue; 758 759 /* 760 * Make latency(i, j) same as 761 * latency(k, l), try to use latency 762 * that has been adjusted already to get 763 * more consistency (if possible), and 764 * remember which latencies were 765 * adjusted for next time 766 */ 767 if (lat_corrected[i][j]) { 768 t = t1; 769 lgrp_config(cflag, t2, t); 770 t2 = t; 771 } else if (lat_corrected[k][l]) { 772 t = t2; 773 lgrp_config(cflag, t1, t); 774 t1 = t; 775 } else { 776 if (t1 > t2) 777 t = t2; 778 else 779 t = t1; 780 lgrp_config(cflag, t1, t); 781 lgrp_config(cflag, t2, t); 782 t1 = t2 = t; 783 } 784 785 lgrp_plat_probe_times[i][j] = 786 lgrp_plat_probe_times[k][l] = t; 787 788 lat_corrected[i][j] = 789 lat_corrected[k][l] = 1; 790 } 791 } 792 793 /* 794 * Local latencies should be same 795 * - Find min and max local latencies 796 * - Make all local latencies be minimum 797 */ 798 min = -1; 799 max = 0; 800 for (i = 0; i < lgrp_plat_node_cnt; i++) { 801 t = lgrp_plat_probe_times[i][i]; 802 if (t == 0) 803 continue; 804 if (min == -1 || t < min) 805 min = t; 806 if (t > max) 807 max = t; 808 } 809 if (min != max) { 810 for (i = 0; i < lgrp_plat_node_cnt; i++) { 811 int local; 812 813 local = lgrp_plat_probe_times[i][i]; 814 if (local == 0) 815 continue; 816 817 /* 818 * Track suspect probe times that aren't within 819 * tolerance of minimum local latency and how much 820 * probe times are corrected by 821 */ 822 if (local - min > min >> lgrp_plat_probe_lt_shift) 823 lgrp_plat_probe_suspect[i][i]++; 824 825 lgrp_plat_probe_errors[i][i] += local - min; 826 827 /* 828 * Make local latencies be minimum 829 */ 830 lgrp_config(cflag, local, min); 831 lgrp_plat_probe_times[i][i] = min; 832 } 833 } 834 835 /* 836 * Determine max probe time again since just adjusted latencies 837 */ 838 lgrp_plat_probe_time_max = 0; 839 for (i = 0; i < lgrp_plat_node_cnt; i++) 840 for (j = 0; j < lgrp_plat_node_cnt; j++) { 841 t = lgrp_plat_probe_times[i][j]; 842 if (t > lgrp_plat_probe_time_max) 843 lgrp_plat_probe_time_max = t; 844 } 845 } 846 847 848 /* 849 * Verify following about latencies between nodes: 850 * 851 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 852 * - Local latencies same 853 * - Local < remote 854 * - Number of latencies seen is reasonable 855 * - Number of occurrences of a given latency should be more than 1 856 * 857 * Returns: 858 * 0 Success 859 * -1 Not symmetric 860 * -2 Local latencies not same 861 * -3 Local >= remote 862 * -4 Wrong number of latencies 863 * -5 Not enough occurrences of given latency 864 */ 865 static int 866 lgrp_plat_latency_verify(void) 867 { 868 int i; 869 int j; 870 lgrp_plat_latency_acct_t *l; 871 int probed; 872 u_longlong_t t1; 873 u_longlong_t t2; 874 875 /* 876 * Nothing to do when this is an UMA machine, lgroup topology is 877 * limited to 2 levels, or there aren't any probe times yet 878 */ 879 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 880 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 881 return (0); 882 883 /* 884 * Make sure that latencies are symmetric between any two nodes 885 * (ie. latency(node0, node1) == latency(node1, node0)) 886 */ 887 for (i = 0; i < lgrp_plat_node_cnt; i++) 888 for (j = 0; j < lgrp_plat_node_cnt; j++) { 889 t1 = lgrp_plat_probe_times[i][j]; 890 t2 = lgrp_plat_probe_times[j][i]; 891 892 if (t1 == 0 || t2 == 0 || t1 == t2) 893 continue; 894 895 return (-1); 896 } 897 898 /* 899 * Local latencies should be same 900 */ 901 t1 = lgrp_plat_probe_times[0][0]; 902 for (i = 1; i < lgrp_plat_node_cnt; i++) { 903 t2 = lgrp_plat_probe_times[i][i]; 904 if (t2 == 0) 905 continue; 906 907 if (t1 == 0) { 908 t1 = t2; 909 continue; 910 } 911 912 if (t1 != t2) 913 return (-2); 914 } 915 916 /* 917 * Local latencies should be less than remote 918 */ 919 if (t1) { 920 for (i = 0; i < lgrp_plat_node_cnt; i++) 921 for (j = 0; j < lgrp_plat_node_cnt; j++) { 922 t2 = lgrp_plat_probe_times[i][j]; 923 if (i == j || t2 == 0) 924 continue; 925 926 if (t1 >= t2) 927 return (-3); 928 } 929 } 930 931 /* 932 * Rest of checks are not very useful for machines with less than 933 * 4 nodes (which means less than 3 latencies on Opteron) 934 */ 935 if (lgrp_plat_node_cnt < 4) 936 return (0); 937 938 /* 939 * Need to see whether done probing in order to verify number of 940 * latencies are correct 941 */ 942 probed = 0; 943 for (i = 0; i < lgrp_plat_node_cnt; i++) 944 if (lgrp_plat_probe_times[i][i]) 945 probed++; 946 947 if (probed != lgrp_plat_node_cnt) 948 return (0); 949 950 /* 951 * Determine number of unique latencies seen in probe times, 952 * their values, and number of occurrences of each 953 */ 954 lgrp_plat_probe_nlatencies = 0; 955 bzero(lgrp_plat_probe_lat_acct, 956 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 957 for (i = 0; i < lgrp_plat_node_cnt; i++) { 958 for (j = 0; j < lgrp_plat_node_cnt; j++) { 959 int k; 960 961 /* 962 * Look at each probe time 963 */ 964 t1 = lgrp_plat_probe_times[i][j]; 965 if (t1 == 0) 966 continue; 967 968 /* 969 * Account for unique latencies 970 */ 971 for (k = 0; k < lgrp_plat_node_cnt; k++) { 972 l = &lgrp_plat_probe_lat_acct[k]; 973 if (t1 == l->la_value) { 974 /* 975 * Increment number of occurrences 976 * if seen before 977 */ 978 l->la_count++; 979 break; 980 } else if (l->la_value == 0) { 981 /* 982 * Record latency if haven't seen before 983 */ 984 l->la_value = t1; 985 l->la_count++; 986 lgrp_plat_probe_nlatencies++; 987 break; 988 } 989 } 990 } 991 } 992 993 /* 994 * Number of latencies should be relative to number of 995 * nodes in system: 996 * - Same as nodes when nodes <= 2 997 * - Less than nodes when nodes > 2 998 * - Greater than 2 when nodes >= 4 999 */ 1000 if ((lgrp_plat_node_cnt <= 2 && 1001 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 1002 (lgrp_plat_node_cnt > 2 && 1003 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 1004 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 1005 lgrp_plat_probe_nlatencies <= 2)) 1006 return (-4); 1007 1008 /* 1009 * There should be more than one occurrence of every latency 1010 * as long as probing is complete 1011 */ 1012 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 1013 l = &lgrp_plat_probe_lat_acct[i]; 1014 if (l->la_count <= 1) 1015 return (-5); 1016 } 1017 return (0); 1018 } 1019 1020 1021 /* 1022 * Set lgroup latencies for 2 level lgroup topology 1023 */ 1024 static void 1025 lgrp_plat_2level_setup(void) 1026 { 1027 int i; 1028 1029 if (lgrp_plat_node_cnt >= 4) 1030 cmn_err(CE_NOTE, 1031 "MPO only optimizing for local and remote\n"); 1032 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1033 int j; 1034 1035 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1036 if (i == j) 1037 lgrp_plat_probe_times[i][j] = 2; 1038 else 1039 lgrp_plat_probe_times[i][j] = 3; 1040 } 1041 } 1042 lgrp_plat_probe_time_min = 2; 1043 lgrp_plat_probe_time_max = 3; 1044 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1045 } 1046 1047 1048 /* 1049 * Return time needed to probe from current CPU to memory in given node 1050 */ 1051 static hrtime_t 1052 lgrp_plat_probe_time(int to) 1053 { 1054 caddr_t buf; 1055 uint_t dev; 1056 /* LINTED: set but not used in function */ 1057 volatile uint_t dev_vendor; 1058 hrtime_t elapsed; 1059 hrtime_t end; 1060 int from; 1061 int i; 1062 int ipl; 1063 hrtime_t max; 1064 hrtime_t min; 1065 hrtime_t start; 1066 extern int use_sse_pagecopy; 1067 1068 /* 1069 * Determine ID of node containing current CPU 1070 */ 1071 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1072 1073 /* 1074 * Do common work for probing main memory 1075 */ 1076 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1077 /* 1078 * Skip probing any nodes without memory and 1079 * set probe time to 0 1080 */ 1081 if (lgrp_plat_probe_memory[to] == NULL) { 1082 lgrp_plat_probe_times[from][to] = 0; 1083 return (0); 1084 } 1085 1086 /* 1087 * Invalidate caches once instead of once every sample 1088 * which should cut cost of probing by a lot 1089 */ 1090 lgrp_plat_flush_cost = gethrtime(); 1091 invalidate_cache(); 1092 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1093 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1094 } 1095 1096 /* 1097 * Probe from current CPU to given memory using specified operation 1098 * and take specified number of samples 1099 */ 1100 max = 0; 1101 min = -1; 1102 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1103 lgrp_plat_probe_cost = gethrtime(); 1104 1105 /* 1106 * Can't measure probe time if gethrtime() isn't working yet 1107 */ 1108 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1109 return (0); 1110 1111 switch (lgrp_plat_probe_op) { 1112 1113 case LGRP_PLAT_PROBE_PGCPY: 1114 default: 1115 /* 1116 * Measure how long it takes to copy page 1117 * on top of itself 1118 */ 1119 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1120 1121 kpreempt_disable(); 1122 ipl = splhigh(); 1123 start = gethrtime(); 1124 if (use_sse_pagecopy) 1125 hwblkpagecopy(buf, buf); 1126 else 1127 bcopy(buf, buf, PAGESIZE); 1128 end = gethrtime(); 1129 elapsed = end - start; 1130 splx(ipl); 1131 kpreempt_enable(); 1132 break; 1133 1134 case LGRP_PLAT_PROBE_VENDOR: 1135 /* 1136 * Measure how long it takes to read vendor ID from 1137 * Northbridge 1138 */ 1139 dev = OPT_PCS_DEV_NODE0 + to; 1140 kpreempt_disable(); 1141 ipl = spl8(); 1142 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1143 OPT_PCS_OFF_VENDOR)); 1144 start = gethrtime(); 1145 dev_vendor = inl(PCI_CONFDATA); 1146 end = gethrtime(); 1147 elapsed = end - start; 1148 splx(ipl); 1149 kpreempt_enable(); 1150 break; 1151 } 1152 1153 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1154 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1155 1156 if (min == -1 || elapsed < min) 1157 min = elapsed; 1158 if (elapsed > max) 1159 max = elapsed; 1160 } 1161 1162 /* 1163 * Update minimum and maximum probe times between 1164 * these two nodes 1165 */ 1166 if (min < lgrp_plat_probe_min[from][to] || 1167 lgrp_plat_probe_min[from][to] == 0) 1168 lgrp_plat_probe_min[from][to] = min; 1169 1170 if (max > lgrp_plat_probe_max[from][to]) 1171 lgrp_plat_probe_max[from][to] = max; 1172 1173 return (min); 1174 } 1175 1176 1177 /* 1178 * Probe memory in each node from current CPU to determine latency topology 1179 */ 1180 void 1181 lgrp_plat_probe(void) 1182 { 1183 int from; 1184 int i; 1185 hrtime_t probe_time; 1186 int to; 1187 1188 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1189 return; 1190 1191 /* 1192 * Determine ID of node containing current CPU 1193 */ 1194 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1195 1196 /* 1197 * Don't need to probe if got times already 1198 */ 1199 if (lgrp_plat_probe_times[from][from] != 0) 1200 return; 1201 1202 /* 1203 * Read vendor ID in Northbridge or read and write page(s) 1204 * in each node from current CPU and remember how long it takes, 1205 * so we can build latency topology of machine later. 1206 * This should approximate the memory latency between each node. 1207 */ 1208 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1209 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1210 /* 1211 * Get probe time and bail out if can't get it yet 1212 */ 1213 probe_time = lgrp_plat_probe_time(to); 1214 if (probe_time == 0) 1215 return; 1216 1217 /* 1218 * Keep lowest probe time as latency between nodes 1219 */ 1220 if (lgrp_plat_probe_times[from][to] == 0 || 1221 probe_time < lgrp_plat_probe_times[from][to]) 1222 lgrp_plat_probe_times[from][to] = probe_time; 1223 1224 /* 1225 * Update overall minimum and maximum probe times 1226 * across all nodes 1227 */ 1228 if (probe_time < lgrp_plat_probe_time_min || 1229 lgrp_plat_probe_time_min == -1) 1230 lgrp_plat_probe_time_min = probe_time; 1231 if (probe_time > lgrp_plat_probe_time_max) 1232 lgrp_plat_probe_time_max = probe_time; 1233 } 1234 1235 /* 1236 * - Fix up latencies such that local latencies are same, 1237 * latency(i, j) == latency(j, i), etc. (if possible) 1238 * 1239 * - Verify that latencies look ok 1240 * 1241 * - Fallback to just optimizing for local and remote if 1242 * latencies didn't look right 1243 */ 1244 lgrp_plat_latency_adjust(); 1245 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1246 if (lgrp_plat_probe_error_code) 1247 lgrp_plat_2level_setup(); 1248 } 1249 1250 1251 /* 1252 * Platform-specific initialization 1253 */ 1254 void 1255 lgrp_plat_main_init(void) 1256 { 1257 int curnode; 1258 int ht_limit; 1259 int i; 1260 1261 /* 1262 * Print a notice that MPO is disabled when memory is interleaved 1263 * across nodes....Would do this when it is discovered, but can't 1264 * because it happens way too early during boot.... 1265 */ 1266 if (lgrp_plat_mem_intrlv) 1267 cmn_err(CE_NOTE, 1268 "MPO disabled because memory is interleaved\n"); 1269 1270 /* 1271 * Don't bother to do any probing if there is only one node or the 1272 * height of the lgroup topology less than or equal to 2 1273 */ 1274 ht_limit = lgrp_topo_ht_limit(); 1275 if (max_mem_nodes == 1 || ht_limit <= 2) { 1276 /* 1277 * Setup lgroup latencies for 2 level lgroup topology 1278 * (ie. local and remote only) if they haven't been set yet 1279 */ 1280 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1281 lgrp_plat_probe_time_max == 0) 1282 lgrp_plat_2level_setup(); 1283 return; 1284 } 1285 1286 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1287 /* 1288 * Should have been able to probe from CPU 0 when it was added 1289 * to lgroup hierarchy, but may not have been able to then 1290 * because it happens so early in boot that gethrtime() hasn't 1291 * been initialized. (:-( 1292 */ 1293 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1294 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1295 lgrp_plat_probe(); 1296 1297 return; 1298 } 1299 1300 /* 1301 * When probing memory, use one page for every sample to determine 1302 * lgroup topology and taking multiple samples 1303 */ 1304 if (lgrp_plat_probe_memsize == 0) 1305 lgrp_plat_probe_memsize = PAGESIZE * 1306 lgrp_plat_probe_nsamples; 1307 1308 /* 1309 * Map memory in each node needed for probing to determine latency 1310 * topology 1311 */ 1312 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1313 int mnode; 1314 1315 /* 1316 * Skip this node and leave its probe page NULL 1317 * if it doesn't have any memory 1318 */ 1319 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1320 if (!mem_node_config[mnode].exists) { 1321 lgrp_plat_probe_memory[i] = NULL; 1322 continue; 1323 } 1324 1325 /* 1326 * Allocate one kernel virtual page 1327 */ 1328 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1329 lgrp_plat_probe_memsize, VM_NOSLEEP); 1330 if (lgrp_plat_probe_memory[i] == NULL) { 1331 cmn_err(CE_WARN, 1332 "lgrp_plat_main_init: couldn't allocate memory"); 1333 return; 1334 } 1335 1336 /* 1337 * Map virtual page to first page in node 1338 */ 1339 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1340 lgrp_plat_probe_memsize, 1341 lgrp_plat_probe_pfn[i], 1342 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1343 HAT_LOAD_NOCONSIST); 1344 } 1345 1346 /* 1347 * Probe from current CPU 1348 */ 1349 lgrp_plat_probe(); 1350 } 1351 1352 /* 1353 * Allocate additional space for an lgroup. 1354 */ 1355 /* ARGSUSED */ 1356 lgrp_t * 1357 lgrp_plat_alloc(lgrp_id_t lgrpid) 1358 { 1359 lgrp_t *lgrp; 1360 1361 lgrp = &lgrp_space[nlgrps_alloc++]; 1362 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1363 return (NULL); 1364 return (lgrp); 1365 } 1366 1367 /* 1368 * Platform handling for (re)configuration changes 1369 */ 1370 /* ARGSUSED */ 1371 void 1372 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1373 { 1374 } 1375 1376 /* 1377 * Return the platform handle for the lgroup containing the given CPU 1378 */ 1379 /* ARGSUSED */ 1380 lgrp_handle_t 1381 lgrp_plat_cpu_to_hand(processorid_t id) 1382 { 1383 if (lgrp_plat_node_cnt == 1) 1384 return (LGRP_DEFAULT_HANDLE); 1385 1386 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1387 } 1388 1389 /* 1390 * Return the platform handle of the lgroup that contains the physical memory 1391 * corresponding to the given page frame number 1392 */ 1393 /* ARGSUSED */ 1394 lgrp_handle_t 1395 lgrp_plat_pfn_to_hand(pfn_t pfn) 1396 { 1397 int mnode; 1398 1399 if (max_mem_nodes == 1) 1400 return (LGRP_DEFAULT_HANDLE); 1401 1402 mnode = plat_pfn_to_mem_node(pfn); 1403 return (MEM_NODE_2_LGRPHAND(mnode)); 1404 } 1405 1406 /* 1407 * Return the maximum number of lgrps supported by the platform. 1408 * Before lgrp topology is known it returns an estimate based on the number of 1409 * nodes. Once topology is known it returns the actual maximim number of lgrps 1410 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1411 * may not grow during system lifetime. 1412 */ 1413 int 1414 lgrp_plat_max_lgrps() 1415 { 1416 return (lgrp_topo_initialized ? 1417 lgrp_alloc_max + 1 : 1418 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1419 } 1420 1421 /* 1422 * Return the number of free, allocatable, or installed 1423 * pages in an lgroup 1424 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1425 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1426 */ 1427 /* ARGSUSED */ 1428 static pgcnt_t 1429 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1430 { 1431 struct memlist *mlist; 1432 pgcnt_t npgs = 0; 1433 extern struct memlist *phys_avail; 1434 extern struct memlist *phys_install; 1435 1436 switch (query) { 1437 case LGRP_MEM_SIZE_FREE: 1438 return ((pgcnt_t)freemem); 1439 case LGRP_MEM_SIZE_AVAIL: 1440 memlist_read_lock(); 1441 for (mlist = phys_avail; mlist; mlist = mlist->next) 1442 npgs += btop(mlist->size); 1443 memlist_read_unlock(); 1444 return (npgs); 1445 case LGRP_MEM_SIZE_INSTALL: 1446 memlist_read_lock(); 1447 for (mlist = phys_install; mlist; mlist = mlist->next) 1448 npgs += btop(mlist->size); 1449 memlist_read_unlock(); 1450 return (npgs); 1451 default: 1452 return ((pgcnt_t)0); 1453 } 1454 } 1455 1456 /* 1457 * Return the number of free pages in an lgroup. 1458 * 1459 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1460 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1461 * number of allocatable base pagesize pages corresponding to the 1462 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1463 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1464 * memory installed, regardless of whether or not it's usable. 1465 */ 1466 pgcnt_t 1467 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1468 { 1469 int mnode; 1470 pgcnt_t npgs = (pgcnt_t)0; 1471 extern struct memlist *phys_avail; 1472 extern struct memlist *phys_install; 1473 1474 1475 if (plathand == LGRP_DEFAULT_HANDLE) 1476 return (lgrp_plat_mem_size_default(plathand, query)); 1477 1478 if (plathand != LGRP_NULL_HANDLE) { 1479 mnode = plat_lgrphand_to_mem_node(plathand); 1480 if (mnode >= 0 && mem_node_config[mnode].exists) { 1481 switch (query) { 1482 case LGRP_MEM_SIZE_FREE: 1483 npgs = MNODE_PGCNT(mnode); 1484 break; 1485 case LGRP_MEM_SIZE_AVAIL: 1486 npgs = mem_node_memlist_pages(mnode, 1487 phys_avail); 1488 break; 1489 case LGRP_MEM_SIZE_INSTALL: 1490 npgs = mem_node_memlist_pages(mnode, 1491 phys_install); 1492 break; 1493 default: 1494 break; 1495 } 1496 } 1497 } 1498 return (npgs); 1499 } 1500 1501 /* 1502 * Return latency between "from" and "to" lgroups 1503 * 1504 * This latency number can only be used for relative comparison 1505 * between lgroups on the running system, cannot be used across platforms, 1506 * and may not reflect the actual latency. It is platform and implementation 1507 * specific, so platform gets to decide its value. It would be nice if the 1508 * number was at least proportional to make comparisons more meaningful though. 1509 */ 1510 /* ARGSUSED */ 1511 int 1512 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1513 { 1514 lgrp_handle_t src, dest; 1515 1516 if (max_mem_nodes == 1) 1517 return (0); 1518 1519 /* 1520 * Return max latency for root lgroup 1521 */ 1522 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1523 return (lgrp_plat_probe_time_max); 1524 1525 src = from; 1526 dest = to; 1527 1528 /* 1529 * Return 0 for nodes (lgroup platform handles) out of range 1530 */ 1531 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1532 return (0); 1533 1534 /* 1535 * Probe from current CPU if its lgroup latencies haven't been set yet 1536 * and we are trying to get latency from current CPU to some node 1537 */ 1538 if (lgrp_plat_probe_times[src][src] == 0 && 1539 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1540 lgrp_plat_probe(); 1541 1542 return (lgrp_plat_probe_times[src][dest]); 1543 } 1544 1545 /* 1546 * Return platform handle for root lgroup 1547 */ 1548 lgrp_handle_t 1549 lgrp_plat_root_hand(void) 1550 { 1551 return (LGRP_DEFAULT_HANDLE); 1552 } 1553