1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 31 #include <sys/cmn_err.h> 32 #include <sys/cpupart.h> 33 #include <sys/cpuvar.h> 34 #include <sys/lgrp.h> 35 #include <sys/machsystm.h> 36 #include <sys/memlist.h> 37 #include <sys/memnode.h> 38 #include <sys/mman.h> 39 #include <sys/pci_cfgspace.h> 40 #include <sys/pci_impl.h> 41 #include <sys/param.h> 42 #include <sys/pghw.h> 43 #include <sys/promif.h> /* for prom_printf() */ 44 #include <sys/systm.h> 45 #include <sys/thread.h> 46 #include <sys/types.h> 47 #include <sys/var.h> 48 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 49 #include <vm/hat_i86.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/vm_dep.h> 52 53 54 /* 55 * lgroup platform support for x86 platforms. 56 */ 57 58 #define MAX_NODES 8 59 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 60 61 #define LGRP_PLAT_CPU_TO_NODE(cpu) (pg_plat_hw_instance_id(cpu, PGHW_CHIP)) 62 63 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 64 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 65 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 66 67 /* 68 * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 69 * 70 * Until System Affinity Resource Table (SRAT) becomes part of ACPI standard, 71 * we need to examine registers in PCI configuration space to determine how 72 * many nodes are in the system and which CPUs and memory are in each node. 73 * This could be determined by probing all memory from each CPU, but that is 74 * too expensive to do while booting the kernel. 75 * 76 * NOTE: Using these PCI configuration space registers to determine this 77 * locality info is Opteron K8 specific and not guaranteed to work on 78 * the next generation Opteron processor. Furthermore, we assume that 79 * there is one CPU per node and CPU 0 is in node 0, CPU 1 is in node 1, 80 * etc. which should be true for Opteron K8.... 81 */ 82 83 /* 84 * Opteron DRAM Address Map in PCI configuration space gives base and limit 85 * of physical memory in each node for Opteron K8. The following constants 86 * and macros define their contents, structure, and access. 87 */ 88 89 /* 90 * How many bits to shift Opteron DRAM Address Map base and limit registers 91 * to get actual value 92 */ 93 #define OPT_DRAMADDR_LSHIFT_ADDR 8 /* shift left for address */ 94 95 #define OPT_DRAMADDR_MASK_OFF 0xFFFFFF /* offset for address */ 96 97 /* 98 * Bit masks defining what's in Opteron DRAM Address Map base register 99 */ 100 #define OPT_DRAMBASE_MASK_RE 0x1 /* read enable */ 101 #define OPT_DRAMBASE_MASK_WE 0x2 /* write enable */ 102 #define OPT_DRAMBASE_MASK_INTRLVEN 0x700 /* interleave */ 103 104 #define OPT_DRAMBASE_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 105 106 /* 107 * Macros to get values from Opteron DRAM Address Map base register 108 */ 109 #define OPT_DRAMBASE(reg) \ 110 (((u_longlong_t)reg & OPT_DRAMBASE_MASK_ADDR) << \ 111 OPT_DRAMADDR_LSHIFT_ADDR) 112 113 114 /* 115 * Bit masks defining what's in Opteron DRAM Address Map limit register 116 */ 117 #define OPT_DRAMLIMIT_MASK_DSTNODE 0x7 /* destination node */ 118 #define OPT_DRAMLIMIT_MASK_INTRLVSEL 0x70 /* interleave select */ 119 #define OPT_DRAMLIMIT_MASK_ADDR 0xFFFF0000 /* addr bits 39-24 */ 120 121 /* 122 * Macros to get values from Opteron DRAM Address Map limit register 123 */ 124 #define OPT_DRAMLIMIT(reg) \ 125 (((u_longlong_t)reg & OPT_DRAMLIMIT_MASK_ADDR) << \ 126 OPT_DRAMADDR_LSHIFT_ADDR) 127 128 129 /* 130 * Opteron Node ID register in PCI configuration space contains 131 * number of nodes in system, etc. for Opteron K8. The following 132 * constants and macros define its contents, structure, and access. 133 */ 134 135 /* 136 * Bit masks defining what's in Opteron Node ID register 137 */ 138 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 139 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 140 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 141 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 142 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 143 144 /* 145 * How many bits in Opteron Node ID register to shift right to get actual value 146 */ 147 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 148 149 /* 150 * Macros to get values from Opteron Node ID register 151 */ 152 #define OPT_NODE_CNT(reg) \ 153 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 154 155 156 /* 157 * PCI configuration space registers accessed by specifying 158 * a bus, device, function, and offset. The following constants 159 * define the values needed to access Opteron K8 configuration 160 * info to determine its node topology 161 */ 162 163 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 164 165 /* 166 * Opteron PCI configuration space register function values 167 */ 168 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 169 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 170 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 171 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 172 173 /* 174 * PCI Configuration Space register offsets 175 */ 176 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 177 #define OPT_PCS_OFF_DRAMBASE 0x40 /* DRAM Base register (node 0) */ 178 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 179 180 /* 181 * Opteron PCI Configuration Space device IDs for nodes 182 */ 183 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 184 185 186 /* 187 * Bookkeeping for latencies seen during probing (used for verification) 188 */ 189 typedef struct lgrp_plat_latency_acct { 190 hrtime_t la_value; /* latency value */ 191 int la_count; /* occurrences */ 192 } lgrp_plat_latency_acct_t; 193 194 195 /* 196 * Choices for probing to determine lgroup topology 197 */ 198 typedef enum lgrp_plat_probe_op { 199 LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 200 LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 201 } lgrp_plat_probe_op_t; 202 203 204 /* 205 * Opteron DRAM address map gives base and limit for physical memory in a node 206 */ 207 typedef struct opt_dram_addr_map { 208 uint32_t base; 209 uint32_t limit; 210 } opt_dram_addr_map_t; 211 212 213 /* 214 * Starting and ending page for physical memory in node 215 */ 216 typedef struct phys_addr_map { 217 pfn_t start; 218 pfn_t end; 219 int exists; 220 } phys_addr_map_t; 221 222 223 /* 224 * Opteron DRAM address map for each node 225 */ 226 struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 227 228 /* 229 * Node ID register contents for each node 230 */ 231 uint_t opt_node_info[MAX_NODES]; 232 233 /* 234 * Whether memory is interleaved across nodes causing MPO to be disabled 235 */ 236 int lgrp_plat_mem_intrlv = 0; 237 238 /* 239 * Number of nodes in system 240 */ 241 uint_t lgrp_plat_node_cnt = 1; 242 243 /* 244 * Physical address range for memory in each node 245 */ 246 phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 247 248 /* 249 * Probe costs (individual and total) and flush cost 250 */ 251 hrtime_t lgrp_plat_flush_cost = 0; 252 hrtime_t lgrp_plat_probe_cost = 0; 253 hrtime_t lgrp_plat_probe_cost_total = 0; 254 255 /* 256 * Error code for latency adjustment and verification 257 */ 258 int lgrp_plat_probe_error_code = 0; 259 260 /* 261 * How much latencies were off from minimum values gotten 262 */ 263 hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 264 265 /* 266 * Unique probe latencies and number of occurrences of each 267 */ 268 lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 269 270 /* 271 * Size of memory buffer in each node for probing 272 */ 273 size_t lgrp_plat_probe_memsize = 0; 274 275 /* 276 * Virtual address of page in each node for probing 277 */ 278 caddr_t lgrp_plat_probe_memory[MAX_NODES]; 279 280 /* 281 * Number of unique latencies in probe times 282 */ 283 int lgrp_plat_probe_nlatencies = 0; 284 285 /* 286 * How many rounds of probing to do 287 */ 288 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 289 290 /* 291 * Number of samples to take when probing each node 292 */ 293 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 294 295 /* 296 * Number of times to read vendor ID from Northbridge for each probe. 297 */ 298 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 299 300 /* 301 * How to probe to determine lgroup topology 302 */ 303 lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 304 305 /* 306 * PFN of page in each node for probing 307 */ 308 pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 309 310 /* 311 * Whether probe time was suspect (ie. not within tolerance of value that it 312 * should match) 313 */ 314 int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 315 316 /* 317 * How long it takes to access memory from each node 318 */ 319 hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 320 321 /* 322 * Min and max node memory probe times seen 323 */ 324 hrtime_t lgrp_plat_probe_time_max = 0; 325 hrtime_t lgrp_plat_probe_time_min = -1; 326 hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 327 hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 328 329 330 /* 331 * Allocate lgrp and lgrp stat arrays statically. 332 */ 333 static lgrp_t lgrp_space[NLGRP]; 334 static int nlgrps_alloc; 335 336 struct lgrp_stats lgrp_stats[NLGRP]; 337 338 #define CPUID_FAMILY_OPTERON 15 339 340 uint_t opt_family = 0; 341 uint_t opt_model = 0; 342 uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 343 344 345 /* 346 * Determine whether we're running on an AMD Opteron K8 machine 347 */ 348 int 349 is_opteron(void) 350 { 351 if (x86_vendor != X86_VENDOR_AMD) 352 return (0); 353 354 if (cpuid_getfamily(CPU) == CPUID_FAMILY_OPTERON) 355 return (1); 356 else 357 return (0); 358 } 359 360 int 361 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 362 { 363 if (max_mem_nodes == 1) 364 return (0); 365 366 return ((int)hand); 367 } 368 369 lgrp_handle_t 370 plat_mem_node_to_lgrphand(int mnode) 371 { 372 if (max_mem_nodes == 1) 373 return (LGRP_DEFAULT_HANDLE); 374 375 return ((lgrp_handle_t)mnode); 376 } 377 378 int 379 plat_pfn_to_mem_node(pfn_t pfn) 380 { 381 int node; 382 383 if (max_mem_nodes == 1) 384 return (0); 385 386 for (node = 0; node < lgrp_plat_node_cnt; node++) { 387 /* 388 * Skip nodes with no memory 389 */ 390 if (!lgrp_plat_node_memory[node].exists) 391 continue; 392 393 if (pfn >= lgrp_plat_node_memory[node].start && 394 pfn <= lgrp_plat_node_memory[node].end) 395 return (node); 396 } 397 398 ASSERT(node < lgrp_plat_node_cnt); 399 return (-1); 400 } 401 402 /* 403 * Configure memory nodes for machines with more than one node (ie NUMA) 404 */ 405 void 406 plat_build_mem_nodes(struct memlist *list) 407 { 408 pfn_t cur_start; /* start addr of subrange */ 409 pfn_t cur_end; /* end addr of subrange */ 410 pfn_t start; /* start addr of whole range */ 411 pfn_t end; /* end addr of whole range */ 412 413 /* 414 * Boot install lists are arranged <addr, len>, ... 415 */ 416 while (list) { 417 int node; 418 419 start = list->address >> PAGESHIFT; 420 end = (list->address + list->size - 1) >> PAGESHIFT; 421 422 if (start > physmax) { 423 list = list->next; 424 continue; 425 } 426 if (end > physmax) 427 end = physmax; 428 429 /* 430 * When there is only one memnode, just add memory to memnode 431 */ 432 if (max_mem_nodes == 1) { 433 mem_node_add_slice(start, end); 434 list = list->next; 435 continue; 436 } 437 438 /* 439 * mem_node_add_slice() expects to get a memory range that 440 * is within one memnode, so need to split any memory range 441 * that spans multiple memnodes into subranges that are each 442 * contained within one memnode when feeding them to 443 * mem_node_add_slice() 444 */ 445 cur_start = start; 446 do { 447 node = plat_pfn_to_mem_node(cur_start); 448 449 /* 450 * Panic if DRAM address map registers or SRAT say 451 * memory in node doesn't exist or address from 452 * boot installed memory list entry isn't in this node. 453 * This shouldn't happen and rest of code can't deal 454 * with this if it does. 455 */ 456 if (node < 0 || node >= lgrp_plat_node_cnt || 457 !lgrp_plat_node_memory[node].exists || 458 cur_start < lgrp_plat_node_memory[node].start || 459 cur_start > lgrp_plat_node_memory[node].end) { 460 cmn_err(CE_PANIC, "Don't know which memnode " 461 "to add installed memory address 0x%lx\n", 462 cur_start); 463 } 464 465 /* 466 * End of current subrange should not span memnodes 467 */ 468 cur_end = end; 469 if (lgrp_plat_node_memory[node].exists && 470 cur_end > lgrp_plat_node_memory[node].end) 471 cur_end = lgrp_plat_node_memory[node].end; 472 473 mem_node_add_slice(cur_start, cur_end); 474 475 /* 476 * Next subrange starts after end of current one 477 */ 478 cur_start = cur_end + 1; 479 } while (cur_end < end); 480 481 list = list->next; 482 } 483 mem_node_physalign = 0; 484 mem_node_pfn_shift = 0; 485 } 486 487 488 /* 489 * Platform-specific initialization of lgroups 490 */ 491 void 492 lgrp_plat_init(void) 493 { 494 uint_t bus; 495 uint_t dev; 496 uint_t node; 497 uint_t off; 498 499 extern lgrp_load_t lgrp_expand_proc_thresh; 500 extern lgrp_load_t lgrp_expand_proc_diff; 501 502 /* 503 * Initialize as a UMA machine if this isn't an Opteron 504 */ 505 if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 506 lgrp_plat_node_cnt = max_mem_nodes = 1; 507 return; 508 } 509 510 /* 511 * Read configuration registers from PCI configuration space to 512 * determine node information, which memory is in each node, etc. 513 * 514 * Write to PCI configuration space address register to specify 515 * which configuration register to read and read/write PCI 516 * configuration space data register to get/set contents 517 */ 518 bus = OPT_PCS_BUS_CONFIG; 519 dev = OPT_PCS_DEV_NODE0; 520 off = OPT_PCS_OFF_DRAMBASE; 521 522 /* 523 * Read node ID register for node 0 to get node count 524 */ 525 opt_node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 526 OPT_PCS_OFF_NODEID); 527 lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 528 529 for (node = 0; node < lgrp_plat_node_cnt; node++) { 530 /* 531 * Read node ID register (except for node 0 which we just read) 532 */ 533 if (node > 0) { 534 opt_node_info[node] = pci_getl_func(bus, dev, 535 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 536 } 537 538 /* 539 * Read DRAM base and limit registers which specify 540 * physical memory range of each node 541 */ 542 opt_dram_map[node].base = pci_getl_func(bus, dev, 543 OPT_PCS_FUNC_ADDRMAP, off); 544 if (opt_dram_map[node].base & OPT_DRAMBASE_MASK_INTRLVEN) 545 lgrp_plat_mem_intrlv++; 546 547 off += 4; /* limit register offset */ 548 opt_dram_map[node].limit = pci_getl_func(bus, dev, 549 OPT_PCS_FUNC_ADDRMAP, off); 550 551 /* 552 * Increment device number to next node and register offset for 553 * DRAM base register of next node 554 */ 555 off += 4; 556 dev++; 557 558 /* 559 * Both read and write enable bits must be enabled in DRAM 560 * address map base register for physical memory to exist in 561 * node 562 */ 563 if ((opt_dram_map[node].base & OPT_DRAMBASE_MASK_RE) == 0 || 564 (opt_dram_map[node].base & OPT_DRAMBASE_MASK_WE) == 0) { 565 /* 566 * Mark node memory as non-existent and set start and 567 * end addresses to be same in lgrp_plat_node_memory[] 568 */ 569 lgrp_plat_node_memory[node].exists = 0; 570 lgrp_plat_node_memory[node].start = 571 lgrp_plat_node_memory[node].end = (pfn_t)-1; 572 continue; 573 } 574 575 /* 576 * Get PFN for first page in each node, 577 * so we can probe memory to determine latency topology 578 */ 579 lgrp_plat_probe_pfn[node] = 580 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 581 582 /* 583 * Mark node memory as existing and remember physical address 584 * range of each node for use later 585 */ 586 lgrp_plat_node_memory[node].exists = 1; 587 lgrp_plat_node_memory[node].start = 588 btop(OPT_DRAMBASE(opt_dram_map[node].base)); 589 lgrp_plat_node_memory[node].end = 590 btop(OPT_DRAMLIMIT(opt_dram_map[node].limit) | 591 OPT_DRAMADDR_MASK_OFF); 592 } 593 594 /* 595 * Only use one memory node if memory is interleaved between any nodes 596 */ 597 if (lgrp_plat_mem_intrlv) { 598 lgrp_plat_node_cnt = max_mem_nodes = 1; 599 (void) lgrp_topo_ht_limit_set(1); 600 } else { 601 max_mem_nodes = lgrp_plat_node_cnt; 602 603 /* 604 * Probing errors can mess up the lgroup topology and force us 605 * fall back to a 2 level lgroup topology. Here we bound how 606 * tall the lgroup topology can grow in hopes of avoiding any 607 * anamolies in probing from messing up the lgroup topology 608 * by limiting the accuracy of the latency topology. 609 * 610 * Assume that nodes will at least be configured in a ring, 611 * so limit height of lgroup topology to be less than number 612 * of nodes on a system with 4 or more nodes 613 */ 614 if (lgrp_plat_node_cnt >= 4 && 615 lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 616 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 617 } 618 619 /* 620 * Lgroups on Opteron architectures have but a single physical 621 * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 622 * so that lgrp_choose() will spread things out aggressively. 623 */ 624 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 625 lgrp_expand_proc_diff = 0; 626 } 627 628 629 /* 630 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 631 * be considered same 632 */ 633 #define LGRP_LAT_TOLERANCE_SHIFT 4 634 635 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 636 637 638 /* 639 * Adjust latencies between nodes to be symmetric, normalize latencies between 640 * any nodes that are within some tolerance to be same, and make local 641 * latencies be same 642 */ 643 static void 644 lgrp_plat_latency_adjust(void) 645 { 646 int i; 647 int j; 648 int k; 649 int l; 650 u_longlong_t max; 651 u_longlong_t min; 652 u_longlong_t t; 653 u_longlong_t t1; 654 u_longlong_t t2; 655 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 656 int lat_corrected[MAX_NODES][MAX_NODES]; 657 658 /* 659 * Nothing to do when this is an UMA machine 660 */ 661 if (max_mem_nodes == 1) 662 return; 663 664 /* 665 * Make sure that latencies are symmetric between any two nodes 666 * (ie. latency(node0, node1) == latency(node1, node0)) 667 */ 668 for (i = 0; i < lgrp_plat_node_cnt; i++) 669 for (j = 0; j < lgrp_plat_node_cnt; j++) { 670 t1 = lgrp_plat_probe_times[i][j]; 671 t2 = lgrp_plat_probe_times[j][i]; 672 673 if (t1 == 0 || t2 == 0 || t1 == t2) 674 continue; 675 676 /* 677 * Latencies should be same 678 * - Use minimum of two latencies which should be same 679 * - Track suspect probe times not within tolerance of 680 * min value 681 * - Remember how much values are corrected by 682 */ 683 if (t1 > t2) { 684 t = t2; 685 lgrp_plat_probe_errors[i][j] += t1 - t2; 686 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 687 lgrp_plat_probe_suspect[i][j]++; 688 lgrp_plat_probe_suspect[j][i]++; 689 } 690 } else if (t2 > t1) { 691 t = t1; 692 lgrp_plat_probe_errors[j][i] += t2 - t1; 693 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 694 lgrp_plat_probe_suspect[i][j]++; 695 lgrp_plat_probe_suspect[j][i]++; 696 } 697 } 698 699 lgrp_plat_probe_times[i][j] = 700 lgrp_plat_probe_times[j][i] = t; 701 lgrp_config(cflag, t1, t); 702 lgrp_config(cflag, t2, t); 703 } 704 705 /* 706 * Keep track of which latencies get corrected 707 */ 708 for (i = 0; i < MAX_NODES; i++) 709 for (j = 0; j < MAX_NODES; j++) 710 lat_corrected[i][j] = 0; 711 712 /* 713 * For every two nodes, see whether there is another pair of nodes which 714 * are about the same distance apart and make the latencies be the same 715 * if they are close enough together 716 */ 717 for (i = 0; i < lgrp_plat_node_cnt; i++) 718 for (j = 0; j < lgrp_plat_node_cnt; j++) { 719 /* 720 * Pick one pair of nodes (i, j) 721 * and get latency between them 722 */ 723 t1 = lgrp_plat_probe_times[i][j]; 724 725 /* 726 * Skip this pair of nodes if there isn't a latency 727 * for it yet 728 */ 729 if (t1 == 0) 730 continue; 731 732 for (k = 0; k < lgrp_plat_node_cnt; k++) 733 for (l = 0; l < lgrp_plat_node_cnt; l++) { 734 /* 735 * Pick another pair of nodes (k, l) 736 * not same as (i, j) and get latency 737 * between them 738 */ 739 if (k == i && l == j) 740 continue; 741 742 t2 = lgrp_plat_probe_times[k][l]; 743 744 /* 745 * Skip this pair of nodes if there 746 * isn't a latency for it yet 747 */ 748 749 if (t2 == 0) 750 continue; 751 752 /* 753 * Skip nodes (k, l) if they already 754 * have same latency as (i, j) or 755 * their latency isn't close enough to 756 * be considered/made the same 757 */ 758 if (t1 == t2 || (t1 > t2 && t1 - t2 > 759 t1 >> lgrp_plat_probe_lt_shift) || 760 (t2 > t1 && t2 - t1 > 761 t2 >> lgrp_plat_probe_lt_shift)) 762 continue; 763 764 /* 765 * Make latency(i, j) same as 766 * latency(k, l), try to use latency 767 * that has been adjusted already to get 768 * more consistency (if possible), and 769 * remember which latencies were 770 * adjusted for next time 771 */ 772 if (lat_corrected[i][j]) { 773 t = t1; 774 lgrp_config(cflag, t2, t); 775 t2 = t; 776 } else if (lat_corrected[k][l]) { 777 t = t2; 778 lgrp_config(cflag, t1, t); 779 t1 = t; 780 } else { 781 if (t1 > t2) 782 t = t2; 783 else 784 t = t1; 785 lgrp_config(cflag, t1, t); 786 lgrp_config(cflag, t2, t); 787 t1 = t2 = t; 788 } 789 790 lgrp_plat_probe_times[i][j] = 791 lgrp_plat_probe_times[k][l] = t; 792 793 lat_corrected[i][j] = 794 lat_corrected[k][l] = 1; 795 } 796 } 797 798 /* 799 * Local latencies should be same 800 * - Find min and max local latencies 801 * - Make all local latencies be minimum 802 */ 803 min = -1; 804 max = 0; 805 for (i = 0; i < lgrp_plat_node_cnt; i++) { 806 t = lgrp_plat_probe_times[i][i]; 807 if (t == 0) 808 continue; 809 if (min == -1 || t < min) 810 min = t; 811 if (t > max) 812 max = t; 813 } 814 if (min != max) { 815 for (i = 0; i < lgrp_plat_node_cnt; i++) { 816 int local; 817 818 local = lgrp_plat_probe_times[i][i]; 819 if (local == 0) 820 continue; 821 822 /* 823 * Track suspect probe times that aren't within 824 * tolerance of minimum local latency and how much 825 * probe times are corrected by 826 */ 827 if (local - min > min >> lgrp_plat_probe_lt_shift) 828 lgrp_plat_probe_suspect[i][i]++; 829 830 lgrp_plat_probe_errors[i][i] += local - min; 831 832 /* 833 * Make local latencies be minimum 834 */ 835 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 836 lgrp_plat_probe_times[i][i] = min; 837 } 838 } 839 840 /* 841 * Determine max probe time again since just adjusted latencies 842 */ 843 lgrp_plat_probe_time_max = 0; 844 for (i = 0; i < lgrp_plat_node_cnt; i++) 845 for (j = 0; j < lgrp_plat_node_cnt; j++) { 846 t = lgrp_plat_probe_times[i][j]; 847 if (t > lgrp_plat_probe_time_max) 848 lgrp_plat_probe_time_max = t; 849 } 850 } 851 852 853 /* 854 * Verify following about latencies between nodes: 855 * 856 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 857 * - Local latencies same 858 * - Local < remote 859 * - Number of latencies seen is reasonable 860 * - Number of occurrences of a given latency should be more than 1 861 * 862 * Returns: 863 * 0 Success 864 * -1 Not symmetric 865 * -2 Local latencies not same 866 * -3 Local >= remote 867 * -4 Wrong number of latencies 868 * -5 Not enough occurrences of given latency 869 */ 870 static int 871 lgrp_plat_latency_verify(void) 872 { 873 int i; 874 int j; 875 lgrp_plat_latency_acct_t *l; 876 int probed; 877 u_longlong_t t1; 878 u_longlong_t t2; 879 880 /* 881 * Nothing to do when this is an UMA machine, lgroup topology is 882 * limited to 2 levels, or there aren't any probe times yet 883 */ 884 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 885 (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 886 return (0); 887 888 /* 889 * Make sure that latencies are symmetric between any two nodes 890 * (ie. latency(node0, node1) == latency(node1, node0)) 891 */ 892 for (i = 0; i < lgrp_plat_node_cnt; i++) 893 for (j = 0; j < lgrp_plat_node_cnt; j++) { 894 t1 = lgrp_plat_probe_times[i][j]; 895 t2 = lgrp_plat_probe_times[j][i]; 896 897 if (t1 == 0 || t2 == 0 || t1 == t2) 898 continue; 899 900 return (-1); 901 } 902 903 /* 904 * Local latencies should be same 905 */ 906 t1 = lgrp_plat_probe_times[0][0]; 907 for (i = 1; i < lgrp_plat_node_cnt; i++) { 908 t2 = lgrp_plat_probe_times[i][i]; 909 if (t2 == 0) 910 continue; 911 912 if (t1 == 0) { 913 t1 = t2; 914 continue; 915 } 916 917 if (t1 != t2) 918 return (-2); 919 } 920 921 /* 922 * Local latencies should be less than remote 923 */ 924 if (t1) { 925 for (i = 0; i < lgrp_plat_node_cnt; i++) 926 for (j = 0; j < lgrp_plat_node_cnt; j++) { 927 t2 = lgrp_plat_probe_times[i][j]; 928 if (i == j || t2 == 0) 929 continue; 930 931 if (t1 >= t2) 932 return (-3); 933 } 934 } 935 936 /* 937 * Rest of checks are not very useful for machines with less than 938 * 4 nodes (which means less than 3 latencies on Opteron) 939 */ 940 if (lgrp_plat_node_cnt < 4) 941 return (0); 942 943 /* 944 * Need to see whether done probing in order to verify number of 945 * latencies are correct 946 */ 947 probed = 0; 948 for (i = 0; i < lgrp_plat_node_cnt; i++) 949 if (lgrp_plat_probe_times[i][i]) 950 probed++; 951 952 if (probed != lgrp_plat_node_cnt) 953 return (0); 954 955 /* 956 * Determine number of unique latencies seen in probe times, 957 * their values, and number of occurrences of each 958 */ 959 lgrp_plat_probe_nlatencies = 0; 960 bzero(lgrp_plat_probe_lat_acct, 961 MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 962 for (i = 0; i < lgrp_plat_node_cnt; i++) { 963 for (j = 0; j < lgrp_plat_node_cnt; j++) { 964 int k; 965 966 /* 967 * Look at each probe time 968 */ 969 t1 = lgrp_plat_probe_times[i][j]; 970 if (t1 == 0) 971 continue; 972 973 /* 974 * Account for unique latencies 975 */ 976 for (k = 0; k < lgrp_plat_node_cnt; k++) { 977 l = &lgrp_plat_probe_lat_acct[k]; 978 if (t1 == l->la_value) { 979 /* 980 * Increment number of occurrences 981 * if seen before 982 */ 983 l->la_count++; 984 break; 985 } else if (l->la_value == 0) { 986 /* 987 * Record latency if haven't seen before 988 */ 989 l->la_value = t1; 990 l->la_count++; 991 lgrp_plat_probe_nlatencies++; 992 break; 993 } 994 } 995 } 996 } 997 998 /* 999 * Number of latencies should be relative to number of 1000 * nodes in system: 1001 * - Same as nodes when nodes <= 2 1002 * - Less than nodes when nodes > 2 1003 * - Greater than 2 when nodes >= 4 1004 */ 1005 if ((lgrp_plat_node_cnt <= 2 && 1006 lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 1007 (lgrp_plat_node_cnt > 2 && 1008 lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 1009 (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 1010 lgrp_plat_probe_nlatencies <= 2)) 1011 return (-4); 1012 1013 /* 1014 * There should be more than one occurrence of every latency 1015 * as long as probing is complete 1016 */ 1017 for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 1018 l = &lgrp_plat_probe_lat_acct[i]; 1019 if (l->la_count <= 1) 1020 return (-5); 1021 } 1022 return (0); 1023 } 1024 1025 1026 /* 1027 * Set lgroup latencies for 2 level lgroup topology 1028 */ 1029 static void 1030 lgrp_plat_2level_setup(void) 1031 { 1032 int i; 1033 1034 if (lgrp_plat_node_cnt >= 4) 1035 cmn_err(CE_NOTE, 1036 "MPO only optimizing for local and remote\n"); 1037 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1038 int j; 1039 1040 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1041 if (i == j) 1042 lgrp_plat_probe_times[i][j] = 2; 1043 else 1044 lgrp_plat_probe_times[i][j] = 3; 1045 } 1046 } 1047 lgrp_plat_probe_time_min = 2; 1048 lgrp_plat_probe_time_max = 3; 1049 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 1050 } 1051 1052 1053 /* 1054 * Return time needed to probe from current CPU to memory in given node 1055 */ 1056 static hrtime_t 1057 lgrp_plat_probe_time(int to) 1058 { 1059 caddr_t buf; 1060 uint_t dev; 1061 /* LINTED: set but not used in function */ 1062 volatile uint_t dev_vendor; 1063 hrtime_t elapsed; 1064 hrtime_t end; 1065 int from; 1066 int i; 1067 int ipl; 1068 hrtime_t max; 1069 hrtime_t min; 1070 hrtime_t start; 1071 int cnt; 1072 extern int use_sse_pagecopy; 1073 1074 /* 1075 * Determine ID of node containing current CPU 1076 */ 1077 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1078 1079 /* 1080 * Do common work for probing main memory 1081 */ 1082 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 1083 /* 1084 * Skip probing any nodes without memory and 1085 * set probe time to 0 1086 */ 1087 if (lgrp_plat_probe_memory[to] == NULL) { 1088 lgrp_plat_probe_times[from][to] = 0; 1089 return (0); 1090 } 1091 1092 /* 1093 * Invalidate caches once instead of once every sample 1094 * which should cut cost of probing by a lot 1095 */ 1096 lgrp_plat_flush_cost = gethrtime(); 1097 invalidate_cache(); 1098 lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 1099 lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 1100 } 1101 1102 /* 1103 * Probe from current CPU to given memory using specified operation 1104 * and take specified number of samples 1105 */ 1106 max = 0; 1107 min = -1; 1108 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1109 lgrp_plat_probe_cost = gethrtime(); 1110 1111 /* 1112 * Can't measure probe time if gethrtime() isn't working yet 1113 */ 1114 if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 1115 return (0); 1116 1117 switch (lgrp_plat_probe_op) { 1118 1119 case LGRP_PLAT_PROBE_PGCPY: 1120 default: 1121 /* 1122 * Measure how long it takes to copy page 1123 * on top of itself 1124 */ 1125 buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 1126 1127 kpreempt_disable(); 1128 ipl = splhigh(); 1129 start = gethrtime(); 1130 if (use_sse_pagecopy) 1131 hwblkpagecopy(buf, buf); 1132 else 1133 bcopy(buf, buf, PAGESIZE); 1134 end = gethrtime(); 1135 elapsed = end - start; 1136 splx(ipl); 1137 kpreempt_enable(); 1138 break; 1139 1140 case LGRP_PLAT_PROBE_VENDOR: 1141 /* 1142 * Measure how long it takes to read vendor ID from 1143 * Northbridge 1144 */ 1145 dev = OPT_PCS_DEV_NODE0 + to; 1146 kpreempt_disable(); 1147 ipl = spl8(); 1148 outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 1149 OPT_PCS_OFF_VENDOR)); 1150 start = gethrtime(); 1151 for (cnt = 0; cnt < lgrp_plat_probe_nreads; cnt++) 1152 dev_vendor = inl(PCI_CONFDATA); 1153 end = gethrtime(); 1154 elapsed = (end - start) / lgrp_plat_probe_nreads; 1155 splx(ipl); 1156 kpreempt_enable(); 1157 break; 1158 } 1159 1160 lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 1161 lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 1162 1163 if (min == -1 || elapsed < min) 1164 min = elapsed; 1165 if (elapsed > max) 1166 max = elapsed; 1167 } 1168 1169 /* 1170 * Update minimum and maximum probe times between 1171 * these two nodes 1172 */ 1173 if (min < lgrp_plat_probe_min[from][to] || 1174 lgrp_plat_probe_min[from][to] == 0) 1175 lgrp_plat_probe_min[from][to] = min; 1176 1177 if (max > lgrp_plat_probe_max[from][to]) 1178 lgrp_plat_probe_max[from][to] = max; 1179 1180 return (min); 1181 } 1182 1183 1184 /* 1185 * Probe memory in each node from current CPU to determine latency topology 1186 */ 1187 void 1188 lgrp_plat_probe(void) 1189 { 1190 int from; 1191 int i; 1192 hrtime_t probe_time; 1193 int to; 1194 1195 if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1196 return; 1197 1198 /* 1199 * Determine ID of node containing current CPU 1200 */ 1201 from = LGRP_PLAT_CPU_TO_NODE(CPU); 1202 1203 /* 1204 * Don't need to probe if got times already 1205 */ 1206 if (lgrp_plat_probe_times[from][from] != 0) 1207 return; 1208 1209 /* 1210 * Read vendor ID in Northbridge or read and write page(s) 1211 * in each node from current CPU and remember how long it takes, 1212 * so we can build latency topology of machine later. 1213 * This should approximate the memory latency between each node. 1214 */ 1215 for (i = 0; i < lgrp_plat_probe_nrounds; i++) 1216 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1217 /* 1218 * Get probe time and bail out if can't get it yet 1219 */ 1220 probe_time = lgrp_plat_probe_time(to); 1221 if (probe_time == 0) 1222 return; 1223 1224 /* 1225 * Keep lowest probe time as latency between nodes 1226 */ 1227 if (lgrp_plat_probe_times[from][to] == 0 || 1228 probe_time < lgrp_plat_probe_times[from][to]) 1229 lgrp_plat_probe_times[from][to] = probe_time; 1230 1231 /* 1232 * Update overall minimum and maximum probe times 1233 * across all nodes 1234 */ 1235 if (probe_time < lgrp_plat_probe_time_min || 1236 lgrp_plat_probe_time_min == -1) 1237 lgrp_plat_probe_time_min = probe_time; 1238 if (probe_time > lgrp_plat_probe_time_max) 1239 lgrp_plat_probe_time_max = probe_time; 1240 } 1241 1242 /* 1243 * - Fix up latencies such that local latencies are same, 1244 * latency(i, j) == latency(j, i), etc. (if possible) 1245 * 1246 * - Verify that latencies look ok 1247 * 1248 * - Fallback to just optimizing for local and remote if 1249 * latencies didn't look right 1250 */ 1251 lgrp_plat_latency_adjust(); 1252 lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 1253 if (lgrp_plat_probe_error_code) 1254 lgrp_plat_2level_setup(); 1255 } 1256 1257 1258 /* 1259 * Platform-specific initialization 1260 */ 1261 void 1262 lgrp_plat_main_init(void) 1263 { 1264 int curnode; 1265 int ht_limit; 1266 int i; 1267 1268 /* 1269 * Print a notice that MPO is disabled when memory is interleaved 1270 * across nodes....Would do this when it is discovered, but can't 1271 * because it happens way too early during boot.... 1272 */ 1273 if (lgrp_plat_mem_intrlv) 1274 cmn_err(CE_NOTE, 1275 "MPO disabled because memory is interleaved\n"); 1276 1277 /* 1278 * Don't bother to do any probing if there is only one node or the 1279 * height of the lgroup topology less than or equal to 2 1280 */ 1281 ht_limit = lgrp_topo_ht_limit(); 1282 if (max_mem_nodes == 1 || ht_limit <= 2) { 1283 /* 1284 * Setup lgroup latencies for 2 level lgroup topology 1285 * (ie. local and remote only) if they haven't been set yet 1286 */ 1287 if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 1288 lgrp_plat_probe_time_max == 0) 1289 lgrp_plat_2level_setup(); 1290 return; 1291 } 1292 1293 if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 1294 /* 1295 * Should have been able to probe from CPU 0 when it was added 1296 * to lgroup hierarchy, but may not have been able to then 1297 * because it happens so early in boot that gethrtime() hasn't 1298 * been initialized. (:-( 1299 */ 1300 curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 1301 if (lgrp_plat_probe_times[curnode][curnode] == 0) 1302 lgrp_plat_probe(); 1303 1304 return; 1305 } 1306 1307 /* 1308 * When probing memory, use one page for every sample to determine 1309 * lgroup topology and taking multiple samples 1310 */ 1311 if (lgrp_plat_probe_memsize == 0) 1312 lgrp_plat_probe_memsize = PAGESIZE * 1313 lgrp_plat_probe_nsamples; 1314 1315 /* 1316 * Map memory in each node needed for probing to determine latency 1317 * topology 1318 */ 1319 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1320 int mnode; 1321 1322 /* 1323 * Skip this node and leave its probe page NULL 1324 * if it doesn't have any memory 1325 */ 1326 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 1327 if (!mem_node_config[mnode].exists) { 1328 lgrp_plat_probe_memory[i] = NULL; 1329 continue; 1330 } 1331 1332 /* 1333 * Allocate one kernel virtual page 1334 */ 1335 lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 1336 lgrp_plat_probe_memsize, VM_NOSLEEP); 1337 if (lgrp_plat_probe_memory[i] == NULL) { 1338 cmn_err(CE_WARN, 1339 "lgrp_plat_main_init: couldn't allocate memory"); 1340 return; 1341 } 1342 1343 /* 1344 * Map virtual page to first page in node 1345 */ 1346 hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 1347 lgrp_plat_probe_memsize, 1348 lgrp_plat_probe_pfn[i], 1349 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 1350 HAT_LOAD_NOCONSIST); 1351 } 1352 1353 /* 1354 * Probe from current CPU 1355 */ 1356 lgrp_plat_probe(); 1357 } 1358 1359 /* 1360 * Allocate additional space for an lgroup. 1361 */ 1362 /* ARGSUSED */ 1363 lgrp_t * 1364 lgrp_plat_alloc(lgrp_id_t lgrpid) 1365 { 1366 lgrp_t *lgrp; 1367 1368 lgrp = &lgrp_space[nlgrps_alloc++]; 1369 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 1370 return (NULL); 1371 return (lgrp); 1372 } 1373 1374 /* 1375 * Platform handling for (re)configuration changes 1376 */ 1377 /* ARGSUSED */ 1378 void 1379 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 1380 { 1381 } 1382 1383 /* 1384 * Return the platform handle for the lgroup containing the given CPU 1385 */ 1386 /* ARGSUSED */ 1387 lgrp_handle_t 1388 lgrp_plat_cpu_to_hand(processorid_t id) 1389 { 1390 if (lgrp_plat_node_cnt == 1) 1391 return (LGRP_DEFAULT_HANDLE); 1392 1393 return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 1394 } 1395 1396 /* 1397 * Return the platform handle of the lgroup that contains the physical memory 1398 * corresponding to the given page frame number 1399 */ 1400 /* ARGSUSED */ 1401 lgrp_handle_t 1402 lgrp_plat_pfn_to_hand(pfn_t pfn) 1403 { 1404 int mnode; 1405 1406 if (max_mem_nodes == 1) 1407 return (LGRP_DEFAULT_HANDLE); 1408 1409 if (pfn > physmax) 1410 return (LGRP_NULL_HANDLE); 1411 1412 mnode = plat_pfn_to_mem_node(pfn); 1413 if (mnode < 0) 1414 return (LGRP_NULL_HANDLE); 1415 1416 return (MEM_NODE_2_LGRPHAND(mnode)); 1417 } 1418 1419 /* 1420 * Return the maximum number of lgrps supported by the platform. 1421 * Before lgrp topology is known it returns an estimate based on the number of 1422 * nodes. Once topology is known it returns the actual maximim number of lgrps 1423 * created. Since x86 doesn't support dynamic addition of new nodes, this number 1424 * may not grow during system lifetime. 1425 */ 1426 int 1427 lgrp_plat_max_lgrps() 1428 { 1429 return (lgrp_topo_initialized ? 1430 lgrp_alloc_max + 1 : 1431 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1432 } 1433 1434 /* 1435 * Return the number of free, allocatable, or installed 1436 * pages in an lgroup 1437 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1438 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1439 */ 1440 /* ARGSUSED */ 1441 static pgcnt_t 1442 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1443 { 1444 struct memlist *mlist; 1445 pgcnt_t npgs = 0; 1446 extern struct memlist *phys_avail; 1447 extern struct memlist *phys_install; 1448 1449 switch (query) { 1450 case LGRP_MEM_SIZE_FREE: 1451 return ((pgcnt_t)freemem); 1452 case LGRP_MEM_SIZE_AVAIL: 1453 memlist_read_lock(); 1454 for (mlist = phys_avail; mlist; mlist = mlist->next) 1455 npgs += btop(mlist->size); 1456 memlist_read_unlock(); 1457 return (npgs); 1458 case LGRP_MEM_SIZE_INSTALL: 1459 memlist_read_lock(); 1460 for (mlist = phys_install; mlist; mlist = mlist->next) 1461 npgs += btop(mlist->size); 1462 memlist_read_unlock(); 1463 return (npgs); 1464 default: 1465 return ((pgcnt_t)0); 1466 } 1467 } 1468 1469 /* 1470 * Return the number of free pages in an lgroup. 1471 * 1472 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1473 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1474 * number of allocatable base pagesize pages corresponding to the 1475 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1476 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1477 * memory installed, regardless of whether or not it's usable. 1478 */ 1479 pgcnt_t 1480 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1481 { 1482 int mnode; 1483 pgcnt_t npgs = (pgcnt_t)0; 1484 extern struct memlist *phys_avail; 1485 extern struct memlist *phys_install; 1486 1487 1488 if (plathand == LGRP_DEFAULT_HANDLE) 1489 return (lgrp_plat_mem_size_default(plathand, query)); 1490 1491 if (plathand != LGRP_NULL_HANDLE) { 1492 mnode = plat_lgrphand_to_mem_node(plathand); 1493 if (mnode >= 0 && mem_node_config[mnode].exists) { 1494 switch (query) { 1495 case LGRP_MEM_SIZE_FREE: 1496 npgs = MNODE_PGCNT(mnode); 1497 break; 1498 case LGRP_MEM_SIZE_AVAIL: 1499 npgs = mem_node_memlist_pages(mnode, 1500 phys_avail); 1501 break; 1502 case LGRP_MEM_SIZE_INSTALL: 1503 npgs = mem_node_memlist_pages(mnode, 1504 phys_install); 1505 break; 1506 default: 1507 break; 1508 } 1509 } 1510 } 1511 return (npgs); 1512 } 1513 1514 /* 1515 * Return latency between "from" and "to" lgroups 1516 * 1517 * This latency number can only be used for relative comparison 1518 * between lgroups on the running system, cannot be used across platforms, 1519 * and may not reflect the actual latency. It is platform and implementation 1520 * specific, so platform gets to decide its value. It would be nice if the 1521 * number was at least proportional to make comparisons more meaningful though. 1522 */ 1523 /* ARGSUSED */ 1524 int 1525 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1526 { 1527 lgrp_handle_t src, dest; 1528 1529 if (max_mem_nodes == 1) 1530 return (0); 1531 1532 /* 1533 * Return max latency for root lgroup 1534 */ 1535 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1536 return (lgrp_plat_probe_time_max); 1537 1538 src = from; 1539 dest = to; 1540 1541 /* 1542 * Return 0 for nodes (lgroup platform handles) out of range 1543 */ 1544 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1545 return (0); 1546 1547 /* 1548 * Probe from current CPU if its lgroup latencies haven't been set yet 1549 * and we are trying to get latency from current CPU to some node 1550 */ 1551 if (lgrp_plat_probe_times[src][src] == 0 && 1552 LGRP_PLAT_CPU_TO_NODE(CPU) == src) 1553 lgrp_plat_probe(); 1554 1555 return (lgrp_plat_probe_times[src][dest]); 1556 } 1557 1558 /* 1559 * Return platform handle for root lgroup 1560 */ 1561 lgrp_handle_t 1562 lgrp_plat_root_hand(void) 1563 { 1564 return (LGRP_DEFAULT_HANDLE); 1565 } 1566