xref: /titanic_51/usr/src/uts/i86pc/os/lgrpplat.c (revision 5b98b50905ca1ba7dd1e3c1d6e0a4f8460cc1b9f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 
30 /*
31  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
32  * ================================================================
33  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
34  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
35  * one or more CPUs and some local memory.  The CPUs in each node can access
36  * the memory in the other nodes but at a higher latency than accessing their
37  * local memory.  Typically, a system with only one node has Uniform Memory
38  * Access (UMA), but it may be possible to have a one node system that has
39  * some global memory outside of the node which is higher latency.
40  *
41  * Module Description
42  * ------------------
43  * This module provides a platform interface for determining which CPUs and
44  * which memory (and how much) are in a NUMA node and how far each node is from
45  * each other.  The interface is used by the Virtual Memory (VM) system and the
46  * common lgroup framework.  The VM system uses the plat_*() routines to fill
47  * in its memory node (memnode) array with the physical address range spanned
48  * by each NUMA node to know which memory belongs to which node, so it can
49  * build and manage a physical page free list for each NUMA node and allocate
50  * local memory from each node as needed.  The common lgroup framework uses the
51  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
52  * to each node (leaf lgroup) and how far each node is from each other, so it
53  * can build the latency (lgroup) topology for the machine in order to optimize
54  * for locality.  Also, an lgroup platform handle instead of lgroups are used
55  * in the interface with this module, so this module shouldn't need to know
56  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
57  * etc. are in each NUMA node, how far each node is from each other, and to use
58  * a unique lgroup platform handle to refer to each node through the interface.
59  *
60  * Determining NUMA Configuration
61  * ------------------------------
62  * By default, this module will try to determine the NUMA configuration of the
63  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
64  * Locality Information Table (SLIT).  The SRAT contains info to tell which
65  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
66  * is a matrix that gives the distance between each system locality (which is
67  * a NUMA node and should correspond to proximity domains in the SRAT).  For
68  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
69  * specification.
70  *
71  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
72  * examine registers in PCI configuration space to determine how many nodes are
73  * in the system and which CPUs and memory are in each node.
74  * do while booting the kernel.
75  *
76  * NOTE: Using these PCI configuration space registers to determine this
77  *       locality info is not guaranteed to work or be compatible across all
78  *	 Opteron processor families.
79  *
80  * If the SLIT does not exist or look right, the kernel will probe to determine
81  * the distance between nodes as long as the NUMA CPU and memory configuration
82  * has been determined (see lgrp_plat_probe() for details).
83  *
84  * Data Structures
85  * ---------------
86  * The main data structures used by this code are the following:
87  *
88  * - lgrp_plat_cpu_node[]		APIC ID to node ID mapping table
89  *					indexed by hashed APIC ID (only used
90  *					for SRAT)
91  *
92  * - lgrp_plat_lat_stats.latencies[][]	Table of latencies between same and
93  *					different nodes indexed by node ID
94  *
95  * - lgrp_plat_node_cnt			Number of NUMA nodes in system
96  *
97  * - lgrp_plat_node_domain[]		Node ID to proximity domain ID mapping
98  *					table indexed by node ID (only used
99  *					for SRAT)
100  *
101  * - lgrp_plat_node_memory[]		Table with physical address range for
102  *					each node indexed by node ID
103  *
104  * The code is implemented to make the following always be true:
105  *
106  *	lgroup platform handle == node ID == memnode ID
107  *
108  * Moreover, it allows for the proximity domain ID to be equal to all of the
109  * above as long as the proximity domains IDs are numbered from 0 to <number of
110  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
111  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
112  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
113  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
114  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
115  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
116  * to node IDs.  However, the proximity domain IDs may not map to the
117  * equivalent node ID since we want to keep the node IDs numbered from 0 to
118  * <number of nodes - 1> to minimize cost of searching and potentially space.
119  */
120 
121 
122 #include <sys/archsystm.h>	/* for {in,out}{b,w,l}() */
123 #include <sys/cmn_err.h>
124 #include <sys/controlregs.h>
125 #include <sys/cpupart.h>
126 #include <sys/cpuvar.h>
127 #include <sys/lgrp.h>
128 #include <sys/machsystm.h>
129 #include <sys/memlist.h>
130 #include <sys/memnode.h>
131 #include <sys/mman.h>
132 #include <sys/pci_cfgspace.h>
133 #include <sys/pci_impl.h>
134 #include <sys/param.h>
135 #include <sys/pghw.h>
136 #include <sys/promif.h>		/* for prom_printf() */
137 #include <sys/sysmacros.h>
138 #include <sys/systm.h>
139 #include <sys/thread.h>
140 #include <sys/types.h>
141 #include <sys/var.h>
142 #include <sys/x86_archext.h>	/* for x86_feature and X86_AMD */
143 #include <vm/hat_i86.h>
144 #include <vm/seg_kmem.h>
145 #include <vm/vm_dep.h>
146 
147 #include "acpi_fw.h"		/* for SRAT and SLIT */
148 
149 
150 #define	MAX_NODES		8
151 #define	NLGRP			(MAX_NODES * (MAX_NODES - 1) + 1)
152 
153 /*
154  * Constants for configuring probing
155  */
156 #define	LGRP_PLAT_PROBE_NROUNDS		64	/* default laps for probing */
157 #define	LGRP_PLAT_PROBE_NSAMPLES	1	/* default samples to take */
158 #define	LGRP_PLAT_PROBE_NREADS		256	/* number of vendor ID reads */
159 
160 /*
161  * Flags for probing
162  */
163 #define	LGRP_PLAT_PROBE_ENABLE		0x1	/* enable probing */
164 #define	LGRP_PLAT_PROBE_PGCPY		0x2	/* probe using page copy */
165 #define	LGRP_PLAT_PROBE_VENDOR		0x4	/* probe vendor ID register */
166 
167 /*
168  * Hash CPU APIC ID into CPU to node mapping table using max_ncpus
169  * to minimize span of entries used
170  */
171 #define	CPU_NODE_HASH(apicid)		((apicid) % max_ncpus)
172 
173 /*
174  * Hash proximity domain ID into node to domain mapping table using to minimize
175  * span of entries used
176  */
177 #define	NODE_DOMAIN_HASH(domain)	((domain) % lgrp_plat_node_cnt)
178 
179 
180 /*
181  * CPU APIC ID to node ID mapping structure (only used with SRAT)
182  */
183 typedef	struct cpu_node_map {
184 	int		exists;
185 	uint_t		node;
186 	uint32_t	apicid;
187 	uint32_t	prox_domain;
188 } cpu_node_map_t;
189 
190 /*
191  * Latency statistics
192  */
193 typedef struct lgrp_plat_latency_stats {
194 	hrtime_t	latencies[MAX_NODES][MAX_NODES];
195 	hrtime_t	latency_max;
196 	hrtime_t	latency_min;
197 } lgrp_plat_latency_stats_t;
198 
199 /*
200  * Memory configuration for probing
201  */
202 typedef struct lgrp_plat_probe_mem_config {
203 	size_t	probe_memsize;		/* how much memory to probe per node */
204 	caddr_t	probe_va[MAX_NODES];	/* where memory mapped for probing */
205 	pfn_t	probe_pfn[MAX_NODES];	/* physical pages to map for probing */
206 } lgrp_plat_probe_mem_config_t;
207 
208 /*
209  * Statistics kept for probing
210  */
211 typedef struct lgrp_plat_probe_stats {
212 	hrtime_t	flush_cost;
213 	hrtime_t	probe_cost;
214 	hrtime_t	probe_cost_total;
215 	hrtime_t	probe_error_code;
216 	hrtime_t	probe_errors[MAX_NODES][MAX_NODES];
217 	int		probe_suspect[MAX_NODES][MAX_NODES];
218 	hrtime_t	probe_max[MAX_NODES][MAX_NODES];
219 	hrtime_t	probe_min[MAX_NODES][MAX_NODES];
220 } lgrp_plat_probe_stats_t;
221 
222 /*
223  * Node to proximity domain ID mapping structure (only used with SRAT)
224  */
225 typedef	struct node_domain_map {
226 	int		exists;
227 	uint32_t	prox_domain;
228 } node_domain_map_t;
229 
230 /*
231  * Node ID and starting and ending page for physical memory in node
232  */
233 typedef	struct node_phys_addr_map {
234 	pfn_t		start;
235 	pfn_t		end;
236 	int		exists;
237 	uint32_t	prox_domain;
238 } node_phys_addr_map_t;
239 
240 
241 /*
242  * CPU APIC ID to node ID mapping table (only used for SRAT)
243  */
244 static cpu_node_map_t			lgrp_plat_cpu_node[NCPU];
245 
246 /*
247  * Latency statistics
248  */
249 lgrp_plat_latency_stats_t		lgrp_plat_lat_stats;
250 
251 /*
252  * Whether memory is interleaved across nodes causing MPO to be disabled
253  */
254 static int				lgrp_plat_mem_intrlv = 0;
255 
256 /*
257  * Node ID to proximity domain ID mapping table (only used for SRAT)
258  */
259 static node_domain_map_t		lgrp_plat_node_domain[MAX_NODES];
260 
261 /*
262  * Physical address range for memory in each node
263  */
264 static node_phys_addr_map_t		lgrp_plat_node_memory[MAX_NODES];
265 
266 /*
267  * Statistics gotten from probing
268  */
269 static lgrp_plat_probe_stats_t		lgrp_plat_probe_stats;
270 
271 /*
272  * Memory configuration for probing
273  */
274 static lgrp_plat_probe_mem_config_t	lgrp_plat_probe_mem_config;
275 
276 /*
277  * Error code from processing ACPI SRAT
278  */
279 static int				lgrp_plat_srat_error = 0;
280 
281 /*
282  * Error code from processing ACPI SLIT
283  */
284 static int				lgrp_plat_slit_error = 0;
285 
286 /*
287  * Allocate lgroup array statically
288  */
289 static lgrp_t				lgrp_space[NLGRP];
290 static int				nlgrps_alloc;
291 
292 
293 /*
294  * Number of nodes in system
295  */
296 uint_t			lgrp_plat_node_cnt = 1;
297 
298 /*
299  * Configuration Parameters for Probing
300  * - lgrp_plat_probe_flags	Flags to specify enabling probing, probe
301  *				operation, etc.
302  * - lgrp_plat_probe_nrounds	How many rounds of probing to do
303  * - lgrp_plat_probe_nsamples	Number of samples to take when probing each
304  *				node
305  * - lgrp_plat_probe_nreads	Number of times to read vendor ID from
306  *				Northbridge for each probe
307  */
308 uint_t			lgrp_plat_probe_flags = 0;
309 int			lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
310 int			lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
311 int			lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
312 
313 /*
314  * Enable use of ACPI System Resource Affinity Table (SRAT) and System
315  * Locality Information Table (SLIT)
316  */
317 int			lgrp_plat_srat_enable = 1;
318 int			lgrp_plat_slit_enable = 1;
319 
320 /*
321  * Static array to hold lgroup statistics
322  */
323 struct lgrp_stats	lgrp_stats[NLGRP];
324 
325 
326 /*
327  * Forward declarations of platform interface routines
328  */
329 void		plat_build_mem_nodes(struct memlist *list);
330 
331 int		plat_lgrphand_to_mem_node(lgrp_handle_t hand);
332 
333 lgrp_handle_t	plat_mem_node_to_lgrphand(int mnode);
334 
335 int		plat_mnode_xcheck(pfn_t pfncnt);
336 
337 int		plat_pfn_to_mem_node(pfn_t pfn);
338 
339 /*
340  * Forward declarations of lgroup platform interface routines
341  */
342 lgrp_t		*lgrp_plat_alloc(lgrp_id_t lgrpid);
343 
344 void		lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
345 
346 lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);
347 
348 void		lgrp_plat_init(void);
349 
350 int		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
351 
352 void		lgrp_plat_main_init(void);
353 
354 int		lgrp_plat_max_lgrps(void);
355 
356 pgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
357     lgrp_mem_query_t query);
358 
359 lgrp_handle_t	lgrp_plat_pfn_to_hand(pfn_t pfn);
360 
361 void		lgrp_plat_probe(void);
362 
363 lgrp_handle_t	lgrp_plat_root_hand(void);
364 
365 
366 /*
367  * Forward declarations of local routines
368  */
369 static int	is_opteron(void);
370 
371 static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node);
372 
373 static int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
374     uint32_t domain);
375 
376 static void	lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
377     lgrp_plat_latency_stats_t *lat_stats,
378     lgrp_plat_probe_stats_t *probe_stats);
379 
380 static int	lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
381     lgrp_plat_latency_stats_t *lat_stats);
382 
383 static pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
384 
385 static int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
386     uint32_t domain);
387 
388 static int	lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
389     node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
390     uint32_t domain);
391 
392 static hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
393     lgrp_plat_probe_mem_config_t *probe_mem_config,
394     lgrp_plat_latency_stats_t *lat_stats,
395     lgrp_plat_probe_stats_t *probe_stats);
396 
397 static int	lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
398     node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats);
399 
400 static int	lgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt,
401     node_domain_map_t *node_domain, cpu_node_map_t *cpu_node,
402     node_phys_addr_map_t *node_memory);
403 
404 static int	lgrp_plat_srat_domains(struct srat *tp);
405 
406 static void	lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
407     lgrp_plat_latency_stats_t *lat_stats);
408 
409 static void	opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
410     node_phys_addr_map_t *node_memory);
411 
412 static hrtime_t	opt_probe_vendor(int dest_node, int nreads);
413 
414 
415 /*
416  * PLATFORM INTERFACE ROUTINES
417  */
418 
419 /*
420  * Configure memory nodes for machines with more than one node (ie NUMA)
421  */
422 void
423 plat_build_mem_nodes(struct memlist *list)
424 {
425 	pfn_t		cur_start;	/* start addr of subrange */
426 	pfn_t		cur_end;	/* end addr of subrange */
427 	pfn_t		start;		/* start addr of whole range */
428 	pfn_t		end;		/* end addr of whole range */
429 
430 	/*
431 	 * Boot install lists are arranged <addr, len>, ...
432 	 */
433 	while (list) {
434 		int	node;
435 
436 		start = list->address >> PAGESHIFT;
437 		end = (list->address + list->size - 1) >> PAGESHIFT;
438 
439 		if (start > physmax) {
440 			list = list->next;
441 			continue;
442 		}
443 		if (end > physmax)
444 			end = physmax;
445 
446 		/*
447 		 * When there is only one memnode, just add memory to memnode
448 		 */
449 		if (max_mem_nodes == 1) {
450 			mem_node_add_slice(start, end);
451 			list = list->next;
452 			continue;
453 		}
454 
455 		/*
456 		 * mem_node_add_slice() expects to get a memory range that
457 		 * is within one memnode, so need to split any memory range
458 		 * that spans multiple memnodes into subranges that are each
459 		 * contained within one memnode when feeding them to
460 		 * mem_node_add_slice()
461 		 */
462 		cur_start = start;
463 		do {
464 			node = plat_pfn_to_mem_node(cur_start);
465 
466 			/*
467 			 * Panic if DRAM address map registers or SRAT say
468 			 * memory in node doesn't exist or address from
469 			 * boot installed memory list entry isn't in this node.
470 			 * This shouldn't happen and rest of code can't deal
471 			 * with this if it does.
472 			 */
473 			if (node < 0 || node >= lgrp_plat_node_cnt ||
474 			    !lgrp_plat_node_memory[node].exists ||
475 			    cur_start < lgrp_plat_node_memory[node].start ||
476 			    cur_start > lgrp_plat_node_memory[node].end) {
477 				cmn_err(CE_PANIC, "Don't know which memnode "
478 				    "to add installed memory address 0x%lx\n",
479 				    cur_start);
480 			}
481 
482 			/*
483 			 * End of current subrange should not span memnodes
484 			 */
485 			cur_end = end;
486 			if (lgrp_plat_node_memory[node].exists &&
487 			    cur_end > lgrp_plat_node_memory[node].end)
488 				cur_end = lgrp_plat_node_memory[node].end;
489 
490 			mem_node_add_slice(cur_start, cur_end);
491 
492 			/*
493 			 * Next subrange starts after end of current one
494 			 */
495 			cur_start = cur_end + 1;
496 		} while (cur_end < end);
497 
498 		list = list->next;
499 	}
500 	mem_node_physalign = 0;
501 	mem_node_pfn_shift = 0;
502 }
503 
504 
505 int
506 plat_lgrphand_to_mem_node(lgrp_handle_t hand)
507 {
508 	if (max_mem_nodes == 1)
509 		return (0);
510 
511 	return ((int)hand);
512 }
513 
514 
515 /*
516  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
517  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
518  * a crossing is found and returns 0 otherwise.
519  */
520 int
521 plat_mnode_xcheck(pfn_t pfncnt)
522 {
523 	int	node, prevnode = -1, basenode;
524 	pfn_t	ea, sa;
525 
526 	for (node = 0; node < lgrp_plat_node_cnt; node++) {
527 
528 		if (lgrp_plat_node_memory[node].exists == 0)
529 			continue;
530 
531 		if (prevnode == -1) {
532 			prevnode = node;
533 			basenode = node;
534 			continue;
535 		}
536 
537 		/* assume x86 node pfn ranges are in increasing order */
538 		ASSERT(lgrp_plat_node_memory[node].start >
539 		    lgrp_plat_node_memory[prevnode].end);
540 
541 		/*
542 		 * continue if the starting address of node is not contiguous
543 		 * with the previous node.
544 		 */
545 
546 		if (lgrp_plat_node_memory[node].start !=
547 		    (lgrp_plat_node_memory[prevnode].end + 1)) {
548 			basenode = node;
549 			prevnode = node;
550 			continue;
551 		}
552 
553 		/* check if the starting address of node is pfncnt aligned */
554 		if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) {
555 
556 			/*
557 			 * at this point, node starts at an unaligned boundary
558 			 * and is contiguous with the previous node(s) to
559 			 * basenode. Check if there is an aligned contiguous
560 			 * range of length pfncnt that crosses this boundary.
561 			 */
562 
563 			sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end,
564 			    pfncnt);
565 			ea = P2ROUNDUP((lgrp_plat_node_memory[node].start),
566 			    pfncnt);
567 
568 			ASSERT((ea - sa) == pfncnt);
569 			if (sa >= lgrp_plat_node_memory[basenode].start &&
570 			    ea <= (lgrp_plat_node_memory[node].end + 1))
571 				return (1);
572 		}
573 		prevnode = node;
574 	}
575 	return (0);
576 }
577 
578 
579 lgrp_handle_t
580 plat_mem_node_to_lgrphand(int mnode)
581 {
582 	if (max_mem_nodes == 1)
583 		return (LGRP_DEFAULT_HANDLE);
584 
585 	return ((lgrp_handle_t)mnode);
586 }
587 
588 
589 int
590 plat_pfn_to_mem_node(pfn_t pfn)
591 {
592 	int	node;
593 
594 	if (max_mem_nodes == 1)
595 		return (0);
596 
597 	for (node = 0; node < lgrp_plat_node_cnt; node++) {
598 		/*
599 		 * Skip nodes with no memory
600 		 */
601 		if (!lgrp_plat_node_memory[node].exists)
602 			continue;
603 
604 		if (pfn >= lgrp_plat_node_memory[node].start &&
605 		    pfn <= lgrp_plat_node_memory[node].end)
606 			return (node);
607 	}
608 
609 	/*
610 	 * Didn't find memnode where this PFN lives which should never happen
611 	 */
612 	ASSERT(node < lgrp_plat_node_cnt);
613 	return (-1);
614 }
615 
616 
617 /*
618  * LGROUP PLATFORM INTERFACE ROUTINES
619  */
620 
621 /*
622  * Allocate additional space for an lgroup.
623  */
624 /* ARGSUSED */
625 lgrp_t *
626 lgrp_plat_alloc(lgrp_id_t lgrpid)
627 {
628 	lgrp_t *lgrp;
629 
630 	lgrp = &lgrp_space[nlgrps_alloc++];
631 	if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
632 		return (NULL);
633 	return (lgrp);
634 }
635 
636 
637 /*
638  * Platform handling for (re)configuration changes
639  */
640 /* ARGSUSED */
641 void
642 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
643 {
644 }
645 
646 
647 /*
648  * Return the platform handle for the lgroup containing the given CPU
649  */
650 /* ARGSUSED */
651 lgrp_handle_t
652 lgrp_plat_cpu_to_hand(processorid_t id)
653 {
654 	lgrp_handle_t	hand;
655 
656 	if (lgrp_plat_node_cnt == 1)
657 		return (LGRP_DEFAULT_HANDLE);
658 
659 	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
660 	    lgrp_plat_cpu_node);
661 
662 	ASSERT(hand != (lgrp_handle_t)-1);
663 	if (hand == (lgrp_handle_t)-1)
664 		return (LGRP_NULL_HANDLE);
665 
666 	return (hand);
667 }
668 
669 
670 /*
671  * Platform-specific initialization of lgroups
672  */
673 void
674 lgrp_plat_init(void)
675 {
676 #if defined(__xpv)
677 	/*
678 	 * XXPV	For now, the hypervisor treats all memory equally.
679 	 */
680 	lgrp_plat_node_cnt = max_mem_nodes = 1;
681 #else	/* __xpv */
682 	uint_t	probe_op;
683 
684 	/*
685 	 * Initialize as a UMA machine
686 	 */
687 	if (lgrp_topo_ht_limit() == 1) {
688 		lgrp_plat_node_cnt = max_mem_nodes = 1;
689 		return;
690 	}
691 
692 	/*
693 	 * Determine which CPUs and memory are local to each other and number
694 	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
695 	 */
696 	lgrp_plat_srat_error = lgrp_plat_process_srat(srat_ptr,
697 	    &lgrp_plat_node_cnt, lgrp_plat_node_domain, lgrp_plat_cpu_node,
698 	    lgrp_plat_node_memory);
699 
700 	/*
701 	 * Try to use PCI config space registers on Opteron if SRAT doesn't
702 	 * exist or there is some error processing the SRAT
703 	 */
704 	if (lgrp_plat_srat_error != 0 && is_opteron())
705 		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
706 		    lgrp_plat_node_memory);
707 
708 	/*
709 	 * Don't bother to setup system for multiple lgroups and only use one
710 	 * memory node when memory is interleaved between any nodes or there is
711 	 * only one NUMA node
712 	 *
713 	 * NOTE: May need to change this for Dynamic Reconfiguration (DR)
714 	 *	 when and if it happens for x86/x64
715 	 */
716 	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
717 		lgrp_plat_node_cnt = max_mem_nodes = 1;
718 		(void) lgrp_topo_ht_limit_set(1);
719 		return;
720 	}
721 
722 	/*
723 	 * Leaf lgroups on x86/x64 architectures contain one physical
724 	 * processor chip. Tune lgrp_expand_proc_thresh and
725 	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
726 	 * things out aggressively.
727 	 */
728 	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
729 	lgrp_expand_proc_diff = 0;
730 
731 	/*
732 	 * There should be one memnode (physical page free list(s)) for
733 	 * each node
734 	 */
735 	max_mem_nodes = lgrp_plat_node_cnt;
736 
737 	/*
738 	 * Initialize min and max latency before reading SLIT or probing
739 	 */
740 	lgrp_plat_lat_stats.latency_min = -1;
741 	lgrp_plat_lat_stats.latency_max = 0;
742 
743 	/*
744 	 * Determine how far each NUMA node is from each other by
745 	 * reading ACPI System Locality Information Table (SLIT) if it
746 	 * exists
747 	 */
748 	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
749 	    lgrp_plat_node_cnt, lgrp_plat_node_memory,
750 	    &lgrp_plat_lat_stats);
751 	if (lgrp_plat_slit_error == 0)
752 		return;
753 
754 	/*
755 	 * Probe to determine latency between NUMA nodes when SLIT
756 	 * doesn't exist or make sense
757 	 */
758 	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
759 
760 	/*
761 	 * Specify whether to probe using vendor ID register or page copy
762 	 * if hasn't been specified already or is overspecified
763 	 */
764 	probe_op = lgrp_plat_probe_flags &
765 	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
766 
767 	if (probe_op == 0 ||
768 	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
769 		lgrp_plat_probe_flags &=
770 		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
771 		if (is_opteron())
772 			lgrp_plat_probe_flags |=
773 			    LGRP_PLAT_PROBE_VENDOR;
774 		else
775 			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
776 	}
777 
778 	/*
779 	 * Probing errors can mess up the lgroup topology and
780 	 * force us fall back to a 2 level lgroup topology.
781 	 * Here we bound how tall the lgroup topology can grow
782 	 * in hopes of avoiding any anamolies in probing from
783 	 * messing up the lgroup topology by limiting the
784 	 * accuracy of the latency topology.
785 	 *
786 	 * Assume that nodes will at least be configured in a
787 	 * ring, so limit height of lgroup topology to be less
788 	 * than number of nodes on a system with 4 or more
789 	 * nodes
790 	 */
791 	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
792 	    lgrp_topo_ht_limit_default())
793 		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
794 #endif	/* __xpv */
795 }
796 
797 
798 /*
799  * Return latency between "from" and "to" lgroups
800  *
801  * This latency number can only be used for relative comparison
802  * between lgroups on the running system, cannot be used across platforms,
803  * and may not reflect the actual latency.  It is platform and implementation
804  * specific, so platform gets to decide its value.  It would be nice if the
805  * number was at least proportional to make comparisons more meaningful though.
806  */
807 /* ARGSUSED */
808 int
809 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
810 {
811 	lgrp_handle_t	src, dest;
812 	int		node;
813 
814 	if (max_mem_nodes == 1)
815 		return (0);
816 
817 	/*
818 	 * Return max latency for root lgroup
819 	 */
820 	if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
821 		return (lgrp_plat_lat_stats.latency_max);
822 
823 	src = from;
824 	dest = to;
825 
826 	/*
827 	 * Return 0 for nodes (lgroup platform handles) out of range
828 	 */
829 	if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
830 		return (0);
831 
832 	/*
833 	 * Probe from current CPU if its lgroup latencies haven't been set yet
834 	 * and we are trying to get latency from current CPU to some node
835 	 */
836 	node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
837 	ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
838 	if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src)
839 		lgrp_plat_probe();
840 
841 	return (lgrp_plat_lat_stats.latencies[src][dest]);
842 }
843 
844 
845 /*
846  * Platform-specific initialization
847  */
848 void
849 lgrp_plat_main_init(void)
850 {
851 	int	curnode;
852 	int	ht_limit;
853 	int	i;
854 
855 	/*
856 	 * Print a notice that MPO is disabled when memory is interleaved
857 	 * across nodes....Would do this when it is discovered, but can't
858 	 * because it happens way too early during boot....
859 	 */
860 	if (lgrp_plat_mem_intrlv)
861 		cmn_err(CE_NOTE,
862 		    "MPO disabled because memory is interleaved\n");
863 
864 	/*
865 	 * Don't bother to do any probing if it is disabled, there is only one
866 	 * node, or the height of the lgroup topology less than or equal to 2
867 	 */
868 	ht_limit = lgrp_topo_ht_limit();
869 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
870 	    max_mem_nodes == 1 || ht_limit <= 2) {
871 		/*
872 		 * Setup lgroup latencies for 2 level lgroup topology
873 		 * (ie. local and remote only) if they haven't been set yet
874 		 */
875 		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
876 		    lgrp_plat_lat_stats.latency_max == 0)
877 			lgrp_plat_2level_setup(lgrp_plat_node_memory,
878 			    &lgrp_plat_lat_stats);
879 		return;
880 	}
881 
882 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
883 		/*
884 		 * Should have been able to probe from CPU 0 when it was added
885 		 * to lgroup hierarchy, but may not have been able to then
886 		 * because it happens so early in boot that gethrtime() hasn't
887 		 * been initialized.  (:-(
888 		 */
889 		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
890 		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
891 		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
892 			lgrp_plat_probe();
893 
894 		return;
895 	}
896 
897 	/*
898 	 * When probing memory, use one page for every sample to determine
899 	 * lgroup topology and taking multiple samples
900 	 */
901 	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
902 		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
903 		    lgrp_plat_probe_nsamples;
904 
905 	/*
906 	 * Map memory in each node needed for probing to determine latency
907 	 * topology
908 	 */
909 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
910 		int	mnode;
911 
912 		/*
913 		 * Skip this node and leave its probe page NULL
914 		 * if it doesn't have any memory
915 		 */
916 		mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i);
917 		if (!mem_node_config[mnode].exists) {
918 			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
919 			continue;
920 		}
921 
922 		/*
923 		 * Allocate one kernel virtual page
924 		 */
925 		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
926 		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
927 		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
928 			cmn_err(CE_WARN,
929 			    "lgrp_plat_main_init: couldn't allocate memory");
930 			return;
931 		}
932 
933 		/*
934 		 * Get PFN for first page in each node
935 		 */
936 		lgrp_plat_probe_mem_config.probe_pfn[i] =
937 		    mem_node_config[mnode].physbase;
938 
939 		/*
940 		 * Map virtual page to first page in node
941 		 */
942 		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
943 		    lgrp_plat_probe_mem_config.probe_memsize,
944 		    lgrp_plat_probe_mem_config.probe_pfn[i],
945 		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
946 		    HAT_LOAD_NOCONSIST);
947 	}
948 
949 	/*
950 	 * Probe from current CPU
951 	 */
952 	lgrp_plat_probe();
953 }
954 
955 
956 /*
957  * Return the maximum number of lgrps supported by the platform.
958  * Before lgrp topology is known it returns an estimate based on the number of
959  * nodes. Once topology is known it returns the actual maximim number of lgrps
960  * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and
961  * dynamic addition of new nodes, this number may not grow during system
962  * lifetime (yet).
963  */
964 int
965 lgrp_plat_max_lgrps(void)
966 {
967 	return (lgrp_topo_initialized ?
968 	    lgrp_alloc_max + 1 :
969 	    lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
970 }
971 
972 
973 /*
974  * Return the number of free pages in an lgroup.
975  *
976  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
977  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
978  * number of allocatable base pagesize pages corresponding to the
979  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
980  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
981  * memory installed, regardless of whether or not it's usable.
982  */
983 pgcnt_t
984 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
985 {
986 	int	mnode;
987 	pgcnt_t npgs = (pgcnt_t)0;
988 	extern struct memlist *phys_avail;
989 	extern struct memlist *phys_install;
990 
991 
992 	if (plathand == LGRP_DEFAULT_HANDLE)
993 		return (lgrp_plat_mem_size_default(plathand, query));
994 
995 	if (plathand != LGRP_NULL_HANDLE) {
996 		mnode = plat_lgrphand_to_mem_node(plathand);
997 		if (mnode >= 0 && mem_node_config[mnode].exists) {
998 			switch (query) {
999 			case LGRP_MEM_SIZE_FREE:
1000 				npgs = MNODE_PGCNT(mnode);
1001 				break;
1002 			case LGRP_MEM_SIZE_AVAIL:
1003 				npgs = mem_node_memlist_pages(mnode,
1004 				    phys_avail);
1005 				break;
1006 			case LGRP_MEM_SIZE_INSTALL:
1007 				npgs = mem_node_memlist_pages(mnode,
1008 				    phys_install);
1009 				break;
1010 			default:
1011 				break;
1012 			}
1013 		}
1014 	}
1015 	return (npgs);
1016 }
1017 
1018 
1019 /*
1020  * Return the platform handle of the lgroup that contains the physical memory
1021  * corresponding to the given page frame number
1022  */
1023 /* ARGSUSED */
1024 lgrp_handle_t
1025 lgrp_plat_pfn_to_hand(pfn_t pfn)
1026 {
1027 	int	mnode;
1028 
1029 	if (max_mem_nodes == 1)
1030 		return (LGRP_DEFAULT_HANDLE);
1031 
1032 	if (pfn > physmax)
1033 		return (LGRP_NULL_HANDLE);
1034 
1035 	mnode = plat_pfn_to_mem_node(pfn);
1036 	if (mnode < 0)
1037 		return (LGRP_NULL_HANDLE);
1038 
1039 	return (MEM_NODE_2_LGRPHAND(mnode));
1040 }
1041 
1042 
1043 /*
1044  * Probe memory in each node from current CPU to determine latency topology
1045  *
1046  * The probing code will probe the vendor ID register on the Northbridge of
1047  * Opteron processors and probe memory for other processors by default.
1048  *
1049  * Since probing is inherently error prone, the code takes laps across all the
1050  * nodes probing from each node to each of the other nodes some number of
1051  * times.  Furthermore, each node is probed some number of times before moving
1052  * onto the next one during each lap.  The minimum latency gotten between nodes
1053  * is kept as the latency between the nodes.
1054  *
1055  * After all that,  the probe times are adjusted by normalizing values that are
1056  * close to each other and local latencies are made the same.  Lastly, the
1057  * latencies are verified to make sure that certain conditions are met (eg.
1058  * local < remote, latency(a, b) == latency(b, a), etc.).
1059  *
1060  * If any of the conditions aren't met, the code will export a NUMA
1061  * configuration with the local CPUs and memory given by the SRAT or PCI config
1062  * space registers and one remote memory latency since it can't tell exactly
1063  * how far each node is from each other.
1064  */
1065 void
1066 lgrp_plat_probe(void)
1067 {
1068 	int				from;
1069 	int				i;
1070 	lgrp_plat_latency_stats_t	*lat_stats;
1071 	hrtime_t			probe_time;
1072 	int				to;
1073 
1074 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
1075 	    max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
1076 		return;
1077 
1078 	/*
1079 	 * Determine ID of node containing current CPU
1080 	 */
1081 	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
1082 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
1083 	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
1084 		ASSERT(lgrp_plat_node_domain[from].exists);
1085 
1086 	/*
1087 	 * Don't need to probe if got times already
1088 	 */
1089 	lat_stats = &lgrp_plat_lat_stats;
1090 	if (lat_stats->latencies[from][from] != 0)
1091 		return;
1092 
1093 	/*
1094 	 * Read vendor ID in Northbridge or read and write page(s)
1095 	 * in each node from current CPU and remember how long it takes,
1096 	 * so we can build latency topology of machine later.
1097 	 * This should approximate the memory latency between each node.
1098 	 */
1099 	for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
1100 		for (to = 0; to < lgrp_plat_node_cnt; to++) {
1101 			/*
1102 			 * Get probe time and bail out if can't get it yet
1103 			 */
1104 			probe_time = lgrp_plat_probe_time(to,
1105 			    lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config,
1106 			    &lgrp_plat_lat_stats, &lgrp_plat_probe_stats);
1107 			if (probe_time == 0)
1108 				return;
1109 
1110 			/*
1111 			 * Keep lowest probe time as latency between nodes
1112 			 */
1113 			if (lat_stats->latencies[from][to] == 0 ||
1114 			    probe_time < lat_stats->latencies[from][to])
1115 				lat_stats->latencies[from][to] = probe_time;
1116 
1117 			/*
1118 			 * Update overall minimum and maximum probe times
1119 			 * across all nodes
1120 			 */
1121 			if (probe_time < lat_stats->latency_min ||
1122 			    lat_stats->latency_min == -1)
1123 				lat_stats->latency_min = probe_time;
1124 			if (probe_time > lat_stats->latency_max)
1125 				lat_stats->latency_max = probe_time;
1126 		}
1127 	}
1128 
1129 	/*
1130 	 * - Fix up latencies such that local latencies are same,
1131 	 *   latency(i, j) == latency(j, i), etc. (if possible)
1132 	 *
1133 	 * - Verify that latencies look ok
1134 	 *
1135 	 * - Fallback to just optimizing for local and remote if
1136 	 *   latencies didn't look right
1137 	 */
1138 	lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats,
1139 	    &lgrp_plat_probe_stats);
1140 	lgrp_plat_probe_stats.probe_error_code =
1141 	    lgrp_plat_latency_verify(lgrp_plat_node_memory,
1142 	    &lgrp_plat_lat_stats);
1143 	if (lgrp_plat_probe_stats.probe_error_code)
1144 		lgrp_plat_2level_setup(lgrp_plat_node_memory,
1145 		    &lgrp_plat_lat_stats);
1146 }
1147 
1148 
1149 /*
1150  * Return platform handle for root lgroup
1151  */
1152 lgrp_handle_t
1153 lgrp_plat_root_hand(void)
1154 {
1155 	return (LGRP_DEFAULT_HANDLE);
1156 }
1157 
1158 
1159 /*
1160  * INTERNAL ROUTINES
1161  */
1162 
1163 
1164 /*
1165  * Update CPU to node mapping for given CPU and proximity domain (and returns
1166  * negative numbers for errors and positive ones for success)
1167  */
1168 static int
1169 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
1170     cpu_node_map_t *cpu_node, uint32_t apicid, uint32_t domain)
1171 {
1172 	uint_t	i;
1173 	uint_t	start;
1174 	int	node;
1175 
1176 	/*
1177 	 * Get node number for proximity domain
1178 	 */
1179 	node = lgrp_plat_domain_to_node(node_domain, domain);
1180 	if (node == -1) {
1181 		node = lgrp_plat_node_domain_update(node_domain, domain);
1182 		if (node == -1)
1183 			return (-1);
1184 	}
1185 
1186 	/*
1187 	 * Hash given CPU APIC ID into CPU to node mapping table/array and
1188 	 * enter it and its corresponding node and proximity domain IDs into
1189 	 * first non-existent or matching entry
1190 	 */
1191 	i = start = CPU_NODE_HASH(apicid);
1192 	do {
1193 		if (cpu_node[i].exists) {
1194 			/*
1195 			 * Update already existing entry for CPU
1196 			 */
1197 			if (cpu_node[i].apicid == apicid) {
1198 				/*
1199 				 * Just return when everything same
1200 				 */
1201 				if (cpu_node[i].prox_domain == domain &&
1202 				    cpu_node[i].node == node)
1203 					return (1);
1204 
1205 				/*
1206 				 * Assert that proximity domain and node IDs
1207 				 * should be same and return error on non-debug
1208 				 * kernel
1209 				 */
1210 				ASSERT(cpu_node[i].prox_domain == domain &&
1211 				    cpu_node[i].node == node);
1212 				return (-1);
1213 			}
1214 		} else {
1215 			/*
1216 			 * Create new entry for CPU
1217 			 */
1218 			cpu_node[i].exists = 1;
1219 			cpu_node[i].apicid = apicid;
1220 			cpu_node[i].prox_domain = domain;
1221 			cpu_node[i].node = node;
1222 			return (0);
1223 		}
1224 		i = CPU_NODE_HASH(i + 1);
1225 	} while (i != start);
1226 
1227 	/*
1228 	 * Ran out of supported number of entries which shouldn't happen....
1229 	 */
1230 	ASSERT(i != start);
1231 	return (-1);
1232 }
1233 
1234 
1235 /*
1236  * Get node ID for given CPU ID
1237  */
1238 static int
1239 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node)
1240 {
1241 	uint32_t	apicid;
1242 	uint_t		i;
1243 	uint_t		start;
1244 
1245 	if (cp == NULL)
1246 		return (-1);
1247 
1248 	/*
1249 	 * SRAT doesn't exist, isn't enabled, or there was an error processing
1250 	 * it, so return chip ID for Opteron and -1 otherwise.
1251 	 */
1252 	if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
1253 	    lgrp_plat_srat_error) {
1254 		if (is_opteron())
1255 			return (pg_plat_hw_instance_id(cp, PGHW_CHIP));
1256 		return (-1);
1257 	}
1258 
1259 	/*
1260 	 * SRAT does exist, so get APIC ID for given CPU and map that to its
1261 	 * node ID
1262 	 */
1263 	apicid = cpuid_get_apicid(cp);
1264 	i = start = CPU_NODE_HASH(apicid);
1265 	do {
1266 		if (cpu_node[i].apicid == apicid && cpu_node[i].exists)
1267 			return (cpu_node[i].node);
1268 		i = CPU_NODE_HASH(i + 1);
1269 	} while (i != start);
1270 	return (-1);
1271 }
1272 
1273 
1274 /*
1275  * Return node number for given proximity domain/system locality
1276  */
1277 static int
1278 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, uint32_t domain)
1279 {
1280 	uint_t	node;
1281 	uint_t	start;
1282 
1283 	/*
1284 	 * Hash proximity domain ID into node to domain mapping table (array),
1285 	 * search for entry with matching proximity domain ID, and return index
1286 	 * of matching entry as node ID.
1287 	 */
1288 	node = start = NODE_DOMAIN_HASH(domain);
1289 	do {
1290 		if (node_domain[node].prox_domain == domain &&
1291 		    node_domain[node].exists)
1292 			return (node);
1293 		node = NODE_DOMAIN_HASH(node + 1);
1294 	} while (node != start);
1295 	return (-1);
1296 }
1297 
1298 
1299 /*
1300  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
1301  * be considered same
1302  */
1303 #define	LGRP_LAT_TOLERANCE_SHIFT	4
1304 
1305 int	lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
1306 
1307 
1308 /*
1309  * Adjust latencies between nodes to be symmetric, normalize latencies between
1310  * any nodes that are within some tolerance to be same, and make local
1311  * latencies be same
1312  */
1313 static void
1314 lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
1315     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
1316 {
1317 	int				i;
1318 	int				j;
1319 	int				k;
1320 	int				l;
1321 	u_longlong_t			max;
1322 	u_longlong_t			min;
1323 	u_longlong_t			t;
1324 	u_longlong_t			t1;
1325 	u_longlong_t			t2;
1326 	const lgrp_config_flag_t	cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
1327 	int				lat_corrected[MAX_NODES][MAX_NODES];
1328 
1329 	/*
1330 	 * Nothing to do when this is an UMA machine or don't have args needed
1331 	 */
1332 	if (max_mem_nodes == 1)
1333 		return;
1334 
1335 	ASSERT(node_memory != NULL && lat_stats != NULL &&
1336 	    probe_stats != NULL);
1337 
1338 	/*
1339 	 * Make sure that latencies are symmetric between any two nodes
1340 	 * (ie. latency(node0, node1) == latency(node1, node0))
1341 	 */
1342 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1343 		if (!node_memory[i].exists)
1344 			continue;
1345 
1346 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1347 			if (!node_memory[j].exists)
1348 				continue;
1349 
1350 			t1 = lat_stats->latencies[i][j];
1351 			t2 = lat_stats->latencies[j][i];
1352 
1353 			if (t1 == 0 || t2 == 0 || t1 == t2)
1354 				continue;
1355 
1356 			/*
1357 			 * Latencies should be same
1358 			 * - Use minimum of two latencies which should be same
1359 			 * - Track suspect probe times not within tolerance of
1360 			 *   min value
1361 			 * - Remember how much values are corrected by
1362 			 */
1363 			if (t1 > t2) {
1364 				t = t2;
1365 				probe_stats->probe_errors[i][j] += t1 - t2;
1366 				if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
1367 					probe_stats->probe_suspect[i][j]++;
1368 					probe_stats->probe_suspect[j][i]++;
1369 				}
1370 			} else if (t2 > t1) {
1371 				t = t1;
1372 				probe_stats->probe_errors[j][i] += t2 - t1;
1373 				if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
1374 					probe_stats->probe_suspect[i][j]++;
1375 					probe_stats->probe_suspect[j][i]++;
1376 				}
1377 			}
1378 
1379 			lat_stats->latencies[i][j] =
1380 			    lat_stats->latencies[j][i] = t;
1381 			lgrp_config(cflag, t1, t);
1382 			lgrp_config(cflag, t2, t);
1383 		}
1384 	}
1385 
1386 	/*
1387 	 * Keep track of which latencies get corrected
1388 	 */
1389 	for (i = 0; i < MAX_NODES; i++)
1390 		for (j = 0; j < MAX_NODES; j++)
1391 			lat_corrected[i][j] = 0;
1392 
1393 	/*
1394 	 * For every two nodes, see whether there is another pair of nodes which
1395 	 * are about the same distance apart and make the latencies be the same
1396 	 * if they are close enough together
1397 	 */
1398 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1399 		if (!node_memory[i].exists)
1400 			continue;
1401 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1402 			if (!node_memory[j].exists)
1403 				continue;
1404 			/*
1405 			 * Pick one pair of nodes (i, j)
1406 			 * and get latency between them
1407 			 */
1408 			t1 = lat_stats->latencies[i][j];
1409 
1410 			/*
1411 			 * Skip this pair of nodes if there isn't a latency
1412 			 * for it yet
1413 			 */
1414 			if (t1 == 0)
1415 				continue;
1416 
1417 			for (k = 0; k < lgrp_plat_node_cnt; k++) {
1418 				if (!node_memory[k].exists)
1419 					continue;
1420 				for (l = 0; l < lgrp_plat_node_cnt; l++) {
1421 					if (!node_memory[l].exists)
1422 						continue;
1423 					/*
1424 					 * Pick another pair of nodes (k, l)
1425 					 * not same as (i, j) and get latency
1426 					 * between them
1427 					 */
1428 					if (k == i && l == j)
1429 						continue;
1430 
1431 					t2 = lat_stats->latencies[k][l];
1432 
1433 					/*
1434 					 * Skip this pair of nodes if there
1435 					 * isn't a latency for it yet
1436 					 */
1437 
1438 					if (t2 == 0)
1439 						continue;
1440 
1441 					/*
1442 					 * Skip nodes (k, l) if they already
1443 					 * have same latency as (i, j) or
1444 					 * their latency isn't close enough to
1445 					 * be considered/made the same
1446 					 */
1447 					if (t1 == t2 || (t1 > t2 && t1 - t2 >
1448 					    t1 >> lgrp_plat_probe_lt_shift) ||
1449 					    (t2 > t1 && t2 - t1 >
1450 					    t2 >> lgrp_plat_probe_lt_shift))
1451 						continue;
1452 
1453 					/*
1454 					 * Make latency(i, j) same as
1455 					 * latency(k, l), try to use latency
1456 					 * that has been adjusted already to get
1457 					 * more consistency (if possible), and
1458 					 * remember which latencies were
1459 					 * adjusted for next time
1460 					 */
1461 					if (lat_corrected[i][j]) {
1462 						t = t1;
1463 						lgrp_config(cflag, t2, t);
1464 						t2 = t;
1465 					} else if (lat_corrected[k][l]) {
1466 						t = t2;
1467 						lgrp_config(cflag, t1, t);
1468 						t1 = t;
1469 					} else {
1470 						if (t1 > t2)
1471 							t = t2;
1472 						else
1473 							t = t1;
1474 						lgrp_config(cflag, t1, t);
1475 						lgrp_config(cflag, t2, t);
1476 						t1 = t2 = t;
1477 					}
1478 
1479 					lat_stats->latencies[i][j] =
1480 					    lat_stats->latencies[k][l] = t;
1481 
1482 					lat_corrected[i][j] =
1483 					    lat_corrected[k][l] = 1;
1484 				}
1485 			}
1486 		}
1487 	}
1488 
1489 	/*
1490 	 * Local latencies should be same
1491 	 * - Find min and max local latencies
1492 	 * - Make all local latencies be minimum
1493 	 */
1494 	min = -1;
1495 	max = 0;
1496 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1497 		if (!node_memory[i].exists)
1498 			continue;
1499 		t = lat_stats->latencies[i][i];
1500 		if (t == 0)
1501 			continue;
1502 		if (min == -1 || t < min)
1503 			min = t;
1504 		if (t > max)
1505 			max = t;
1506 	}
1507 	if (min != max) {
1508 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
1509 			int	local;
1510 
1511 			if (!node_memory[i].exists)
1512 				continue;
1513 
1514 			local = lat_stats->latencies[i][i];
1515 			if (local == 0)
1516 				continue;
1517 
1518 			/*
1519 			 * Track suspect probe times that aren't within
1520 			 * tolerance of minimum local latency and how much
1521 			 * probe times are corrected by
1522 			 */
1523 			if (local - min > min >> lgrp_plat_probe_lt_shift)
1524 				probe_stats->probe_suspect[i][i]++;
1525 
1526 			probe_stats->probe_errors[i][i] += local - min;
1527 
1528 			/*
1529 			 * Make local latencies be minimum
1530 			 */
1531 			lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
1532 			lat_stats->latencies[i][i] = min;
1533 		}
1534 	}
1535 
1536 	/*
1537 	 * Determine max probe time again since just adjusted latencies
1538 	 */
1539 	lat_stats->latency_max = 0;
1540 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1541 		if (!node_memory[i].exists)
1542 			continue;
1543 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1544 			if (!node_memory[j].exists)
1545 				continue;
1546 			t = lat_stats->latencies[i][j];
1547 			if (t > lat_stats->latency_max)
1548 				lat_stats->latency_max = t;
1549 		}
1550 	}
1551 }
1552 
1553 
1554 /*
1555  * Verify following about latencies between nodes:
1556  *
1557  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
1558  * - Local latencies same
1559  * - Local < remote
1560  * - Number of latencies seen is reasonable
1561  * - Number of occurrences of a given latency should be more than 1
1562  *
1563  * Returns:
1564  *	0	Success
1565  *	-1	Not symmetric
1566  *	-2	Local latencies not same
1567  *	-3	Local >= remote
1568  */
1569 static int
1570 lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
1571     lgrp_plat_latency_stats_t *lat_stats)
1572 {
1573 	int				i;
1574 	int				j;
1575 	u_longlong_t			t1;
1576 	u_longlong_t			t2;
1577 
1578 	ASSERT(node_memory != NULL && lat_stats != NULL);
1579 
1580 	/*
1581 	 * Nothing to do when this is an UMA machine, lgroup topology is
1582 	 * limited to 2 levels, or there aren't any probe times yet
1583 	 */
1584 	if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
1585 	    lat_stats->latencies[0][0] == 0)
1586 		return (0);
1587 
1588 	/*
1589 	 * Make sure that latencies are symmetric between any two nodes
1590 	 * (ie. latency(node0, node1) == latency(node1, node0))
1591 	 */
1592 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1593 		if (!node_memory[i].exists)
1594 			continue;
1595 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1596 			if (!node_memory[j].exists)
1597 				continue;
1598 			t1 = lat_stats->latencies[i][j];
1599 			t2 = lat_stats->latencies[j][i];
1600 
1601 			if (t1 == 0 || t2 == 0 || t1 == t2)
1602 				continue;
1603 
1604 			return (-1);
1605 		}
1606 	}
1607 
1608 	/*
1609 	 * Local latencies should be same
1610 	 */
1611 	t1 = lat_stats->latencies[0][0];
1612 	for (i = 1; i < lgrp_plat_node_cnt; i++) {
1613 		if (!node_memory[i].exists)
1614 			continue;
1615 
1616 		t2 = lat_stats->latencies[i][i];
1617 		if (t2 == 0)
1618 			continue;
1619 
1620 		if (t1 == 0) {
1621 			t1 = t2;
1622 			continue;
1623 		}
1624 
1625 		if (t1 != t2)
1626 			return (-2);
1627 	}
1628 
1629 	/*
1630 	 * Local latencies should be less than remote
1631 	 */
1632 	if (t1) {
1633 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
1634 			if (!node_memory[i].exists)
1635 				continue;
1636 			for (j = 0; j < lgrp_plat_node_cnt; j++) {
1637 				if (!node_memory[j].exists)
1638 					continue;
1639 				t2 = lat_stats->latencies[i][j];
1640 				if (i == j || t2 == 0)
1641 					continue;
1642 
1643 				if (t1 >= t2)
1644 					return (-3);
1645 			}
1646 		}
1647 	}
1648 
1649 	return (0);
1650 }
1651 
1652 
1653 /*
1654  * Return the number of free, allocatable, or installed
1655  * pages in an lgroup
1656  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
1657  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
1658  */
1659 /* ARGSUSED */
1660 static pgcnt_t
1661 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
1662 {
1663 	struct memlist *mlist;
1664 	pgcnt_t npgs = 0;
1665 	extern struct memlist *phys_avail;
1666 	extern struct memlist *phys_install;
1667 
1668 	switch (query) {
1669 	case LGRP_MEM_SIZE_FREE:
1670 		return ((pgcnt_t)freemem);
1671 	case LGRP_MEM_SIZE_AVAIL:
1672 		memlist_read_lock();
1673 		for (mlist = phys_avail; mlist; mlist = mlist->next)
1674 			npgs += btop(mlist->size);
1675 		memlist_read_unlock();
1676 		return (npgs);
1677 	case LGRP_MEM_SIZE_INSTALL:
1678 		memlist_read_lock();
1679 		for (mlist = phys_install; mlist; mlist = mlist->next)
1680 			npgs += btop(mlist->size);
1681 		memlist_read_unlock();
1682 		return (npgs);
1683 	default:
1684 		return ((pgcnt_t)0);
1685 	}
1686 }
1687 
1688 
1689 /*
1690  * Update node to proximity domain mappings for given domain and return node ID
1691  */
1692 static int
1693 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, uint32_t domain)
1694 {
1695 	uint_t	node;
1696 	uint_t	start;
1697 
1698 	/*
1699 	 * Hash proximity domain ID into node to domain mapping table (array)
1700 	 * and add entry for it into first non-existent or matching entry found
1701 	 */
1702 	node = start = NODE_DOMAIN_HASH(domain);
1703 	do {
1704 		/*
1705 		 * Entry doesn't exist yet, so create one for this proximity
1706 		 * domain and return node ID which is index into mapping table.
1707 		 */
1708 		if (!node_domain[node].exists) {
1709 			node_domain[node].exists = 1;
1710 			node_domain[node].prox_domain = domain;
1711 			return (node);
1712 		}
1713 
1714 		/*
1715 		 * Entry exists for this proximity domain already, so just
1716 		 * return node ID (index into table).
1717 		 */
1718 		if (node_domain[node].prox_domain == domain)
1719 			return (node);
1720 		node = NODE_DOMAIN_HASH(node + 1);
1721 	} while (node != start);
1722 
1723 	/*
1724 	 * Ran out of supported number of entries which shouldn't happen....
1725 	 */
1726 	ASSERT(node != start);
1727 	return (-1);
1728 }
1729 
1730 
1731 /*
1732  * Update node memory information for given proximity domain with specified
1733  * starting and ending physical address range (and return positive numbers for
1734  * success and negative ones for errors)
1735  */
1736 static int
1737 lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
1738     node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
1739     uint32_t domain)
1740 {
1741 	int	node;
1742 
1743 	/*
1744 	 * Get node number for proximity domain
1745 	 */
1746 	node = lgrp_plat_domain_to_node(node_domain, domain);
1747 	if (node == -1) {
1748 		node = lgrp_plat_node_domain_update(node_domain, domain);
1749 		if (node == -1)
1750 			return (-1);
1751 	}
1752 
1753 	/*
1754 	 * Create entry in table for node if it doesn't exist
1755 	 */
1756 	if (!node_memory[node].exists) {
1757 		node_memory[node].exists = 1;
1758 		node_memory[node].start = btop(start);
1759 		node_memory[node].end = btop(end);
1760 		node_memory[node].prox_domain = domain;
1761 		return (0);
1762 	}
1763 
1764 	/*
1765 	 * Entry already exists for this proximity domain
1766 	 *
1767 	 * There may be more than one SRAT memory entry for a domain, so we may
1768 	 * need to update existing start or end address for the node.
1769 	 */
1770 	if (node_memory[node].prox_domain == domain) {
1771 		if (btop(start) < node_memory[node].start)
1772 			node_memory[node].start = btop(start);
1773 		if (btop(end) > node_memory[node].end)
1774 			node_memory[node].end = btop(end);
1775 		return (1);
1776 	}
1777 	return (-2);
1778 }
1779 
1780 
1781 /*
1782  * Return time needed to probe from current CPU to memory in given node
1783  */
1784 static hrtime_t
1785 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
1786     lgrp_plat_probe_mem_config_t *probe_mem_config,
1787     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
1788 {
1789 	caddr_t			buf;
1790 	hrtime_t		elapsed;
1791 	hrtime_t		end;
1792 	int			from;
1793 	int			i;
1794 	int			ipl;
1795 	hrtime_t		max;
1796 	hrtime_t		min;
1797 	hrtime_t		start;
1798 	extern int		use_sse_pagecopy;
1799 
1800 	/*
1801 	 * Determine ID of node containing current CPU
1802 	 */
1803 	from = lgrp_plat_cpu_to_node(CPU, cpu_node);
1804 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
1805 
1806 	/*
1807 	 * Do common work for probing main memory
1808 	 */
1809 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
1810 		/*
1811 		 * Skip probing any nodes without memory and
1812 		 * set probe time to 0
1813 		 */
1814 		if (probe_mem_config->probe_va[to] == NULL) {
1815 			lat_stats->latencies[from][to] = 0;
1816 			return (0);
1817 		}
1818 
1819 		/*
1820 		 * Invalidate caches once instead of once every sample
1821 		 * which should cut cost of probing by a lot
1822 		 */
1823 		probe_stats->flush_cost = gethrtime();
1824 		invalidate_cache();
1825 		probe_stats->flush_cost = gethrtime() -
1826 		    probe_stats->flush_cost;
1827 		probe_stats->probe_cost_total += probe_stats->flush_cost;
1828 	}
1829 
1830 	/*
1831 	 * Probe from current CPU to given memory using specified operation
1832 	 * and take specified number of samples
1833 	 */
1834 	max = 0;
1835 	min = -1;
1836 	for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
1837 		probe_stats->probe_cost = gethrtime();
1838 
1839 		/*
1840 		 * Can't measure probe time if gethrtime() isn't working yet
1841 		 */
1842 		if (probe_stats->probe_cost == 0 && gethrtime() == 0)
1843 			return (0);
1844 
1845 		if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
1846 			/*
1847 			 * Measure how long it takes to read vendor ID from
1848 			 * Northbridge
1849 			 */
1850 			elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
1851 		} else {
1852 			/*
1853 			 * Measure how long it takes to copy page
1854 			 * on top of itself
1855 			 */
1856 			buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
1857 
1858 			kpreempt_disable();
1859 			ipl = splhigh();
1860 			start = gethrtime();
1861 			if (use_sse_pagecopy)
1862 				hwblkpagecopy(buf, buf);
1863 			else
1864 				bcopy(buf, buf, PAGESIZE);
1865 			end = gethrtime();
1866 			elapsed = end - start;
1867 			splx(ipl);
1868 			kpreempt_enable();
1869 		}
1870 
1871 		probe_stats->probe_cost = gethrtime() -
1872 		    probe_stats->probe_cost;
1873 		probe_stats->probe_cost_total += probe_stats->probe_cost;
1874 
1875 		if (min == -1 || elapsed < min)
1876 			min = elapsed;
1877 		if (elapsed > max)
1878 			max = elapsed;
1879 	}
1880 
1881 	/*
1882 	 * Update minimum and maximum probe times between
1883 	 * these two nodes
1884 	 */
1885 	if (min < probe_stats->probe_min[from][to] ||
1886 	    probe_stats->probe_min[from][to] == 0)
1887 		probe_stats->probe_min[from][to] = min;
1888 
1889 	if (max > probe_stats->probe_max[from][to])
1890 		probe_stats->probe_max[from][to] = max;
1891 
1892 	return (min);
1893 }
1894 
1895 
1896 /*
1897  * Read ACPI System Locality Information Table (SLIT) to determine how far each
1898  * NUMA node is from each other
1899  */
1900 static int
1901 lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
1902     node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats)
1903 {
1904 	int		i;
1905 	int		j;
1906 	int		localities;
1907 	hrtime_t	max;
1908 	hrtime_t	min;
1909 	int		retval;
1910 	uint8_t		*slit_entries;
1911 
1912 	if (tp == NULL || !lgrp_plat_slit_enable)
1913 		return (1);
1914 
1915 	if (lat_stats == NULL)
1916 		return (2);
1917 
1918 	localities = tp->number;
1919 	if (localities != node_cnt)
1920 		return (3);
1921 
1922 	min = lat_stats->latency_min;
1923 	max = lat_stats->latency_max;
1924 
1925 	/*
1926 	 * Fill in latency matrix based on SLIT entries
1927 	 */
1928 	slit_entries = tp->entry;
1929 	for (i = 0; i < localities; i++) {
1930 		for (j = 0; j < localities; j++) {
1931 			uint8_t	latency;
1932 
1933 			latency = slit_entries[(i * localities) + j];
1934 			lat_stats->latencies[i][j] = latency;
1935 			if (latency < min || min == -1)
1936 				min = latency;
1937 			if (latency > max)
1938 				max = latency;
1939 		}
1940 	}
1941 
1942 	/*
1943 	 * Verify that latencies/distances given in SLIT look reasonable
1944 	 */
1945 	retval = lgrp_plat_latency_verify(node_memory, lat_stats);
1946 
1947 	if (retval) {
1948 		/*
1949 		 * Reinitialize (zero) latency table since SLIT doesn't look
1950 		 * right
1951 		 */
1952 		for (i = 0; i < localities; i++) {
1953 			for (j = 0; j < localities; j++)
1954 				lat_stats->latencies[i][j] = 0;
1955 		}
1956 	} else {
1957 		/*
1958 		 * Update min and max latencies seen since SLIT looks valid
1959 		 */
1960 		lat_stats->latency_min = min;
1961 		lat_stats->latency_max = max;
1962 	}
1963 
1964 	return (retval);
1965 }
1966 
1967 
1968 /*
1969  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
1970  * and memory are local to each other in the same NUMA node
1971  */
1972 static int
1973 lgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt,
1974     node_domain_map_t *node_domain, cpu_node_map_t *cpu_node,
1975     node_phys_addr_map_t *node_memory)
1976 {
1977 	struct srat_item	*srat_end;
1978 	int			i;
1979 	struct srat_item	*item;
1980 
1981 	if (tp == NULL || !lgrp_plat_srat_enable)
1982 		return (1);
1983 
1984 	/*
1985 	 * Determine number of nodes by counting number of proximity domains in
1986 	 * SRAT
1987 	 */
1988 	if (node_cnt) {
1989 		int	nodes;
1990 
1991 		nodes = lgrp_plat_srat_domains(tp);
1992 		if (nodes < 0) {
1993 			*node_cnt = 1;
1994 			return (2);
1995 		}
1996 		*node_cnt = nodes;
1997 	}
1998 
1999 	/*
2000 	 * Walk through SRAT, examining each CPU and memory entry to determine
2001 	 * which CPUs and memory belong to which node.
2002 	 */
2003 	item = tp->list;
2004 	srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
2005 	while (item < srat_end) {
2006 		uint32_t	apic_id;
2007 		uint32_t	domain;
2008 		uint64_t	end;
2009 		uint64_t	length;
2010 		uint64_t	start;
2011 
2012 		switch (item->type) {
2013 		case SRAT_PROCESSOR:	/* CPU entry */
2014 			if (!(item->i.p.flags & SRAT_ENABLED) ||
2015 			    cpu_node == NULL)
2016 				break;
2017 
2018 			/*
2019 			 * Calculate domain (node) ID and fill in APIC ID to
2020 			 * domain/node mapping table
2021 			 */
2022 			domain = item->i.p.domain1;
2023 			for (i = 0; i < 3; i++) {
2024 				domain += item->i.p.domain2[i] <<
2025 				    ((i + 1) * 8);
2026 			}
2027 			apic_id = item->i.p.apic_id;
2028 
2029 			if (lgrp_plat_cpu_node_update(node_domain, cpu_node,
2030 			    apic_id, domain) < 0)
2031 				return (3);
2032 			break;
2033 
2034 		case SRAT_MEMORY:	/* memory entry */
2035 			if (!(item->i.m.flags & SRAT_ENABLED) ||
2036 			    node_memory == NULL)
2037 				break;
2038 
2039 			/*
2040 			 * Get domain (node) ID and fill in domain/node
2041 			 * to memory mapping table
2042 			 */
2043 			domain = item->i.m.domain;
2044 			start = item->i.m.base_addr;
2045 			length = item->i.m.len;
2046 			end = start + length - 1;
2047 
2048 			if (lgrp_plat_node_memory_update(node_domain,
2049 			    node_memory, start, end, domain) < 0)
2050 				return (4);
2051 			break;
2052 
2053 		default:
2054 			break;
2055 		}
2056 
2057 		item = (struct srat_item *)((uintptr_t)item + item->len);
2058 	}
2059 	return (0);
2060 }
2061 
2062 
2063 /*
2064  * Return number of proximity domains given in ACPI SRAT
2065  */
2066 static int
2067 lgrp_plat_srat_domains(struct srat *tp)
2068 {
2069 	int			domain_cnt;
2070 	struct srat_item	*end;
2071 	int			i;
2072 	struct srat_item	*item;
2073 	node_domain_map_t	node_domain[MAX_NODES];
2074 
2075 
2076 	if (tp == NULL || !lgrp_plat_srat_enable)
2077 		return (1);
2078 
2079 	/*
2080 	 * Walk through SRAT, examining each CPU and memory entry to determine
2081 	 * proximity domain ID for each.
2082 	 */
2083 	domain_cnt = 0;
2084 	item = tp->list;
2085 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
2086 	bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
2087 	while (item < end) {
2088 		uint32_t	domain;
2089 		boolean_t	overflow;
2090 		uint_t		start;
2091 
2092 		switch (item->type) {
2093 		case SRAT_PROCESSOR:	/* CPU entry */
2094 			if (!(item->i.p.flags & SRAT_ENABLED))
2095 				break;
2096 			domain = item->i.p.domain1;
2097 			for (i = 0; i < 3; i++) {
2098 				domain += item->i.p.domain2[i] <<
2099 				    ((i + 1) * 8);
2100 			}
2101 			break;
2102 
2103 		case SRAT_MEMORY:	/* memory entry */
2104 			if (!(item->i.m.flags & SRAT_ENABLED))
2105 				break;
2106 			domain = item->i.m.domain;
2107 			break;
2108 
2109 		default:
2110 			break;
2111 		}
2112 
2113 		/*
2114 		 * Count and keep track of which proximity domain IDs seen
2115 		 */
2116 		start = i = domain % MAX_NODES;
2117 		overflow = B_TRUE;
2118 		do {
2119 			/*
2120 			 * Create entry for proximity domain and increment
2121 			 * count when no entry exists where proximity domain
2122 			 * hashed
2123 			 */
2124 			if (!node_domain[i].exists) {
2125 				node_domain[i].exists = 1;
2126 				node_domain[i].prox_domain = domain;
2127 				domain_cnt++;
2128 				overflow = B_FALSE;
2129 				break;
2130 			}
2131 
2132 			/*
2133 			 * Nothing to do when proximity domain seen already
2134 			 * and its entry exists
2135 			 */
2136 			if (node_domain[i].prox_domain == domain) {
2137 				overflow = B_FALSE;
2138 				break;
2139 			}
2140 
2141 			/*
2142 			 * Entry exists where proximity domain hashed, but for
2143 			 * different proximity domain so keep search for empty
2144 			 * slot to put it or matching entry whichever comes
2145 			 * first.
2146 			 */
2147 			i = (i + 1) % MAX_NODES;
2148 		} while (i != start);
2149 
2150 		/*
2151 		 * Didn't find empty or matching entry which means have more
2152 		 * proximity domains than supported nodes (:-(
2153 		 */
2154 		ASSERT(overflow != B_TRUE);
2155 		if (overflow == B_TRUE)
2156 			return (-1);
2157 
2158 		item = (struct srat_item *)((uintptr_t)item + item->len);
2159 	}
2160 	return (domain_cnt);
2161 }
2162 
2163 
2164 /*
2165  * Set lgroup latencies for 2 level lgroup topology
2166  */
2167 static void
2168 lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
2169     lgrp_plat_latency_stats_t *lat_stats)
2170 {
2171 	int	i;
2172 
2173 	ASSERT(node_memory != NULL && lat_stats != NULL);
2174 
2175 	if (lgrp_plat_node_cnt >= 4)
2176 		cmn_err(CE_NOTE,
2177 		    "MPO only optimizing for local and remote\n");
2178 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2179 		int	j;
2180 
2181 		if (!node_memory[i].exists)
2182 			continue;
2183 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
2184 			if (!node_memory[j].exists)
2185 				continue;
2186 			if (i == j)
2187 				lat_stats->latencies[i][j] = 2;
2188 			else
2189 				lat_stats->latencies[i][j] = 3;
2190 		}
2191 	}
2192 	lat_stats->latency_min = 2;
2193 	lat_stats->latency_max = 3;
2194 	lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
2195 }
2196 
2197 
2198 /*
2199  * The following Opteron specific constants, macros, types, and routines define
2200  * PCI configuration space registers and how to read them to determine the NUMA
2201  * configuration of *supported* Opteron processors.  They provide the same
2202  * information that may be gotten from the ACPI System Resource Affinity Table
2203  * (SRAT) if it exists on the machine of interest.
2204  *
2205  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
2206  * of interest describes all of these registers and their contents.  The main
2207  * registers used by this code to determine the NUMA configuration of the
2208  * machine are the node ID register for the number of NUMA nodes and the DRAM
2209  * address map registers for the physical address range of each node.
2210  *
2211  * NOTE: The format and how to determine the NUMA configuration using PCI
2212  *	 config space registers may change or may not be supported in future
2213  *	 Opteron processor families.
2214  */
2215 
2216 /*
2217  * How many bits to shift Opteron DRAM Address Map base and limit registers
2218  * to get actual value
2219  */
2220 #define	OPT_DRAMADDR_HI_LSHIFT_ADDR	40	/* shift left for address */
2221 #define	OPT_DRAMADDR_LO_LSHIFT_ADDR	8	/* shift left for address */
2222 
2223 #define	OPT_DRAMADDR_HI_MASK_ADDR	0x000000FF /* address bits 47-40 */
2224 #define	OPT_DRAMADDR_LO_MASK_ADDR	0xFFFF0000 /* address bits 39-24 */
2225 
2226 #define	OPT_DRAMADDR_LO_MASK_OFF	0xFFFFFF /* offset for address */
2227 
2228 /*
2229  * Macros to derive addresses from Opteron DRAM Address Map registers
2230  */
2231 #define	OPT_DRAMADDR_HI(reg) \
2232 	(((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
2233 	    OPT_DRAMADDR_HI_LSHIFT_ADDR)
2234 
2235 #define	OPT_DRAMADDR_LO(reg) \
2236 	(((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
2237 	    OPT_DRAMADDR_LO_LSHIFT_ADDR)
2238 
2239 #define	OPT_DRAMADDR(high, low) \
2240 	(OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
2241 
2242 /*
2243  * Bit masks defining what's in Opteron DRAM Address Map base register
2244  */
2245 #define	OPT_DRAMBASE_LO_MASK_RE		0x1	/* read enable */
2246 #define	OPT_DRAMBASE_LO_MASK_WE		0x2	/* write enable */
2247 #define	OPT_DRAMBASE_LO_MASK_INTRLVEN	0x700	/* interleave */
2248 
2249 /*
2250  * Bit masks defining what's in Opteron DRAM Address Map limit register
2251  */
2252 #define	OPT_DRAMLIMIT_LO_MASK_DSTNODE	0x7		/* destination node */
2253 #define	OPT_DRAMLIMIT_LO_MASK_INTRLVSEL	0x700		/* interleave select */
2254 
2255 
2256 /*
2257  * Opteron Node ID register in PCI configuration space contains
2258  * number of nodes in system, etc. for Opteron K8.  The following
2259  * constants and macros define its contents, structure, and access.
2260  */
2261 
2262 /*
2263  * Bit masks defining what's in Opteron Node ID register
2264  */
2265 #define	OPT_NODE_MASK_ID	0x7	/* node ID */
2266 #define	OPT_NODE_MASK_CNT	0x70	/* node count */
2267 #define	OPT_NODE_MASK_IONODE	0x700	/* Hypertransport I/O hub node ID */
2268 #define	OPT_NODE_MASK_LCKNODE	0x7000	/* lock controller node ID */
2269 #define	OPT_NODE_MASK_CPUCNT	0xF0000	/* CPUs in system (0 means 1 CPU)  */
2270 
2271 /*
2272  * How many bits in Opteron Node ID register to shift right to get actual value
2273  */
2274 #define	OPT_NODE_RSHIFT_CNT	0x4	/* shift right for node count value */
2275 
2276 /*
2277  * Macros to get values from Opteron Node ID register
2278  */
2279 #define	OPT_NODE_CNT(reg) \
2280 	((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
2281 
2282 /*
2283  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
2284  * "in/out" instructions
2285  *
2286  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
2287  *	 other uses should just do MMIO to access PCI ECS.
2288  *	 Must enable special bit in Northbridge Configuration Register on
2289  *	 Greyhound for extended CF8 space access to be able to access PCI ECS
2290  *	 using "in/out" instructions and restore special bit after done
2291  *	 accessing PCI ECS.
2292  */
2293 #define	OPT_PCI_ECS_ADDR(bus, device, function, reg) \
2294 	(PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
2295 	    (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
2296 	    ((((reg) >> 8) & 0xf) << 24))
2297 
2298 /*
2299  * PCI configuration space registers accessed by specifying
2300  * a bus, device, function, and offset.  The following constants
2301  * define the values needed to access Opteron K8 configuration
2302  * info to determine its node topology
2303  */
2304 
2305 #define	OPT_PCS_BUS_CONFIG	0	/* Hypertransport config space bus */
2306 
2307 /*
2308  * Opteron PCI configuration space register function values
2309  */
2310 #define	OPT_PCS_FUNC_HT		0	/* Hypertransport configuration */
2311 #define	OPT_PCS_FUNC_ADDRMAP	1	/* Address map configuration */
2312 #define	OPT_PCS_FUNC_DRAM	2	/* DRAM configuration */
2313 #define	OPT_PCS_FUNC_MISC	3	/* Miscellaneous configuration */
2314 
2315 /*
2316  * PCI Configuration Space register offsets
2317  */
2318 #define	OPT_PCS_OFF_VENDOR	0x0	/* device/vendor ID register */
2319 #define	OPT_PCS_OFF_DRAMBASE_HI	0x140	/* DRAM Base register (node 0) */
2320 #define	OPT_PCS_OFF_DRAMBASE_LO	0x40	/* DRAM Base register (node 0) */
2321 #define	OPT_PCS_OFF_NODEID	0x60	/* Node ID register */
2322 
2323 /*
2324  * Opteron PCI Configuration Space device IDs for nodes
2325  */
2326 #define	OPT_PCS_DEV_NODE0		24	/* device number for node 0 */
2327 
2328 
2329 /*
2330  * Opteron DRAM address map gives base and limit for physical memory in a node
2331  */
2332 typedef	struct opt_dram_addr_map {
2333 	uint32_t	base_hi;
2334 	uint32_t	base_lo;
2335 	uint32_t	limit_hi;
2336 	uint32_t	limit_lo;
2337 } opt_dram_addr_map_t;
2338 
2339 
2340 /*
2341  * Supported AMD processor families
2342  */
2343 #define	AMD_FAMILY_HAMMER	15
2344 #define	AMD_FAMILY_GREYHOUND	16
2345 
2346 /*
2347  * Whether to have is_opteron() return 1 even when processor isn't supported
2348  */
2349 uint_t	is_opteron_override = 0;
2350 
2351 /*
2352  * AMD processor family for current CPU
2353  */
2354 uint_t	opt_family = 0;
2355 
2356 
2357 /*
2358  * Determine whether we're running on a supported AMD Opteron since reading
2359  * node count and DRAM address map registers may have different format or
2360  * may not be supported across processor families
2361  */
2362 static int
2363 is_opteron(void)
2364 {
2365 
2366 	if (x86_vendor != X86_VENDOR_AMD)
2367 		return (0);
2368 
2369 	opt_family = cpuid_getfamily(CPU);
2370 	if (opt_family == AMD_FAMILY_HAMMER ||
2371 	    opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
2372 		return (1);
2373 	else
2374 		return (0);
2375 }
2376 
2377 
2378 /*
2379  * Determine NUMA configuration for Opteron from registers that live in PCI
2380  * configuration space
2381  */
2382 static void
2383 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
2384     node_phys_addr_map_t *node_memory)
2385 {
2386 	uint_t				bus;
2387 	uint_t				dev;
2388 	struct opt_dram_addr_map	dram_map[MAX_NODES];
2389 	uint_t				node;
2390 	uint_t				node_info[MAX_NODES];
2391 	uint_t				off_hi;
2392 	uint_t				off_lo;
2393 	uint64_t			nb_cfg_reg;
2394 
2395 	/*
2396 	 * Read configuration registers from PCI configuration space to
2397 	 * determine node information, which memory is in each node, etc.
2398 	 *
2399 	 * Write to PCI configuration space address register to specify
2400 	 * which configuration register to read and read/write PCI
2401 	 * configuration space data register to get/set contents
2402 	 */
2403 	bus = OPT_PCS_BUS_CONFIG;
2404 	dev = OPT_PCS_DEV_NODE0;
2405 	off_hi = OPT_PCS_OFF_DRAMBASE_HI;
2406 	off_lo = OPT_PCS_OFF_DRAMBASE_LO;
2407 
2408 	/*
2409 	 * Read node ID register for node 0 to get node count
2410 	 */
2411 	node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
2412 	    OPT_PCS_OFF_NODEID);
2413 	*node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
2414 
2415 	/*
2416 	 * If number of nodes is more than maximum supported, then set node
2417 	 * count to 1 and treat system as UMA instead of NUMA.
2418 	 */
2419 	if (*node_cnt > MAX_NODES) {
2420 		*node_cnt = 1;
2421 		return;
2422 	}
2423 
2424 	/*
2425 	 * For Greyhound, PCI Extended Configuration Space must be enabled to
2426 	 * read high DRAM address map base and limit registers
2427 	 */
2428 	if (opt_family == AMD_FAMILY_GREYHOUND) {
2429 		nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
2430 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
2431 			wrmsr(MSR_AMD_NB_CFG,
2432 			    nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
2433 	}
2434 
2435 	for (node = 0; node < *node_cnt; node++) {
2436 		uint32_t	base_hi;
2437 		uint32_t	base_lo;
2438 		uint32_t	limit_hi;
2439 		uint32_t	limit_lo;
2440 
2441 		/*
2442 		 * Read node ID register (except for node 0 which we just read)
2443 		 */
2444 		if (node > 0) {
2445 			node_info[node] = pci_getl_func(bus, dev,
2446 			    OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
2447 		}
2448 
2449 		/*
2450 		 * Read DRAM base and limit registers which specify
2451 		 * physical memory range of each node
2452 		 */
2453 		if (opt_family != AMD_FAMILY_GREYHOUND)
2454 			base_hi = 0;
2455 		else {
2456 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
2457 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
2458 			base_hi = dram_map[node].base_hi =
2459 			    inl(PCI_CONFDATA);
2460 		}
2461 		base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
2462 		    OPT_PCS_FUNC_ADDRMAP, off_lo);
2463 
2464 		if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
2465 		    mem_intrlv)
2466 			*mem_intrlv = *mem_intrlv + 1;
2467 
2468 		off_hi += 4;	/* high limit register offset */
2469 		if (opt_family != AMD_FAMILY_GREYHOUND)
2470 			limit_hi = 0;
2471 		else {
2472 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
2473 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
2474 			limit_hi = dram_map[node].limit_hi =
2475 			    inl(PCI_CONFDATA);
2476 		}
2477 
2478 		off_lo += 4;	/* low limit register offset */
2479 		limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
2480 		    dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
2481 
2482 		/*
2483 		 * Increment device number to next node and register offsets
2484 		 * for DRAM base register of next node
2485 		 */
2486 		off_hi += 4;
2487 		off_lo += 4;
2488 		dev++;
2489 
2490 		/*
2491 		 * Both read and write enable bits must be enabled in DRAM
2492 		 * address map base register for physical memory to exist in
2493 		 * node
2494 		 */
2495 		if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
2496 		    (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
2497 			/*
2498 			 * Mark node memory as non-existent and set start and
2499 			 * end addresses to be same in node_memory[]
2500 			 */
2501 			node_memory[node].exists = 0;
2502 			node_memory[node].start = node_memory[node].end =
2503 			    (pfn_t)-1;
2504 			continue;
2505 		}
2506 
2507 		/*
2508 		 * Mark node memory as existing and remember physical address
2509 		 * range of each node for use later
2510 		 */
2511 		node_memory[node].exists = 1;
2512 
2513 		node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
2514 
2515 		node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
2516 		    OPT_DRAMADDR_LO_MASK_OFF);
2517 	}
2518 
2519 	/*
2520 	 * Restore PCI Extended Configuration Space enable bit
2521 	 */
2522 	if (opt_family == AMD_FAMILY_GREYHOUND) {
2523 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
2524 			wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
2525 	}
2526 }
2527 
2528 
2529 /*
2530  * Return average amount of time to read vendor ID register on Northbridge
2531  * N times on specified destination node from current CPU
2532  */
2533 static hrtime_t
2534 opt_probe_vendor(int dest_node, int nreads)
2535 {
2536 	int		cnt;
2537 	uint_t		dev;
2538 	/* LINTED: set but not used in function */
2539 	volatile uint_t	dev_vendor;
2540 	hrtime_t	elapsed;
2541 	hrtime_t	end;
2542 	int		ipl;
2543 	hrtime_t	start;
2544 
2545 	dev = OPT_PCS_DEV_NODE0 + dest_node;
2546 	kpreempt_disable();
2547 	ipl = spl8();
2548 	outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
2549 	    OPT_PCS_OFF_VENDOR));
2550 	start = gethrtime();
2551 	for (cnt = 0; cnt < nreads; cnt++)
2552 		dev_vendor = inl(PCI_CONFDATA);
2553 	end = gethrtime();
2554 	elapsed = (end - start) / nreads;
2555 	splx(ipl);
2556 	kpreempt_enable();
2557 	return (elapsed);
2558 }
2559