xref: /titanic_52/usr/src/uts/i86pc/os/lgrpplat.c (revision 5b9d3151a4426af9ad6ef2c2a178f13476b884b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright (c) 2010, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 /*
31  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
32  * ================================================================
33  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
34  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
35  * one or more CPUs and some local memory.  The CPUs in each node can access
36  * the memory in the other nodes but at a higher latency than accessing their
37  * local memory.  Typically, a system with only one node has Uniform Memory
38  * Access (UMA), but it may be possible to have a one node system that has
39  * some global memory outside of the node which is higher latency.
40  *
41  * Module Description
42  * ------------------
43  * This module provides a platform interface for determining which CPUs and
44  * which memory (and how much) are in a NUMA node and how far each node is from
45  * each other.  The interface is used by the Virtual Memory (VM) system and the
46  * common lgroup framework.  The VM system uses the plat_*() routines to fill
47  * in its memory node (memnode) array with the physical address range spanned
48  * by each NUMA node to know which memory belongs to which node, so it can
49  * build and manage a physical page free list for each NUMA node and allocate
50  * local memory from each node as needed.  The common lgroup framework uses the
51  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
52  * to each node (leaf lgroup) and how far each node is from each other, so it
53  * can build the latency (lgroup) topology for the machine in order to optimize
54  * for locality.  Also, an lgroup platform handle instead of lgroups are used
55  * in the interface with this module, so this module shouldn't need to know
56  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
57  * etc. are in each NUMA node, how far each node is from each other, and to use
58  * a unique lgroup platform handle to refer to each node through the interface.
59  *
60  * Determining NUMA Configuration
61  * ------------------------------
62  * By default, this module will try to determine the NUMA configuration of the
63  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
64  * Locality Information Table (SLIT).  The SRAT contains info to tell which
65  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
66  * is a matrix that gives the distance between each system locality (which is
67  * a NUMA node and should correspond to proximity domains in the SRAT).  For
68  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
69  * specification.
70  *
71  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
72  * examine registers in PCI configuration space to determine how many nodes are
73  * in the system and which CPUs and memory are in each node.
74  * do while booting the kernel.
75  *
76  * NOTE: Using these PCI configuration space registers to determine this
77  *       locality info is not guaranteed to work or be compatible across all
78  *	 Opteron processor families.
79  *
80  * If the SLIT does not exist or look right, the kernel will probe to determine
81  * the distance between nodes as long as the NUMA CPU and memory configuration
82  * has been determined (see lgrp_plat_probe() for details).
83  *
84  * Data Structures
85  * ---------------
86  * The main data structures used by this code are the following:
87  *
88  * - lgrp_plat_cpu_node[]		CPU to node ID mapping table indexed by
89  *					CPU ID (only used for SRAT)
90  *
91  * - lgrp_plat_lat_stats.latencies[][]	Table of latencies between same and
92  *					different nodes indexed by node ID
93  *
94  * - lgrp_plat_node_cnt			Number of NUMA nodes in system for
95  *					non-DR-capable systems,
96  *					maximum possible number of NUMA nodes
97  *					in system for DR capable systems.
98  *
99  * - lgrp_plat_node_domain[]		Node ID to proximity domain ID mapping
100  *					table indexed by node ID (only used
101  *					for SRAT)
102  *
103  * - lgrp_plat_memnode_info[]		Table with physical address range for
104  *					each memory node indexed by memory node
105  *					ID
106  *
107  * The code is implemented to make the following always be true:
108  *
109  *	lgroup platform handle == node ID == memnode ID
110  *
111  * Moreover, it allows for the proximity domain ID to be equal to all of the
112  * above as long as the proximity domains IDs are numbered from 0 to <number of
113  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
114  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
115  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
116  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
117  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
118  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
119  * to node IDs.  However, the proximity domain IDs may not map to the
120  * equivalent node ID since we want to keep the node IDs numbered from 0 to
121  * <number of nodes - 1> to minimize cost of searching and potentially space.
122  *
123  * With the introduction of support of memory DR operations on x86 platforms,
124  * things get a little complicated. The addresses of hot-added memory may not
125  * be continuous with other memory connected to the same lgrp node. In other
126  * words, memory addresses may get interleaved among lgrp nodes after memory
127  * DR operations. To work around this limitation, we have extended the
128  * relationship between lgrp node and memory node from 1:1 map to 1:N map,
129  * that means there may be multiple memory nodes associated with a lgrp node
130  * after memory DR operations.
131  *
132  * To minimize the code changes to support memory DR operations, the
133  * following policies have been adopted.
134  * 1) On non-DR-capable systems, the relationship among lgroup platform handle,
135  *    node ID and memnode ID is still kept as:
136  *	lgroup platform handle == node ID == memnode ID
137  * 2) For memory present at boot time on DR capable platforms, the relationship
138  *    is still kept as is.
139  *	lgroup platform handle == node ID == memnode ID
140  * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have
141  *    been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt)
142  *    are reserved for memory present at boot time, and memnode IDs
143  *    [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate
144  *    memnode ID for hot-added memory.
145  * 4) All boot code having the assumption "node ID == memnode ID" can live as
146  *    is, that's because node ID is always equal to memnode ID at boot time.
147  * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and
148  *    lgrp_plat_mem_size() related logics have been enhanced to deal with
149  *    the 1:N map relationship.
150  * 6) The latency probing related logics, which have the assumption
151  *    "node ID == memnode ID" and may be called at run time, is disabled if
152  *    memory DR operation is enabled.
153  */
154 
155 
156 #include <sys/archsystm.h>	/* for {in,out}{b,w,l}() */
157 #include <sys/atomic.h>
158 #include <sys/bootconf.h>
159 #include <sys/cmn_err.h>
160 #include <sys/controlregs.h>
161 #include <sys/cpupart.h>
162 #include <sys/cpuvar.h>
163 #include <sys/lgrp.h>
164 #include <sys/machsystm.h>
165 #include <sys/memlist.h>
166 #include <sys/memnode.h>
167 #include <sys/mman.h>
168 #include <sys/note.h>
169 #include <sys/pci_cfgspace.h>
170 #include <sys/pci_impl.h>
171 #include <sys/param.h>
172 #include <sys/pghw.h>
173 #include <sys/promif.h>		/* for prom_printf() */
174 #include <sys/sysmacros.h>
175 #include <sys/systm.h>
176 #include <sys/thread.h>
177 #include <sys/types.h>
178 #include <sys/var.h>
179 #include <sys/x86_archext.h>
180 #include <vm/hat_i86.h>
181 #include <vm/seg_kmem.h>
182 #include <vm/vm_dep.h>
183 
184 #include <sys/acpidev.h>
185 #include "acpi_fw.h"		/* for SRAT, SLIT and MSCT */
186 
187 
188 #define	MAX_NODES		8
189 #define	NLGRP			(MAX_NODES * (MAX_NODES - 1) + 1)
190 
191 /*
192  * Constants for configuring probing
193  */
194 #define	LGRP_PLAT_PROBE_NROUNDS		64	/* default laps for probing */
195 #define	LGRP_PLAT_PROBE_NSAMPLES	1	/* default samples to take */
196 #define	LGRP_PLAT_PROBE_NREADS		256	/* number of vendor ID reads */
197 
198 /*
199  * Flags for probing
200  */
201 #define	LGRP_PLAT_PROBE_ENABLE		0x1	/* enable probing */
202 #define	LGRP_PLAT_PROBE_PGCPY		0x2	/* probe using page copy */
203 #define	LGRP_PLAT_PROBE_VENDOR		0x4	/* probe vendor ID register */
204 
205 /*
206  * Hash proximity domain ID into node to domain mapping table "mod" number of
207  * nodes to minimize span of entries used and try to have lowest numbered
208  * proximity domain be node 0
209  */
210 #define	NODE_DOMAIN_HASH(domain, node_cnt) \
211 	((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \
212 	    ((domain) - lgrp_plat_prox_domain_min) % node_cnt)
213 
214 /*
215  * CPU to node ID mapping structure (only used with SRAT)
216  */
217 typedef	struct cpu_node_map {
218 	int		exists;
219 	uint_t		node;
220 	uint32_t	apicid;
221 	uint32_t	prox_domain;
222 } cpu_node_map_t;
223 
224 /*
225  * Latency statistics
226  */
227 typedef struct lgrp_plat_latency_stats {
228 	hrtime_t	latencies[MAX_NODES][MAX_NODES];
229 	hrtime_t	latency_max;
230 	hrtime_t	latency_min;
231 } lgrp_plat_latency_stats_t;
232 
233 /*
234  * Memory configuration for probing
235  */
236 typedef struct lgrp_plat_probe_mem_config {
237 	size_t	probe_memsize;		/* how much memory to probe per node */
238 	caddr_t	probe_va[MAX_NODES];	/* where memory mapped for probing */
239 	pfn_t	probe_pfn[MAX_NODES];	/* physical pages to map for probing */
240 } lgrp_plat_probe_mem_config_t;
241 
242 /*
243  * Statistics kept for probing
244  */
245 typedef struct lgrp_plat_probe_stats {
246 	hrtime_t	flush_cost;
247 	hrtime_t	probe_cost;
248 	hrtime_t	probe_cost_total;
249 	hrtime_t	probe_error_code;
250 	hrtime_t	probe_errors[MAX_NODES][MAX_NODES];
251 	int		probe_suspect[MAX_NODES][MAX_NODES];
252 	hrtime_t	probe_max[MAX_NODES][MAX_NODES];
253 	hrtime_t	probe_min[MAX_NODES][MAX_NODES];
254 } lgrp_plat_probe_stats_t;
255 
256 /*
257  * Node to proximity domain ID mapping structure (only used with SRAT)
258  */
259 typedef	struct node_domain_map {
260 	int		exists;
261 	uint32_t	prox_domain;
262 } node_domain_map_t;
263 
264 /*
265  * Node ID and starting and ending page for physical memory in memory node
266  */
267 typedef	struct memnode_phys_addr_map {
268 	pfn_t		start;
269 	pfn_t		end;
270 	int		exists;
271 	uint32_t	prox_domain;
272 	uint32_t	device_id;
273 	uint_t		lgrphand;
274 } memnode_phys_addr_map_t;
275 
276 /*
277  * Number of CPUs for which we got APIC IDs
278  */
279 static int				lgrp_plat_apic_ncpus = 0;
280 
281 /*
282  * CPU to node ID mapping table (only used for SRAT) and its max number of
283  * entries
284  */
285 static cpu_node_map_t			*lgrp_plat_cpu_node = NULL;
286 static uint_t				lgrp_plat_cpu_node_nentries = 0;
287 
288 /*
289  * Latency statistics
290  */
291 lgrp_plat_latency_stats_t		lgrp_plat_lat_stats;
292 
293 /*
294  * Whether memory is interleaved across nodes causing MPO to be disabled
295  */
296 static int				lgrp_plat_mem_intrlv = 0;
297 
298 /*
299  * Node ID to proximity domain ID mapping table (only used for SRAT)
300  */
301 static node_domain_map_t		lgrp_plat_node_domain[MAX_NODES];
302 
303 /*
304  * Physical address range for memory in each node
305  */
306 static memnode_phys_addr_map_t		lgrp_plat_memnode_info[MAX_MEM_NODES];
307 
308 /*
309  * Statistics gotten from probing
310  */
311 static lgrp_plat_probe_stats_t		lgrp_plat_probe_stats;
312 
313 /*
314  * Memory configuration for probing
315  */
316 static lgrp_plat_probe_mem_config_t	lgrp_plat_probe_mem_config;
317 
318 /*
319  * Lowest proximity domain ID seen in ACPI SRAT
320  */
321 static uint32_t				lgrp_plat_prox_domain_min = UINT32_MAX;
322 
323 /*
324  * Error code from processing ACPI SRAT
325  */
326 static int				lgrp_plat_srat_error = 0;
327 
328 /*
329  * Error code from processing ACPI SLIT
330  */
331 static int				lgrp_plat_slit_error = 0;
332 
333 /*
334  * Whether lgrp topology has been flattened to 2 levels.
335  */
336 static int				lgrp_plat_topo_flatten = 0;
337 
338 
339 /*
340  * Maximum memory node ID in use.
341  */
342 static uint_t				lgrp_plat_max_mem_node;
343 
344 /*
345  * Allocate lgroup array statically
346  */
347 static lgrp_t				lgrp_space[NLGRP];
348 static int				nlgrps_alloc;
349 
350 
351 /*
352  * Enable finding and using minimum proximity domain ID when hashing
353  */
354 int			lgrp_plat_domain_min_enable = 1;
355 
356 /*
357  * Maximum possible number of nodes in system
358  */
359 uint_t			lgrp_plat_node_cnt = 1;
360 
361 /*
362  * Enable sorting nodes in ascending order by starting physical address
363  */
364 int			lgrp_plat_node_sort_enable = 1;
365 
366 /*
367  * Configuration Parameters for Probing
368  * - lgrp_plat_probe_flags	Flags to specify enabling probing, probe
369  *				operation, etc.
370  * - lgrp_plat_probe_nrounds	How many rounds of probing to do
371  * - lgrp_plat_probe_nsamples	Number of samples to take when probing each
372  *				node
373  * - lgrp_plat_probe_nreads	Number of times to read vendor ID from
374  *				Northbridge for each probe
375  */
376 uint_t			lgrp_plat_probe_flags = 0;
377 int			lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
378 int			lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
379 int			lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
380 
381 /*
382  * Enable use of ACPI System Resource Affinity Table (SRAT), System
383  * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT)
384  */
385 int			lgrp_plat_srat_enable = 1;
386 int			lgrp_plat_slit_enable = 1;
387 int			lgrp_plat_msct_enable = 1;
388 
389 /*
390  * mnode_xwa: set to non-zero value to initiate workaround if large pages are
391  * found to be crossing memory node boundaries. The workaround will eliminate
392  * a base size page at the end of each memory node boundary to ensure that
393  * a large page with constituent pages that span more than 1 memory node
394  * can never be formed.
395  *
396  */
397 int	mnode_xwa = 1;
398 
399 /*
400  * Static array to hold lgroup statistics
401  */
402 struct lgrp_stats	lgrp_stats[NLGRP];
403 
404 
405 /*
406  * Forward declarations of platform interface routines
407  */
408 void		plat_build_mem_nodes(struct memlist *list);
409 
410 int		plat_mnode_xcheck(pfn_t pfncnt);
411 
412 lgrp_handle_t	plat_mem_node_to_lgrphand(int mnode);
413 
414 int		plat_pfn_to_mem_node(pfn_t pfn);
415 
416 /*
417  * Forward declarations of lgroup platform interface routines
418  */
419 lgrp_t		*lgrp_plat_alloc(lgrp_id_t lgrpid);
420 
421 void		lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
422 
423 lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);
424 
425 void		lgrp_plat_init(lgrp_init_stages_t stage);
426 
427 int		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
428 
429 int		lgrp_plat_max_lgrps(void);
430 
431 pgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
432     lgrp_mem_query_t query);
433 
434 lgrp_handle_t	lgrp_plat_pfn_to_hand(pfn_t pfn);
435 
436 void		lgrp_plat_probe(void);
437 
438 lgrp_handle_t	lgrp_plat_root_hand(void);
439 
440 
441 /*
442  * Forward declarations of local routines
443  */
444 static int	is_opteron(void);
445 
446 static int	lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
447     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
448     uint32_t domain);
449 
450 static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
451     int cpu_node_nentries);
452 
453 static int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
454     int node_cnt, uint32_t domain);
455 
456 static void	lgrp_plat_get_numa_config(void);
457 
458 static void	lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
459     lgrp_plat_latency_stats_t *lat_stats,
460     lgrp_plat_probe_stats_t *probe_stats);
461 
462 static int	lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
463     lgrp_plat_latency_stats_t *lat_stats);
464 
465 static void	lgrp_plat_main_init(void);
466 
467 static pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
468 
469 static int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
470     int node_cnt, uint32_t domain);
471 
472 static int	lgrp_plat_memnode_info_update(node_domain_map_t *node_domain,
473     int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt,
474     uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id);
475 
476 static void	lgrp_plat_node_sort(node_domain_map_t *node_domain,
477     int node_cnt, cpu_node_map_t *cpu_node, int cpu_count,
478     memnode_phys_addr_map_t *memnode_info);
479 
480 static hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
481     int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
482     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);
483 
484 static int	lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);
485 
486 static int	lgrp_plat_process_slit(struct slit *tp,
487     node_domain_map_t *node_domain, uint_t node_cnt,
488     memnode_phys_addr_map_t *memnode_info,
489     lgrp_plat_latency_stats_t *lat_stats);
490 
491 static int	lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info,
492     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
493     lgrp_plat_latency_stats_t *lat_stats);
494 
495 static int	lgrp_plat_process_srat(struct srat *tp, struct msct *mp,
496     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
497     cpu_node_map_t *cpu_node, int cpu_count,
498     memnode_phys_addr_map_t *memnode_info);
499 
500 static void	lgrp_plat_release_bootstrap(void);
501 
502 static int	lgrp_plat_srat_domains(struct srat *tp,
503     uint32_t *prox_domain_min);
504 
505 static int	lgrp_plat_msct_domains(struct msct *tp,
506     uint32_t *prox_domain_min);
507 
508 static void	lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats);
509 
510 static void	opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
511     memnode_phys_addr_map_t *memnode_info);
512 
513 static hrtime_t	opt_probe_vendor(int dest_node, int nreads);
514 
515 
516 /*
517  * PLATFORM INTERFACE ROUTINES
518  */
519 
520 /*
521  * Configure memory nodes for machines with more than one node (ie NUMA)
522  */
523 void
524 plat_build_mem_nodes(struct memlist *list)
525 {
526 	pfn_t		cur_start;	/* start addr of subrange */
527 	pfn_t		cur_end;	/* end addr of subrange */
528 	pfn_t		start;		/* start addr of whole range */
529 	pfn_t		end;		/* end addr of whole range */
530 	pgcnt_t		endcnt;		/* pages to sacrifice */
531 
532 	/*
533 	 * Boot install lists are arranged <addr, len>, ...
534 	 */
535 	while (list) {
536 		int	node;
537 
538 		start = list->ml_address >> PAGESHIFT;
539 		end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT;
540 
541 		if (start > physmax) {
542 			list = list->ml_next;
543 			continue;
544 		}
545 		if (end > physmax)
546 			end = physmax;
547 
548 		/*
549 		 * When there is only one memnode, just add memory to memnode
550 		 */
551 		if (max_mem_nodes == 1) {
552 			mem_node_add_slice(start, end);
553 			list = list->ml_next;
554 			continue;
555 		}
556 
557 		/*
558 		 * mem_node_add_slice() expects to get a memory range that
559 		 * is within one memnode, so need to split any memory range
560 		 * that spans multiple memnodes into subranges that are each
561 		 * contained within one memnode when feeding them to
562 		 * mem_node_add_slice()
563 		 */
564 		cur_start = start;
565 		do {
566 			node = plat_pfn_to_mem_node(cur_start);
567 
568 			/*
569 			 * Panic if DRAM address map registers or SRAT say
570 			 * memory in node doesn't exist or address from
571 			 * boot installed memory list entry isn't in this node.
572 			 * This shouldn't happen and rest of code can't deal
573 			 * with this if it does.
574 			 */
575 			if (node < 0 || node >= lgrp_plat_max_mem_node ||
576 			    !lgrp_plat_memnode_info[node].exists ||
577 			    cur_start < lgrp_plat_memnode_info[node].start ||
578 			    cur_start > lgrp_plat_memnode_info[node].end) {
579 				cmn_err(CE_PANIC, "Don't know which memnode "
580 				    "to add installed memory address 0x%lx\n",
581 				    cur_start);
582 			}
583 
584 			/*
585 			 * End of current subrange should not span memnodes
586 			 */
587 			cur_end = end;
588 			endcnt = 0;
589 			if (lgrp_plat_memnode_info[node].exists &&
590 			    cur_end > lgrp_plat_memnode_info[node].end) {
591 				cur_end = lgrp_plat_memnode_info[node].end;
592 				if (mnode_xwa > 1) {
593 					/*
594 					 * sacrifice the last page in each
595 					 * node to eliminate large pages
596 					 * that span more than 1 memory node.
597 					 */
598 					endcnt = 1;
599 					physinstalled--;
600 				}
601 			}
602 
603 			mem_node_add_slice(cur_start, cur_end - endcnt);
604 
605 			/*
606 			 * Next subrange starts after end of current one
607 			 */
608 			cur_start = cur_end + 1;
609 		} while (cur_end < end);
610 
611 		list = list->ml_next;
612 	}
613 	mem_node_physalign = 0;
614 	mem_node_pfn_shift = 0;
615 }
616 
617 
618 /*
619  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
620  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
621  * a crossing is found and returns 0 otherwise.
622  */
623 int
624 plat_mnode_xcheck(pfn_t pfncnt)
625 {
626 	int	node, prevnode = -1, basenode;
627 	pfn_t	ea, sa;
628 
629 	for (node = 0; node < lgrp_plat_max_mem_node; node++) {
630 
631 		if (lgrp_plat_memnode_info[node].exists == 0)
632 			continue;
633 
634 		if (prevnode == -1) {
635 			prevnode = node;
636 			basenode = node;
637 			continue;
638 		}
639 
640 		/* assume x86 node pfn ranges are in increasing order */
641 		ASSERT(lgrp_plat_memnode_info[node].start >
642 		    lgrp_plat_memnode_info[prevnode].end);
643 
644 		/*
645 		 * continue if the starting address of node is not contiguous
646 		 * with the previous node.
647 		 */
648 
649 		if (lgrp_plat_memnode_info[node].start !=
650 		    (lgrp_plat_memnode_info[prevnode].end + 1)) {
651 			basenode = node;
652 			prevnode = node;
653 			continue;
654 		}
655 
656 		/* check if the starting address of node is pfncnt aligned */
657 		if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) {
658 
659 			/*
660 			 * at this point, node starts at an unaligned boundary
661 			 * and is contiguous with the previous node(s) to
662 			 * basenode. Check if there is an aligned contiguous
663 			 * range of length pfncnt that crosses this boundary.
664 			 */
665 
666 			sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end,
667 			    pfncnt);
668 			ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start),
669 			    pfncnt);
670 
671 			ASSERT((ea - sa) == pfncnt);
672 			if (sa >= lgrp_plat_memnode_info[basenode].start &&
673 			    ea <= (lgrp_plat_memnode_info[node].end + 1)) {
674 				/*
675 				 * large page found to cross mnode boundary.
676 				 * Return Failure if workaround not enabled.
677 				 */
678 				if (mnode_xwa == 0)
679 					return (1);
680 				mnode_xwa++;
681 			}
682 		}
683 		prevnode = node;
684 	}
685 	return (0);
686 }
687 
688 
689 lgrp_handle_t
690 plat_mem_node_to_lgrphand(int mnode)
691 {
692 	if (max_mem_nodes == 1)
693 		return (LGRP_DEFAULT_HANDLE);
694 
695 	ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node);
696 
697 	return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand));
698 }
699 
700 int
701 plat_pfn_to_mem_node(pfn_t pfn)
702 {
703 	int	node;
704 
705 	if (max_mem_nodes == 1)
706 		return (0);
707 
708 	for (node = 0; node < lgrp_plat_max_mem_node; node++) {
709 		/*
710 		 * Skip nodes with no memory
711 		 */
712 		if (!lgrp_plat_memnode_info[node].exists)
713 			continue;
714 
715 		membar_consumer();
716 		if (pfn >= lgrp_plat_memnode_info[node].start &&
717 		    pfn <= lgrp_plat_memnode_info[node].end)
718 			return (node);
719 	}
720 
721 	/*
722 	 * Didn't find memnode where this PFN lives which should never happen
723 	 */
724 	ASSERT(node < lgrp_plat_max_mem_node);
725 	return (-1);
726 }
727 
728 
729 /*
730  * LGROUP PLATFORM INTERFACE ROUTINES
731  */
732 
733 /*
734  * Allocate additional space for an lgroup.
735  */
736 lgrp_t *
737 lgrp_plat_alloc(lgrp_id_t lgrpid)
738 {
739 	lgrp_t *lgrp;
740 
741 	lgrp = &lgrp_space[nlgrps_alloc++];
742 	if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
743 		return (NULL);
744 	return (lgrp);
745 }
746 
747 
748 /*
749  * Platform handling for (re)configuration changes
750  *
751  * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug:
752  * 1) Use cpu_lock to synchronize between lgrp_plat_config() and
753  *    lgrp_plat_cpu_to_hand().
754  * 2) Disable latency probing logic by making sure that the flag
755  *    LGRP_PLAT_PROBE_ENABLE is cleared.
756  *
757  * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug:
758  * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal.
759  * 2) Only expansion to existing entries, no shrinking.
760  * 3) On writing side, DR framework ensures that lgrp_plat_config() is called
761  *    in single-threaded context. And membar_producer() is used to ensure that
762  *    all changes are visible to other CPUs before setting the "exists" flag.
763  * 4) On reading side, membar_consumer() after checking the "exists" flag
764  *    ensures that right values are retrieved.
765  *
766  * Mechanism to protect lgrp_plat_node_domain[] at hotplug:
767  * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal.
768  * 2) On writing side, it's single-threaded and membar_producer() is used to
769  *    ensure all changes are visible to other CPUs before setting the "exists"
770  *    flag.
771  * 3) On reading side, membar_consumer() after checking the "exists" flag
772  *    ensures that right values are retrieved.
773  */
774 void
775 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
776 {
777 #ifdef	__xpv
778 	_NOTE(ARGUNUSED(flag, arg));
779 #else
780 	int	rc, node;
781 	cpu_t	*cp;
782 	void	*hdl = NULL;
783 	uchar_t	*sliptr = NULL;
784 	uint32_t domain, apicid, slicnt = 0;
785 	update_membounds_t *mp;
786 
787 	extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *,
788 	    uint32_t *, uint32_t *, uchar_t **);
789 	extern void acpidev_dr_free_cpu_numa_info(void *);
790 
791 	/*
792 	 * This interface is used to support CPU/memory DR operations.
793 	 * Don't bother here if it's still during boot or only one lgrp node
794 	 * is supported.
795 	 */
796 	if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1)
797 		return;
798 
799 	switch (flag) {
800 	case LGRP_CONFIG_CPU_ADD:
801 		cp = (cpu_t *)arg;
802 		ASSERT(cp != NULL);
803 		ASSERT(MUTEX_HELD(&cpu_lock));
804 
805 		/* Check whether CPU already exists. */
806 		ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists);
807 		if (lgrp_plat_cpu_node[cp->cpu_id].exists) {
808 			cmn_err(CE_WARN,
809 			    "!lgrp: CPU(%d) already exists in cpu_node map.",
810 			    cp->cpu_id);
811 			break;
812 		}
813 
814 		/* Query CPU lgrp information. */
815 		rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
816 		    &slicnt, &sliptr);
817 		ASSERT(rc == 0);
818 		if (rc != 0) {
819 			cmn_err(CE_WARN,
820 			    "!lgrp: failed to query lgrp info for CPU(%d).",
821 			    cp->cpu_id);
822 			break;
823 		}
824 
825 		/* Update node to proximity domain mapping */
826 		node = lgrp_plat_domain_to_node(lgrp_plat_node_domain,
827 		    lgrp_plat_node_cnt, domain);
828 		if (node == -1) {
829 			node = lgrp_plat_node_domain_update(
830 			    lgrp_plat_node_domain, lgrp_plat_node_cnt, domain);
831 			ASSERT(node != -1);
832 			if (node == -1) {
833 				acpidev_dr_free_cpu_numa_info(hdl);
834 				cmn_err(CE_WARN, "!lgrp: failed to update "
835 				    "node_domain map for domain(%u).", domain);
836 				break;
837 			}
838 		}
839 
840 		/* Update latency information among lgrps. */
841 		if (slicnt != 0 && sliptr != NULL) {
842 			if (lgrp_plat_process_sli(domain, sliptr, slicnt,
843 			    lgrp_plat_node_domain, lgrp_plat_node_cnt,
844 			    &lgrp_plat_lat_stats) != 0) {
845 				cmn_err(CE_WARN, "!lgrp: failed to update "
846 				    "latency information for domain (%u).",
847 				    domain);
848 			}
849 		}
850 
851 		/* Update CPU to node mapping. */
852 		lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain;
853 		lgrp_plat_cpu_node[cp->cpu_id].node = node;
854 		lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid;
855 		lgrp_plat_cpu_node[cp->cpu_id].exists = 1;
856 		lgrp_plat_apic_ncpus++;
857 
858 		acpidev_dr_free_cpu_numa_info(hdl);
859 		break;
860 
861 	case LGRP_CONFIG_CPU_DEL:
862 		cp = (cpu_t *)arg;
863 		ASSERT(cp != NULL);
864 		ASSERT(MUTEX_HELD(&cpu_lock));
865 
866 		/* Check whether CPU exists. */
867 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists);
868 		if (!lgrp_plat_cpu_node[cp->cpu_id].exists) {
869 			cmn_err(CE_WARN,
870 			    "!lgrp: CPU(%d) doesn't exist in cpu_node map.",
871 			    cp->cpu_id);
872 			break;
873 		}
874 
875 		/* Query CPU lgrp information. */
876 		rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
877 		    NULL, NULL);
878 		ASSERT(rc == 0);
879 		if (rc != 0) {
880 			cmn_err(CE_WARN,
881 			    "!lgrp: failed to query lgrp info for CPU(%d).",
882 			    cp->cpu_id);
883 			break;
884 		}
885 
886 		/* Update map. */
887 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid);
888 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain);
889 		lgrp_plat_cpu_node[cp->cpu_id].exists = 0;
890 		lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX;
891 		lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX;
892 		lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX;
893 		lgrp_plat_apic_ncpus--;
894 
895 		acpidev_dr_free_cpu_numa_info(hdl);
896 		break;
897 
898 	case LGRP_CONFIG_MEM_ADD:
899 		mp = (update_membounds_t *)arg;
900 		ASSERT(mp != NULL);
901 
902 		/* Update latency information among lgrps. */
903 		if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) {
904 			if (lgrp_plat_process_sli(mp->u_domain,
905 			    mp->u_sli_ptr, mp->u_sli_cnt,
906 			    lgrp_plat_node_domain, lgrp_plat_node_cnt,
907 			    &lgrp_plat_lat_stats) != 0) {
908 				cmn_err(CE_WARN, "!lgrp: failed to update "
909 				    "latency information for domain (%u).",
910 				    domain);
911 			}
912 		}
913 
914 		if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain,
915 		    lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes,
916 		    mp->u_base, mp->u_base + mp->u_length,
917 		    mp->u_domain, mp->u_device_id) < 0) {
918 			cmn_err(CE_WARN,
919 			    "!lgrp: failed to update latency  information for "
920 			    "memory (0x%" PRIx64 " - 0x%" PRIx64 ").",
921 			    mp->u_base, mp->u_base + mp->u_length);
922 		}
923 		break;
924 
925 	default:
926 		break;
927 	}
928 #endif	/* __xpv */
929 }
930 
931 
932 /*
933  * Return the platform handle for the lgroup containing the given CPU
934  */
935 lgrp_handle_t
936 lgrp_plat_cpu_to_hand(processorid_t id)
937 {
938 	lgrp_handle_t	hand;
939 
940 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
941 
942 	if (lgrp_plat_node_cnt == 1)
943 		return (LGRP_DEFAULT_HANDLE);
944 
945 	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
946 	    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);
947 
948 	ASSERT(hand != (lgrp_handle_t)-1);
949 	if (hand == (lgrp_handle_t)-1)
950 		return (LGRP_NULL_HANDLE);
951 
952 	return (hand);
953 }
954 
955 
956 /*
957  * Platform-specific initialization of lgroups
958  */
959 void
960 lgrp_plat_init(lgrp_init_stages_t stage)
961 {
962 #if defined(__xpv)
963 #else	/* __xpv */
964 	u_longlong_t	value;
965 #endif	/* __xpv */
966 
967 	switch (stage) {
968 	case LGRP_INIT_STAGE1:
969 #if defined(__xpv)
970 		/*
971 		 * XXPV	For now, the hypervisor treats all memory equally.
972 		 */
973 		lgrp_plat_node_cnt = max_mem_nodes = 1;
974 #else	/* __xpv */
975 
976 		/*
977 		 * Get boot property for lgroup topology height limit
978 		 */
979 		if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
980 			(void) lgrp_topo_ht_limit_set((int)value);
981 
982 		/*
983 		 * Get boot property for enabling/disabling SRAT
984 		 */
985 		if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
986 			lgrp_plat_srat_enable = (int)value;
987 
988 		/*
989 		 * Get boot property for enabling/disabling SLIT
990 		 */
991 		if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
992 			lgrp_plat_slit_enable = (int)value;
993 
994 		/*
995 		 * Get boot property for enabling/disabling MSCT
996 		 */
997 		if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0)
998 			lgrp_plat_msct_enable = (int)value;
999 
1000 		/*
1001 		 * Initialize as a UMA machine
1002 		 */
1003 		if (lgrp_topo_ht_limit() == 1) {
1004 			lgrp_plat_node_cnt = max_mem_nodes = 1;
1005 			lgrp_plat_max_mem_node = 1;
1006 			return;
1007 		}
1008 
1009 		lgrp_plat_get_numa_config();
1010 
1011 		/*
1012 		 * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes
1013 		 * to support memory DR operations if memory DR is enabled.
1014 		 */
1015 		lgrp_plat_max_mem_node = lgrp_plat_node_cnt;
1016 		if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) {
1017 			max_mem_nodes = MAX_MEM_NODES_PER_LGROUP *
1018 			    lgrp_plat_node_cnt;
1019 			ASSERT(max_mem_nodes <= MAX_MEM_NODES);
1020 		}
1021 #endif	/* __xpv */
1022 		break;
1023 
1024 	case LGRP_INIT_STAGE3:
1025 		lgrp_plat_probe();
1026 		lgrp_plat_release_bootstrap();
1027 		break;
1028 
1029 	case LGRP_INIT_STAGE4:
1030 		lgrp_plat_main_init();
1031 		break;
1032 
1033 	default:
1034 		break;
1035 	}
1036 }
1037 
1038 
1039 /*
1040  * Return latency between "from" and "to" lgroups
1041  *
1042  * This latency number can only be used for relative comparison
1043  * between lgroups on the running system, cannot be used across platforms,
1044  * and may not reflect the actual latency.  It is platform and implementation
1045  * specific, so platform gets to decide its value.  It would be nice if the
1046  * number was at least proportional to make comparisons more meaningful though.
1047  */
1048 int
1049 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
1050 {
1051 	lgrp_handle_t	src, dest;
1052 	int		node;
1053 
1054 	if (max_mem_nodes == 1)
1055 		return (0);
1056 
1057 	/*
1058 	 * Return max latency for root lgroup
1059 	 */
1060 	if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
1061 		return (lgrp_plat_lat_stats.latency_max);
1062 
1063 	src = from;
1064 	dest = to;
1065 
1066 	/*
1067 	 * Return 0 for nodes (lgroup platform handles) out of range
1068 	 */
1069 	if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
1070 		return (0);
1071 
1072 	/*
1073 	 * Probe from current CPU if its lgroup latencies haven't been set yet
1074 	 * and we are trying to get latency from current CPU to some node.
1075 	 * Avoid probing if CPU/memory DR is enabled.
1076 	 */
1077 	if (lgrp_plat_lat_stats.latencies[src][src] == 0) {
1078 		/*
1079 		 * Latency information should be updated by lgrp_plat_config()
1080 		 * for DR operations. Something is wrong if reaches here.
1081 		 * For safety, flatten lgrp topology to two levels.
1082 		 */
1083 		if (plat_dr_support_cpu() || plat_dr_support_memory()) {
1084 			ASSERT(lgrp_plat_lat_stats.latencies[src][src]);
1085 			cmn_err(CE_WARN,
1086 			    "lgrp: failed to get latency information, "
1087 			    "fall back to two-level topology.");
1088 			lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1089 		} else {
1090 			node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1091 			    lgrp_plat_cpu_node_nentries);
1092 			ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
1093 			if (node == src)
1094 				lgrp_plat_probe();
1095 		}
1096 	}
1097 
1098 	return (lgrp_plat_lat_stats.latencies[src][dest]);
1099 }
1100 
1101 
1102 /*
1103  * Return the maximum number of lgrps supported by the platform.
1104  * Before lgrp topology is known it returns an estimate based on the number of
1105  * nodes. Once topology is known it returns:
1106  * 1) the actual maximim number of lgrps created if CPU/memory DR operations
1107  *    are not suppported.
1108  * 2) the maximum possible number of lgrps if CPU/memory DR operations are
1109  *    supported.
1110  */
1111 int
1112 lgrp_plat_max_lgrps(void)
1113 {
1114 	if (!lgrp_topo_initialized || plat_dr_support_cpu() ||
1115 	    plat_dr_support_memory()) {
1116 		return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
1117 	} else {
1118 		return (lgrp_alloc_max + 1);
1119 	}
1120 }
1121 
1122 
1123 /*
1124  * Count number of memory pages (_t) based on mnode id (_n) and query type (_t).
1125  */
1126 #define	_LGRP_PLAT_MEM_SIZE(_n, _q, _t)					\
1127 	if (mem_node_config[_n].exists) {				\
1128 		switch (_q) {						\
1129 		case LGRP_MEM_SIZE_FREE:				\
1130 			_t += MNODE_PGCNT(_n);				\
1131 			break;						\
1132 		case LGRP_MEM_SIZE_AVAIL:				\
1133 			_t += mem_node_memlist_pages(_n, phys_avail);	\
1134 				break;					\
1135 		case LGRP_MEM_SIZE_INSTALL:				\
1136 			_t += mem_node_memlist_pages(_n, phys_install);	\
1137 			break;						\
1138 		default:						\
1139 			break;						\
1140 		}							\
1141 	}
1142 
1143 /*
1144  * Return the number of free pages in an lgroup.
1145  *
1146  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
1147  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
1148  * number of allocatable base pagesize pages corresponding to the
1149  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
1150  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
1151  * memory installed, regardless of whether or not it's usable.
1152  */
1153 pgcnt_t
1154 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
1155 {
1156 	int	mnode;
1157 	pgcnt_t npgs = (pgcnt_t)0;
1158 	extern struct memlist *phys_avail;
1159 	extern struct memlist *phys_install;
1160 
1161 
1162 	if (plathand == LGRP_DEFAULT_HANDLE)
1163 		return (lgrp_plat_mem_size_default(plathand, query));
1164 
1165 	if (plathand != LGRP_NULL_HANDLE) {
1166 		/* Count memory node present at boot. */
1167 		mnode = (int)plathand;
1168 		ASSERT(mnode < lgrp_plat_node_cnt);
1169 		_LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1170 
1171 		/* Count possible hot-added memory nodes. */
1172 		for (mnode = lgrp_plat_node_cnt;
1173 		    mnode < lgrp_plat_max_mem_node; mnode++) {
1174 			if (lgrp_plat_memnode_info[mnode].lgrphand == plathand)
1175 				_LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1176 		}
1177 	}
1178 
1179 	return (npgs);
1180 }
1181 
1182 
1183 /*
1184  * Return the platform handle of the lgroup that contains the physical memory
1185  * corresponding to the given page frame number
1186  */
1187 lgrp_handle_t
1188 lgrp_plat_pfn_to_hand(pfn_t pfn)
1189 {
1190 	int	mnode;
1191 
1192 	if (max_mem_nodes == 1)
1193 		return (LGRP_DEFAULT_HANDLE);
1194 
1195 	if (pfn > physmax)
1196 		return (LGRP_NULL_HANDLE);
1197 
1198 	mnode = plat_pfn_to_mem_node(pfn);
1199 	if (mnode < 0)
1200 		return (LGRP_NULL_HANDLE);
1201 
1202 	return (MEM_NODE_2_LGRPHAND(mnode));
1203 }
1204 
1205 
1206 /*
1207  * Probe memory in each node from current CPU to determine latency topology
1208  *
1209  * The probing code will probe the vendor ID register on the Northbridge of
1210  * Opteron processors and probe memory for other processors by default.
1211  *
1212  * Since probing is inherently error prone, the code takes laps across all the
1213  * nodes probing from each node to each of the other nodes some number of
1214  * times.  Furthermore, each node is probed some number of times before moving
1215  * onto the next one during each lap.  The minimum latency gotten between nodes
1216  * is kept as the latency between the nodes.
1217  *
1218  * After all that,  the probe times are adjusted by normalizing values that are
1219  * close to each other and local latencies are made the same.  Lastly, the
1220  * latencies are verified to make sure that certain conditions are met (eg.
1221  * local < remote, latency(a, b) == latency(b, a), etc.).
1222  *
1223  * If any of the conditions aren't met, the code will export a NUMA
1224  * configuration with the local CPUs and memory given by the SRAT or PCI config
1225  * space registers and one remote memory latency since it can't tell exactly
1226  * how far each node is from each other.
1227  */
1228 void
1229 lgrp_plat_probe(void)
1230 {
1231 	int				from;
1232 	int				i;
1233 	lgrp_plat_latency_stats_t	*lat_stats;
1234 	boolean_t			probed;
1235 	hrtime_t			probe_time;
1236 	int				to;
1237 
1238 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
1239 	    max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
1240 		return;
1241 
1242 	/* SRAT and SLIT should be enabled if DR operations are enabled. */
1243 	if (plat_dr_support_cpu() || plat_dr_support_memory())
1244 		return;
1245 
1246 	/*
1247 	 * Determine ID of node containing current CPU
1248 	 */
1249 	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1250 	    lgrp_plat_cpu_node_nentries);
1251 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
1252 	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
1253 		ASSERT(lgrp_plat_node_domain[from].exists);
1254 
1255 	/*
1256 	 * Don't need to probe if got times already
1257 	 */
1258 	lat_stats = &lgrp_plat_lat_stats;
1259 	if (lat_stats->latencies[from][from] != 0)
1260 		return;
1261 
1262 	/*
1263 	 * Read vendor ID in Northbridge or read and write page(s)
1264 	 * in each node from current CPU and remember how long it takes,
1265 	 * so we can build latency topology of machine later.
1266 	 * This should approximate the memory latency between each node.
1267 	 */
1268 	probed = B_FALSE;
1269 	for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
1270 		for (to = 0; to < lgrp_plat_node_cnt; to++) {
1271 			/*
1272 			 * Get probe time and skip over any nodes that can't be
1273 			 * probed yet or don't have memory
1274 			 */
1275 			probe_time = lgrp_plat_probe_time(to,
1276 			    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
1277 			    &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
1278 			    &lgrp_plat_probe_stats);
1279 			if (probe_time == 0)
1280 				continue;
1281 
1282 			probed = B_TRUE;
1283 
1284 			/*
1285 			 * Keep lowest probe time as latency between nodes
1286 			 */
1287 			if (lat_stats->latencies[from][to] == 0 ||
1288 			    probe_time < lat_stats->latencies[from][to])
1289 				lat_stats->latencies[from][to] = probe_time;
1290 
1291 			/*
1292 			 * Update overall minimum and maximum probe times
1293 			 * across all nodes
1294 			 */
1295 			if (probe_time < lat_stats->latency_min ||
1296 			    lat_stats->latency_min == -1)
1297 				lat_stats->latency_min = probe_time;
1298 			if (probe_time > lat_stats->latency_max)
1299 				lat_stats->latency_max = probe_time;
1300 		}
1301 	}
1302 
1303 	/*
1304 	 * Bail out if weren't able to probe any nodes from current CPU
1305 	 */
1306 	if (probed == B_FALSE)
1307 		return;
1308 
1309 	/*
1310 	 * - Fix up latencies such that local latencies are same,
1311 	 *   latency(i, j) == latency(j, i), etc. (if possible)
1312 	 *
1313 	 * - Verify that latencies look ok
1314 	 *
1315 	 * - Fallback to just optimizing for local and remote if
1316 	 *   latencies didn't look right
1317 	 */
1318 	lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats,
1319 	    &lgrp_plat_probe_stats);
1320 	lgrp_plat_probe_stats.probe_error_code =
1321 	    lgrp_plat_latency_verify(lgrp_plat_memnode_info,
1322 	    &lgrp_plat_lat_stats);
1323 	if (lgrp_plat_probe_stats.probe_error_code)
1324 		lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1325 }
1326 
1327 
1328 /*
1329  * Return platform handle for root lgroup
1330  */
1331 lgrp_handle_t
1332 lgrp_plat_root_hand(void)
1333 {
1334 	return (LGRP_DEFAULT_HANDLE);
1335 }
1336 
1337 
1338 /*
1339  * INTERNAL ROUTINES
1340  */
1341 
1342 
1343 /*
1344  * Update CPU to node mapping for given CPU and proximity domain.
1345  * Return values:
1346  * 	- zero for success
1347  *	- positive numbers for warnings
1348  *	- negative numbers for errors
1349  */
1350 static int
1351 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
1352     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
1353 {
1354 	uint_t	i;
1355 	int	node;
1356 
1357 	/*
1358 	 * Get node number for proximity domain
1359 	 */
1360 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
1361 	if (node == -1) {
1362 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
1363 		    domain);
1364 		if (node == -1)
1365 			return (-1);
1366 	}
1367 
1368 	/*
1369 	 * Search for entry with given APIC ID and fill in its node and
1370 	 * proximity domain IDs (if they haven't been set already)
1371 	 */
1372 	for (i = 0; i < nentries; i++) {
1373 		/*
1374 		 * Skip nonexistent entries and ones without matching APIC ID
1375 		 */
1376 		if (!cpu_node[i].exists || cpu_node[i].apicid != apicid)
1377 			continue;
1378 
1379 		/*
1380 		 * Just return if entry completely and correctly filled in
1381 		 * already
1382 		 */
1383 		if (cpu_node[i].prox_domain == domain &&
1384 		    cpu_node[i].node == node)
1385 			return (1);
1386 
1387 		/*
1388 		 * It's invalid to have more than one entry with the same
1389 		 * local APIC ID in SRAT table.
1390 		 */
1391 		if (cpu_node[i].node != UINT_MAX)
1392 			return (-2);
1393 
1394 		/*
1395 		 * Fill in node and proximity domain IDs
1396 		 */
1397 		cpu_node[i].prox_domain = domain;
1398 		cpu_node[i].node = node;
1399 
1400 		return (0);
1401 	}
1402 
1403 	/*
1404 	 * It's possible that an apicid doesn't exist in the cpu_node map due
1405 	 * to user limits number of CPUs powered on at boot by specifying the
1406 	 * boot_ncpus kernel option.
1407 	 */
1408 	return (2);
1409 }
1410 
1411 
1412 /*
1413  * Get node ID for given CPU
1414  */
1415 static int
1416 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
1417     int cpu_node_nentries)
1418 {
1419 	processorid_t	cpuid;
1420 
1421 	if (cp == NULL)
1422 		return (-1);
1423 
1424 	cpuid = cp->cpu_id;
1425 	if (cpuid < 0 || cpuid >= max_ncpus)
1426 		return (-1);
1427 
1428 	/*
1429 	 * SRAT doesn't exist, isn't enabled, or there was an error processing
1430 	 * it, so return node ID for Opteron and -1 otherwise.
1431 	 */
1432 	if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
1433 	    lgrp_plat_srat_error) {
1434 		if (is_opteron())
1435 			return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE));
1436 		return (-1);
1437 	}
1438 
1439 	/*
1440 	 * Return -1 when CPU to node ID mapping entry doesn't exist for given
1441 	 * CPU
1442 	 */
1443 	if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
1444 		return (-1);
1445 
1446 	return (cpu_node[cpuid].node);
1447 }
1448 
1449 
1450 /*
1451  * Return node number for given proximity domain/system locality
1452  */
1453 static int
1454 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
1455     uint32_t domain)
1456 {
1457 	uint_t	node;
1458 	uint_t	start;
1459 
1460 	/*
1461 	 * Hash proximity domain ID into node to domain mapping table (array),
1462 	 * search for entry with matching proximity domain ID, and return index
1463 	 * of matching entry as node ID.
1464 	 */
1465 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
1466 	do {
1467 		if (node_domain[node].exists) {
1468 			membar_consumer();
1469 			if (node_domain[node].prox_domain == domain)
1470 				return (node);
1471 		}
1472 		node = (node + 1) % node_cnt;
1473 	} while (node != start);
1474 	return (-1);
1475 }
1476 
1477 
1478 /*
1479  * Get NUMA configuration of machine
1480  */
1481 static void
1482 lgrp_plat_get_numa_config(void)
1483 {
1484 	uint_t		probe_op;
1485 
1486 	/*
1487 	 * Read boot property with CPU to APIC ID mapping table/array to
1488 	 * determine number of CPUs
1489 	 */
1490 	lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
1491 
1492 	/*
1493 	 * Determine which CPUs and memory are local to each other and number
1494 	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
1495 	 */
1496 	if (lgrp_plat_apic_ncpus > 0) {
1497 		int	retval;
1498 
1499 		/* Reserve enough resources if CPU DR is enabled. */
1500 		if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus)
1501 			lgrp_plat_cpu_node_nentries = max_ncpus;
1502 		else
1503 			lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
1504 
1505 		/*
1506 		 * Temporarily allocate boot memory to use for CPU to node
1507 		 * mapping since kernel memory allocator isn't alive yet
1508 		 */
1509 		lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
1510 		    NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t),
1511 		    sizeof (int));
1512 
1513 		ASSERT(lgrp_plat_cpu_node != NULL);
1514 		if (lgrp_plat_cpu_node) {
1515 			bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
1516 			    sizeof (cpu_node_map_t));
1517 		} else {
1518 			lgrp_plat_cpu_node_nentries = 0;
1519 		}
1520 
1521 		/*
1522 		 * Fill in CPU to node ID mapping table with APIC ID for each
1523 		 * CPU
1524 		 */
1525 		(void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
1526 
1527 		retval = lgrp_plat_process_srat(srat_ptr, msct_ptr,
1528 		    &lgrp_plat_prox_domain_min,
1529 		    lgrp_plat_node_domain, lgrp_plat_cpu_node,
1530 		    lgrp_plat_apic_ncpus, lgrp_plat_memnode_info);
1531 		if (retval <= 0) {
1532 			lgrp_plat_srat_error = retval;
1533 			lgrp_plat_node_cnt = 1;
1534 		} else {
1535 			lgrp_plat_srat_error = 0;
1536 			lgrp_plat_node_cnt = retval;
1537 		}
1538 	}
1539 
1540 	/*
1541 	 * Try to use PCI config space registers on Opteron if there's an error
1542 	 * processing CPU to APIC ID mapping or SRAT
1543 	 */
1544 	if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
1545 	    is_opteron())
1546 		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
1547 		    lgrp_plat_memnode_info);
1548 
1549 	/*
1550 	 * Don't bother to setup system for multiple lgroups and only use one
1551 	 * memory node when memory is interleaved between any nodes or there is
1552 	 * only one NUMA node
1553 	 */
1554 	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
1555 		lgrp_plat_node_cnt = max_mem_nodes = 1;
1556 		(void) lgrp_topo_ht_limit_set(1);
1557 		return;
1558 	}
1559 
1560 	/*
1561 	 * Leaf lgroups on x86/x64 architectures contain one physical
1562 	 * processor chip. Tune lgrp_expand_proc_thresh and
1563 	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
1564 	 * things out aggressively.
1565 	 */
1566 	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
1567 	lgrp_expand_proc_diff = 0;
1568 
1569 	/*
1570 	 * There should be one memnode (physical page free list(s)) for
1571 	 * each node if memory DR is disabled.
1572 	 */
1573 	max_mem_nodes = lgrp_plat_node_cnt;
1574 
1575 	/*
1576 	 * Initialize min and max latency before reading SLIT or probing
1577 	 */
1578 	lgrp_plat_lat_stats.latency_min = -1;
1579 	lgrp_plat_lat_stats.latency_max = 0;
1580 
1581 	/*
1582 	 * Determine how far each NUMA node is from each other by
1583 	 * reading ACPI System Locality Information Table (SLIT) if it
1584 	 * exists
1585 	 */
1586 	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
1587 	    lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info,
1588 	    &lgrp_plat_lat_stats);
1589 
1590 	/*
1591 	 * Disable support of CPU/memory DR operations if multiple locality
1592 	 * domains exist in system and either of following is true.
1593 	 * 1) Failed to process SLIT table.
1594 	 * 2) Latency probing is enabled by user.
1595 	 */
1596 	if (lgrp_plat_node_cnt > 1 &&
1597 	    (plat_dr_support_cpu() || plat_dr_support_memory())) {
1598 		if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 ||
1599 		    !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 ||
1600 		    lgrp_plat_apic_ncpus <= 0) {
1601 			cmn_err(CE_CONT,
1602 			    "?lgrp: failed to process ACPI SRAT/SLIT table, "
1603 			    "disable support of CPU/memory DR operations.");
1604 			plat_dr_disable_cpu();
1605 			plat_dr_disable_memory();
1606 		} else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) {
1607 			cmn_err(CE_CONT,
1608 			    "?lgrp: latency probing enabled by user, "
1609 			    "disable support of CPU/memory DR operations.");
1610 			plat_dr_disable_cpu();
1611 			plat_dr_disable_memory();
1612 		}
1613 	}
1614 
1615 	/* Done if succeeded to process SLIT table. */
1616 	if (lgrp_plat_slit_error == 0)
1617 		return;
1618 
1619 	/*
1620 	 * Probe to determine latency between NUMA nodes when SLIT
1621 	 * doesn't exist or make sense
1622 	 */
1623 	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
1624 
1625 	/*
1626 	 * Specify whether to probe using vendor ID register or page copy
1627 	 * if hasn't been specified already or is overspecified
1628 	 */
1629 	probe_op = lgrp_plat_probe_flags &
1630 	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1631 
1632 	if (probe_op == 0 ||
1633 	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
1634 		lgrp_plat_probe_flags &=
1635 		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1636 		if (is_opteron())
1637 			lgrp_plat_probe_flags |=
1638 			    LGRP_PLAT_PROBE_VENDOR;
1639 		else
1640 			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
1641 	}
1642 
1643 	/*
1644 	 * Probing errors can mess up the lgroup topology and
1645 	 * force us fall back to a 2 level lgroup topology.
1646 	 * Here we bound how tall the lgroup topology can grow
1647 	 * in hopes of avoiding any anamolies in probing from
1648 	 * messing up the lgroup topology by limiting the
1649 	 * accuracy of the latency topology.
1650 	 *
1651 	 * Assume that nodes will at least be configured in a
1652 	 * ring, so limit height of lgroup topology to be less
1653 	 * than number of nodes on a system with 4 or more
1654 	 * nodes
1655 	 */
1656 	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
1657 	    lgrp_topo_ht_limit_default())
1658 		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
1659 }
1660 
1661 
1662 /*
1663  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
1664  * be considered same
1665  */
1666 #define	LGRP_LAT_TOLERANCE_SHIFT	4
1667 
1668 int	lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
1669 
1670 
1671 /*
1672  * Adjust latencies between nodes to be symmetric, normalize latencies between
1673  * any nodes that are within some tolerance to be same, and make local
1674  * latencies be same
1675  */
1676 static void
1677 lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
1678     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
1679 {
1680 	int				i;
1681 	int				j;
1682 	int				k;
1683 	int				l;
1684 	u_longlong_t			max;
1685 	u_longlong_t			min;
1686 	u_longlong_t			t;
1687 	u_longlong_t			t1;
1688 	u_longlong_t			t2;
1689 	const lgrp_config_flag_t	cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
1690 	int				lat_corrected[MAX_NODES][MAX_NODES];
1691 
1692 	/*
1693 	 * Nothing to do when this is an UMA machine or don't have args needed
1694 	 */
1695 	if (max_mem_nodes == 1)
1696 		return;
1697 
1698 	ASSERT(memnode_info != NULL && lat_stats != NULL &&
1699 	    probe_stats != NULL);
1700 
1701 	/*
1702 	 * Make sure that latencies are symmetric between any two nodes
1703 	 * (ie. latency(node0, node1) == latency(node1, node0))
1704 	 */
1705 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1706 		if (!memnode_info[i].exists)
1707 			continue;
1708 
1709 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1710 			if (!memnode_info[j].exists)
1711 				continue;
1712 
1713 			t1 = lat_stats->latencies[i][j];
1714 			t2 = lat_stats->latencies[j][i];
1715 
1716 			if (t1 == 0 || t2 == 0 || t1 == t2)
1717 				continue;
1718 
1719 			/*
1720 			 * Latencies should be same
1721 			 * - Use minimum of two latencies which should be same
1722 			 * - Track suspect probe times not within tolerance of
1723 			 *   min value
1724 			 * - Remember how much values are corrected by
1725 			 */
1726 			if (t1 > t2) {
1727 				t = t2;
1728 				probe_stats->probe_errors[i][j] += t1 - t2;
1729 				if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
1730 					probe_stats->probe_suspect[i][j]++;
1731 					probe_stats->probe_suspect[j][i]++;
1732 				}
1733 			} else if (t2 > t1) {
1734 				t = t1;
1735 				probe_stats->probe_errors[j][i] += t2 - t1;
1736 				if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
1737 					probe_stats->probe_suspect[i][j]++;
1738 					probe_stats->probe_suspect[j][i]++;
1739 				}
1740 			}
1741 
1742 			lat_stats->latencies[i][j] =
1743 			    lat_stats->latencies[j][i] = t;
1744 			lgrp_config(cflag, t1, t);
1745 			lgrp_config(cflag, t2, t);
1746 		}
1747 	}
1748 
1749 	/*
1750 	 * Keep track of which latencies get corrected
1751 	 */
1752 	for (i = 0; i < MAX_NODES; i++)
1753 		for (j = 0; j < MAX_NODES; j++)
1754 			lat_corrected[i][j] = 0;
1755 
1756 	/*
1757 	 * For every two nodes, see whether there is another pair of nodes which
1758 	 * are about the same distance apart and make the latencies be the same
1759 	 * if they are close enough together
1760 	 */
1761 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1762 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1763 			if (!memnode_info[j].exists)
1764 				continue;
1765 			/*
1766 			 * Pick one pair of nodes (i, j)
1767 			 * and get latency between them
1768 			 */
1769 			t1 = lat_stats->latencies[i][j];
1770 
1771 			/*
1772 			 * Skip this pair of nodes if there isn't a latency
1773 			 * for it yet
1774 			 */
1775 			if (t1 == 0)
1776 				continue;
1777 
1778 			for (k = 0; k < lgrp_plat_node_cnt; k++) {
1779 				for (l = 0; l < lgrp_plat_node_cnt; l++) {
1780 					if (!memnode_info[l].exists)
1781 						continue;
1782 					/*
1783 					 * Pick another pair of nodes (k, l)
1784 					 * not same as (i, j) and get latency
1785 					 * between them
1786 					 */
1787 					if (k == i && l == j)
1788 						continue;
1789 
1790 					t2 = lat_stats->latencies[k][l];
1791 
1792 					/*
1793 					 * Skip this pair of nodes if there
1794 					 * isn't a latency for it yet
1795 					 */
1796 
1797 					if (t2 == 0)
1798 						continue;
1799 
1800 					/*
1801 					 * Skip nodes (k, l) if they already
1802 					 * have same latency as (i, j) or
1803 					 * their latency isn't close enough to
1804 					 * be considered/made the same
1805 					 */
1806 					if (t1 == t2 || (t1 > t2 && t1 - t2 >
1807 					    t1 >> lgrp_plat_probe_lt_shift) ||
1808 					    (t2 > t1 && t2 - t1 >
1809 					    t2 >> lgrp_plat_probe_lt_shift))
1810 						continue;
1811 
1812 					/*
1813 					 * Make latency(i, j) same as
1814 					 * latency(k, l), try to use latency
1815 					 * that has been adjusted already to get
1816 					 * more consistency (if possible), and
1817 					 * remember which latencies were
1818 					 * adjusted for next time
1819 					 */
1820 					if (lat_corrected[i][j]) {
1821 						t = t1;
1822 						lgrp_config(cflag, t2, t);
1823 						t2 = t;
1824 					} else if (lat_corrected[k][l]) {
1825 						t = t2;
1826 						lgrp_config(cflag, t1, t);
1827 						t1 = t;
1828 					} else {
1829 						if (t1 > t2)
1830 							t = t2;
1831 						else
1832 							t = t1;
1833 						lgrp_config(cflag, t1, t);
1834 						lgrp_config(cflag, t2, t);
1835 						t1 = t2 = t;
1836 					}
1837 
1838 					lat_stats->latencies[i][j] =
1839 					    lat_stats->latencies[k][l] = t;
1840 
1841 					lat_corrected[i][j] =
1842 					    lat_corrected[k][l] = 1;
1843 				}
1844 			}
1845 		}
1846 	}
1847 
1848 	/*
1849 	 * Local latencies should be same
1850 	 * - Find min and max local latencies
1851 	 * - Make all local latencies be minimum
1852 	 */
1853 	min = -1;
1854 	max = 0;
1855 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1856 		if (!memnode_info[i].exists)
1857 			continue;
1858 		t = lat_stats->latencies[i][i];
1859 		if (t == 0)
1860 			continue;
1861 		if (min == -1 || t < min)
1862 			min = t;
1863 		if (t > max)
1864 			max = t;
1865 	}
1866 	if (min != max) {
1867 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
1868 			int	local;
1869 
1870 			if (!memnode_info[i].exists)
1871 				continue;
1872 
1873 			local = lat_stats->latencies[i][i];
1874 			if (local == 0)
1875 				continue;
1876 
1877 			/*
1878 			 * Track suspect probe times that aren't within
1879 			 * tolerance of minimum local latency and how much
1880 			 * probe times are corrected by
1881 			 */
1882 			if (local - min > min >> lgrp_plat_probe_lt_shift)
1883 				probe_stats->probe_suspect[i][i]++;
1884 
1885 			probe_stats->probe_errors[i][i] += local - min;
1886 
1887 			/*
1888 			 * Make local latencies be minimum
1889 			 */
1890 			lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
1891 			lat_stats->latencies[i][i] = min;
1892 		}
1893 	}
1894 
1895 	/*
1896 	 * Determine max probe time again since just adjusted latencies
1897 	 */
1898 	lat_stats->latency_max = 0;
1899 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1900 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1901 			if (!memnode_info[j].exists)
1902 				continue;
1903 			t = lat_stats->latencies[i][j];
1904 			if (t > lat_stats->latency_max)
1905 				lat_stats->latency_max = t;
1906 		}
1907 	}
1908 }
1909 
1910 
1911 /*
1912  * Verify following about latencies between nodes:
1913  *
1914  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
1915  * - Local latencies same
1916  * - Local < remote
1917  * - Number of latencies seen is reasonable
1918  * - Number of occurrences of a given latency should be more than 1
1919  *
1920  * Returns:
1921  *	0	Success
1922  *	-1	Not symmetric
1923  *	-2	Local latencies not same
1924  *	-3	Local >= remote
1925  */
1926 static int
1927 lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
1928     lgrp_plat_latency_stats_t *lat_stats)
1929 {
1930 	int				i;
1931 	int				j;
1932 	u_longlong_t			t1;
1933 	u_longlong_t			t2;
1934 
1935 	ASSERT(memnode_info != NULL && lat_stats != NULL);
1936 
1937 	/*
1938 	 * Nothing to do when this is an UMA machine, lgroup topology is
1939 	 * limited to 2 levels, or there aren't any probe times yet
1940 	 */
1941 	if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
1942 	    lat_stats->latencies[0][0] == 0)
1943 		return (0);
1944 
1945 	/*
1946 	 * Make sure that latencies are symmetric between any two nodes
1947 	 * (ie. latency(node0, node1) == latency(node1, node0))
1948 	 */
1949 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1950 		if (!memnode_info[i].exists)
1951 			continue;
1952 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1953 			if (!memnode_info[j].exists)
1954 				continue;
1955 			t1 = lat_stats->latencies[i][j];
1956 			t2 = lat_stats->latencies[j][i];
1957 
1958 			if (t1 == 0 || t2 == 0 || t1 == t2)
1959 				continue;
1960 
1961 			return (-1);
1962 		}
1963 	}
1964 
1965 	/*
1966 	 * Local latencies should be same
1967 	 */
1968 	t1 = lat_stats->latencies[0][0];
1969 	for (i = 1; i < lgrp_plat_node_cnt; i++) {
1970 		if (!memnode_info[i].exists)
1971 			continue;
1972 
1973 		t2 = lat_stats->latencies[i][i];
1974 		if (t2 == 0)
1975 			continue;
1976 
1977 		if (t1 == 0) {
1978 			t1 = t2;
1979 			continue;
1980 		}
1981 
1982 		if (t1 != t2)
1983 			return (-2);
1984 	}
1985 
1986 	/*
1987 	 * Local latencies should be less than remote
1988 	 */
1989 	if (t1) {
1990 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
1991 			for (j = 0; j < lgrp_plat_node_cnt; j++) {
1992 				if (!memnode_info[j].exists)
1993 					continue;
1994 				t2 = lat_stats->latencies[i][j];
1995 				if (i == j || t2 == 0)
1996 					continue;
1997 
1998 				if (t1 >= t2)
1999 					return (-3);
2000 			}
2001 		}
2002 	}
2003 
2004 	return (0);
2005 }
2006 
2007 
2008 /*
2009  * Platform-specific initialization
2010  */
2011 static void
2012 lgrp_plat_main_init(void)
2013 {
2014 	int	curnode;
2015 	int	ht_limit;
2016 	int	i;
2017 
2018 	/*
2019 	 * Print a notice that MPO is disabled when memory is interleaved
2020 	 * across nodes....Would do this when it is discovered, but can't
2021 	 * because it happens way too early during boot....
2022 	 */
2023 	if (lgrp_plat_mem_intrlv)
2024 		cmn_err(CE_NOTE,
2025 		    "MPO disabled because memory is interleaved\n");
2026 
2027 	/*
2028 	 * Don't bother to do any probing if it is disabled, there is only one
2029 	 * node, or the height of the lgroup topology less than or equal to 2
2030 	 */
2031 	ht_limit = lgrp_topo_ht_limit();
2032 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
2033 	    max_mem_nodes == 1 || ht_limit <= 2) {
2034 		/*
2035 		 * Setup lgroup latencies for 2 level lgroup topology
2036 		 * (ie. local and remote only) if they haven't been set yet
2037 		 */
2038 		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
2039 		    lgrp_plat_lat_stats.latency_max == 0)
2040 			lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
2041 		return;
2042 	}
2043 
2044 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2045 		/*
2046 		 * Should have been able to probe from CPU 0 when it was added
2047 		 * to lgroup hierarchy, but may not have been able to then
2048 		 * because it happens so early in boot that gethrtime() hasn't
2049 		 * been initialized.  (:-(
2050 		 */
2051 		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
2052 		    lgrp_plat_cpu_node_nentries);
2053 		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
2054 		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
2055 			lgrp_plat_probe();
2056 
2057 		return;
2058 	}
2059 
2060 	/*
2061 	 * When probing memory, use one page for every sample to determine
2062 	 * lgroup topology and taking multiple samples
2063 	 */
2064 	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
2065 		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
2066 		    lgrp_plat_probe_nsamples;
2067 
2068 	/*
2069 	 * Map memory in each node needed for probing to determine latency
2070 	 * topology
2071 	 */
2072 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2073 		int	mnode;
2074 
2075 		/*
2076 		 * Skip this node and leave its probe page NULL
2077 		 * if it doesn't have any memory
2078 		 */
2079 		mnode = i;
2080 		if (!mem_node_config[mnode].exists) {
2081 			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
2082 			continue;
2083 		}
2084 
2085 		/*
2086 		 * Allocate one kernel virtual page
2087 		 */
2088 		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
2089 		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
2090 		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
2091 			cmn_err(CE_WARN,
2092 			    "lgrp_plat_main_init: couldn't allocate memory");
2093 			return;
2094 		}
2095 
2096 		/*
2097 		 * Get PFN for first page in each node
2098 		 */
2099 		lgrp_plat_probe_mem_config.probe_pfn[i] =
2100 		    mem_node_config[mnode].physbase;
2101 
2102 		/*
2103 		 * Map virtual page to first page in node
2104 		 */
2105 		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
2106 		    lgrp_plat_probe_mem_config.probe_memsize,
2107 		    lgrp_plat_probe_mem_config.probe_pfn[i],
2108 		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
2109 		    HAT_LOAD_NOCONSIST);
2110 	}
2111 
2112 	/*
2113 	 * Probe from current CPU
2114 	 */
2115 	lgrp_plat_probe();
2116 }
2117 
2118 
2119 /*
2120  * Return the number of free, allocatable, or installed
2121  * pages in an lgroup
2122  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
2123  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
2124  */
2125 static pgcnt_t
2126 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
2127 {
2128 	_NOTE(ARGUNUSED(lgrphand));
2129 
2130 	struct memlist *mlist;
2131 	pgcnt_t npgs = 0;
2132 	extern struct memlist *phys_avail;
2133 	extern struct memlist *phys_install;
2134 
2135 	switch (query) {
2136 	case LGRP_MEM_SIZE_FREE:
2137 		return ((pgcnt_t)freemem);
2138 	case LGRP_MEM_SIZE_AVAIL:
2139 		memlist_read_lock();
2140 		for (mlist = phys_avail; mlist; mlist = mlist->ml_next)
2141 			npgs += btop(mlist->ml_size);
2142 		memlist_read_unlock();
2143 		return (npgs);
2144 	case LGRP_MEM_SIZE_INSTALL:
2145 		memlist_read_lock();
2146 		for (mlist = phys_install; mlist; mlist = mlist->ml_next)
2147 			npgs += btop(mlist->ml_size);
2148 		memlist_read_unlock();
2149 		return (npgs);
2150 	default:
2151 		return ((pgcnt_t)0);
2152 	}
2153 }
2154 
2155 
2156 /*
2157  * Update node to proximity domain mappings for given domain and return node ID
2158  */
2159 static int
2160 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
2161     uint32_t domain)
2162 {
2163 	uint_t	node;
2164 	uint_t	start;
2165 
2166 	/*
2167 	 * Hash proximity domain ID into node to domain mapping table (array)
2168 	 * and add entry for it into first non-existent or matching entry found
2169 	 */
2170 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
2171 	do {
2172 		/*
2173 		 * Entry doesn't exist yet, so create one for this proximity
2174 		 * domain and return node ID which is index into mapping table.
2175 		 */
2176 		if (!node_domain[node].exists) {
2177 			node_domain[node].prox_domain = domain;
2178 			membar_producer();
2179 			node_domain[node].exists = 1;
2180 			return (node);
2181 		}
2182 
2183 		/*
2184 		 * Entry exists for this proximity domain already, so just
2185 		 * return node ID (index into table).
2186 		 */
2187 		if (node_domain[node].prox_domain == domain)
2188 			return (node);
2189 		node = NODE_DOMAIN_HASH(node + 1, node_cnt);
2190 	} while (node != start);
2191 
2192 	/*
2193 	 * Ran out of supported number of entries which shouldn't happen....
2194 	 */
2195 	ASSERT(node != start);
2196 	return (-1);
2197 }
2198 
2199 /*
2200  * Update node memory information for given proximity domain with specified
2201  * starting and ending physical address range (and return positive numbers for
2202  * success and negative ones for errors)
2203  */
2204 static int
2205 lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt,
2206     memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start,
2207     uint64_t end, uint32_t domain, uint32_t device_id)
2208 {
2209 	int	node, mnode;
2210 
2211 	/*
2212 	 * Get node number for proximity domain
2213 	 */
2214 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
2215 	if (node == -1) {
2216 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
2217 		    domain);
2218 		if (node == -1)
2219 			return (-1);
2220 	}
2221 
2222 	/*
2223 	 * This function is called during boot if device_id is
2224 	 * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for
2225 	 * memory DR operations.
2226 	 */
2227 	if (device_id != ACPI_MEMNODE_DEVID_BOOT) {
2228 		ASSERT(lgrp_plat_max_mem_node <= memnode_cnt);
2229 
2230 		for (mnode = lgrp_plat_node_cnt;
2231 		    mnode < lgrp_plat_max_mem_node; mnode++) {
2232 			if (memnode_info[mnode].exists &&
2233 			    memnode_info[mnode].prox_domain == domain &&
2234 			    memnode_info[mnode].device_id == device_id) {
2235 				if (btop(start) < memnode_info[mnode].start)
2236 					memnode_info[mnode].start = btop(start);
2237 				if (btop(end) > memnode_info[mnode].end)
2238 					memnode_info[mnode].end = btop(end);
2239 				return (1);
2240 			}
2241 		}
2242 
2243 		if (lgrp_plat_max_mem_node >= memnode_cnt) {
2244 			return (-3);
2245 		} else {
2246 			lgrp_plat_max_mem_node++;
2247 			memnode_info[mnode].start = btop(start);
2248 			memnode_info[mnode].end = btop(end);
2249 			memnode_info[mnode].prox_domain = domain;
2250 			memnode_info[mnode].device_id = device_id;
2251 			memnode_info[mnode].lgrphand = node;
2252 			membar_producer();
2253 			memnode_info[mnode].exists = 1;
2254 			return (0);
2255 		}
2256 	}
2257 
2258 	/*
2259 	 * Create entry in table for node if it doesn't exist
2260 	 */
2261 	ASSERT(node < memnode_cnt);
2262 	if (!memnode_info[node].exists) {
2263 		memnode_info[node].start = btop(start);
2264 		memnode_info[node].end = btop(end);
2265 		memnode_info[node].prox_domain = domain;
2266 		memnode_info[node].device_id = device_id;
2267 		memnode_info[node].lgrphand = node;
2268 		membar_producer();
2269 		memnode_info[node].exists = 1;
2270 		return (0);
2271 	}
2272 
2273 	/*
2274 	 * Entry already exists for this proximity domain
2275 	 *
2276 	 * There may be more than one SRAT memory entry for a domain, so we may
2277 	 * need to update existing start or end address for the node.
2278 	 */
2279 	if (memnode_info[node].prox_domain == domain) {
2280 		if (btop(start) < memnode_info[node].start)
2281 			memnode_info[node].start = btop(start);
2282 		if (btop(end) > memnode_info[node].end)
2283 			memnode_info[node].end = btop(end);
2284 		return (1);
2285 	}
2286 	return (-2);
2287 }
2288 
2289 
2290 /*
2291  * Have to sort nodes by starting physical address because plat_mnode_xcheck()
2292  * assumes and expects memnodes to be sorted in ascending order by physical
2293  * address.
2294  */
2295 static void
2296 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt,
2297     cpu_node_map_t *cpu_node, int cpu_count,
2298     memnode_phys_addr_map_t *memnode_info)
2299 {
2300 	boolean_t	found;
2301 	int		i;
2302 	int		j;
2303 	int		n;
2304 	boolean_t	sorted;
2305 	boolean_t	swapped;
2306 
2307 	if (!lgrp_plat_node_sort_enable || node_cnt <= 1 ||
2308 	    node_domain == NULL || memnode_info == NULL)
2309 		return;
2310 
2311 	/*
2312 	 * Sorted already?
2313 	 */
2314 	sorted = B_TRUE;
2315 	for (i = 0; i < node_cnt - 1; i++) {
2316 		/*
2317 		 * Skip entries that don't exist
2318 		 */
2319 		if (!memnode_info[i].exists)
2320 			continue;
2321 
2322 		/*
2323 		 * Try to find next existing entry to compare against
2324 		 */
2325 		found = B_FALSE;
2326 		for (j = i + 1; j < node_cnt; j++) {
2327 			if (memnode_info[j].exists) {
2328 				found = B_TRUE;
2329 				break;
2330 			}
2331 		}
2332 
2333 		/*
2334 		 * Done if no more existing entries to compare against
2335 		 */
2336 		if (found == B_FALSE)
2337 			break;
2338 
2339 		/*
2340 		 * Not sorted if starting address of current entry is bigger
2341 		 * than starting address of next existing entry
2342 		 */
2343 		if (memnode_info[i].start > memnode_info[j].start) {
2344 			sorted = B_FALSE;
2345 			break;
2346 		}
2347 	}
2348 
2349 	/*
2350 	 * Don't need to sort if sorted already
2351 	 */
2352 	if (sorted == B_TRUE)
2353 		return;
2354 
2355 	/*
2356 	 * Just use bubble sort since number of nodes is small
2357 	 */
2358 	n = node_cnt;
2359 	do {
2360 		swapped = B_FALSE;
2361 		n--;
2362 		for (i = 0; i < n; i++) {
2363 			/*
2364 			 * Skip entries that don't exist
2365 			 */
2366 			if (!memnode_info[i].exists)
2367 				continue;
2368 
2369 			/*
2370 			 * Try to find next existing entry to compare against
2371 			 */
2372 			found = B_FALSE;
2373 			for (j = i + 1; j <= n; j++) {
2374 				if (memnode_info[j].exists) {
2375 					found = B_TRUE;
2376 					break;
2377 				}
2378 			}
2379 
2380 			/*
2381 			 * Done if no more existing entries to compare against
2382 			 */
2383 			if (found == B_FALSE)
2384 				break;
2385 
2386 			if (memnode_info[i].start > memnode_info[j].start) {
2387 				memnode_phys_addr_map_t	save_addr;
2388 				node_domain_map_t	save_node;
2389 
2390 				/*
2391 				 * Swap node to proxmity domain ID assignments
2392 				 */
2393 				bcopy(&node_domain[i], &save_node,
2394 				    sizeof (node_domain_map_t));
2395 				bcopy(&node_domain[j], &node_domain[i],
2396 				    sizeof (node_domain_map_t));
2397 				bcopy(&save_node, &node_domain[j],
2398 				    sizeof (node_domain_map_t));
2399 
2400 				/*
2401 				 * Swap node to physical memory assignments
2402 				 */
2403 				bcopy(&memnode_info[i], &save_addr,
2404 				    sizeof (memnode_phys_addr_map_t));
2405 				bcopy(&memnode_info[j], &memnode_info[i],
2406 				    sizeof (memnode_phys_addr_map_t));
2407 				bcopy(&save_addr, &memnode_info[j],
2408 				    sizeof (memnode_phys_addr_map_t));
2409 				swapped = B_TRUE;
2410 			}
2411 		}
2412 	} while (swapped == B_TRUE);
2413 
2414 	/*
2415 	 * Check to make sure that CPUs assigned to correct node IDs now since
2416 	 * node to proximity domain ID assignments may have been changed above
2417 	 */
2418 	if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1)
2419 		return;
2420 	for (i = 0; i < cpu_count; i++) {
2421 		int		node;
2422 
2423 		node = lgrp_plat_domain_to_node(node_domain, node_cnt,
2424 		    cpu_node[i].prox_domain);
2425 		if (cpu_node[i].node != node)
2426 			cpu_node[i].node = node;
2427 	}
2428 
2429 }
2430 
2431 
2432 /*
2433  * Return time needed to probe from current CPU to memory in given node
2434  */
2435 static hrtime_t
2436 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
2437     lgrp_plat_probe_mem_config_t *probe_mem_config,
2438     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
2439 {
2440 	caddr_t			buf;
2441 	hrtime_t		elapsed;
2442 	hrtime_t		end;
2443 	int			from;
2444 	int			i;
2445 	int			ipl;
2446 	hrtime_t		max;
2447 	hrtime_t		min;
2448 	hrtime_t		start;
2449 	extern int		use_sse_pagecopy;
2450 
2451 	/*
2452 	 * Determine ID of node containing current CPU
2453 	 */
2454 	from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
2455 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
2456 
2457 	/*
2458 	 * Do common work for probing main memory
2459 	 */
2460 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
2461 		/*
2462 		 * Skip probing any nodes without memory and
2463 		 * set probe time to 0
2464 		 */
2465 		if (probe_mem_config->probe_va[to] == NULL) {
2466 			lat_stats->latencies[from][to] = 0;
2467 			return (0);
2468 		}
2469 
2470 		/*
2471 		 * Invalidate caches once instead of once every sample
2472 		 * which should cut cost of probing by a lot
2473 		 */
2474 		probe_stats->flush_cost = gethrtime();
2475 		invalidate_cache();
2476 		probe_stats->flush_cost = gethrtime() -
2477 		    probe_stats->flush_cost;
2478 		probe_stats->probe_cost_total += probe_stats->flush_cost;
2479 	}
2480 
2481 	/*
2482 	 * Probe from current CPU to given memory using specified operation
2483 	 * and take specified number of samples
2484 	 */
2485 	max = 0;
2486 	min = -1;
2487 	for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
2488 		probe_stats->probe_cost = gethrtime();
2489 
2490 		/*
2491 		 * Can't measure probe time if gethrtime() isn't working yet
2492 		 */
2493 		if (probe_stats->probe_cost == 0 && gethrtime() == 0)
2494 			return (0);
2495 
2496 		if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2497 			/*
2498 			 * Measure how long it takes to read vendor ID from
2499 			 * Northbridge
2500 			 */
2501 			elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
2502 		} else {
2503 			/*
2504 			 * Measure how long it takes to copy page
2505 			 * on top of itself
2506 			 */
2507 			buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
2508 
2509 			kpreempt_disable();
2510 			ipl = splhigh();
2511 			start = gethrtime();
2512 			if (use_sse_pagecopy)
2513 				hwblkpagecopy(buf, buf);
2514 			else
2515 				bcopy(buf, buf, PAGESIZE);
2516 			end = gethrtime();
2517 			elapsed = end - start;
2518 			splx(ipl);
2519 			kpreempt_enable();
2520 		}
2521 
2522 		probe_stats->probe_cost = gethrtime() -
2523 		    probe_stats->probe_cost;
2524 		probe_stats->probe_cost_total += probe_stats->probe_cost;
2525 
2526 		if (min == -1 || elapsed < min)
2527 			min = elapsed;
2528 		if (elapsed > max)
2529 			max = elapsed;
2530 	}
2531 
2532 	/*
2533 	 * Update minimum and maximum probe times between
2534 	 * these two nodes
2535 	 */
2536 	if (min < probe_stats->probe_min[from][to] ||
2537 	    probe_stats->probe_min[from][to] == 0)
2538 		probe_stats->probe_min[from][to] = min;
2539 
2540 	if (max > probe_stats->probe_max[from][to])
2541 		probe_stats->probe_max[from][to] = max;
2542 
2543 	return (min);
2544 }
2545 
2546 
2547 /*
2548  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
2549  * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
2550  * and return number of CPU APIC IDs.
2551  *
2552  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
2553  *       in in cpu_apicid_array boot property which is based on and follows
2554  *	 same ordering as processor list in ACPI MADT.  If the code in
2555  *	 usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns
2556  *	 CPU IDs ever changes, then this code will need to change too....
2557  */
2558 static int
2559 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node)
2560 {
2561 	int	boot_prop_len;
2562 	char	*boot_prop_name = BP_CPU_APICID_ARRAY;
2563 	uint8_t	cpu_apicid_array[UINT8_MAX + 1];
2564 	int	i;
2565 	int	n;
2566 
2567 	/*
2568 	 * Check length of property value
2569 	 */
2570 	boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
2571 	if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array))
2572 		return (-1);
2573 
2574 	/*
2575 	 * Calculate number of entries in array and return when the system is
2576 	 * not very interesting for NUMA. It's not interesting for NUMA if
2577 	 * system has only one CPU and doesn't support CPU hotplug.
2578 	 */
2579 	n = boot_prop_len / sizeof (uint8_t);
2580 	if (n == 1 && !plat_dr_support_cpu())
2581 		return (-2);
2582 
2583 	/*
2584 	 * Get CPU to APIC ID property value
2585 	 */
2586 	if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
2587 		return (-3);
2588 
2589 	/*
2590 	 * Just return number of CPU APIC IDs if CPU to node mapping table is
2591 	 * NULL
2592 	 */
2593 	if (cpu_node == NULL) {
2594 		if (plat_dr_support_cpu() && n >= boot_ncpus) {
2595 			return (boot_ncpus);
2596 		} else {
2597 			return (n);
2598 		}
2599 	}
2600 
2601 	/*
2602 	 * Fill in CPU to node ID mapping table with APIC ID for each CPU
2603 	 */
2604 	for (i = 0; i < n; i++) {
2605 		/* Only add boot CPUs into the map if CPU DR is enabled. */
2606 		if (plat_dr_support_cpu() && i >= boot_ncpus)
2607 			break;
2608 		cpu_node[i].exists = 1;
2609 		cpu_node[i].apicid = cpu_apicid_array[i];
2610 		cpu_node[i].prox_domain = UINT32_MAX;
2611 		cpu_node[i].node = UINT_MAX;
2612 	}
2613 
2614 	/*
2615 	 * Return number of CPUs based on number of APIC IDs
2616 	 */
2617 	return (i);
2618 }
2619 
2620 
2621 /*
2622  * Read ACPI System Locality Information Table (SLIT) to determine how far each
2623  * NUMA node is from each other
2624  */
2625 static int
2626 lgrp_plat_process_slit(struct slit *tp,
2627     node_domain_map_t *node_domain, uint_t node_cnt,
2628     memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats)
2629 {
2630 	int		i;
2631 	int		j;
2632 	int		src;
2633 	int		dst;
2634 	int		localities;
2635 	hrtime_t	max;
2636 	hrtime_t	min;
2637 	int		retval;
2638 	uint8_t		*slit_entries;
2639 
2640 	if (tp == NULL || !lgrp_plat_slit_enable)
2641 		return (1);
2642 
2643 	if (lat_stats == NULL)
2644 		return (2);
2645 
2646 	localities = tp->number;
2647 
2648 	min = lat_stats->latency_min;
2649 	max = lat_stats->latency_max;
2650 
2651 	/*
2652 	 * Fill in latency matrix based on SLIT entries
2653 	 */
2654 	slit_entries = tp->entry;
2655 	for (i = 0; i < localities; i++) {
2656 		src = lgrp_plat_domain_to_node(node_domain,
2657 		    node_cnt, i);
2658 		if (src == -1)
2659 			continue;
2660 
2661 		for (j = 0; j < localities; j++) {
2662 			uint8_t	latency;
2663 
2664 			dst = lgrp_plat_domain_to_node(node_domain,
2665 			    node_cnt, j);
2666 			if (dst == -1)
2667 				continue;
2668 
2669 			latency = slit_entries[(i * localities) + j];
2670 			lat_stats->latencies[src][dst] = latency;
2671 			if (latency < min || min == -1)
2672 				min = latency;
2673 			if (latency > max)
2674 				max = latency;
2675 		}
2676 	}
2677 
2678 	/*
2679 	 * Verify that latencies/distances given in SLIT look reasonable
2680 	 */
2681 	retval = lgrp_plat_latency_verify(memnode_info, lat_stats);
2682 
2683 	if (retval) {
2684 		/*
2685 		 * Reinitialize (zero) latency table since SLIT doesn't look
2686 		 * right
2687 		 */
2688 		for (i = 0; i < localities; i++) {
2689 			for (j = 0; j < localities; j++)
2690 				lat_stats->latencies[i][j] = 0;
2691 		}
2692 	} else {
2693 		/*
2694 		 * Update min and max latencies seen since SLIT looks valid
2695 		 */
2696 		lat_stats->latency_min = min;
2697 		lat_stats->latency_max = max;
2698 	}
2699 
2700 	return (retval);
2701 }
2702 
2703 
2704 /*
2705  * Update lgrp latencies according to information returned by ACPI _SLI method.
2706  */
2707 static int
2708 lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info,
2709     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
2710     lgrp_plat_latency_stats_t *lat_stats)
2711 {
2712 	int		i;
2713 	int		src, dst;
2714 	uint8_t		latency;
2715 	hrtime_t	max, min;
2716 
2717 	if (lat_stats == NULL || sli_info == NULL ||
2718 	    sli_cnt == 0 || domain_id >= sli_cnt)
2719 		return (-1);
2720 
2721 	src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id);
2722 	if (src == -1) {
2723 		src = lgrp_plat_node_domain_update(node_domain, node_cnt,
2724 		    domain_id);
2725 		if (src == -1)
2726 			return (-1);
2727 	}
2728 
2729 	/*
2730 	 * Don't update latency info if topology has been flattened to 2 levels.
2731 	 */
2732 	if (lgrp_plat_topo_flatten != 0) {
2733 		return (0);
2734 	}
2735 
2736 	/*
2737 	 * Latency information for proximity domain is ready.
2738 	 * TODO: support adjusting latency information at runtime.
2739 	 */
2740 	if (lat_stats->latencies[src][src] != 0) {
2741 		return (0);
2742 	}
2743 
2744 	/* Validate latency information. */
2745 	for (i = 0; i < sli_cnt; i++) {
2746 		if (i == domain_id) {
2747 			if (sli_info[i] != ACPI_SLIT_SELF_LATENCY ||
2748 			    sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) {
2749 				return (-1);
2750 			}
2751 		} else {
2752 			if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY ||
2753 			    sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY ||
2754 			    sli_info[i] != sli_info[sli_cnt + i]) {
2755 				return (-1);
2756 			}
2757 		}
2758 	}
2759 
2760 	min = lat_stats->latency_min;
2761 	max = lat_stats->latency_max;
2762 	for (i = 0; i < sli_cnt; i++) {
2763 		dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i);
2764 		if (dst == -1)
2765 			continue;
2766 
2767 		ASSERT(sli_info[i] == sli_info[sli_cnt + i]);
2768 
2769 		/* Update row in latencies matrix. */
2770 		latency = sli_info[i];
2771 		lat_stats->latencies[src][dst] = latency;
2772 		if (latency < min || min == -1)
2773 			min = latency;
2774 		if (latency > max)
2775 			max = latency;
2776 
2777 		/* Update column in latencies matrix. */
2778 		latency = sli_info[sli_cnt + i];
2779 		lat_stats->latencies[dst][src] = latency;
2780 		if (latency < min || min == -1)
2781 			min = latency;
2782 		if (latency > max)
2783 			max = latency;
2784 	}
2785 	lat_stats->latency_min = min;
2786 	lat_stats->latency_max = max;
2787 
2788 	return (0);
2789 }
2790 
2791 
2792 /*
2793  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
2794  * and memory are local to each other in the same NUMA node and return number
2795  * of nodes
2796  */
2797 static int
2798 lgrp_plat_process_srat(struct srat *tp, struct msct *mp,
2799     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
2800     cpu_node_map_t *cpu_node, int cpu_count,
2801     memnode_phys_addr_map_t *memnode_info)
2802 {
2803 	struct srat_item	*srat_end;
2804 	int			i;
2805 	struct srat_item	*item;
2806 	int			node_cnt;
2807 	int			proc_entry_count;
2808 	int			rc;
2809 
2810 	/*
2811 	 * Nothing to do when no SRAT or disabled
2812 	 */
2813 	if (tp == NULL || !lgrp_plat_srat_enable)
2814 		return (-1);
2815 
2816 	/*
2817 	 * Try to get domain information from MSCT table.
2818 	 * ACPI4.0: OSPM will use information provided by the MSCT only
2819 	 * when the System Resource Affinity Table (SRAT) exists.
2820 	 */
2821 	node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min);
2822 	if (node_cnt <= 0) {
2823 		/*
2824 		 * Determine number of nodes by counting number of proximity
2825 		 * domains in SRAT.
2826 		 */
2827 		node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min);
2828 	}
2829 	/*
2830 	 * Return if number of nodes is 1 or less since don't need to read SRAT.
2831 	 */
2832 	if (node_cnt == 1)
2833 		return (1);
2834 	else if (node_cnt <= 0)
2835 		return (-2);
2836 
2837 	/*
2838 	 * Walk through SRAT, examining each CPU and memory entry to determine
2839 	 * which CPUs and memory belong to which node.
2840 	 */
2841 	item = tp->list;
2842 	srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
2843 	proc_entry_count = 0;
2844 	while (item < srat_end) {
2845 		uint32_t	apic_id;
2846 		uint32_t	domain;
2847 		uint64_t	end;
2848 		uint64_t	length;
2849 		uint64_t	start;
2850 
2851 		switch (item->type) {
2852 		case SRAT_PROCESSOR:	/* CPU entry */
2853 			if (!(item->i.p.flags & SRAT_ENABLED) ||
2854 			    cpu_node == NULL)
2855 				break;
2856 
2857 			/*
2858 			 * Calculate domain (node) ID and fill in APIC ID to
2859 			 * domain/node mapping table
2860 			 */
2861 			domain = item->i.p.domain1;
2862 			for (i = 0; i < 3; i++) {
2863 				domain += item->i.p.domain2[i] <<
2864 				    ((i + 1) * 8);
2865 			}
2866 			apic_id = item->i.p.apic_id;
2867 
2868 			rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2869 			    cpu_node, cpu_count, apic_id, domain);
2870 			if (rc < 0)
2871 				return (-3);
2872 			else if (rc == 0)
2873 				proc_entry_count++;
2874 			break;
2875 
2876 		case SRAT_MEMORY:	/* memory entry */
2877 			if (!(item->i.m.flags & SRAT_ENABLED) ||
2878 			    memnode_info == NULL)
2879 				break;
2880 
2881 			/*
2882 			 * Get domain (node) ID and fill in domain/node
2883 			 * to memory mapping table
2884 			 */
2885 			domain = item->i.m.domain;
2886 			start = item->i.m.base_addr;
2887 			length = item->i.m.len;
2888 			end = start + length - 1;
2889 
2890 			/*
2891 			 * According to ACPI 4.0, both ENABLE and HOTPLUG flags
2892 			 * may be set for memory address range entries in SRAT
2893 			 * table which are reserved for memory hot plug.
2894 			 * We intersect memory address ranges in SRAT table
2895 			 * with memory ranges in physinstalled to filter out
2896 			 * memory address ranges reserved for hot plug.
2897 			 */
2898 			if (item->i.m.flags & SRAT_HOT_PLUG) {
2899 				uint64_t	rstart = UINT64_MAX;
2900 				uint64_t	rend = 0;
2901 				struct memlist	*ml;
2902 				extern struct bootops	*bootops;
2903 
2904 				memlist_read_lock();
2905 				for (ml = bootops->boot_mem->physinstalled;
2906 				    ml; ml = ml->ml_next) {
2907 					uint64_t tstart = ml->ml_address;
2908 					uint64_t tend;
2909 
2910 					tend = ml->ml_address + ml->ml_size;
2911 					if (tstart > end || tend < start)
2912 						continue;
2913 					if (start > tstart)
2914 						tstart = start;
2915 					if (rstart > tstart)
2916 						rstart = tstart;
2917 					if (end < tend)
2918 						tend = end;
2919 					if (rend < tend)
2920 						rend = tend;
2921 				}
2922 				memlist_read_unlock();
2923 				start = rstart;
2924 				end = rend;
2925 				/* Skip this entry if no memory installed. */
2926 				if (start > end)
2927 					break;
2928 			}
2929 
2930 			if (lgrp_plat_memnode_info_update(node_domain,
2931 			    node_cnt, memnode_info, node_cnt,
2932 			    start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0)
2933 				return (-4);
2934 			break;
2935 
2936 		case SRAT_X2APIC:	/* x2apic CPU entry */
2937 			if (!(item->i.xp.flags & SRAT_ENABLED) ||
2938 			    cpu_node == NULL)
2939 				break;
2940 
2941 			/*
2942 			 * Calculate domain (node) ID and fill in APIC ID to
2943 			 * domain/node mapping table
2944 			 */
2945 			domain = item->i.xp.domain;
2946 			apic_id = item->i.xp.x2apic_id;
2947 
2948 			rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2949 			    cpu_node, cpu_count, apic_id, domain);
2950 			if (rc < 0)
2951 				return (-3);
2952 			else if (rc == 0)
2953 				proc_entry_count++;
2954 			break;
2955 
2956 		default:
2957 			break;
2958 		}
2959 
2960 		item = (struct srat_item *)((uintptr_t)item + item->len);
2961 	}
2962 
2963 	/*
2964 	 * Should have seen at least as many SRAT processor entries as CPUs
2965 	 */
2966 	if (proc_entry_count < cpu_count)
2967 		return (-5);
2968 
2969 	/*
2970 	 * Need to sort nodes by starting physical address since VM system
2971 	 * assumes and expects memnodes to be sorted in ascending order by
2972 	 * physical address
2973 	 */
2974 	lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count,
2975 	    memnode_info);
2976 
2977 	return (node_cnt);
2978 }
2979 
2980 
2981 /*
2982  * Allocate permanent memory for any temporary memory that we needed to
2983  * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
2984  * initialized and copy everything from temporary to permanent memory since
2985  * temporary boot memory will eventually be released during boot
2986  */
2987 static void
2988 lgrp_plat_release_bootstrap(void)
2989 {
2990 	void	*buf;
2991 	size_t	size;
2992 
2993 	if (lgrp_plat_cpu_node_nentries > 0) {
2994 		size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
2995 		buf = kmem_alloc(size, KM_SLEEP);
2996 		bcopy(lgrp_plat_cpu_node, buf, size);
2997 		lgrp_plat_cpu_node = buf;
2998 	}
2999 }
3000 
3001 
3002 /*
3003  * Return number of proximity domains given in ACPI SRAT
3004  */
3005 static int
3006 lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min)
3007 {
3008 	int			domain_cnt;
3009 	uint32_t		domain_min;
3010 	struct srat_item	*end;
3011 	int			i;
3012 	struct srat_item	*item;
3013 	node_domain_map_t	node_domain[MAX_NODES];
3014 
3015 
3016 	if (tp == NULL || !lgrp_plat_srat_enable)
3017 		return (1);
3018 
3019 	/*
3020 	 * Walk through SRAT to find minimum proximity domain ID
3021 	 */
3022 	domain_min = UINT32_MAX;
3023 	item = tp->list;
3024 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
3025 	while (item < end) {
3026 		uint32_t	domain;
3027 
3028 		switch (item->type) {
3029 		case SRAT_PROCESSOR:	/* CPU entry */
3030 			if (!(item->i.p.flags & SRAT_ENABLED)) {
3031 				item = (struct srat_item *)((uintptr_t)item +
3032 				    item->len);
3033 				continue;
3034 			}
3035 			domain = item->i.p.domain1;
3036 			for (i = 0; i < 3; i++) {
3037 				domain += item->i.p.domain2[i] <<
3038 				    ((i + 1) * 8);
3039 			}
3040 			break;
3041 
3042 		case SRAT_MEMORY:	/* memory entry */
3043 			if (!(item->i.m.flags & SRAT_ENABLED)) {
3044 				item = (struct srat_item *)((uintptr_t)item +
3045 				    item->len);
3046 				continue;
3047 			}
3048 			domain = item->i.m.domain;
3049 			break;
3050 
3051 		case SRAT_X2APIC:	/* x2apic CPU entry */
3052 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
3053 				item = (struct srat_item *)((uintptr_t)item +
3054 				    item->len);
3055 				continue;
3056 			}
3057 			domain = item->i.xp.domain;
3058 			break;
3059 
3060 		default:
3061 			item = (struct srat_item *)((uintptr_t)item +
3062 			    item->len);
3063 			continue;
3064 		}
3065 
3066 		/*
3067 		 * Keep track of minimum proximity domain ID
3068 		 */
3069 		if (domain < domain_min)
3070 			domain_min = domain;
3071 
3072 		item = (struct srat_item *)((uintptr_t)item + item->len);
3073 	}
3074 	if (lgrp_plat_domain_min_enable && prox_domain_min != NULL)
3075 		*prox_domain_min = domain_min;
3076 
3077 	/*
3078 	 * Walk through SRAT, examining each CPU and memory entry to determine
3079 	 * proximity domain ID for each.
3080 	 */
3081 	domain_cnt = 0;
3082 	item = tp->list;
3083 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
3084 	bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
3085 	while (item < end) {
3086 		uint32_t	domain;
3087 		boolean_t	overflow;
3088 		uint_t		start;
3089 
3090 		switch (item->type) {
3091 		case SRAT_PROCESSOR:	/* CPU entry */
3092 			if (!(item->i.p.flags & SRAT_ENABLED)) {
3093 				item = (struct srat_item *)((uintptr_t)item +
3094 				    item->len);
3095 				continue;
3096 			}
3097 			domain = item->i.p.domain1;
3098 			for (i = 0; i < 3; i++) {
3099 				domain += item->i.p.domain2[i] <<
3100 				    ((i + 1) * 8);
3101 			}
3102 			break;
3103 
3104 		case SRAT_MEMORY:	/* memory entry */
3105 			if (!(item->i.m.flags & SRAT_ENABLED)) {
3106 				item = (struct srat_item *)((uintptr_t)item +
3107 				    item->len);
3108 				continue;
3109 			}
3110 			domain = item->i.m.domain;
3111 			break;
3112 
3113 		case SRAT_X2APIC:	/* x2apic CPU entry */
3114 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
3115 				item = (struct srat_item *)((uintptr_t)item +
3116 				    item->len);
3117 				continue;
3118 			}
3119 			domain = item->i.xp.domain;
3120 			break;
3121 
3122 		default:
3123 			item = (struct srat_item *)((uintptr_t)item +
3124 			    item->len);
3125 			continue;
3126 		}
3127 
3128 		/*
3129 		 * Count and keep track of which proximity domain IDs seen
3130 		 */
3131 		start = i = domain % MAX_NODES;
3132 		overflow = B_TRUE;
3133 		do {
3134 			/*
3135 			 * Create entry for proximity domain and increment
3136 			 * count when no entry exists where proximity domain
3137 			 * hashed
3138 			 */
3139 			if (!node_domain[i].exists) {
3140 				node_domain[i].exists = 1;
3141 				node_domain[i].prox_domain = domain;
3142 				domain_cnt++;
3143 				overflow = B_FALSE;
3144 				break;
3145 			}
3146 
3147 			/*
3148 			 * Nothing to do when proximity domain seen already
3149 			 * and its entry exists
3150 			 */
3151 			if (node_domain[i].prox_domain == domain) {
3152 				overflow = B_FALSE;
3153 				break;
3154 			}
3155 
3156 			/*
3157 			 * Entry exists where proximity domain hashed, but for
3158 			 * different proximity domain so keep search for empty
3159 			 * slot to put it or matching entry whichever comes
3160 			 * first.
3161 			 */
3162 			i = (i + 1) % MAX_NODES;
3163 		} while (i != start);
3164 
3165 		/*
3166 		 * Didn't find empty or matching entry which means have more
3167 		 * proximity domains than supported nodes (:-(
3168 		 */
3169 		ASSERT(overflow != B_TRUE);
3170 		if (overflow == B_TRUE)
3171 			return (-1);
3172 
3173 		item = (struct srat_item *)((uintptr_t)item + item->len);
3174 	}
3175 	return (domain_cnt);
3176 }
3177 
3178 
3179 /*
3180  * Parse domain information in ACPI Maximum System Capability Table (MSCT).
3181  * MSCT table has been verified in function process_msct() in fakebop.c.
3182  */
3183 static int
3184 lgrp_plat_msct_domains(struct msct *tp, uint32_t *prox_domain_min)
3185 {
3186 	int last_seen = 0;
3187 	uint32_t proxmin = UINT32_MAX;
3188 	struct msct_proximity_domain *item, *end;
3189 
3190 	if (tp == NULL || lgrp_plat_msct_enable == 0)
3191 		return (-1);
3192 
3193 	if (tp->maximum_proximity_domains >= MAX_NODES) {
3194 		cmn_err(CE_CONT,
3195 		    "?lgrp: too many proximity domains (%d), max %d supported, "
3196 		    "disable support of CPU/memory DR operations.",
3197 		    tp->maximum_proximity_domains + 1, MAX_NODES);
3198 		plat_dr_disable_cpu();
3199 		plat_dr_disable_memory();
3200 		return (-1);
3201 	}
3202 
3203 	if (prox_domain_min != NULL) {
3204 		end = (void *)(tp->hdr.len + (uintptr_t)tp);
3205 		for (item = (void *)((uintptr_t)tp +
3206 		    tp->proximity_domain_offset); item < end;
3207 		    item = (void *)(item->length + (uintptr_t)item)) {
3208 			if (item->domain_min < proxmin) {
3209 				proxmin = item->domain_min;
3210 			}
3211 
3212 			last_seen = item->domain_max - item->domain_min + 1;
3213 			/*
3214 			 * Break out if all proximity domains have been
3215 			 * processed. Some BIOSes may have unused items
3216 			 * at the end of MSCT table.
3217 			 */
3218 			if (last_seen > tp->maximum_proximity_domains) {
3219 				break;
3220 			}
3221 		}
3222 		*prox_domain_min = proxmin;
3223 	}
3224 
3225 	return (tp->maximum_proximity_domains + 1);
3226 }
3227 
3228 
3229 /*
3230  * Set lgroup latencies for 2 level lgroup topology
3231  */
3232 static void
3233 lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats)
3234 {
3235 	int	i, j;
3236 
3237 	ASSERT(lat_stats != NULL);
3238 
3239 	if (lgrp_plat_node_cnt >= 4)
3240 		cmn_err(CE_NOTE,
3241 		    "MPO only optimizing for local and remote\n");
3242 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
3243 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
3244 			if (i == j)
3245 				lat_stats->latencies[i][j] = 2;
3246 			else
3247 				lat_stats->latencies[i][j] = 3;
3248 		}
3249 	}
3250 	lat_stats->latency_min = 2;
3251 	lat_stats->latency_max = 3;
3252 	/* TODO: check it. */
3253 	lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
3254 	lgrp_plat_topo_flatten = 1;
3255 }
3256 
3257 
3258 /*
3259  * The following Opteron specific constants, macros, types, and routines define
3260  * PCI configuration space registers and how to read them to determine the NUMA
3261  * configuration of *supported* Opteron processors.  They provide the same
3262  * information that may be gotten from the ACPI System Resource Affinity Table
3263  * (SRAT) if it exists on the machine of interest.
3264  *
3265  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
3266  * of interest describes all of these registers and their contents.  The main
3267  * registers used by this code to determine the NUMA configuration of the
3268  * machine are the node ID register for the number of NUMA nodes and the DRAM
3269  * address map registers for the physical address range of each node.
3270  *
3271  * NOTE: The format and how to determine the NUMA configuration using PCI
3272  *	 config space registers may change or may not be supported in future
3273  *	 Opteron processor families.
3274  */
3275 
3276 /*
3277  * How many bits to shift Opteron DRAM Address Map base and limit registers
3278  * to get actual value
3279  */
3280 #define	OPT_DRAMADDR_HI_LSHIFT_ADDR	40	/* shift left for address */
3281 #define	OPT_DRAMADDR_LO_LSHIFT_ADDR	8	/* shift left for address */
3282 
3283 #define	OPT_DRAMADDR_HI_MASK_ADDR	0x000000FF /* address bits 47-40 */
3284 #define	OPT_DRAMADDR_LO_MASK_ADDR	0xFFFF0000 /* address bits 39-24 */
3285 
3286 #define	OPT_DRAMADDR_LO_MASK_OFF	0xFFFFFF /* offset for address */
3287 
3288 /*
3289  * Macros to derive addresses from Opteron DRAM Address Map registers
3290  */
3291 #define	OPT_DRAMADDR_HI(reg) \
3292 	(((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
3293 	    OPT_DRAMADDR_HI_LSHIFT_ADDR)
3294 
3295 #define	OPT_DRAMADDR_LO(reg) \
3296 	(((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
3297 	    OPT_DRAMADDR_LO_LSHIFT_ADDR)
3298 
3299 #define	OPT_DRAMADDR(high, low) \
3300 	(OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
3301 
3302 /*
3303  * Bit masks defining what's in Opteron DRAM Address Map base register
3304  */
3305 #define	OPT_DRAMBASE_LO_MASK_RE		0x1	/* read enable */
3306 #define	OPT_DRAMBASE_LO_MASK_WE		0x2	/* write enable */
3307 #define	OPT_DRAMBASE_LO_MASK_INTRLVEN	0x700	/* interleave */
3308 
3309 /*
3310  * Bit masks defining what's in Opteron DRAM Address Map limit register
3311  */
3312 #define	OPT_DRAMLIMIT_LO_MASK_DSTNODE	0x7		/* destination node */
3313 #define	OPT_DRAMLIMIT_LO_MASK_INTRLVSEL	0x700		/* interleave select */
3314 
3315 
3316 /*
3317  * Opteron Node ID register in PCI configuration space contains
3318  * number of nodes in system, etc. for Opteron K8.  The following
3319  * constants and macros define its contents, structure, and access.
3320  */
3321 
3322 /*
3323  * Bit masks defining what's in Opteron Node ID register
3324  */
3325 #define	OPT_NODE_MASK_ID	0x7	/* node ID */
3326 #define	OPT_NODE_MASK_CNT	0x70	/* node count */
3327 #define	OPT_NODE_MASK_IONODE	0x700	/* Hypertransport I/O hub node ID */
3328 #define	OPT_NODE_MASK_LCKNODE	0x7000	/* lock controller node ID */
3329 #define	OPT_NODE_MASK_CPUCNT	0xF0000	/* CPUs in system (0 means 1 CPU)  */
3330 
3331 /*
3332  * How many bits in Opteron Node ID register to shift right to get actual value
3333  */
3334 #define	OPT_NODE_RSHIFT_CNT	0x4	/* shift right for node count value */
3335 
3336 /*
3337  * Macros to get values from Opteron Node ID register
3338  */
3339 #define	OPT_NODE_CNT(reg) \
3340 	((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
3341 
3342 /*
3343  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
3344  * "in/out" instructions
3345  *
3346  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
3347  *	 other uses should just do MMIO to access PCI ECS.
3348  *	 Must enable special bit in Northbridge Configuration Register on
3349  *	 Greyhound for extended CF8 space access to be able to access PCI ECS
3350  *	 using "in/out" instructions and restore special bit after done
3351  *	 accessing PCI ECS.
3352  */
3353 #define	OPT_PCI_ECS_ADDR(bus, device, function, reg) \
3354 	(PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
3355 	    (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
3356 	    ((((reg) >> 8) & 0xf) << 24))
3357 
3358 /*
3359  * PCI configuration space registers accessed by specifying
3360  * a bus, device, function, and offset.  The following constants
3361  * define the values needed to access Opteron K8 configuration
3362  * info to determine its node topology
3363  */
3364 
3365 #define	OPT_PCS_BUS_CONFIG	0	/* Hypertransport config space bus */
3366 
3367 /*
3368  * Opteron PCI configuration space register function values
3369  */
3370 #define	OPT_PCS_FUNC_HT		0	/* Hypertransport configuration */
3371 #define	OPT_PCS_FUNC_ADDRMAP	1	/* Address map configuration */
3372 #define	OPT_PCS_FUNC_DRAM	2	/* DRAM configuration */
3373 #define	OPT_PCS_FUNC_MISC	3	/* Miscellaneous configuration */
3374 
3375 /*
3376  * PCI Configuration Space register offsets
3377  */
3378 #define	OPT_PCS_OFF_VENDOR	0x0	/* device/vendor ID register */
3379 #define	OPT_PCS_OFF_DRAMBASE_HI	0x140	/* DRAM Base register (node 0) */
3380 #define	OPT_PCS_OFF_DRAMBASE_LO	0x40	/* DRAM Base register (node 0) */
3381 #define	OPT_PCS_OFF_NODEID	0x60	/* Node ID register */
3382 
3383 /*
3384  * Opteron PCI Configuration Space device IDs for nodes
3385  */
3386 #define	OPT_PCS_DEV_NODE0		24	/* device number for node 0 */
3387 
3388 
3389 /*
3390  * Opteron DRAM address map gives base and limit for physical memory in a node
3391  */
3392 typedef	struct opt_dram_addr_map {
3393 	uint32_t	base_hi;
3394 	uint32_t	base_lo;
3395 	uint32_t	limit_hi;
3396 	uint32_t	limit_lo;
3397 } opt_dram_addr_map_t;
3398 
3399 
3400 /*
3401  * Supported AMD processor families
3402  */
3403 #define	AMD_FAMILY_HAMMER	15
3404 #define	AMD_FAMILY_GREYHOUND	16
3405 
3406 /*
3407  * Whether to have is_opteron() return 1 even when processor isn't supported
3408  */
3409 uint_t	is_opteron_override = 0;
3410 
3411 /*
3412  * AMD processor family for current CPU
3413  */
3414 uint_t	opt_family = 0;
3415 
3416 
3417 /*
3418  * Determine whether we're running on a supported AMD Opteron since reading
3419  * node count and DRAM address map registers may have different format or
3420  * may not be supported across processor families
3421  */
3422 static int
3423 is_opteron(void)
3424 {
3425 
3426 	if (x86_vendor != X86_VENDOR_AMD)
3427 		return (0);
3428 
3429 	opt_family = cpuid_getfamily(CPU);
3430 	if (opt_family == AMD_FAMILY_HAMMER ||
3431 	    opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
3432 		return (1);
3433 	else
3434 		return (0);
3435 }
3436 
3437 
3438 /*
3439  * Determine NUMA configuration for Opteron from registers that live in PCI
3440  * configuration space
3441  */
3442 static void
3443 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
3444     memnode_phys_addr_map_t *memnode_info)
3445 {
3446 	uint_t				bus;
3447 	uint_t				dev;
3448 	struct opt_dram_addr_map	dram_map[MAX_NODES];
3449 	uint_t				node;
3450 	uint_t				node_info[MAX_NODES];
3451 	uint_t				off_hi;
3452 	uint_t				off_lo;
3453 	uint64_t			nb_cfg_reg;
3454 
3455 	/*
3456 	 * Read configuration registers from PCI configuration space to
3457 	 * determine node information, which memory is in each node, etc.
3458 	 *
3459 	 * Write to PCI configuration space address register to specify
3460 	 * which configuration register to read and read/write PCI
3461 	 * configuration space data register to get/set contents
3462 	 */
3463 	bus = OPT_PCS_BUS_CONFIG;
3464 	dev = OPT_PCS_DEV_NODE0;
3465 	off_hi = OPT_PCS_OFF_DRAMBASE_HI;
3466 	off_lo = OPT_PCS_OFF_DRAMBASE_LO;
3467 
3468 	/*
3469 	 * Read node ID register for node 0 to get node count
3470 	 */
3471 	node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
3472 	    OPT_PCS_OFF_NODEID);
3473 	*node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
3474 
3475 	/*
3476 	 * If number of nodes is more than maximum supported, then set node
3477 	 * count to 1 and treat system as UMA instead of NUMA.
3478 	 */
3479 	if (*node_cnt > MAX_NODES) {
3480 		*node_cnt = 1;
3481 		return;
3482 	}
3483 
3484 	/*
3485 	 * For Greyhound, PCI Extended Configuration Space must be enabled to
3486 	 * read high DRAM address map base and limit registers
3487 	 */
3488 	if (opt_family == AMD_FAMILY_GREYHOUND) {
3489 		nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
3490 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3491 			wrmsr(MSR_AMD_NB_CFG,
3492 			    nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
3493 	}
3494 
3495 	for (node = 0; node < *node_cnt; node++) {
3496 		uint32_t	base_hi;
3497 		uint32_t	base_lo;
3498 		uint32_t	limit_hi;
3499 		uint32_t	limit_lo;
3500 
3501 		/*
3502 		 * Read node ID register (except for node 0 which we just read)
3503 		 */
3504 		if (node > 0) {
3505 			node_info[node] = pci_getl_func(bus, dev,
3506 			    OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
3507 		}
3508 
3509 		/*
3510 		 * Read DRAM base and limit registers which specify
3511 		 * physical memory range of each node
3512 		 */
3513 		if (opt_family != AMD_FAMILY_GREYHOUND)
3514 			base_hi = 0;
3515 		else {
3516 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3517 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
3518 			base_hi = dram_map[node].base_hi =
3519 			    inl(PCI_CONFDATA);
3520 		}
3521 		base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
3522 		    OPT_PCS_FUNC_ADDRMAP, off_lo);
3523 
3524 		if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
3525 		    mem_intrlv)
3526 			*mem_intrlv = *mem_intrlv + 1;
3527 
3528 		off_hi += 4;	/* high limit register offset */
3529 		if (opt_family != AMD_FAMILY_GREYHOUND)
3530 			limit_hi = 0;
3531 		else {
3532 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3533 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
3534 			limit_hi = dram_map[node].limit_hi =
3535 			    inl(PCI_CONFDATA);
3536 		}
3537 
3538 		off_lo += 4;	/* low limit register offset */
3539 		limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
3540 		    dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
3541 
3542 		/*
3543 		 * Increment device number to next node and register offsets
3544 		 * for DRAM base register of next node
3545 		 */
3546 		off_hi += 4;
3547 		off_lo += 4;
3548 		dev++;
3549 
3550 		/*
3551 		 * Both read and write enable bits must be enabled in DRAM
3552 		 * address map base register for physical memory to exist in
3553 		 * node
3554 		 */
3555 		if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
3556 		    (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
3557 			/*
3558 			 * Mark node memory as non-existent and set start and
3559 			 * end addresses to be same in memnode_info[]
3560 			 */
3561 			memnode_info[node].exists = 0;
3562 			memnode_info[node].start = memnode_info[node].end =
3563 			    (pfn_t)-1;
3564 			continue;
3565 		}
3566 
3567 		/*
3568 		 * Mark node memory as existing and remember physical address
3569 		 * range of each node for use later
3570 		 */
3571 		memnode_info[node].exists = 1;
3572 
3573 		memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
3574 
3575 		memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
3576 		    OPT_DRAMADDR_LO_MASK_OFF);
3577 	}
3578 
3579 	/*
3580 	 * Restore PCI Extended Configuration Space enable bit
3581 	 */
3582 	if (opt_family == AMD_FAMILY_GREYHOUND) {
3583 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3584 			wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
3585 	}
3586 }
3587 
3588 
3589 /*
3590  * Return average amount of time to read vendor ID register on Northbridge
3591  * N times on specified destination node from current CPU
3592  */
3593 static hrtime_t
3594 opt_probe_vendor(int dest_node, int nreads)
3595 {
3596 	int		cnt;
3597 	uint_t		dev;
3598 	/* LINTED: set but not used in function */
3599 	volatile uint_t	dev_vendor;
3600 	hrtime_t	elapsed;
3601 	hrtime_t	end;
3602 	int		ipl;
3603 	hrtime_t	start;
3604 
3605 	dev = OPT_PCS_DEV_NODE0 + dest_node;
3606 	kpreempt_disable();
3607 	ipl = spl8();
3608 	outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
3609 	    OPT_PCS_OFF_VENDOR));
3610 	start = gethrtime();
3611 	for (cnt = 0; cnt < nreads; cnt++)
3612 		dev_vendor = inl(PCI_CONFDATA);
3613 	end = gethrtime();
3614 	elapsed = (end - start) / nreads;
3615 	splx(ipl);
3616 	kpreempt_enable();
3617 	return (elapsed);
3618 }
3619