xref: /illumos-gate/usr/src/uts/i86pc/os/lgrpplat.c (revision d0f40dc6a997c84bacf5f9ba83d57a95495c399b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /*
27  * Copyright (c) 2010, Intel Corporation.
28  * All rights reserved.
29  */
30 
31 /*
32  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
33  * ================================================================
34  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
35  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
36  * one or more CPUs and some local memory.  The CPUs in each node can access
37  * the memory in the other nodes but at a higher latency than accessing their
38  * local memory.  Typically, a system with only one node has Uniform Memory
39  * Access (UMA), but it may be possible to have a one node system that has
40  * some global memory outside of the node which is higher latency.
41  *
42  * Module Description
43  * ------------------
44  * This module provides a platform interface for determining which CPUs and
45  * which memory (and how much) are in a NUMA node and how far each node is from
46  * each other.  The interface is used by the Virtual Memory (VM) system and the
47  * common lgroup framework.  The VM system uses the plat_*() routines to fill
48  * in its memory node (memnode) array with the physical address range spanned
49  * by each NUMA node to know which memory belongs to which node, so it can
50  * build and manage a physical page free list for each NUMA node and allocate
51  * local memory from each node as needed.  The common lgroup framework uses the
52  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
53  * to each node (leaf lgroup) and how far each node is from each other, so it
54  * can build the latency (lgroup) topology for the machine in order to optimize
55  * for locality.  Also, an lgroup platform handle instead of lgroups are used
56  * in the interface with this module, so this module shouldn't need to know
57  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
58  * etc. are in each NUMA node, how far each node is from each other, and to use
59  * a unique lgroup platform handle to refer to each node through the interface.
60  *
61  * Determining NUMA Configuration
62  * ------------------------------
63  * By default, this module will try to determine the NUMA configuration of the
64  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
65  * Locality Information Table (SLIT).  The SRAT contains info to tell which
66  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
67  * is a matrix that gives the distance between each system locality (which is
68  * a NUMA node and should correspond to proximity domains in the SRAT).  For
69  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
70  * specification.
71  *
72  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
73  * examine registers in PCI configuration space to determine how many nodes are
74  * in the system and which CPUs and memory are in each node.
75  * do while booting the kernel.
76  *
77  * NOTE: Using these PCI configuration space registers to determine this
78  *       locality info is not guaranteed to work or be compatible across all
79  *	 Opteron processor families.
80  *
81  * If the SLIT does not exist or look right, the kernel will probe to determine
82  * the distance between nodes as long as the NUMA CPU and memory configuration
83  * has been determined (see lgrp_plat_probe() for details).
84  *
85  * Data Structures
86  * ---------------
87  * The main data structures used by this code are the following:
88  *
89  * - lgrp_plat_cpu_node[]		CPU to node ID mapping table indexed by
90  *					CPU ID (only used for SRAT)
91  *
92  * - lgrp_plat_lat_stats.latencies[][]	Table of latencies between same and
93  *					different nodes indexed by node ID
94  *
95  * - lgrp_plat_node_cnt			Number of NUMA nodes in system for
96  *					non-DR-capable systems,
97  *					maximum possible number of NUMA nodes
98  *					in system for DR capable systems.
99  *
100  * - lgrp_plat_node_domain[]		Node ID to proximity domain ID mapping
101  *					table indexed by node ID (only used
102  *					for SRAT)
103  *
104  * - lgrp_plat_memnode_info[]		Table with physical address range for
105  *					each memory node indexed by memory node
106  *					ID
107  *
108  * The code is implemented to make the following always be true:
109  *
110  *	lgroup platform handle == node ID == memnode ID
111  *
112  * Moreover, it allows for the proximity domain ID to be equal to all of the
113  * above as long as the proximity domains IDs are numbered from 0 to <number of
114  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
115  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
116  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
117  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
118  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
119  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
120  * to node IDs.  However, the proximity domain IDs may not map to the
121  * equivalent node ID since we want to keep the node IDs numbered from 0 to
122  * <number of nodes - 1> to minimize cost of searching and potentially space.
123  *
124  * With the introduction of support of memory DR operations on x86 platforms,
125  * things get a little complicated. The addresses of hot-added memory may not
126  * be continuous with other memory connected to the same lgrp node. In other
127  * words, memory addresses may get interleaved among lgrp nodes after memory
128  * DR operations. To work around this limitation, we have extended the
129  * relationship between lgrp node and memory node from 1:1 map to 1:N map,
130  * that means there may be multiple memory nodes associated with a lgrp node
131  * after memory DR operations.
132  *
133  * To minimize the code changes to support memory DR operations, the
134  * following policies have been adopted.
135  * 1) On non-DR-capable systems, the relationship among lgroup platform handle,
136  *    node ID and memnode ID is still kept as:
137  *	lgroup platform handle == node ID == memnode ID
138  * 2) For memory present at boot time on DR capable platforms, the relationship
139  *    is still kept as is.
140  *	lgroup platform handle == node ID == memnode ID
141  * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have
142  *    been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt)
143  *    are reserved for memory present at boot time, and memnode IDs
144  *    [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate
145  *    memnode ID for hot-added memory.
146  * 4) All boot code having the assumption "node ID == memnode ID" can live as
147  *    is, that's because node ID is always equal to memnode ID at boot time.
148  * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and
149  *    lgrp_plat_mem_size() related logics have been enhanced to deal with
150  *    the 1:N map relationship.
151  * 6) The latency probing related logics, which have the assumption
152  *    "node ID == memnode ID" and may be called at run time, is disabled if
153  *    memory DR operation is enabled.
154  */
155 
156 
157 #include <sys/archsystm.h>	/* for {in,out}{b,w,l}() */
158 #include <sys/atomic.h>
159 #include <sys/bootconf.h>
160 #include <sys/cmn_err.h>
161 #include <sys/controlregs.h>
162 #include <sys/cpupart.h>
163 #include <sys/cpuvar.h>
164 #include <sys/lgrp.h>
165 #include <sys/machsystm.h>
166 #include <sys/memlist.h>
167 #include <sys/memnode.h>
168 #include <sys/mman.h>
169 #include <sys/note.h>
170 #include <sys/pci_cfgspace.h>
171 #include <sys/pci_impl.h>
172 #include <sys/param.h>
173 #include <sys/pghw.h>
174 #include <sys/promif.h>		/* for prom_printf() */
175 #include <sys/sysmacros.h>
176 #include <sys/systm.h>
177 #include <sys/thread.h>
178 #include <sys/types.h>
179 #include <sys/var.h>
180 #include <sys/x86_archext.h>	/* for x86_feature and X86_AMD */
181 #include <vm/hat_i86.h>
182 #include <vm/seg_kmem.h>
183 #include <vm/vm_dep.h>
184 
185 #include <sys/acpidev.h>
186 #include "acpi_fw.h"		/* for SRAT, SLIT and MSCT */
187 
188 
189 #define	MAX_NODES		8
190 #define	NLGRP			(MAX_NODES * (MAX_NODES - 1) + 1)
191 
192 /*
193  * Constants for configuring probing
194  */
195 #define	LGRP_PLAT_PROBE_NROUNDS		64	/* default laps for probing */
196 #define	LGRP_PLAT_PROBE_NSAMPLES	1	/* default samples to take */
197 #define	LGRP_PLAT_PROBE_NREADS		256	/* number of vendor ID reads */
198 
199 /*
200  * Flags for probing
201  */
202 #define	LGRP_PLAT_PROBE_ENABLE		0x1	/* enable probing */
203 #define	LGRP_PLAT_PROBE_PGCPY		0x2	/* probe using page copy */
204 #define	LGRP_PLAT_PROBE_VENDOR		0x4	/* probe vendor ID register */
205 
206 /*
207  * Hash proximity domain ID into node to domain mapping table "mod" number of
208  * nodes to minimize span of entries used and try to have lowest numbered
209  * proximity domain be node 0
210  */
211 #define	NODE_DOMAIN_HASH(domain, node_cnt) \
212 	((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \
213 	    ((domain) - lgrp_plat_prox_domain_min) % node_cnt)
214 
215 /*
216  * CPU to node ID mapping structure (only used with SRAT)
217  */
218 typedef	struct cpu_node_map {
219 	int		exists;
220 	uint_t		node;
221 	uint32_t	apicid;
222 	uint32_t	prox_domain;
223 } cpu_node_map_t;
224 
225 /*
226  * Latency statistics
227  */
228 typedef struct lgrp_plat_latency_stats {
229 	hrtime_t	latencies[MAX_NODES][MAX_NODES];
230 	hrtime_t	latency_max;
231 	hrtime_t	latency_min;
232 } lgrp_plat_latency_stats_t;
233 
234 /*
235  * Memory configuration for probing
236  */
237 typedef struct lgrp_plat_probe_mem_config {
238 	size_t	probe_memsize;		/* how much memory to probe per node */
239 	caddr_t	probe_va[MAX_NODES];	/* where memory mapped for probing */
240 	pfn_t	probe_pfn[MAX_NODES];	/* physical pages to map for probing */
241 } lgrp_plat_probe_mem_config_t;
242 
243 /*
244  * Statistics kept for probing
245  */
246 typedef struct lgrp_plat_probe_stats {
247 	hrtime_t	flush_cost;
248 	hrtime_t	probe_cost;
249 	hrtime_t	probe_cost_total;
250 	hrtime_t	probe_error_code;
251 	hrtime_t	probe_errors[MAX_NODES][MAX_NODES];
252 	int		probe_suspect[MAX_NODES][MAX_NODES];
253 	hrtime_t	probe_max[MAX_NODES][MAX_NODES];
254 	hrtime_t	probe_min[MAX_NODES][MAX_NODES];
255 } lgrp_plat_probe_stats_t;
256 
257 /*
258  * Node to proximity domain ID mapping structure (only used with SRAT)
259  */
260 typedef	struct node_domain_map {
261 	int		exists;
262 	uint32_t	prox_domain;
263 } node_domain_map_t;
264 
265 /*
266  * Node ID and starting and ending page for physical memory in memory node
267  */
268 typedef	struct memnode_phys_addr_map {
269 	pfn_t		start;
270 	pfn_t		end;
271 	int		exists;
272 	uint32_t	prox_domain;
273 	uint32_t	device_id;
274 	uint_t		lgrphand;
275 } memnode_phys_addr_map_t;
276 
277 /*
278  * Number of CPUs for which we got APIC IDs
279  */
280 static int				lgrp_plat_apic_ncpus = 0;
281 
282 /*
283  * CPU to node ID mapping table (only used for SRAT) and its max number of
284  * entries
285  */
286 static cpu_node_map_t			*lgrp_plat_cpu_node = NULL;
287 static uint_t				lgrp_plat_cpu_node_nentries = 0;
288 
289 /*
290  * Latency statistics
291  */
292 lgrp_plat_latency_stats_t		lgrp_plat_lat_stats;
293 
294 /*
295  * Whether memory is interleaved across nodes causing MPO to be disabled
296  */
297 static int				lgrp_plat_mem_intrlv = 0;
298 
299 /*
300  * Node ID to proximity domain ID mapping table (only used for SRAT)
301  */
302 static node_domain_map_t		lgrp_plat_node_domain[MAX_NODES];
303 
304 /*
305  * Physical address range for memory in each node
306  */
307 static memnode_phys_addr_map_t		lgrp_plat_memnode_info[MAX_MEM_NODES];
308 
309 /*
310  * Statistics gotten from probing
311  */
312 static lgrp_plat_probe_stats_t		lgrp_plat_probe_stats;
313 
314 /*
315  * Memory configuration for probing
316  */
317 static lgrp_plat_probe_mem_config_t	lgrp_plat_probe_mem_config;
318 
319 /*
320  * Lowest proximity domain ID seen in ACPI SRAT
321  */
322 static uint32_t				lgrp_plat_prox_domain_min = UINT32_MAX;
323 
324 /*
325  * Error code from processing ACPI SRAT
326  */
327 static int				lgrp_plat_srat_error = 0;
328 
329 /*
330  * Error code from processing ACPI SLIT
331  */
332 static int				lgrp_plat_slit_error = 0;
333 
334 /*
335  * Whether lgrp topology has been flattened to 2 levels.
336  */
337 static int				lgrp_plat_topo_flatten = 0;
338 
339 
340 /*
341  * Maximum memory node ID in use.
342  */
343 static uint_t				lgrp_plat_max_mem_node;
344 
345 /*
346  * Allocate lgroup array statically
347  */
348 static lgrp_t				lgrp_space[NLGRP];
349 static int				nlgrps_alloc;
350 
351 
352 /*
353  * Enable finding and using minimum proximity domain ID when hashing
354  */
355 int			lgrp_plat_domain_min_enable = 1;
356 
357 /*
358  * Maximum possible number of nodes in system
359  */
360 uint_t			lgrp_plat_node_cnt = 1;
361 
362 /*
363  * Enable sorting nodes in ascending order by starting physical address
364  */
365 int			lgrp_plat_node_sort_enable = 1;
366 
367 /*
368  * Configuration Parameters for Probing
369  * - lgrp_plat_probe_flags	Flags to specify enabling probing, probe
370  *				operation, etc.
371  * - lgrp_plat_probe_nrounds	How many rounds of probing to do
372  * - lgrp_plat_probe_nsamples	Number of samples to take when probing each
373  *				node
374  * - lgrp_plat_probe_nreads	Number of times to read vendor ID from
375  *				Northbridge for each probe
376  */
377 uint_t			lgrp_plat_probe_flags = 0;
378 int			lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
379 int			lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
380 int			lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
381 
382 /*
383  * Enable use of ACPI System Resource Affinity Table (SRAT), System
384  * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT)
385  */
386 int			lgrp_plat_srat_enable = 1;
387 int			lgrp_plat_slit_enable = 1;
388 int			lgrp_plat_msct_enable = 1;
389 
390 /*
391  * mnode_xwa: set to non-zero value to initiate workaround if large pages are
392  * found to be crossing memory node boundaries. The workaround will eliminate
393  * a base size page at the end of each memory node boundary to ensure that
394  * a large page with constituent pages that span more than 1 memory node
395  * can never be formed.
396  *
397  */
398 int	mnode_xwa = 1;
399 
400 /*
401  * Static array to hold lgroup statistics
402  */
403 struct lgrp_stats	lgrp_stats[NLGRP];
404 
405 
406 /*
407  * Forward declarations of platform interface routines
408  */
409 void		plat_build_mem_nodes(struct memlist *list);
410 
411 int		plat_mnode_xcheck(pfn_t pfncnt);
412 
413 lgrp_handle_t	plat_mem_node_to_lgrphand(int mnode);
414 
415 int		plat_pfn_to_mem_node(pfn_t pfn);
416 
417 /*
418  * Forward declarations of lgroup platform interface routines
419  */
420 lgrp_t		*lgrp_plat_alloc(lgrp_id_t lgrpid);
421 
422 void		lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
423 
424 lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);
425 
426 void		lgrp_plat_init(lgrp_init_stages_t stage);
427 
428 int		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
429 
430 int		lgrp_plat_max_lgrps(void);
431 
432 pgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
433     lgrp_mem_query_t query);
434 
435 lgrp_handle_t	lgrp_plat_pfn_to_hand(pfn_t pfn);
436 
437 void		lgrp_plat_probe(void);
438 
439 lgrp_handle_t	lgrp_plat_root_hand(void);
440 
441 
442 /*
443  * Forward declarations of local routines
444  */
445 static int	is_opteron(void);
446 
447 static int	lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
448     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
449     uint32_t domain);
450 
451 static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
452     int cpu_node_nentries);
453 
454 static int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
455     int node_cnt, uint32_t domain);
456 
457 static void	lgrp_plat_get_numa_config(void);
458 
459 static void	lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
460     lgrp_plat_latency_stats_t *lat_stats,
461     lgrp_plat_probe_stats_t *probe_stats);
462 
463 static int	lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
464     lgrp_plat_latency_stats_t *lat_stats);
465 
466 static void	lgrp_plat_main_init(void);
467 
468 static pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
469 
470 static int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
471     int node_cnt, uint32_t domain);
472 
473 static int	lgrp_plat_memnode_info_update(node_domain_map_t *node_domain,
474     int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt,
475     uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id);
476 
477 static void	lgrp_plat_node_sort(node_domain_map_t *node_domain,
478     int node_cnt, cpu_node_map_t *cpu_node, int cpu_count,
479     memnode_phys_addr_map_t *memnode_info);
480 
481 static hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
482     int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
483     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);
484 
485 static int	lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);
486 
487 static int	lgrp_plat_process_slit(struct slit *tp,
488     node_domain_map_t *node_domain, uint_t node_cnt,
489     memnode_phys_addr_map_t *memnode_info,
490     lgrp_plat_latency_stats_t *lat_stats);
491 
492 static int	lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info,
493     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
494     lgrp_plat_latency_stats_t *lat_stats);
495 
496 static int	lgrp_plat_process_srat(struct srat *tp, struct msct *mp,
497     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
498     cpu_node_map_t *cpu_node, int cpu_count,
499     memnode_phys_addr_map_t *memnode_info);
500 
501 static void	lgrp_plat_release_bootstrap(void);
502 
503 static int	lgrp_plat_srat_domains(struct srat *tp,
504     uint32_t *prox_domain_min);
505 
506 static int	lgrp_plat_msct_domains(struct msct *tp,
507     uint32_t *prox_domain_min);
508 
509 static void	lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats);
510 
511 static void	opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
512     memnode_phys_addr_map_t *memnode_info);
513 
514 static hrtime_t	opt_probe_vendor(int dest_node, int nreads);
515 
516 
517 /*
518  * PLATFORM INTERFACE ROUTINES
519  */
520 
521 /*
522  * Configure memory nodes for machines with more than one node (ie NUMA)
523  */
524 void
525 plat_build_mem_nodes(struct memlist *list)
526 {
527 	pfn_t		cur_start;	/* start addr of subrange */
528 	pfn_t		cur_end;	/* end addr of subrange */
529 	pfn_t		start;		/* start addr of whole range */
530 	pfn_t		end;		/* end addr of whole range */
531 	pgcnt_t		endcnt;		/* pages to sacrifice */
532 
533 	/*
534 	 * Boot install lists are arranged <addr, len>, ...
535 	 */
536 	while (list) {
537 		int	node;
538 
539 		start = list->ml_address >> PAGESHIFT;
540 		end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT;
541 
542 		if (start > physmax) {
543 			list = list->ml_next;
544 			continue;
545 		}
546 		if (end > physmax)
547 			end = physmax;
548 
549 		/*
550 		 * When there is only one memnode, just add memory to memnode
551 		 */
552 		if (max_mem_nodes == 1) {
553 			mem_node_add_slice(start, end);
554 			list = list->ml_next;
555 			continue;
556 		}
557 
558 		/*
559 		 * mem_node_add_slice() expects to get a memory range that
560 		 * is within one memnode, so need to split any memory range
561 		 * that spans multiple memnodes into subranges that are each
562 		 * contained within one memnode when feeding them to
563 		 * mem_node_add_slice()
564 		 */
565 		cur_start = start;
566 		do {
567 			node = plat_pfn_to_mem_node(cur_start);
568 
569 			/*
570 			 * Panic if DRAM address map registers or SRAT say
571 			 * memory in node doesn't exist or address from
572 			 * boot installed memory list entry isn't in this node.
573 			 * This shouldn't happen and rest of code can't deal
574 			 * with this if it does.
575 			 */
576 			if (node < 0 || node >= lgrp_plat_max_mem_node ||
577 			    !lgrp_plat_memnode_info[node].exists ||
578 			    cur_start < lgrp_plat_memnode_info[node].start ||
579 			    cur_start > lgrp_plat_memnode_info[node].end) {
580 				cmn_err(CE_PANIC, "Don't know which memnode "
581 				    "to add installed memory address 0x%lx\n",
582 				    cur_start);
583 			}
584 
585 			/*
586 			 * End of current subrange should not span memnodes
587 			 */
588 			cur_end = end;
589 			endcnt = 0;
590 			if (lgrp_plat_memnode_info[node].exists &&
591 			    cur_end > lgrp_plat_memnode_info[node].end) {
592 				cur_end = lgrp_plat_memnode_info[node].end;
593 				if (mnode_xwa > 1) {
594 					/*
595 					 * sacrifice the last page in each
596 					 * node to eliminate large pages
597 					 * that span more than 1 memory node.
598 					 */
599 					endcnt = 1;
600 					physinstalled--;
601 				}
602 			}
603 
604 			mem_node_add_slice(cur_start, cur_end - endcnt);
605 
606 			/*
607 			 * Next subrange starts after end of current one
608 			 */
609 			cur_start = cur_end + 1;
610 		} while (cur_end < end);
611 
612 		list = list->ml_next;
613 	}
614 	mem_node_physalign = 0;
615 	mem_node_pfn_shift = 0;
616 }
617 
618 
619 /*
620  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
621  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
622  * a crossing is found and returns 0 otherwise.
623  */
624 int
625 plat_mnode_xcheck(pfn_t pfncnt)
626 {
627 	int	node, prevnode = -1, basenode;
628 	pfn_t	ea, sa;
629 
630 	for (node = 0; node < lgrp_plat_max_mem_node; node++) {
631 
632 		if (lgrp_plat_memnode_info[node].exists == 0)
633 			continue;
634 
635 		if (prevnode == -1) {
636 			prevnode = node;
637 			basenode = node;
638 			continue;
639 		}
640 
641 		/* assume x86 node pfn ranges are in increasing order */
642 		ASSERT(lgrp_plat_memnode_info[node].start >
643 		    lgrp_plat_memnode_info[prevnode].end);
644 
645 		/*
646 		 * continue if the starting address of node is not contiguous
647 		 * with the previous node.
648 		 */
649 
650 		if (lgrp_plat_memnode_info[node].start !=
651 		    (lgrp_plat_memnode_info[prevnode].end + 1)) {
652 			basenode = node;
653 			prevnode = node;
654 			continue;
655 		}
656 
657 		/* check if the starting address of node is pfncnt aligned */
658 		if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) {
659 
660 			/*
661 			 * at this point, node starts at an unaligned boundary
662 			 * and is contiguous with the previous node(s) to
663 			 * basenode. Check if there is an aligned contiguous
664 			 * range of length pfncnt that crosses this boundary.
665 			 */
666 
667 			sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end,
668 			    pfncnt);
669 			ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start),
670 			    pfncnt);
671 
672 			ASSERT((ea - sa) == pfncnt);
673 			if (sa >= lgrp_plat_memnode_info[basenode].start &&
674 			    ea <= (lgrp_plat_memnode_info[node].end + 1)) {
675 				/*
676 				 * large page found to cross mnode boundary.
677 				 * Return Failure if workaround not enabled.
678 				 */
679 				if (mnode_xwa == 0)
680 					return (1);
681 				mnode_xwa++;
682 			}
683 		}
684 		prevnode = node;
685 	}
686 	return (0);
687 }
688 
689 
690 lgrp_handle_t
691 plat_mem_node_to_lgrphand(int mnode)
692 {
693 	if (max_mem_nodes == 1)
694 		return (LGRP_DEFAULT_HANDLE);
695 
696 	ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node);
697 
698 	return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand));
699 }
700 
701 int
702 plat_pfn_to_mem_node(pfn_t pfn)
703 {
704 	int	node;
705 
706 	if (max_mem_nodes == 1)
707 		return (0);
708 
709 	for (node = 0; node < lgrp_plat_max_mem_node; node++) {
710 		/*
711 		 * Skip nodes with no memory
712 		 */
713 		if (!lgrp_plat_memnode_info[node].exists)
714 			continue;
715 
716 		membar_consumer();
717 		if (pfn >= lgrp_plat_memnode_info[node].start &&
718 		    pfn <= lgrp_plat_memnode_info[node].end)
719 			return (node);
720 	}
721 
722 	/*
723 	 * Didn't find memnode where this PFN lives which should never happen
724 	 */
725 	ASSERT(node < lgrp_plat_max_mem_node);
726 	return (-1);
727 }
728 
729 
730 /*
731  * LGROUP PLATFORM INTERFACE ROUTINES
732  */
733 
734 /*
735  * Allocate additional space for an lgroup.
736  */
737 lgrp_t *
738 lgrp_plat_alloc(lgrp_id_t lgrpid)
739 {
740 	lgrp_t *lgrp;
741 
742 	lgrp = &lgrp_space[nlgrps_alloc++];
743 	if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
744 		return (NULL);
745 	return (lgrp);
746 }
747 
748 
749 /*
750  * Platform handling for (re)configuration changes
751  *
752  * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug:
753  * 1) Use cpu_lock to synchronize between lgrp_plat_config() and
754  *    lgrp_plat_cpu_to_hand().
755  * 2) Disable latency probing logic by making sure that the flag
756  *    LGRP_PLAT_PROBE_ENABLE is cleared.
757  *
758  * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug:
759  * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal.
760  * 2) Only expansion to existing entries, no shrinking.
761  * 3) On writing side, DR framework ensures that lgrp_plat_config() is called
762  *    in single-threaded context. And membar_producer() is used to ensure that
763  *    all changes are visible to other CPUs before setting the "exists" flag.
764  * 4) On reading side, membar_consumer() after checking the "exists" flag
765  *    ensures that right values are retrieved.
766  *
767  * Mechanism to protect lgrp_plat_node_domain[] at hotplug:
768  * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal.
769  * 2) On writing side, it's single-threaded and membar_producer() is used to
770  *    ensure all changes are visible to other CPUs before setting the "exists"
771  *    flag.
772  * 3) On reading side, membar_consumer() after checking the "exists" flag
773  *    ensures that right values are retrieved.
774  */
775 void
776 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
777 {
778 #ifdef	__xpv
779 	_NOTE(ARGUNUSED(flag, arg));
780 #else
781 	int	rc, node;
782 	cpu_t	*cp;
783 	void	*hdl = NULL;
784 	uchar_t	*sliptr = NULL;
785 	uint32_t domain, apicid, slicnt = 0;
786 	update_membounds_t *mp;
787 
788 	extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *,
789 	    uint32_t *, uint32_t *, uchar_t **);
790 	extern void acpidev_dr_free_cpu_numa_info(void *);
791 
792 	/*
793 	 * This interface is used to support CPU/memory DR operations.
794 	 * Don't bother here if it's still during boot or only one lgrp node
795 	 * is supported.
796 	 */
797 	if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1)
798 		return;
799 
800 	switch (flag) {
801 	case LGRP_CONFIG_CPU_ADD:
802 		cp = (cpu_t *)arg;
803 		ASSERT(cp != NULL);
804 		ASSERT(MUTEX_HELD(&cpu_lock));
805 
806 		/* Check whether CPU already exists. */
807 		ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists);
808 		if (lgrp_plat_cpu_node[cp->cpu_id].exists) {
809 			cmn_err(CE_WARN,
810 			    "!lgrp: CPU(%d) already exists in cpu_node map.",
811 			    cp->cpu_id);
812 			break;
813 		}
814 
815 		/* Query CPU lgrp information. */
816 		rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
817 		    &slicnt, &sliptr);
818 		ASSERT(rc == 0);
819 		if (rc != 0) {
820 			cmn_err(CE_WARN,
821 			    "!lgrp: failed to query lgrp info for CPU(%d).",
822 			    cp->cpu_id);
823 			break;
824 		}
825 
826 		/* Update node to proximity domain mapping */
827 		node = lgrp_plat_domain_to_node(lgrp_plat_node_domain,
828 		    lgrp_plat_node_cnt, domain);
829 		if (node == -1) {
830 			node = lgrp_plat_node_domain_update(
831 			    lgrp_plat_node_domain, lgrp_plat_node_cnt, domain);
832 			ASSERT(node != -1);
833 			if (node == -1) {
834 				acpidev_dr_free_cpu_numa_info(hdl);
835 				cmn_err(CE_WARN, "!lgrp: failed to update "
836 				    "node_domain map for domain(%u).", domain);
837 				break;
838 			}
839 		}
840 
841 		/* Update latency information among lgrps. */
842 		if (slicnt != 0 && sliptr != NULL) {
843 			if (lgrp_plat_process_sli(domain, sliptr, slicnt,
844 			    lgrp_plat_node_domain, lgrp_plat_node_cnt,
845 			    &lgrp_plat_lat_stats) != 0) {
846 				cmn_err(CE_WARN, "!lgrp: failed to update "
847 				    "latency information for domain (%u).",
848 				    domain);
849 			}
850 		}
851 
852 		/* Update CPU to node mapping. */
853 		lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain;
854 		lgrp_plat_cpu_node[cp->cpu_id].node = node;
855 		lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid;
856 		lgrp_plat_cpu_node[cp->cpu_id].exists = 1;
857 		lgrp_plat_apic_ncpus++;
858 
859 		acpidev_dr_free_cpu_numa_info(hdl);
860 		break;
861 
862 	case LGRP_CONFIG_CPU_DEL:
863 		cp = (cpu_t *)arg;
864 		ASSERT(cp != NULL);
865 		ASSERT(MUTEX_HELD(&cpu_lock));
866 
867 		/* Check whether CPU exists. */
868 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists);
869 		if (!lgrp_plat_cpu_node[cp->cpu_id].exists) {
870 			cmn_err(CE_WARN,
871 			    "!lgrp: CPU(%d) doesn't exist in cpu_node map.",
872 			    cp->cpu_id);
873 			break;
874 		}
875 
876 		/* Query CPU lgrp information. */
877 		rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
878 		    NULL, NULL);
879 		ASSERT(rc == 0);
880 		if (rc != 0) {
881 			cmn_err(CE_WARN,
882 			    "!lgrp: failed to query lgrp info for CPU(%d).",
883 			    cp->cpu_id);
884 			break;
885 		}
886 
887 		/* Update map. */
888 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid);
889 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain);
890 		lgrp_plat_cpu_node[cp->cpu_id].exists = 0;
891 		lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX;
892 		lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX;
893 		lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX;
894 		lgrp_plat_apic_ncpus--;
895 
896 		acpidev_dr_free_cpu_numa_info(hdl);
897 		break;
898 
899 	case LGRP_CONFIG_MEM_ADD:
900 		mp = (update_membounds_t *)arg;
901 		ASSERT(mp != NULL);
902 
903 		/* Update latency information among lgrps. */
904 		if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) {
905 			if (lgrp_plat_process_sli(mp->u_domain,
906 			    mp->u_sli_ptr, mp->u_sli_cnt,
907 			    lgrp_plat_node_domain, lgrp_plat_node_cnt,
908 			    &lgrp_plat_lat_stats) != 0) {
909 				cmn_err(CE_WARN, "!lgrp: failed to update "
910 				    "latency information for domain (%u).",
911 				    domain);
912 			}
913 		}
914 
915 		if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain,
916 		    lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes,
917 		    mp->u_base, mp->u_base + mp->u_length,
918 		    mp->u_domain, mp->u_device_id) < 0) {
919 			cmn_err(CE_WARN,
920 			    "!lgrp: failed to update latency  information for "
921 			    "memory (0x%" PRIx64 " - 0x%" PRIx64 ").",
922 			    mp->u_base, mp->u_base + mp->u_length);
923 		}
924 		break;
925 
926 	default:
927 		break;
928 	}
929 #endif	/* __xpv */
930 }
931 
932 
933 /*
934  * Return the platform handle for the lgroup containing the given CPU
935  */
936 lgrp_handle_t
937 lgrp_plat_cpu_to_hand(processorid_t id)
938 {
939 	lgrp_handle_t	hand;
940 
941 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
942 
943 	if (lgrp_plat_node_cnt == 1)
944 		return (LGRP_DEFAULT_HANDLE);
945 
946 	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
947 	    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);
948 
949 	ASSERT(hand != (lgrp_handle_t)-1);
950 	if (hand == (lgrp_handle_t)-1)
951 		return (LGRP_NULL_HANDLE);
952 
953 	return (hand);
954 }
955 
956 
957 /*
958  * Platform-specific initialization of lgroups
959  */
960 void
961 lgrp_plat_init(lgrp_init_stages_t stage)
962 {
963 #if defined(__xpv)
964 #else	/* __xpv */
965 	u_longlong_t	value;
966 #endif	/* __xpv */
967 
968 	switch (stage) {
969 	case LGRP_INIT_STAGE1:
970 #if defined(__xpv)
971 		/*
972 		 * XXPV	For now, the hypervisor treats all memory equally.
973 		 */
974 		lgrp_plat_node_cnt = max_mem_nodes = 1;
975 #else	/* __xpv */
976 
977 		/*
978 		 * Get boot property for lgroup topology height limit
979 		 */
980 		if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
981 			(void) lgrp_topo_ht_limit_set((int)value);
982 
983 		/*
984 		 * Get boot property for enabling/disabling SRAT
985 		 */
986 		if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
987 			lgrp_plat_srat_enable = (int)value;
988 
989 		/*
990 		 * Get boot property for enabling/disabling SLIT
991 		 */
992 		if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
993 			lgrp_plat_slit_enable = (int)value;
994 
995 		/*
996 		 * Get boot property for enabling/disabling MSCT
997 		 */
998 		if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0)
999 			lgrp_plat_msct_enable = (int)value;
1000 
1001 		/*
1002 		 * Initialize as a UMA machine
1003 		 */
1004 		if (lgrp_topo_ht_limit() == 1) {
1005 			lgrp_plat_node_cnt = max_mem_nodes = 1;
1006 			lgrp_plat_max_mem_node = 1;
1007 			return;
1008 		}
1009 
1010 		lgrp_plat_get_numa_config();
1011 
1012 		/*
1013 		 * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes
1014 		 * to support memory DR operations if memory DR is enabled.
1015 		 */
1016 		lgrp_plat_max_mem_node = lgrp_plat_node_cnt;
1017 		if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) {
1018 			max_mem_nodes = MAX_MEM_NODES_PER_LGROUP *
1019 			    lgrp_plat_node_cnt;
1020 			ASSERT(max_mem_nodes <= MAX_MEM_NODES);
1021 		}
1022 #endif	/* __xpv */
1023 		break;
1024 
1025 	case LGRP_INIT_STAGE3:
1026 		lgrp_plat_probe();
1027 		lgrp_plat_release_bootstrap();
1028 		break;
1029 
1030 	case LGRP_INIT_STAGE4:
1031 		lgrp_plat_main_init();
1032 		break;
1033 
1034 	default:
1035 		break;
1036 	}
1037 }
1038 
1039 
1040 /*
1041  * Return latency between "from" and "to" lgroups
1042  *
1043  * This latency number can only be used for relative comparison
1044  * between lgroups on the running system, cannot be used across platforms,
1045  * and may not reflect the actual latency.  It is platform and implementation
1046  * specific, so platform gets to decide its value.  It would be nice if the
1047  * number was at least proportional to make comparisons more meaningful though.
1048  */
1049 int
1050 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
1051 {
1052 	lgrp_handle_t	src, dest;
1053 	int		node;
1054 
1055 	if (max_mem_nodes == 1)
1056 		return (0);
1057 
1058 	/*
1059 	 * Return max latency for root lgroup
1060 	 */
1061 	if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
1062 		return (lgrp_plat_lat_stats.latency_max);
1063 
1064 	src = from;
1065 	dest = to;
1066 
1067 	/*
1068 	 * Return 0 for nodes (lgroup platform handles) out of range
1069 	 */
1070 	if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
1071 		return (0);
1072 
1073 	/*
1074 	 * Probe from current CPU if its lgroup latencies haven't been set yet
1075 	 * and we are trying to get latency from current CPU to some node.
1076 	 * Avoid probing if CPU/memory DR is enabled.
1077 	 */
1078 	if (lgrp_plat_lat_stats.latencies[src][src] == 0) {
1079 		/*
1080 		 * Latency information should be updated by lgrp_plat_config()
1081 		 * for DR operations. Something is wrong if reaches here.
1082 		 * For safety, flatten lgrp topology to two levels.
1083 		 */
1084 		if (plat_dr_support_cpu() || plat_dr_support_memory()) {
1085 			ASSERT(lgrp_plat_lat_stats.latencies[src][src]);
1086 			cmn_err(CE_WARN,
1087 			    "lgrp: failed to get latency information, "
1088 			    "fall back to two-level topology.");
1089 			lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1090 		} else {
1091 			node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1092 			    lgrp_plat_cpu_node_nentries);
1093 			ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
1094 			if (node == src)
1095 				lgrp_plat_probe();
1096 		}
1097 	}
1098 
1099 	return (lgrp_plat_lat_stats.latencies[src][dest]);
1100 }
1101 
1102 
1103 /*
1104  * Return the maximum number of lgrps supported by the platform.
1105  * Before lgrp topology is known it returns an estimate based on the number of
1106  * nodes. Once topology is known it returns:
1107  * 1) the actual maximim number of lgrps created if CPU/memory DR operations
1108  *    are not suppported.
1109  * 2) the maximum possible number of lgrps if CPU/memory DR operations are
1110  *    supported.
1111  */
1112 int
1113 lgrp_plat_max_lgrps(void)
1114 {
1115 	if (!lgrp_topo_initialized || plat_dr_support_cpu() ||
1116 	    plat_dr_support_memory()) {
1117 		return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
1118 	} else {
1119 		return (lgrp_alloc_max + 1);
1120 	}
1121 }
1122 
1123 
1124 /*
1125  * Count number of memory pages (_t) based on mnode id (_n) and query type (_t).
1126  */
1127 #define	_LGRP_PLAT_MEM_SIZE(_n, _q, _t)					\
1128 	if (mem_node_config[_n].exists) {				\
1129 		switch (_q) {						\
1130 		case LGRP_MEM_SIZE_FREE:				\
1131 			_t += MNODE_PGCNT(_n);				\
1132 			break;						\
1133 		case LGRP_MEM_SIZE_AVAIL:				\
1134 			_t += mem_node_memlist_pages(_n, phys_avail);	\
1135 				break;					\
1136 		case LGRP_MEM_SIZE_INSTALL:				\
1137 			_t += mem_node_memlist_pages(_n, phys_install);	\
1138 			break;						\
1139 		default:						\
1140 			break;						\
1141 		}							\
1142 	}
1143 
1144 /*
1145  * Return the number of free pages in an lgroup.
1146  *
1147  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
1148  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
1149  * number of allocatable base pagesize pages corresponding to the
1150  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
1151  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
1152  * memory installed, regardless of whether or not it's usable.
1153  */
1154 pgcnt_t
1155 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
1156 {
1157 	int	mnode;
1158 	pgcnt_t npgs = (pgcnt_t)0;
1159 	extern struct memlist *phys_avail;
1160 	extern struct memlist *phys_install;
1161 
1162 
1163 	if (plathand == LGRP_DEFAULT_HANDLE)
1164 		return (lgrp_plat_mem_size_default(plathand, query));
1165 
1166 	if (plathand != LGRP_NULL_HANDLE) {
1167 		/* Count memory node present at boot. */
1168 		mnode = (int)plathand;
1169 		ASSERT(mnode < lgrp_plat_node_cnt);
1170 		_LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1171 
1172 		/* Count possible hot-added memory nodes. */
1173 		for (mnode = lgrp_plat_node_cnt;
1174 		    mnode < lgrp_plat_max_mem_node; mnode++) {
1175 			if (lgrp_plat_memnode_info[mnode].lgrphand == plathand)
1176 				_LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1177 		}
1178 	}
1179 
1180 	return (npgs);
1181 }
1182 
1183 
1184 /*
1185  * Return the platform handle of the lgroup that contains the physical memory
1186  * corresponding to the given page frame number
1187  */
1188 lgrp_handle_t
1189 lgrp_plat_pfn_to_hand(pfn_t pfn)
1190 {
1191 	int	mnode;
1192 
1193 	if (max_mem_nodes == 1)
1194 		return (LGRP_DEFAULT_HANDLE);
1195 
1196 	if (pfn > physmax)
1197 		return (LGRP_NULL_HANDLE);
1198 
1199 	mnode = plat_pfn_to_mem_node(pfn);
1200 	if (mnode < 0)
1201 		return (LGRP_NULL_HANDLE);
1202 
1203 	return (MEM_NODE_2_LGRPHAND(mnode));
1204 }
1205 
1206 
1207 /*
1208  * Probe memory in each node from current CPU to determine latency topology
1209  *
1210  * The probing code will probe the vendor ID register on the Northbridge of
1211  * Opteron processors and probe memory for other processors by default.
1212  *
1213  * Since probing is inherently error prone, the code takes laps across all the
1214  * nodes probing from each node to each of the other nodes some number of
1215  * times.  Furthermore, each node is probed some number of times before moving
1216  * onto the next one during each lap.  The minimum latency gotten between nodes
1217  * is kept as the latency between the nodes.
1218  *
1219  * After all that,  the probe times are adjusted by normalizing values that are
1220  * close to each other and local latencies are made the same.  Lastly, the
1221  * latencies are verified to make sure that certain conditions are met (eg.
1222  * local < remote, latency(a, b) == latency(b, a), etc.).
1223  *
1224  * If any of the conditions aren't met, the code will export a NUMA
1225  * configuration with the local CPUs and memory given by the SRAT or PCI config
1226  * space registers and one remote memory latency since it can't tell exactly
1227  * how far each node is from each other.
1228  */
1229 void
1230 lgrp_plat_probe(void)
1231 {
1232 	int				from;
1233 	int				i;
1234 	lgrp_plat_latency_stats_t	*lat_stats;
1235 	boolean_t			probed;
1236 	hrtime_t			probe_time;
1237 	int				to;
1238 
1239 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
1240 	    max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
1241 		return;
1242 
1243 	/* SRAT and SLIT should be enabled if DR operations are enabled. */
1244 	if (plat_dr_support_cpu() || plat_dr_support_memory())
1245 		return;
1246 
1247 	/*
1248 	 * Determine ID of node containing current CPU
1249 	 */
1250 	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1251 	    lgrp_plat_cpu_node_nentries);
1252 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
1253 	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
1254 		ASSERT(lgrp_plat_node_domain[from].exists);
1255 
1256 	/*
1257 	 * Don't need to probe if got times already
1258 	 */
1259 	lat_stats = &lgrp_plat_lat_stats;
1260 	if (lat_stats->latencies[from][from] != 0)
1261 		return;
1262 
1263 	/*
1264 	 * Read vendor ID in Northbridge or read and write page(s)
1265 	 * in each node from current CPU and remember how long it takes,
1266 	 * so we can build latency topology of machine later.
1267 	 * This should approximate the memory latency between each node.
1268 	 */
1269 	probed = B_FALSE;
1270 	for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
1271 		for (to = 0; to < lgrp_plat_node_cnt; to++) {
1272 			/*
1273 			 * Get probe time and skip over any nodes that can't be
1274 			 * probed yet or don't have memory
1275 			 */
1276 			probe_time = lgrp_plat_probe_time(to,
1277 			    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
1278 			    &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
1279 			    &lgrp_plat_probe_stats);
1280 			if (probe_time == 0)
1281 				continue;
1282 
1283 			probed = B_TRUE;
1284 
1285 			/*
1286 			 * Keep lowest probe time as latency between nodes
1287 			 */
1288 			if (lat_stats->latencies[from][to] == 0 ||
1289 			    probe_time < lat_stats->latencies[from][to])
1290 				lat_stats->latencies[from][to] = probe_time;
1291 
1292 			/*
1293 			 * Update overall minimum and maximum probe times
1294 			 * across all nodes
1295 			 */
1296 			if (probe_time < lat_stats->latency_min ||
1297 			    lat_stats->latency_min == -1)
1298 				lat_stats->latency_min = probe_time;
1299 			if (probe_time > lat_stats->latency_max)
1300 				lat_stats->latency_max = probe_time;
1301 		}
1302 	}
1303 
1304 	/*
1305 	 * Bail out if weren't able to probe any nodes from current CPU
1306 	 */
1307 	if (probed == B_FALSE)
1308 		return;
1309 
1310 	/*
1311 	 * - Fix up latencies such that local latencies are same,
1312 	 *   latency(i, j) == latency(j, i), etc. (if possible)
1313 	 *
1314 	 * - Verify that latencies look ok
1315 	 *
1316 	 * - Fallback to just optimizing for local and remote if
1317 	 *   latencies didn't look right
1318 	 */
1319 	lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats,
1320 	    &lgrp_plat_probe_stats);
1321 	lgrp_plat_probe_stats.probe_error_code =
1322 	    lgrp_plat_latency_verify(lgrp_plat_memnode_info,
1323 	    &lgrp_plat_lat_stats);
1324 	if (lgrp_plat_probe_stats.probe_error_code)
1325 		lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1326 }
1327 
1328 
1329 /*
1330  * Return platform handle for root lgroup
1331  */
1332 lgrp_handle_t
1333 lgrp_plat_root_hand(void)
1334 {
1335 	return (LGRP_DEFAULT_HANDLE);
1336 }
1337 
1338 
1339 /*
1340  * INTERNAL ROUTINES
1341  */
1342 
1343 
1344 /*
1345  * Update CPU to node mapping for given CPU and proximity domain.
1346  * Return values:
1347  * 	- zero for success
1348  *	- positive numbers for warnings
1349  *	- negative numbers for errors
1350  */
1351 static int
1352 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
1353     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
1354 {
1355 	uint_t	i;
1356 	int	node;
1357 
1358 	/*
1359 	 * Get node number for proximity domain
1360 	 */
1361 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
1362 	if (node == -1) {
1363 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
1364 		    domain);
1365 		if (node == -1)
1366 			return (-1);
1367 	}
1368 
1369 	/*
1370 	 * Search for entry with given APIC ID and fill in its node and
1371 	 * proximity domain IDs (if they haven't been set already)
1372 	 */
1373 	for (i = 0; i < nentries; i++) {
1374 		/*
1375 		 * Skip nonexistent entries and ones without matching APIC ID
1376 		 */
1377 		if (!cpu_node[i].exists || cpu_node[i].apicid != apicid)
1378 			continue;
1379 
1380 		/*
1381 		 * Just return if entry completely and correctly filled in
1382 		 * already
1383 		 */
1384 		if (cpu_node[i].prox_domain == domain &&
1385 		    cpu_node[i].node == node)
1386 			return (1);
1387 
1388 		/*
1389 		 * It's invalid to have more than one entry with the same
1390 		 * local APIC ID in SRAT table.
1391 		 */
1392 		if (cpu_node[i].node != UINT_MAX)
1393 			return (-2);
1394 
1395 		/*
1396 		 * Fill in node and proximity domain IDs
1397 		 */
1398 		cpu_node[i].prox_domain = domain;
1399 		cpu_node[i].node = node;
1400 
1401 		return (0);
1402 	}
1403 
1404 	/*
1405 	 * It's possible that an apicid doesn't exist in the cpu_node map due
1406 	 * to user limits number of CPUs powered on at boot by specifying the
1407 	 * boot_ncpus kernel option.
1408 	 */
1409 	return (2);
1410 }
1411 
1412 
1413 /*
1414  * Get node ID for given CPU
1415  */
1416 static int
1417 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
1418     int cpu_node_nentries)
1419 {
1420 	processorid_t	cpuid;
1421 
1422 	if (cp == NULL)
1423 		return (-1);
1424 
1425 	cpuid = cp->cpu_id;
1426 	if (cpuid < 0 || cpuid >= max_ncpus)
1427 		return (-1);
1428 
1429 	/*
1430 	 * SRAT doesn't exist, isn't enabled, or there was an error processing
1431 	 * it, so return node ID for Opteron and -1 otherwise.
1432 	 */
1433 	if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
1434 	    lgrp_plat_srat_error) {
1435 		if (is_opteron())
1436 			return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE));
1437 		return (-1);
1438 	}
1439 
1440 	/*
1441 	 * Return -1 when CPU to node ID mapping entry doesn't exist for given
1442 	 * CPU
1443 	 */
1444 	if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
1445 		return (-1);
1446 
1447 	return (cpu_node[cpuid].node);
1448 }
1449 
1450 
1451 /*
1452  * Return node number for given proximity domain/system locality
1453  */
1454 static int
1455 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
1456     uint32_t domain)
1457 {
1458 	uint_t	node;
1459 	uint_t	start;
1460 
1461 	/*
1462 	 * Hash proximity domain ID into node to domain mapping table (array),
1463 	 * search for entry with matching proximity domain ID, and return index
1464 	 * of matching entry as node ID.
1465 	 */
1466 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
1467 	do {
1468 		if (node_domain[node].exists) {
1469 			membar_consumer();
1470 			if (node_domain[node].prox_domain == domain)
1471 				return (node);
1472 		}
1473 		node = (node + 1) % node_cnt;
1474 	} while (node != start);
1475 	return (-1);
1476 }
1477 
1478 
1479 /*
1480  * Get NUMA configuration of machine
1481  */
1482 static void
1483 lgrp_plat_get_numa_config(void)
1484 {
1485 	uint_t		probe_op;
1486 
1487 	/*
1488 	 * Read boot property with CPU to APIC ID mapping table/array to
1489 	 * determine number of CPUs
1490 	 */
1491 	lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
1492 
1493 	/*
1494 	 * Determine which CPUs and memory are local to each other and number
1495 	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
1496 	 */
1497 	if (lgrp_plat_apic_ncpus > 0) {
1498 		int	retval;
1499 
1500 		/* Reserve enough resources if CPU DR is enabled. */
1501 		if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus)
1502 			lgrp_plat_cpu_node_nentries = max_ncpus;
1503 		else
1504 			lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
1505 
1506 		/*
1507 		 * Temporarily allocate boot memory to use for CPU to node
1508 		 * mapping since kernel memory allocator isn't alive yet
1509 		 */
1510 		lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
1511 		    NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t),
1512 		    sizeof (int));
1513 
1514 		ASSERT(lgrp_plat_cpu_node != NULL);
1515 		if (lgrp_plat_cpu_node) {
1516 			bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
1517 			    sizeof (cpu_node_map_t));
1518 		} else {
1519 			lgrp_plat_cpu_node_nentries = 0;
1520 		}
1521 
1522 		/*
1523 		 * Fill in CPU to node ID mapping table with APIC ID for each
1524 		 * CPU
1525 		 */
1526 		(void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
1527 
1528 		retval = lgrp_plat_process_srat(srat_ptr, msct_ptr,
1529 		    &lgrp_plat_prox_domain_min,
1530 		    lgrp_plat_node_domain, lgrp_plat_cpu_node,
1531 		    lgrp_plat_apic_ncpus, lgrp_plat_memnode_info);
1532 		if (retval <= 0) {
1533 			lgrp_plat_srat_error = retval;
1534 			lgrp_plat_node_cnt = 1;
1535 		} else {
1536 			lgrp_plat_srat_error = 0;
1537 			lgrp_plat_node_cnt = retval;
1538 		}
1539 	}
1540 
1541 	/*
1542 	 * Try to use PCI config space registers on Opteron if there's an error
1543 	 * processing CPU to APIC ID mapping or SRAT
1544 	 */
1545 	if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
1546 	    is_opteron())
1547 		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
1548 		    lgrp_plat_memnode_info);
1549 
1550 	/*
1551 	 * Don't bother to setup system for multiple lgroups and only use one
1552 	 * memory node when memory is interleaved between any nodes or there is
1553 	 * only one NUMA node
1554 	 */
1555 	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
1556 		lgrp_plat_node_cnt = max_mem_nodes = 1;
1557 		(void) lgrp_topo_ht_limit_set(1);
1558 		return;
1559 	}
1560 
1561 	/*
1562 	 * Leaf lgroups on x86/x64 architectures contain one physical
1563 	 * processor chip. Tune lgrp_expand_proc_thresh and
1564 	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
1565 	 * things out aggressively.
1566 	 */
1567 	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
1568 	lgrp_expand_proc_diff = 0;
1569 
1570 	/*
1571 	 * There should be one memnode (physical page free list(s)) for
1572 	 * each node if memory DR is disabled.
1573 	 */
1574 	max_mem_nodes = lgrp_plat_node_cnt;
1575 
1576 	/*
1577 	 * Initialize min and max latency before reading SLIT or probing
1578 	 */
1579 	lgrp_plat_lat_stats.latency_min = -1;
1580 	lgrp_plat_lat_stats.latency_max = 0;
1581 
1582 	/*
1583 	 * Determine how far each NUMA node is from each other by
1584 	 * reading ACPI System Locality Information Table (SLIT) if it
1585 	 * exists
1586 	 */
1587 	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
1588 	    lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info,
1589 	    &lgrp_plat_lat_stats);
1590 
1591 	/*
1592 	 * Disable support of CPU/memory DR operations if multiple locality
1593 	 * domains exist in system and either of following is true.
1594 	 * 1) Failed to process SLIT table.
1595 	 * 2) Latency probing is enabled by user.
1596 	 */
1597 	if (lgrp_plat_node_cnt > 1 &&
1598 	    (plat_dr_support_cpu() || plat_dr_support_memory())) {
1599 		if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 ||
1600 		    !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 ||
1601 		    lgrp_plat_apic_ncpus <= 0) {
1602 			cmn_err(CE_CONT,
1603 			    "?lgrp: failed to process ACPI SRAT/SLIT table, "
1604 			    "disable support of CPU/memory DR operations.");
1605 			plat_dr_disable_cpu();
1606 			plat_dr_disable_memory();
1607 		} else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) {
1608 			cmn_err(CE_CONT,
1609 			    "?lgrp: latency probing enabled by user, "
1610 			    "disable support of CPU/memory DR operations.");
1611 			plat_dr_disable_cpu();
1612 			plat_dr_disable_memory();
1613 		}
1614 	}
1615 
1616 	/* Done if succeeded to process SLIT table. */
1617 	if (lgrp_plat_slit_error == 0)
1618 		return;
1619 
1620 	/*
1621 	 * Probe to determine latency between NUMA nodes when SLIT
1622 	 * doesn't exist or make sense
1623 	 */
1624 	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
1625 
1626 	/*
1627 	 * Specify whether to probe using vendor ID register or page copy
1628 	 * if hasn't been specified already or is overspecified
1629 	 */
1630 	probe_op = lgrp_plat_probe_flags &
1631 	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1632 
1633 	if (probe_op == 0 ||
1634 	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
1635 		lgrp_plat_probe_flags &=
1636 		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1637 		if (is_opteron())
1638 			lgrp_plat_probe_flags |=
1639 			    LGRP_PLAT_PROBE_VENDOR;
1640 		else
1641 			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
1642 	}
1643 
1644 	/*
1645 	 * Probing errors can mess up the lgroup topology and
1646 	 * force us fall back to a 2 level lgroup topology.
1647 	 * Here we bound how tall the lgroup topology can grow
1648 	 * in hopes of avoiding any anamolies in probing from
1649 	 * messing up the lgroup topology by limiting the
1650 	 * accuracy of the latency topology.
1651 	 *
1652 	 * Assume that nodes will at least be configured in a
1653 	 * ring, so limit height of lgroup topology to be less
1654 	 * than number of nodes on a system with 4 or more
1655 	 * nodes
1656 	 */
1657 	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
1658 	    lgrp_topo_ht_limit_default())
1659 		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
1660 }
1661 
1662 
1663 /*
1664  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
1665  * be considered same
1666  */
1667 #define	LGRP_LAT_TOLERANCE_SHIFT	4
1668 
1669 int	lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
1670 
1671 
1672 /*
1673  * Adjust latencies between nodes to be symmetric, normalize latencies between
1674  * any nodes that are within some tolerance to be same, and make local
1675  * latencies be same
1676  */
1677 static void
1678 lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
1679     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
1680 {
1681 	int				i;
1682 	int				j;
1683 	int				k;
1684 	int				l;
1685 	u_longlong_t			max;
1686 	u_longlong_t			min;
1687 	u_longlong_t			t;
1688 	u_longlong_t			t1;
1689 	u_longlong_t			t2;
1690 	const lgrp_config_flag_t	cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
1691 	int				lat_corrected[MAX_NODES][MAX_NODES];
1692 
1693 	/*
1694 	 * Nothing to do when this is an UMA machine or don't have args needed
1695 	 */
1696 	if (max_mem_nodes == 1)
1697 		return;
1698 
1699 	ASSERT(memnode_info != NULL && lat_stats != NULL &&
1700 	    probe_stats != NULL);
1701 
1702 	/*
1703 	 * Make sure that latencies are symmetric between any two nodes
1704 	 * (ie. latency(node0, node1) == latency(node1, node0))
1705 	 */
1706 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1707 		if (!memnode_info[i].exists)
1708 			continue;
1709 
1710 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1711 			if (!memnode_info[j].exists)
1712 				continue;
1713 
1714 			t1 = lat_stats->latencies[i][j];
1715 			t2 = lat_stats->latencies[j][i];
1716 
1717 			if (t1 == 0 || t2 == 0 || t1 == t2)
1718 				continue;
1719 
1720 			/*
1721 			 * Latencies should be same
1722 			 * - Use minimum of two latencies which should be same
1723 			 * - Track suspect probe times not within tolerance of
1724 			 *   min value
1725 			 * - Remember how much values are corrected by
1726 			 */
1727 			if (t1 > t2) {
1728 				t = t2;
1729 				probe_stats->probe_errors[i][j] += t1 - t2;
1730 				if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
1731 					probe_stats->probe_suspect[i][j]++;
1732 					probe_stats->probe_suspect[j][i]++;
1733 				}
1734 			} else if (t2 > t1) {
1735 				t = t1;
1736 				probe_stats->probe_errors[j][i] += t2 - t1;
1737 				if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
1738 					probe_stats->probe_suspect[i][j]++;
1739 					probe_stats->probe_suspect[j][i]++;
1740 				}
1741 			}
1742 
1743 			lat_stats->latencies[i][j] =
1744 			    lat_stats->latencies[j][i] = t;
1745 			lgrp_config(cflag, t1, t);
1746 			lgrp_config(cflag, t2, t);
1747 		}
1748 	}
1749 
1750 	/*
1751 	 * Keep track of which latencies get corrected
1752 	 */
1753 	for (i = 0; i < MAX_NODES; i++)
1754 		for (j = 0; j < MAX_NODES; j++)
1755 			lat_corrected[i][j] = 0;
1756 
1757 	/*
1758 	 * For every two nodes, see whether there is another pair of nodes which
1759 	 * are about the same distance apart and make the latencies be the same
1760 	 * if they are close enough together
1761 	 */
1762 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1763 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1764 			if (!memnode_info[j].exists)
1765 				continue;
1766 			/*
1767 			 * Pick one pair of nodes (i, j)
1768 			 * and get latency between them
1769 			 */
1770 			t1 = lat_stats->latencies[i][j];
1771 
1772 			/*
1773 			 * Skip this pair of nodes if there isn't a latency
1774 			 * for it yet
1775 			 */
1776 			if (t1 == 0)
1777 				continue;
1778 
1779 			for (k = 0; k < lgrp_plat_node_cnt; k++) {
1780 				for (l = 0; l < lgrp_plat_node_cnt; l++) {
1781 					if (!memnode_info[l].exists)
1782 						continue;
1783 					/*
1784 					 * Pick another pair of nodes (k, l)
1785 					 * not same as (i, j) and get latency
1786 					 * between them
1787 					 */
1788 					if (k == i && l == j)
1789 						continue;
1790 
1791 					t2 = lat_stats->latencies[k][l];
1792 
1793 					/*
1794 					 * Skip this pair of nodes if there
1795 					 * isn't a latency for it yet
1796 					 */
1797 
1798 					if (t2 == 0)
1799 						continue;
1800 
1801 					/*
1802 					 * Skip nodes (k, l) if they already
1803 					 * have same latency as (i, j) or
1804 					 * their latency isn't close enough to
1805 					 * be considered/made the same
1806 					 */
1807 					if (t1 == t2 || (t1 > t2 && t1 - t2 >
1808 					    t1 >> lgrp_plat_probe_lt_shift) ||
1809 					    (t2 > t1 && t2 - t1 >
1810 					    t2 >> lgrp_plat_probe_lt_shift))
1811 						continue;
1812 
1813 					/*
1814 					 * Make latency(i, j) same as
1815 					 * latency(k, l), try to use latency
1816 					 * that has been adjusted already to get
1817 					 * more consistency (if possible), and
1818 					 * remember which latencies were
1819 					 * adjusted for next time
1820 					 */
1821 					if (lat_corrected[i][j]) {
1822 						t = t1;
1823 						lgrp_config(cflag, t2, t);
1824 						t2 = t;
1825 					} else if (lat_corrected[k][l]) {
1826 						t = t2;
1827 						lgrp_config(cflag, t1, t);
1828 						t1 = t;
1829 					} else {
1830 						if (t1 > t2)
1831 							t = t2;
1832 						else
1833 							t = t1;
1834 						lgrp_config(cflag, t1, t);
1835 						lgrp_config(cflag, t2, t);
1836 						t1 = t2 = t;
1837 					}
1838 
1839 					lat_stats->latencies[i][j] =
1840 					    lat_stats->latencies[k][l] = t;
1841 
1842 					lat_corrected[i][j] =
1843 					    lat_corrected[k][l] = 1;
1844 				}
1845 			}
1846 		}
1847 	}
1848 
1849 	/*
1850 	 * Local latencies should be same
1851 	 * - Find min and max local latencies
1852 	 * - Make all local latencies be minimum
1853 	 */
1854 	min = -1;
1855 	max = 0;
1856 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1857 		if (!memnode_info[i].exists)
1858 			continue;
1859 		t = lat_stats->latencies[i][i];
1860 		if (t == 0)
1861 			continue;
1862 		if (min == -1 || t < min)
1863 			min = t;
1864 		if (t > max)
1865 			max = t;
1866 	}
1867 	if (min != max) {
1868 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
1869 			int	local;
1870 
1871 			if (!memnode_info[i].exists)
1872 				continue;
1873 
1874 			local = lat_stats->latencies[i][i];
1875 			if (local == 0)
1876 				continue;
1877 
1878 			/*
1879 			 * Track suspect probe times that aren't within
1880 			 * tolerance of minimum local latency and how much
1881 			 * probe times are corrected by
1882 			 */
1883 			if (local - min > min >> lgrp_plat_probe_lt_shift)
1884 				probe_stats->probe_suspect[i][i]++;
1885 
1886 			probe_stats->probe_errors[i][i] += local - min;
1887 
1888 			/*
1889 			 * Make local latencies be minimum
1890 			 */
1891 			lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
1892 			lat_stats->latencies[i][i] = min;
1893 		}
1894 	}
1895 
1896 	/*
1897 	 * Determine max probe time again since just adjusted latencies
1898 	 */
1899 	lat_stats->latency_max = 0;
1900 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1901 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1902 			if (!memnode_info[j].exists)
1903 				continue;
1904 			t = lat_stats->latencies[i][j];
1905 			if (t > lat_stats->latency_max)
1906 				lat_stats->latency_max = t;
1907 		}
1908 	}
1909 }
1910 
1911 
1912 /*
1913  * Verify following about latencies between nodes:
1914  *
1915  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
1916  * - Local latencies same
1917  * - Local < remote
1918  * - Number of latencies seen is reasonable
1919  * - Number of occurrences of a given latency should be more than 1
1920  *
1921  * Returns:
1922  *	0	Success
1923  *	-1	Not symmetric
1924  *	-2	Local latencies not same
1925  *	-3	Local >= remote
1926  */
1927 static int
1928 lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
1929     lgrp_plat_latency_stats_t *lat_stats)
1930 {
1931 	int				i;
1932 	int				j;
1933 	u_longlong_t			t1;
1934 	u_longlong_t			t2;
1935 
1936 	ASSERT(memnode_info != NULL && lat_stats != NULL);
1937 
1938 	/*
1939 	 * Nothing to do when this is an UMA machine, lgroup topology is
1940 	 * limited to 2 levels, or there aren't any probe times yet
1941 	 */
1942 	if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
1943 	    lat_stats->latencies[0][0] == 0)
1944 		return (0);
1945 
1946 	/*
1947 	 * Make sure that latencies are symmetric between any two nodes
1948 	 * (ie. latency(node0, node1) == latency(node1, node0))
1949 	 */
1950 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1951 		if (!memnode_info[i].exists)
1952 			continue;
1953 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1954 			if (!memnode_info[j].exists)
1955 				continue;
1956 			t1 = lat_stats->latencies[i][j];
1957 			t2 = lat_stats->latencies[j][i];
1958 
1959 			if (t1 == 0 || t2 == 0 || t1 == t2)
1960 				continue;
1961 
1962 			return (-1);
1963 		}
1964 	}
1965 
1966 	/*
1967 	 * Local latencies should be same
1968 	 */
1969 	t1 = lat_stats->latencies[0][0];
1970 	for (i = 1; i < lgrp_plat_node_cnt; i++) {
1971 		if (!memnode_info[i].exists)
1972 			continue;
1973 
1974 		t2 = lat_stats->latencies[i][i];
1975 		if (t2 == 0)
1976 			continue;
1977 
1978 		if (t1 == 0) {
1979 			t1 = t2;
1980 			continue;
1981 		}
1982 
1983 		if (t1 != t2)
1984 			return (-2);
1985 	}
1986 
1987 	/*
1988 	 * Local latencies should be less than remote
1989 	 */
1990 	if (t1) {
1991 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
1992 			for (j = 0; j < lgrp_plat_node_cnt; j++) {
1993 				if (!memnode_info[j].exists)
1994 					continue;
1995 				t2 = lat_stats->latencies[i][j];
1996 				if (i == j || t2 == 0)
1997 					continue;
1998 
1999 				if (t1 >= t2)
2000 					return (-3);
2001 			}
2002 		}
2003 	}
2004 
2005 	return (0);
2006 }
2007 
2008 
2009 /*
2010  * Platform-specific initialization
2011  */
2012 static void
2013 lgrp_plat_main_init(void)
2014 {
2015 	int	curnode;
2016 	int	ht_limit;
2017 	int	i;
2018 
2019 	/*
2020 	 * Print a notice that MPO is disabled when memory is interleaved
2021 	 * across nodes....Would do this when it is discovered, but can't
2022 	 * because it happens way too early during boot....
2023 	 */
2024 	if (lgrp_plat_mem_intrlv)
2025 		cmn_err(CE_NOTE,
2026 		    "MPO disabled because memory is interleaved\n");
2027 
2028 	/*
2029 	 * Don't bother to do any probing if it is disabled, there is only one
2030 	 * node, or the height of the lgroup topology less than or equal to 2
2031 	 */
2032 	ht_limit = lgrp_topo_ht_limit();
2033 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
2034 	    max_mem_nodes == 1 || ht_limit <= 2) {
2035 		/*
2036 		 * Setup lgroup latencies for 2 level lgroup topology
2037 		 * (ie. local and remote only) if they haven't been set yet
2038 		 */
2039 		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
2040 		    lgrp_plat_lat_stats.latency_max == 0)
2041 			lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
2042 		return;
2043 	}
2044 
2045 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2046 		/*
2047 		 * Should have been able to probe from CPU 0 when it was added
2048 		 * to lgroup hierarchy, but may not have been able to then
2049 		 * because it happens so early in boot that gethrtime() hasn't
2050 		 * been initialized.  (:-(
2051 		 */
2052 		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
2053 		    lgrp_plat_cpu_node_nentries);
2054 		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
2055 		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
2056 			lgrp_plat_probe();
2057 
2058 		return;
2059 	}
2060 
2061 	/*
2062 	 * When probing memory, use one page for every sample to determine
2063 	 * lgroup topology and taking multiple samples
2064 	 */
2065 	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
2066 		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
2067 		    lgrp_plat_probe_nsamples;
2068 
2069 	/*
2070 	 * Map memory in each node needed for probing to determine latency
2071 	 * topology
2072 	 */
2073 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2074 		int	mnode;
2075 
2076 		/*
2077 		 * Skip this node and leave its probe page NULL
2078 		 * if it doesn't have any memory
2079 		 */
2080 		mnode = i;
2081 		if (!mem_node_config[mnode].exists) {
2082 			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
2083 			continue;
2084 		}
2085 
2086 		/*
2087 		 * Allocate one kernel virtual page
2088 		 */
2089 		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
2090 		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
2091 		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
2092 			cmn_err(CE_WARN,
2093 			    "lgrp_plat_main_init: couldn't allocate memory");
2094 			return;
2095 		}
2096 
2097 		/*
2098 		 * Get PFN for first page in each node
2099 		 */
2100 		lgrp_plat_probe_mem_config.probe_pfn[i] =
2101 		    mem_node_config[mnode].physbase;
2102 
2103 		/*
2104 		 * Map virtual page to first page in node
2105 		 */
2106 		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
2107 		    lgrp_plat_probe_mem_config.probe_memsize,
2108 		    lgrp_plat_probe_mem_config.probe_pfn[i],
2109 		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
2110 		    HAT_LOAD_NOCONSIST);
2111 	}
2112 
2113 	/*
2114 	 * Probe from current CPU
2115 	 */
2116 	lgrp_plat_probe();
2117 }
2118 
2119 
2120 /*
2121  * Return the number of free, allocatable, or installed
2122  * pages in an lgroup
2123  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
2124  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
2125  */
2126 static pgcnt_t
2127 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
2128 {
2129 	_NOTE(ARGUNUSED(lgrphand));
2130 
2131 	struct memlist *mlist;
2132 	pgcnt_t npgs = 0;
2133 	extern struct memlist *phys_avail;
2134 	extern struct memlist *phys_install;
2135 
2136 	switch (query) {
2137 	case LGRP_MEM_SIZE_FREE:
2138 		return ((pgcnt_t)freemem);
2139 	case LGRP_MEM_SIZE_AVAIL:
2140 		memlist_read_lock();
2141 		for (mlist = phys_avail; mlist; mlist = mlist->ml_next)
2142 			npgs += btop(mlist->ml_size);
2143 		memlist_read_unlock();
2144 		return (npgs);
2145 	case LGRP_MEM_SIZE_INSTALL:
2146 		memlist_read_lock();
2147 		for (mlist = phys_install; mlist; mlist = mlist->ml_next)
2148 			npgs += btop(mlist->ml_size);
2149 		memlist_read_unlock();
2150 		return (npgs);
2151 	default:
2152 		return ((pgcnt_t)0);
2153 	}
2154 }
2155 
2156 
2157 /*
2158  * Update node to proximity domain mappings for given domain and return node ID
2159  */
2160 static int
2161 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
2162     uint32_t domain)
2163 {
2164 	uint_t	node;
2165 	uint_t	start;
2166 
2167 	/*
2168 	 * Hash proximity domain ID into node to domain mapping table (array)
2169 	 * and add entry for it into first non-existent or matching entry found
2170 	 */
2171 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
2172 	do {
2173 		/*
2174 		 * Entry doesn't exist yet, so create one for this proximity
2175 		 * domain and return node ID which is index into mapping table.
2176 		 */
2177 		if (!node_domain[node].exists) {
2178 			node_domain[node].prox_domain = domain;
2179 			membar_producer();
2180 			node_domain[node].exists = 1;
2181 			return (node);
2182 		}
2183 
2184 		/*
2185 		 * Entry exists for this proximity domain already, so just
2186 		 * return node ID (index into table).
2187 		 */
2188 		if (node_domain[node].prox_domain == domain)
2189 			return (node);
2190 		node = NODE_DOMAIN_HASH(node + 1, node_cnt);
2191 	} while (node != start);
2192 
2193 	/*
2194 	 * Ran out of supported number of entries which shouldn't happen....
2195 	 */
2196 	ASSERT(node != start);
2197 	return (-1);
2198 }
2199 
2200 /*
2201  * Update node memory information for given proximity domain with specified
2202  * starting and ending physical address range (and return positive numbers for
2203  * success and negative ones for errors)
2204  */
2205 static int
2206 lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt,
2207     memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start,
2208     uint64_t end, uint32_t domain, uint32_t device_id)
2209 {
2210 	int	node, mnode;
2211 
2212 	/*
2213 	 * Get node number for proximity domain
2214 	 */
2215 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
2216 	if (node == -1) {
2217 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
2218 		    domain);
2219 		if (node == -1)
2220 			return (-1);
2221 	}
2222 
2223 	/*
2224 	 * This function is called during boot if device_id is
2225 	 * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for
2226 	 * memory DR operations.
2227 	 */
2228 	if (device_id != ACPI_MEMNODE_DEVID_BOOT) {
2229 		ASSERT(lgrp_plat_max_mem_node <= memnode_cnt);
2230 
2231 		for (mnode = lgrp_plat_node_cnt;
2232 		    mnode < lgrp_plat_max_mem_node; mnode++) {
2233 			if (memnode_info[mnode].exists &&
2234 			    memnode_info[mnode].prox_domain == domain &&
2235 			    memnode_info[mnode].device_id == device_id) {
2236 				if (btop(start) < memnode_info[mnode].start)
2237 					memnode_info[mnode].start = btop(start);
2238 				if (btop(end) > memnode_info[mnode].end)
2239 					memnode_info[mnode].end = btop(end);
2240 				return (1);
2241 			}
2242 		}
2243 
2244 		if (lgrp_plat_max_mem_node >= memnode_cnt) {
2245 			return (-3);
2246 		} else {
2247 			lgrp_plat_max_mem_node++;
2248 			memnode_info[mnode].start = btop(start);
2249 			memnode_info[mnode].end = btop(end);
2250 			memnode_info[mnode].prox_domain = domain;
2251 			memnode_info[mnode].device_id = device_id;
2252 			memnode_info[mnode].lgrphand = node;
2253 			membar_producer();
2254 			memnode_info[mnode].exists = 1;
2255 			return (0);
2256 		}
2257 	}
2258 
2259 	/*
2260 	 * Create entry in table for node if it doesn't exist
2261 	 */
2262 	ASSERT(node < memnode_cnt);
2263 	if (!memnode_info[node].exists) {
2264 		memnode_info[node].start = btop(start);
2265 		memnode_info[node].end = btop(end);
2266 		memnode_info[node].prox_domain = domain;
2267 		memnode_info[node].device_id = device_id;
2268 		memnode_info[node].lgrphand = node;
2269 		membar_producer();
2270 		memnode_info[node].exists = 1;
2271 		return (0);
2272 	}
2273 
2274 	/*
2275 	 * Entry already exists for this proximity domain
2276 	 *
2277 	 * There may be more than one SRAT memory entry for a domain, so we may
2278 	 * need to update existing start or end address for the node.
2279 	 */
2280 	if (memnode_info[node].prox_domain == domain) {
2281 		if (btop(start) < memnode_info[node].start)
2282 			memnode_info[node].start = btop(start);
2283 		if (btop(end) > memnode_info[node].end)
2284 			memnode_info[node].end = btop(end);
2285 		return (1);
2286 	}
2287 	return (-2);
2288 }
2289 
2290 
2291 /*
2292  * Have to sort nodes by starting physical address because plat_mnode_xcheck()
2293  * assumes and expects memnodes to be sorted in ascending order by physical
2294  * address.
2295  */
2296 static void
2297 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt,
2298     cpu_node_map_t *cpu_node, int cpu_count,
2299     memnode_phys_addr_map_t *memnode_info)
2300 {
2301 	boolean_t	found;
2302 	int		i;
2303 	int		j;
2304 	int		n;
2305 	boolean_t	sorted;
2306 	boolean_t	swapped;
2307 
2308 	if (!lgrp_plat_node_sort_enable || node_cnt <= 1 ||
2309 	    node_domain == NULL || memnode_info == NULL)
2310 		return;
2311 
2312 	/*
2313 	 * Sorted already?
2314 	 */
2315 	sorted = B_TRUE;
2316 	for (i = 0; i < node_cnt - 1; i++) {
2317 		/*
2318 		 * Skip entries that don't exist
2319 		 */
2320 		if (!memnode_info[i].exists)
2321 			continue;
2322 
2323 		/*
2324 		 * Try to find next existing entry to compare against
2325 		 */
2326 		found = B_FALSE;
2327 		for (j = i + 1; j < node_cnt; j++) {
2328 			if (memnode_info[j].exists) {
2329 				found = B_TRUE;
2330 				break;
2331 			}
2332 		}
2333 
2334 		/*
2335 		 * Done if no more existing entries to compare against
2336 		 */
2337 		if (found == B_FALSE)
2338 			break;
2339 
2340 		/*
2341 		 * Not sorted if starting address of current entry is bigger
2342 		 * than starting address of next existing entry
2343 		 */
2344 		if (memnode_info[i].start > memnode_info[j].start) {
2345 			sorted = B_FALSE;
2346 			break;
2347 		}
2348 	}
2349 
2350 	/*
2351 	 * Don't need to sort if sorted already
2352 	 */
2353 	if (sorted == B_TRUE)
2354 		return;
2355 
2356 	/*
2357 	 * Just use bubble sort since number of nodes is small
2358 	 */
2359 	n = node_cnt;
2360 	do {
2361 		swapped = B_FALSE;
2362 		n--;
2363 		for (i = 0; i < n; i++) {
2364 			/*
2365 			 * Skip entries that don't exist
2366 			 */
2367 			if (!memnode_info[i].exists)
2368 				continue;
2369 
2370 			/*
2371 			 * Try to find next existing entry to compare against
2372 			 */
2373 			found = B_FALSE;
2374 			for (j = i + 1; j <= n; j++) {
2375 				if (memnode_info[j].exists) {
2376 					found = B_TRUE;
2377 					break;
2378 				}
2379 			}
2380 
2381 			/*
2382 			 * Done if no more existing entries to compare against
2383 			 */
2384 			if (found == B_FALSE)
2385 				break;
2386 
2387 			if (memnode_info[i].start > memnode_info[j].start) {
2388 				memnode_phys_addr_map_t	save_addr;
2389 				node_domain_map_t	save_node;
2390 
2391 				/*
2392 				 * Swap node to proxmity domain ID assignments
2393 				 */
2394 				bcopy(&node_domain[i], &save_node,
2395 				    sizeof (node_domain_map_t));
2396 				bcopy(&node_domain[j], &node_domain[i],
2397 				    sizeof (node_domain_map_t));
2398 				bcopy(&save_node, &node_domain[j],
2399 				    sizeof (node_domain_map_t));
2400 
2401 				/*
2402 				 * Swap node to physical memory assignments
2403 				 */
2404 				bcopy(&memnode_info[i], &save_addr,
2405 				    sizeof (memnode_phys_addr_map_t));
2406 				bcopy(&memnode_info[j], &memnode_info[i],
2407 				    sizeof (memnode_phys_addr_map_t));
2408 				bcopy(&save_addr, &memnode_info[j],
2409 				    sizeof (memnode_phys_addr_map_t));
2410 				swapped = B_TRUE;
2411 			}
2412 		}
2413 	} while (swapped == B_TRUE);
2414 
2415 	/*
2416 	 * Check to make sure that CPUs assigned to correct node IDs now since
2417 	 * node to proximity domain ID assignments may have been changed above
2418 	 */
2419 	if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1)
2420 		return;
2421 	for (i = 0; i < cpu_count; i++) {
2422 		int		node;
2423 
2424 		node = lgrp_plat_domain_to_node(node_domain, node_cnt,
2425 		    cpu_node[i].prox_domain);
2426 		if (cpu_node[i].node != node)
2427 			cpu_node[i].node = node;
2428 	}
2429 
2430 }
2431 
2432 
2433 /*
2434  * Return time needed to probe from current CPU to memory in given node
2435  */
2436 static hrtime_t
2437 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
2438     lgrp_plat_probe_mem_config_t *probe_mem_config,
2439     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
2440 {
2441 	caddr_t			buf;
2442 	hrtime_t		elapsed;
2443 	hrtime_t		end;
2444 	int			from;
2445 	int			i;
2446 	int			ipl;
2447 	hrtime_t		max;
2448 	hrtime_t		min;
2449 	hrtime_t		start;
2450 	extern int		use_sse_pagecopy;
2451 
2452 	/*
2453 	 * Determine ID of node containing current CPU
2454 	 */
2455 	from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
2456 	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
2457 
2458 	/*
2459 	 * Do common work for probing main memory
2460 	 */
2461 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
2462 		/*
2463 		 * Skip probing any nodes without memory and
2464 		 * set probe time to 0
2465 		 */
2466 		if (probe_mem_config->probe_va[to] == NULL) {
2467 			lat_stats->latencies[from][to] = 0;
2468 			return (0);
2469 		}
2470 
2471 		/*
2472 		 * Invalidate caches once instead of once every sample
2473 		 * which should cut cost of probing by a lot
2474 		 */
2475 		probe_stats->flush_cost = gethrtime();
2476 		invalidate_cache();
2477 		probe_stats->flush_cost = gethrtime() -
2478 		    probe_stats->flush_cost;
2479 		probe_stats->probe_cost_total += probe_stats->flush_cost;
2480 	}
2481 
2482 	/*
2483 	 * Probe from current CPU to given memory using specified operation
2484 	 * and take specified number of samples
2485 	 */
2486 	max = 0;
2487 	min = -1;
2488 	for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
2489 		probe_stats->probe_cost = gethrtime();
2490 
2491 		/*
2492 		 * Can't measure probe time if gethrtime() isn't working yet
2493 		 */
2494 		if (probe_stats->probe_cost == 0 && gethrtime() == 0)
2495 			return (0);
2496 
2497 		if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2498 			/*
2499 			 * Measure how long it takes to read vendor ID from
2500 			 * Northbridge
2501 			 */
2502 			elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
2503 		} else {
2504 			/*
2505 			 * Measure how long it takes to copy page
2506 			 * on top of itself
2507 			 */
2508 			buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
2509 
2510 			kpreempt_disable();
2511 			ipl = splhigh();
2512 			start = gethrtime();
2513 			if (use_sse_pagecopy)
2514 				hwblkpagecopy(buf, buf);
2515 			else
2516 				bcopy(buf, buf, PAGESIZE);
2517 			end = gethrtime();
2518 			elapsed = end - start;
2519 			splx(ipl);
2520 			kpreempt_enable();
2521 		}
2522 
2523 		probe_stats->probe_cost = gethrtime() -
2524 		    probe_stats->probe_cost;
2525 		probe_stats->probe_cost_total += probe_stats->probe_cost;
2526 
2527 		if (min == -1 || elapsed < min)
2528 			min = elapsed;
2529 		if (elapsed > max)
2530 			max = elapsed;
2531 	}
2532 
2533 	/*
2534 	 * Update minimum and maximum probe times between
2535 	 * these two nodes
2536 	 */
2537 	if (min < probe_stats->probe_min[from][to] ||
2538 	    probe_stats->probe_min[from][to] == 0)
2539 		probe_stats->probe_min[from][to] = min;
2540 
2541 	if (max > probe_stats->probe_max[from][to])
2542 		probe_stats->probe_max[from][to] = max;
2543 
2544 	return (min);
2545 }
2546 
2547 
2548 /*
2549  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
2550  * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
2551  * and return number of CPU APIC IDs.
2552  *
2553  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
2554  *       in in cpu_apicid_array boot property which is based on and follows
2555  *	 same ordering as processor list in ACPI MADT.  If the code in
2556  *	 usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns
2557  *	 CPU IDs ever changes, then this code will need to change too....
2558  */
2559 static int
2560 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node)
2561 {
2562 	int	boot_prop_len;
2563 	char	*boot_prop_name = BP_CPU_APICID_ARRAY;
2564 	uint8_t	cpu_apicid_array[UINT8_MAX + 1];
2565 	int	i;
2566 	int	n;
2567 
2568 	/*
2569 	 * Check length of property value
2570 	 */
2571 	boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
2572 	if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array))
2573 		return (-1);
2574 
2575 	/*
2576 	 * Calculate number of entries in array and return when the system is
2577 	 * not very interesting for NUMA. It's not interesting for NUMA if
2578 	 * system has only one CPU and doesn't support CPU hotplug.
2579 	 */
2580 	n = boot_prop_len / sizeof (uint8_t);
2581 	if (n == 1 && !plat_dr_support_cpu())
2582 		return (-2);
2583 
2584 	/*
2585 	 * Get CPU to APIC ID property value
2586 	 */
2587 	if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
2588 		return (-3);
2589 
2590 	/*
2591 	 * Just return number of CPU APIC IDs if CPU to node mapping table is
2592 	 * NULL
2593 	 */
2594 	if (cpu_node == NULL) {
2595 		if (plat_dr_support_cpu() && n >= boot_ncpus) {
2596 			return (boot_ncpus);
2597 		} else {
2598 			return (n);
2599 		}
2600 	}
2601 
2602 	/*
2603 	 * Fill in CPU to node ID mapping table with APIC ID for each CPU
2604 	 */
2605 	for (i = 0; i < n; i++) {
2606 		/* Only add boot CPUs into the map if CPU DR is enabled. */
2607 		if (plat_dr_support_cpu() && i >= boot_ncpus)
2608 			break;
2609 		cpu_node[i].exists = 1;
2610 		cpu_node[i].apicid = cpu_apicid_array[i];
2611 		cpu_node[i].prox_domain = UINT32_MAX;
2612 		cpu_node[i].node = UINT_MAX;
2613 	}
2614 
2615 	/*
2616 	 * Return number of CPUs based on number of APIC IDs
2617 	 */
2618 	return (i);
2619 }
2620 
2621 
2622 /*
2623  * Read ACPI System Locality Information Table (SLIT) to determine how far each
2624  * NUMA node is from each other
2625  */
2626 static int
2627 lgrp_plat_process_slit(struct slit *tp,
2628     node_domain_map_t *node_domain, uint_t node_cnt,
2629     memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats)
2630 {
2631 	int		i;
2632 	int		j;
2633 	int		src;
2634 	int		dst;
2635 	int		localities;
2636 	hrtime_t	max;
2637 	hrtime_t	min;
2638 	int		retval;
2639 	uint8_t		*slit_entries;
2640 
2641 	if (tp == NULL || !lgrp_plat_slit_enable)
2642 		return (1);
2643 
2644 	if (lat_stats == NULL)
2645 		return (2);
2646 
2647 	localities = tp->number;
2648 
2649 	min = lat_stats->latency_min;
2650 	max = lat_stats->latency_max;
2651 
2652 	/*
2653 	 * Fill in latency matrix based on SLIT entries
2654 	 */
2655 	slit_entries = tp->entry;
2656 	for (i = 0; i < localities; i++) {
2657 		src = lgrp_plat_domain_to_node(node_domain,
2658 		    node_cnt, i);
2659 		if (src == -1)
2660 			continue;
2661 
2662 		for (j = 0; j < localities; j++) {
2663 			uint8_t	latency;
2664 
2665 			dst = lgrp_plat_domain_to_node(node_domain,
2666 			    node_cnt, j);
2667 			if (dst == -1)
2668 				continue;
2669 
2670 			latency = slit_entries[(i * localities) + j];
2671 			lat_stats->latencies[src][dst] = latency;
2672 			if (latency < min || min == -1)
2673 				min = latency;
2674 			if (latency > max)
2675 				max = latency;
2676 		}
2677 	}
2678 
2679 	/*
2680 	 * Verify that latencies/distances given in SLIT look reasonable
2681 	 */
2682 	retval = lgrp_plat_latency_verify(memnode_info, lat_stats);
2683 
2684 	if (retval) {
2685 		/*
2686 		 * Reinitialize (zero) latency table since SLIT doesn't look
2687 		 * right
2688 		 */
2689 		for (i = 0; i < localities; i++) {
2690 			for (j = 0; j < localities; j++)
2691 				lat_stats->latencies[i][j] = 0;
2692 		}
2693 	} else {
2694 		/*
2695 		 * Update min and max latencies seen since SLIT looks valid
2696 		 */
2697 		lat_stats->latency_min = min;
2698 		lat_stats->latency_max = max;
2699 	}
2700 
2701 	return (retval);
2702 }
2703 
2704 
2705 /*
2706  * Update lgrp latencies according to information returned by ACPI _SLI method.
2707  */
2708 static int
2709 lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info,
2710     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
2711     lgrp_plat_latency_stats_t *lat_stats)
2712 {
2713 	int		i;
2714 	int		src, dst;
2715 	uint8_t		latency;
2716 	hrtime_t	max, min;
2717 
2718 	if (lat_stats == NULL || sli_info == NULL ||
2719 	    sli_cnt == 0 || domain_id >= sli_cnt)
2720 		return (-1);
2721 
2722 	src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id);
2723 	if (src == -1) {
2724 		src = lgrp_plat_node_domain_update(node_domain, node_cnt,
2725 		    domain_id);
2726 		if (src == -1)
2727 			return (-1);
2728 	}
2729 
2730 	/*
2731 	 * Don't update latency info if topology has been flattened to 2 levels.
2732 	 */
2733 	if (lgrp_plat_topo_flatten != 0) {
2734 		return (0);
2735 	}
2736 
2737 	/*
2738 	 * Latency information for proximity domain is ready.
2739 	 * TODO: support adjusting latency information at runtime.
2740 	 */
2741 	if (lat_stats->latencies[src][src] != 0) {
2742 		return (0);
2743 	}
2744 
2745 	/* Validate latency information. */
2746 	for (i = 0; i < sli_cnt; i++) {
2747 		if (i == domain_id) {
2748 			if (sli_info[i] != ACPI_SLIT_SELF_LATENCY ||
2749 			    sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) {
2750 				return (-1);
2751 			}
2752 		} else {
2753 			if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY ||
2754 			    sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY ||
2755 			    sli_info[i] != sli_info[sli_cnt + i]) {
2756 				return (-1);
2757 			}
2758 		}
2759 	}
2760 
2761 	min = lat_stats->latency_min;
2762 	max = lat_stats->latency_max;
2763 	for (i = 0; i < sli_cnt; i++) {
2764 		dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i);
2765 		if (dst == -1)
2766 			continue;
2767 
2768 		ASSERT(sli_info[i] == sli_info[sli_cnt + i]);
2769 
2770 		/* Update row in latencies matrix. */
2771 		latency = sli_info[i];
2772 		lat_stats->latencies[src][dst] = latency;
2773 		if (latency < min || min == -1)
2774 			min = latency;
2775 		if (latency > max)
2776 			max = latency;
2777 
2778 		/* Update column in latencies matrix. */
2779 		latency = sli_info[sli_cnt + i];
2780 		lat_stats->latencies[dst][src] = latency;
2781 		if (latency < min || min == -1)
2782 			min = latency;
2783 		if (latency > max)
2784 			max = latency;
2785 	}
2786 	lat_stats->latency_min = min;
2787 	lat_stats->latency_max = max;
2788 
2789 	return (0);
2790 }
2791 
2792 
2793 /*
2794  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
2795  * and memory are local to each other in the same NUMA node and return number
2796  * of nodes
2797  */
2798 static int
2799 lgrp_plat_process_srat(struct srat *tp, struct msct *mp,
2800     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
2801     cpu_node_map_t *cpu_node, int cpu_count,
2802     memnode_phys_addr_map_t *memnode_info)
2803 {
2804 	struct srat_item	*srat_end;
2805 	int			i;
2806 	struct srat_item	*item;
2807 	int			node_cnt;
2808 	int			proc_entry_count;
2809 	int			rc;
2810 
2811 	/*
2812 	 * Nothing to do when no SRAT or disabled
2813 	 */
2814 	if (tp == NULL || !lgrp_plat_srat_enable)
2815 		return (-1);
2816 
2817 	/*
2818 	 * Try to get domain information from MSCT table.
2819 	 * ACPI4.0: OSPM will use information provided by the MSCT only
2820 	 * when the System Resource Affinity Table (SRAT) exists.
2821 	 */
2822 	node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min);
2823 	if (node_cnt <= 0) {
2824 		/*
2825 		 * Determine number of nodes by counting number of proximity
2826 		 * domains in SRAT.
2827 		 */
2828 		node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min);
2829 	}
2830 	/*
2831 	 * Return if number of nodes is 1 or less since don't need to read SRAT.
2832 	 */
2833 	if (node_cnt == 1)
2834 		return (1);
2835 	else if (node_cnt <= 0)
2836 		return (-2);
2837 
2838 	/*
2839 	 * Walk through SRAT, examining each CPU and memory entry to determine
2840 	 * which CPUs and memory belong to which node.
2841 	 */
2842 	item = tp->list;
2843 	srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
2844 	proc_entry_count = 0;
2845 	while (item < srat_end) {
2846 		uint32_t	apic_id;
2847 		uint32_t	domain;
2848 		uint64_t	end;
2849 		uint64_t	length;
2850 		uint64_t	start;
2851 
2852 		switch (item->type) {
2853 		case SRAT_PROCESSOR:	/* CPU entry */
2854 			if (!(item->i.p.flags & SRAT_ENABLED) ||
2855 			    cpu_node == NULL)
2856 				break;
2857 
2858 			/*
2859 			 * Calculate domain (node) ID and fill in APIC ID to
2860 			 * domain/node mapping table
2861 			 */
2862 			domain = item->i.p.domain1;
2863 			for (i = 0; i < 3; i++) {
2864 				domain += item->i.p.domain2[i] <<
2865 				    ((i + 1) * 8);
2866 			}
2867 			apic_id = item->i.p.apic_id;
2868 
2869 			rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2870 			    cpu_node, cpu_count, apic_id, domain);
2871 			if (rc < 0)
2872 				return (-3);
2873 			else if (rc == 0)
2874 				proc_entry_count++;
2875 			break;
2876 
2877 		case SRAT_MEMORY:	/* memory entry */
2878 			if (!(item->i.m.flags & SRAT_ENABLED) ||
2879 			    memnode_info == NULL)
2880 				break;
2881 
2882 			/*
2883 			 * Get domain (node) ID and fill in domain/node
2884 			 * to memory mapping table
2885 			 */
2886 			domain = item->i.m.domain;
2887 			start = item->i.m.base_addr;
2888 			length = item->i.m.len;
2889 			end = start + length - 1;
2890 
2891 			/*
2892 			 * According to ACPI 4.0, both ENABLE and HOTPLUG flags
2893 			 * may be set for memory address range entries in SRAT
2894 			 * table which are reserved for memory hot plug.
2895 			 * We intersect memory address ranges in SRAT table
2896 			 * with memory ranges in physinstalled to filter out
2897 			 * memory address ranges reserved for hot plug.
2898 			 */
2899 			if (item->i.m.flags & SRAT_HOT_PLUG) {
2900 				uint64_t	rstart = UINT64_MAX;
2901 				uint64_t	rend = 0;
2902 				struct memlist	*ml;
2903 				extern struct bootops	*bootops;
2904 
2905 				memlist_read_lock();
2906 				for (ml = bootops->boot_mem->physinstalled;
2907 				    ml; ml = ml->ml_next) {
2908 					uint64_t tstart = ml->ml_address;
2909 					uint64_t tend;
2910 
2911 					tend = ml->ml_address + ml->ml_size;
2912 					if (tstart > end || tend < start)
2913 						continue;
2914 					if (start > tstart)
2915 						tstart = start;
2916 					if (rstart > tstart)
2917 						rstart = tstart;
2918 					if (end < tend)
2919 						tend = end;
2920 					if (rend < tend)
2921 						rend = tend;
2922 				}
2923 				memlist_read_unlock();
2924 				start = rstart;
2925 				end = rend;
2926 				/* Skip this entry if no memory installed. */
2927 				if (start > end)
2928 					break;
2929 			}
2930 
2931 			if (lgrp_plat_memnode_info_update(node_domain,
2932 			    node_cnt, memnode_info, node_cnt,
2933 			    start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0)
2934 				return (-4);
2935 			break;
2936 
2937 		case SRAT_X2APIC:	/* x2apic CPU entry */
2938 			if (!(item->i.xp.flags & SRAT_ENABLED) ||
2939 			    cpu_node == NULL)
2940 				break;
2941 
2942 			/*
2943 			 * Calculate domain (node) ID and fill in APIC ID to
2944 			 * domain/node mapping table
2945 			 */
2946 			domain = item->i.xp.domain;
2947 			apic_id = item->i.xp.x2apic_id;
2948 
2949 			rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2950 			    cpu_node, cpu_count, apic_id, domain);
2951 			if (rc < 0)
2952 				return (-3);
2953 			else if (rc == 0)
2954 				proc_entry_count++;
2955 			break;
2956 
2957 		default:
2958 			break;
2959 		}
2960 
2961 		item = (struct srat_item *)((uintptr_t)item + item->len);
2962 	}
2963 
2964 	/*
2965 	 * Should have seen at least as many SRAT processor entries as CPUs
2966 	 */
2967 	if (proc_entry_count < cpu_count)
2968 		return (-5);
2969 
2970 	/*
2971 	 * Need to sort nodes by starting physical address since VM system
2972 	 * assumes and expects memnodes to be sorted in ascending order by
2973 	 * physical address
2974 	 */
2975 	lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count,
2976 	    memnode_info);
2977 
2978 	return (node_cnt);
2979 }
2980 
2981 
2982 /*
2983  * Allocate permanent memory for any temporary memory that we needed to
2984  * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
2985  * initialized and copy everything from temporary to permanent memory since
2986  * temporary boot memory will eventually be released during boot
2987  */
2988 static void
2989 lgrp_plat_release_bootstrap(void)
2990 {
2991 	void	*buf;
2992 	size_t	size;
2993 
2994 	if (lgrp_plat_cpu_node_nentries > 0) {
2995 		size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
2996 		buf = kmem_alloc(size, KM_SLEEP);
2997 		bcopy(lgrp_plat_cpu_node, buf, size);
2998 		lgrp_plat_cpu_node = buf;
2999 	}
3000 }
3001 
3002 
3003 /*
3004  * Return number of proximity domains given in ACPI SRAT
3005  */
3006 static int
3007 lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min)
3008 {
3009 	int			domain_cnt;
3010 	uint32_t		domain_min;
3011 	struct srat_item	*end;
3012 	int			i;
3013 	struct srat_item	*item;
3014 	node_domain_map_t	node_domain[MAX_NODES];
3015 
3016 
3017 	if (tp == NULL || !lgrp_plat_srat_enable)
3018 		return (1);
3019 
3020 	/*
3021 	 * Walk through SRAT to find minimum proximity domain ID
3022 	 */
3023 	domain_min = UINT32_MAX;
3024 	item = tp->list;
3025 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
3026 	while (item < end) {
3027 		uint32_t	domain;
3028 
3029 		switch (item->type) {
3030 		case SRAT_PROCESSOR:	/* CPU entry */
3031 			if (!(item->i.p.flags & SRAT_ENABLED)) {
3032 				item = (struct srat_item *)((uintptr_t)item +
3033 				    item->len);
3034 				continue;
3035 			}
3036 			domain = item->i.p.domain1;
3037 			for (i = 0; i < 3; i++) {
3038 				domain += item->i.p.domain2[i] <<
3039 				    ((i + 1) * 8);
3040 			}
3041 			break;
3042 
3043 		case SRAT_MEMORY:	/* memory entry */
3044 			if (!(item->i.m.flags & SRAT_ENABLED)) {
3045 				item = (struct srat_item *)((uintptr_t)item +
3046 				    item->len);
3047 				continue;
3048 			}
3049 			domain = item->i.m.domain;
3050 			break;
3051 
3052 		case SRAT_X2APIC:	/* x2apic CPU entry */
3053 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
3054 				item = (struct srat_item *)((uintptr_t)item +
3055 				    item->len);
3056 				continue;
3057 			}
3058 			domain = item->i.xp.domain;
3059 			break;
3060 
3061 		default:
3062 			item = (struct srat_item *)((uintptr_t)item +
3063 			    item->len);
3064 			continue;
3065 		}
3066 
3067 		/*
3068 		 * Keep track of minimum proximity domain ID
3069 		 */
3070 		if (domain < domain_min)
3071 			domain_min = domain;
3072 
3073 		item = (struct srat_item *)((uintptr_t)item + item->len);
3074 	}
3075 	if (lgrp_plat_domain_min_enable && prox_domain_min != NULL)
3076 		*prox_domain_min = domain_min;
3077 
3078 	/*
3079 	 * Walk through SRAT, examining each CPU and memory entry to determine
3080 	 * proximity domain ID for each.
3081 	 */
3082 	domain_cnt = 0;
3083 	item = tp->list;
3084 	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
3085 	bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
3086 	while (item < end) {
3087 		uint32_t	domain;
3088 		boolean_t	overflow;
3089 		uint_t		start;
3090 
3091 		switch (item->type) {
3092 		case SRAT_PROCESSOR:	/* CPU entry */
3093 			if (!(item->i.p.flags & SRAT_ENABLED)) {
3094 				item = (struct srat_item *)((uintptr_t)item +
3095 				    item->len);
3096 				continue;
3097 			}
3098 			domain = item->i.p.domain1;
3099 			for (i = 0; i < 3; i++) {
3100 				domain += item->i.p.domain2[i] <<
3101 				    ((i + 1) * 8);
3102 			}
3103 			break;
3104 
3105 		case SRAT_MEMORY:	/* memory entry */
3106 			if (!(item->i.m.flags & SRAT_ENABLED)) {
3107 				item = (struct srat_item *)((uintptr_t)item +
3108 				    item->len);
3109 				continue;
3110 			}
3111 			domain = item->i.m.domain;
3112 			break;
3113 
3114 		case SRAT_X2APIC:	/* x2apic CPU entry */
3115 			if (!(item->i.xp.flags & SRAT_ENABLED)) {
3116 				item = (struct srat_item *)((uintptr_t)item +
3117 				    item->len);
3118 				continue;
3119 			}
3120 			domain = item->i.xp.domain;
3121 			break;
3122 
3123 		default:
3124 			item = (struct srat_item *)((uintptr_t)item +
3125 			    item->len);
3126 			continue;
3127 		}
3128 
3129 		/*
3130 		 * Count and keep track of which proximity domain IDs seen
3131 		 */
3132 		start = i = domain % MAX_NODES;
3133 		overflow = B_TRUE;
3134 		do {
3135 			/*
3136 			 * Create entry for proximity domain and increment
3137 			 * count when no entry exists where proximity domain
3138 			 * hashed
3139 			 */
3140 			if (!node_domain[i].exists) {
3141 				node_domain[i].exists = 1;
3142 				node_domain[i].prox_domain = domain;
3143 				domain_cnt++;
3144 				overflow = B_FALSE;
3145 				break;
3146 			}
3147 
3148 			/*
3149 			 * Nothing to do when proximity domain seen already
3150 			 * and its entry exists
3151 			 */
3152 			if (node_domain[i].prox_domain == domain) {
3153 				overflow = B_FALSE;
3154 				break;
3155 			}
3156 
3157 			/*
3158 			 * Entry exists where proximity domain hashed, but for
3159 			 * different proximity domain so keep search for empty
3160 			 * slot to put it or matching entry whichever comes
3161 			 * first.
3162 			 */
3163 			i = (i + 1) % MAX_NODES;
3164 		} while (i != start);
3165 
3166 		/*
3167 		 * Didn't find empty or matching entry which means have more
3168 		 * proximity domains than supported nodes (:-(
3169 		 */
3170 		ASSERT(overflow != B_TRUE);
3171 		if (overflow == B_TRUE)
3172 			return (-1);
3173 
3174 		item = (struct srat_item *)((uintptr_t)item + item->len);
3175 	}
3176 	return (domain_cnt);
3177 }
3178 
3179 
3180 /*
3181  * Parse domain information in ACPI Maximum System Capability Table (MSCT).
3182  * MSCT table has been verified in function process_msct() in fakebop.c.
3183  */
3184 static int
3185 lgrp_plat_msct_domains(struct msct *tp, uint32_t *prox_domain_min)
3186 {
3187 	int last_seen = 0;
3188 	uint32_t proxmin = UINT32_MAX;
3189 	struct msct_proximity_domain *item, *end;
3190 
3191 	if (tp == NULL || lgrp_plat_msct_enable == 0)
3192 		return (-1);
3193 
3194 	if (tp->maximum_proximity_domains >= MAX_NODES) {
3195 		cmn_err(CE_CONT,
3196 		    "?lgrp: too many proximity domains (%d), max %d supported, "
3197 		    "disable support of CPU/memory DR operations.",
3198 		    tp->maximum_proximity_domains + 1, MAX_NODES);
3199 		plat_dr_disable_cpu();
3200 		plat_dr_disable_memory();
3201 		return (-1);
3202 	}
3203 
3204 	if (prox_domain_min != NULL) {
3205 		end = (void *)(tp->hdr.len + (uintptr_t)tp);
3206 		for (item = (void *)((uintptr_t)tp +
3207 		    tp->proximity_domain_offset); item < end;
3208 		    item = (void *)(item->length + (uintptr_t)item)) {
3209 			if (item->domain_min < proxmin) {
3210 				proxmin = item->domain_min;
3211 			}
3212 
3213 			last_seen = item->domain_max - item->domain_min + 1;
3214 			/*
3215 			 * Break out if all proximity domains have been
3216 			 * processed. Some BIOSes may have unused items
3217 			 * at the end of MSCT table.
3218 			 */
3219 			if (last_seen > tp->maximum_proximity_domains) {
3220 				break;
3221 			}
3222 		}
3223 		*prox_domain_min = proxmin;
3224 	}
3225 
3226 	return (tp->maximum_proximity_domains + 1);
3227 }
3228 
3229 
3230 /*
3231  * Set lgroup latencies for 2 level lgroup topology
3232  */
3233 static void
3234 lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats)
3235 {
3236 	int	i, j;
3237 
3238 	ASSERT(lat_stats != NULL);
3239 
3240 	if (lgrp_plat_node_cnt >= 4)
3241 		cmn_err(CE_NOTE,
3242 		    "MPO only optimizing for local and remote\n");
3243 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
3244 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
3245 			if (i == j)
3246 				lat_stats->latencies[i][j] = 2;
3247 			else
3248 				lat_stats->latencies[i][j] = 3;
3249 		}
3250 	}
3251 	lat_stats->latency_min = 2;
3252 	lat_stats->latency_max = 3;
3253 	/* TODO: check it. */
3254 	lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
3255 	lgrp_plat_topo_flatten = 1;
3256 }
3257 
3258 
3259 /*
3260  * The following Opteron specific constants, macros, types, and routines define
3261  * PCI configuration space registers and how to read them to determine the NUMA
3262  * configuration of *supported* Opteron processors.  They provide the same
3263  * information that may be gotten from the ACPI System Resource Affinity Table
3264  * (SRAT) if it exists on the machine of interest.
3265  *
3266  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
3267  * of interest describes all of these registers and their contents.  The main
3268  * registers used by this code to determine the NUMA configuration of the
3269  * machine are the node ID register for the number of NUMA nodes and the DRAM
3270  * address map registers for the physical address range of each node.
3271  *
3272  * NOTE: The format and how to determine the NUMA configuration using PCI
3273  *	 config space registers may change or may not be supported in future
3274  *	 Opteron processor families.
3275  */
3276 
3277 /*
3278  * How many bits to shift Opteron DRAM Address Map base and limit registers
3279  * to get actual value
3280  */
3281 #define	OPT_DRAMADDR_HI_LSHIFT_ADDR	40	/* shift left for address */
3282 #define	OPT_DRAMADDR_LO_LSHIFT_ADDR	8	/* shift left for address */
3283 
3284 #define	OPT_DRAMADDR_HI_MASK_ADDR	0x000000FF /* address bits 47-40 */
3285 #define	OPT_DRAMADDR_LO_MASK_ADDR	0xFFFF0000 /* address bits 39-24 */
3286 
3287 #define	OPT_DRAMADDR_LO_MASK_OFF	0xFFFFFF /* offset for address */
3288 
3289 /*
3290  * Macros to derive addresses from Opteron DRAM Address Map registers
3291  */
3292 #define	OPT_DRAMADDR_HI(reg) \
3293 	(((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
3294 	    OPT_DRAMADDR_HI_LSHIFT_ADDR)
3295 
3296 #define	OPT_DRAMADDR_LO(reg) \
3297 	(((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
3298 	    OPT_DRAMADDR_LO_LSHIFT_ADDR)
3299 
3300 #define	OPT_DRAMADDR(high, low) \
3301 	(OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
3302 
3303 /*
3304  * Bit masks defining what's in Opteron DRAM Address Map base register
3305  */
3306 #define	OPT_DRAMBASE_LO_MASK_RE		0x1	/* read enable */
3307 #define	OPT_DRAMBASE_LO_MASK_WE		0x2	/* write enable */
3308 #define	OPT_DRAMBASE_LO_MASK_INTRLVEN	0x700	/* interleave */
3309 
3310 /*
3311  * Bit masks defining what's in Opteron DRAM Address Map limit register
3312  */
3313 #define	OPT_DRAMLIMIT_LO_MASK_DSTNODE	0x7		/* destination node */
3314 #define	OPT_DRAMLIMIT_LO_MASK_INTRLVSEL	0x700		/* interleave select */
3315 
3316 
3317 /*
3318  * Opteron Node ID register in PCI configuration space contains
3319  * number of nodes in system, etc. for Opteron K8.  The following
3320  * constants and macros define its contents, structure, and access.
3321  */
3322 
3323 /*
3324  * Bit masks defining what's in Opteron Node ID register
3325  */
3326 #define	OPT_NODE_MASK_ID	0x7	/* node ID */
3327 #define	OPT_NODE_MASK_CNT	0x70	/* node count */
3328 #define	OPT_NODE_MASK_IONODE	0x700	/* Hypertransport I/O hub node ID */
3329 #define	OPT_NODE_MASK_LCKNODE	0x7000	/* lock controller node ID */
3330 #define	OPT_NODE_MASK_CPUCNT	0xF0000	/* CPUs in system (0 means 1 CPU)  */
3331 
3332 /*
3333  * How many bits in Opteron Node ID register to shift right to get actual value
3334  */
3335 #define	OPT_NODE_RSHIFT_CNT	0x4	/* shift right for node count value */
3336 
3337 /*
3338  * Macros to get values from Opteron Node ID register
3339  */
3340 #define	OPT_NODE_CNT(reg) \
3341 	((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
3342 
3343 /*
3344  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
3345  * "in/out" instructions
3346  *
3347  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
3348  *	 other uses should just do MMIO to access PCI ECS.
3349  *	 Must enable special bit in Northbridge Configuration Register on
3350  *	 Greyhound for extended CF8 space access to be able to access PCI ECS
3351  *	 using "in/out" instructions and restore special bit after done
3352  *	 accessing PCI ECS.
3353  */
3354 #define	OPT_PCI_ECS_ADDR(bus, device, function, reg) \
3355 	(PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
3356 	    (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
3357 	    ((((reg) >> 8) & 0xf) << 24))
3358 
3359 /*
3360  * PCI configuration space registers accessed by specifying
3361  * a bus, device, function, and offset.  The following constants
3362  * define the values needed to access Opteron K8 configuration
3363  * info to determine its node topology
3364  */
3365 
3366 #define	OPT_PCS_BUS_CONFIG	0	/* Hypertransport config space bus */
3367 
3368 /*
3369  * Opteron PCI configuration space register function values
3370  */
3371 #define	OPT_PCS_FUNC_HT		0	/* Hypertransport configuration */
3372 #define	OPT_PCS_FUNC_ADDRMAP	1	/* Address map configuration */
3373 #define	OPT_PCS_FUNC_DRAM	2	/* DRAM configuration */
3374 #define	OPT_PCS_FUNC_MISC	3	/* Miscellaneous configuration */
3375 
3376 /*
3377  * PCI Configuration Space register offsets
3378  */
3379 #define	OPT_PCS_OFF_VENDOR	0x0	/* device/vendor ID register */
3380 #define	OPT_PCS_OFF_DRAMBASE_HI	0x140	/* DRAM Base register (node 0) */
3381 #define	OPT_PCS_OFF_DRAMBASE_LO	0x40	/* DRAM Base register (node 0) */
3382 #define	OPT_PCS_OFF_NODEID	0x60	/* Node ID register */
3383 
3384 /*
3385  * Opteron PCI Configuration Space device IDs for nodes
3386  */
3387 #define	OPT_PCS_DEV_NODE0		24	/* device number for node 0 */
3388 
3389 
3390 /*
3391  * Opteron DRAM address map gives base and limit for physical memory in a node
3392  */
3393 typedef	struct opt_dram_addr_map {
3394 	uint32_t	base_hi;
3395 	uint32_t	base_lo;
3396 	uint32_t	limit_hi;
3397 	uint32_t	limit_lo;
3398 } opt_dram_addr_map_t;
3399 
3400 
3401 /*
3402  * Supported AMD processor families
3403  */
3404 #define	AMD_FAMILY_HAMMER	15
3405 #define	AMD_FAMILY_GREYHOUND	16
3406 
3407 /*
3408  * Whether to have is_opteron() return 1 even when processor isn't supported
3409  */
3410 uint_t	is_opteron_override = 0;
3411 
3412 /*
3413  * AMD processor family for current CPU
3414  */
3415 uint_t	opt_family = 0;
3416 
3417 
3418 /*
3419  * Determine whether we're running on a supported AMD Opteron since reading
3420  * node count and DRAM address map registers may have different format or
3421  * may not be supported across processor families
3422  */
3423 static int
3424 is_opteron(void)
3425 {
3426 
3427 	if (x86_vendor != X86_VENDOR_AMD)
3428 		return (0);
3429 
3430 	opt_family = cpuid_getfamily(CPU);
3431 	if (opt_family == AMD_FAMILY_HAMMER ||
3432 	    opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
3433 		return (1);
3434 	else
3435 		return (0);
3436 }
3437 
3438 
3439 /*
3440  * Determine NUMA configuration for Opteron from registers that live in PCI
3441  * configuration space
3442  */
3443 static void
3444 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
3445     memnode_phys_addr_map_t *memnode_info)
3446 {
3447 	uint_t				bus;
3448 	uint_t				dev;
3449 	struct opt_dram_addr_map	dram_map[MAX_NODES];
3450 	uint_t				node;
3451 	uint_t				node_info[MAX_NODES];
3452 	uint_t				off_hi;
3453 	uint_t				off_lo;
3454 	uint64_t			nb_cfg_reg;
3455 
3456 	/*
3457 	 * Read configuration registers from PCI configuration space to
3458 	 * determine node information, which memory is in each node, etc.
3459 	 *
3460 	 * Write to PCI configuration space address register to specify
3461 	 * which configuration register to read and read/write PCI
3462 	 * configuration space data register to get/set contents
3463 	 */
3464 	bus = OPT_PCS_BUS_CONFIG;
3465 	dev = OPT_PCS_DEV_NODE0;
3466 	off_hi = OPT_PCS_OFF_DRAMBASE_HI;
3467 	off_lo = OPT_PCS_OFF_DRAMBASE_LO;
3468 
3469 	/*
3470 	 * Read node ID register for node 0 to get node count
3471 	 */
3472 	node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
3473 	    OPT_PCS_OFF_NODEID);
3474 	*node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
3475 
3476 	/*
3477 	 * If number of nodes is more than maximum supported, then set node
3478 	 * count to 1 and treat system as UMA instead of NUMA.
3479 	 */
3480 	if (*node_cnt > MAX_NODES) {
3481 		*node_cnt = 1;
3482 		return;
3483 	}
3484 
3485 	/*
3486 	 * For Greyhound, PCI Extended Configuration Space must be enabled to
3487 	 * read high DRAM address map base and limit registers
3488 	 */
3489 	if (opt_family == AMD_FAMILY_GREYHOUND) {
3490 		nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
3491 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3492 			wrmsr(MSR_AMD_NB_CFG,
3493 			    nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
3494 	}
3495 
3496 	for (node = 0; node < *node_cnt; node++) {
3497 		uint32_t	base_hi;
3498 		uint32_t	base_lo;
3499 		uint32_t	limit_hi;
3500 		uint32_t	limit_lo;
3501 
3502 		/*
3503 		 * Read node ID register (except for node 0 which we just read)
3504 		 */
3505 		if (node > 0) {
3506 			node_info[node] = pci_getl_func(bus, dev,
3507 			    OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
3508 		}
3509 
3510 		/*
3511 		 * Read DRAM base and limit registers which specify
3512 		 * physical memory range of each node
3513 		 */
3514 		if (opt_family != AMD_FAMILY_GREYHOUND)
3515 			base_hi = 0;
3516 		else {
3517 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3518 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
3519 			base_hi = dram_map[node].base_hi =
3520 			    inl(PCI_CONFDATA);
3521 		}
3522 		base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
3523 		    OPT_PCS_FUNC_ADDRMAP, off_lo);
3524 
3525 		if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
3526 		    mem_intrlv)
3527 			*mem_intrlv = *mem_intrlv + 1;
3528 
3529 		off_hi += 4;	/* high limit register offset */
3530 		if (opt_family != AMD_FAMILY_GREYHOUND)
3531 			limit_hi = 0;
3532 		else {
3533 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3534 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
3535 			limit_hi = dram_map[node].limit_hi =
3536 			    inl(PCI_CONFDATA);
3537 		}
3538 
3539 		off_lo += 4;	/* low limit register offset */
3540 		limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
3541 		    dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
3542 
3543 		/*
3544 		 * Increment device number to next node and register offsets
3545 		 * for DRAM base register of next node
3546 		 */
3547 		off_hi += 4;
3548 		off_lo += 4;
3549 		dev++;
3550 
3551 		/*
3552 		 * Both read and write enable bits must be enabled in DRAM
3553 		 * address map base register for physical memory to exist in
3554 		 * node
3555 		 */
3556 		if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
3557 		    (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
3558 			/*
3559 			 * Mark node memory as non-existent and set start and
3560 			 * end addresses to be same in memnode_info[]
3561 			 */
3562 			memnode_info[node].exists = 0;
3563 			memnode_info[node].start = memnode_info[node].end =
3564 			    (pfn_t)-1;
3565 			continue;
3566 		}
3567 
3568 		/*
3569 		 * Mark node memory as existing and remember physical address
3570 		 * range of each node for use later
3571 		 */
3572 		memnode_info[node].exists = 1;
3573 
3574 		memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
3575 
3576 		memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
3577 		    OPT_DRAMADDR_LO_MASK_OFF);
3578 	}
3579 
3580 	/*
3581 	 * Restore PCI Extended Configuration Space enable bit
3582 	 */
3583 	if (opt_family == AMD_FAMILY_GREYHOUND) {
3584 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3585 			wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
3586 	}
3587 }
3588 
3589 
3590 /*
3591  * Return average amount of time to read vendor ID register on Northbridge
3592  * N times on specified destination node from current CPU
3593  */
3594 static hrtime_t
3595 opt_probe_vendor(int dest_node, int nreads)
3596 {
3597 	int		cnt;
3598 	uint_t		dev;
3599 	/* LINTED: set but not used in function */
3600 	volatile uint_t	dev_vendor;
3601 	hrtime_t	elapsed;
3602 	hrtime_t	end;
3603 	int		ipl;
3604 	hrtime_t	start;
3605 
3606 	dev = OPT_PCS_DEV_NODE0 + dest_node;
3607 	kpreempt_disable();
3608 	ipl = spl8();
3609 	outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
3610 	    OPT_PCS_OFF_VENDOR));
3611 	start = gethrtime();
3612 	for (cnt = 0; cnt < nreads; cnt++)
3613 		dev_vendor = inl(PCI_CONFDATA);
3614 	end = gethrtime();
3615 	elapsed = (end - start) / nreads;
3616 	splx(ipl);
3617 	kpreempt_enable();
3618 	return (elapsed);
3619 }
3620