xref: /illumos-gate/usr/src/uts/i86pc/os/lgrpplat.c (revision 5016ae894be01e501342a67035ea848043662a45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright (c) 2010, Intel Corporation.
27  * All rights reserved.
28  * Copyright 2024 Oxide Computer Company
29  */
30 
31 /*
32  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
33  * ================================================================
34  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
35  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
36  * one or more CPUs and some local memory.  The CPUs in each node can access
37  * the memory in the other nodes but at a higher latency than accessing their
38  * local memory.  Typically, a system with only one node has Uniform Memory
39  * Access (UMA), but it may be possible to have a one node system that has
40  * some global memory outside of the node which is higher latency.
41  *
42  * Module Description
43  * ------------------
44  * This module provides a platform interface for determining which CPUs and
45  * which memory (and how much) are in a NUMA node and how far each node is from
46  * each other.  The interface is used by the Virtual Memory (VM) system and the
47  * common lgroup framework.  The VM system uses the plat_*() routines to fill
48  * in its memory node (memnode) array with the physical address range spanned
49  * by each NUMA node to know which memory belongs to which node, so it can
50  * build and manage a physical page free list for each NUMA node and allocate
51  * local memory from each node as needed.  The common lgroup framework uses the
52  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
53  * to each node (leaf lgroup) and how far each node is from each other, so it
54  * can build the latency (lgroup) topology for the machine in order to optimize
55  * for locality.  Also, an lgroup platform handle instead of lgroups are used
56  * in the interface with this module, so this module shouldn't need to know
57  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
58  * etc. are in each NUMA node, how far each node is from each other, and to use
59  * a unique lgroup platform handle to refer to each node through the interface.
60  *
61  * Determining NUMA Configuration
62  * ------------------------------
63  * By default, this module will try to determine the NUMA configuration of the
64  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
65  * Locality Information Table (SLIT).  The SRAT contains info to tell which
66  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
67  * is a matrix that gives the distance between each system locality (which is
68  * a NUMA node and should correspond to proximity domains in the SRAT).  For
69  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
70  * specification.
71  *
72  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
73  * examine registers in PCI configuration space to determine how many nodes are
74  * in the system and which CPUs and memory are in each node.
75  * do while booting the kernel.
76  *
77  * NOTE: Using these PCI configuration space registers to determine this
78  *       locality info is not guaranteed to work or be compatible across all
79  *	 Opteron processor families.
80  *
81  * If the SLIT does not exist or look right, the kernel will probe to determine
82  * the distance between nodes as long as the NUMA CPU and memory configuration
83  * has been determined (see lgrp_plat_probe() for details).
84  *
85  * Data Structures
86  * ---------------
87  * The main data structures used by this code are the following:
88  *
89  * - lgrp_plat_cpu_node[]		CPU to node ID mapping table indexed by
90  *					CPU ID (only used for SRAT)
91  *
92  * - lgrp_plat_lat_stats.latencies[][]	Table of latencies between same and
93  *					different nodes indexed by node ID
94  *
95  * - lgrp_plat_node_cnt			Number of NUMA nodes in system for
96  *					non-DR-capable systems,
97  *					maximum possible number of NUMA nodes
98  *					in system for DR capable systems.
99  *
100  * - lgrp_plat_node_domain[]		Node ID to proximity domain ID mapping
101  *					table indexed by node ID (only used
102  *					for SRAT)
103  *
104  * - lgrp_plat_memnode_info[]		Table with physical address range for
105  *					each memory node indexed by memory node
106  *					ID
107  *
108  * The code is implemented to make the following always be true:
109  *
110  *	lgroup platform handle == node ID == memnode ID
111  *
112  * Moreover, it allows for the proximity domain ID to be equal to all of the
113  * above as long as the proximity domains IDs are numbered from 0 to <number of
114  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
115  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
116  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
117  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
118  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
119  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
120  * to node IDs.  However, the proximity domain IDs may not map to the
121  * equivalent node ID since we want to keep the node IDs numbered from 0 to
122  * <number of nodes - 1> to minimize cost of searching and potentially space.
123  *
124  * With the introduction of support of memory DR operations on x86 platforms,
125  * things get a little complicated. The addresses of hot-added memory may not
126  * be continuous with other memory connected to the same lgrp node. In other
127  * words, memory addresses may get interleaved among lgrp nodes after memory
128  * DR operations. To work around this limitation, we have extended the
129  * relationship between lgrp node and memory node from 1:1 map to 1:N map,
130  * that means there may be multiple memory nodes associated with a lgrp node
131  * after memory DR operations.
132  *
133  * To minimize the code changes to support memory DR operations, the
134  * following policies have been adopted.
135  * 1) On non-DR-capable systems, the relationship among lgroup platform handle,
136  *    node ID and memnode ID is still kept as:
137  *	lgroup platform handle == node ID == memnode ID
138  * 2) For memory present at boot time on DR capable platforms, the relationship
139  *    is still kept as is.
140  *	lgroup platform handle == node ID == memnode ID
141  * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have
142  *    been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt)
143  *    are reserved for memory present at boot time, and memnode IDs
144  *    [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate
145  *    memnode ID for hot-added memory.
146  * 4) All boot code having the assumption "node ID == memnode ID" can live as
147  *    is, that's because node ID is always equal to memnode ID at boot time.
148  * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and
149  *    lgrp_plat_mem_size() related logics have been enhanced to deal with
150  *    the 1:N map relationship.
151  * 6) The latency probing related logics, which have the assumption
152  *    "node ID == memnode ID" and may be called at run time, is disabled if
153  *    memory DR operation is enabled.
154  */
155 
156 
157 #include <sys/archsystm.h>	/* for {in,out}{b,w,l}() */
158 #include <sys/atomic.h>
159 #include <sys/bootconf.h>
160 #include <sys/cmn_err.h>
161 #include <sys/controlregs.h>
162 #include <sys/cpupart.h>
163 #include <sys/cpuvar.h>
164 #include <sys/lgrp.h>
165 #include <sys/machsystm.h>
166 #include <sys/memlist.h>
167 #include <sys/memnode.h>
168 #include <sys/mman.h>
169 #include <sys/note.h>
170 #include <sys/pci_cfgspace.h>
171 #include <sys/pci_impl.h>
172 #include <sys/param.h>
173 #include <sys/pghw.h>
174 #include <sys/promif.h>		/* for prom_printf() */
175 #include <sys/sysmacros.h>
176 #include <sys/systm.h>
177 #include <sys/thread.h>
178 #include <sys/types.h>
179 #include <sys/var.h>
180 #include <sys/x86_archext.h>
181 #include <vm/hat_i86.h>
182 #include <vm/seg_kmem.h>
183 #include <vm/vm_dep.h>
184 
185 #include <sys/acpidev.h>
186 #include <sys/acpi/acpi.h>		/* for SRAT, SLIT and MSCT */
187 
188 /* from fakebop.c */
189 extern ACPI_TABLE_SRAT *srat_ptr;
190 extern ACPI_TABLE_SLIT *slit_ptr;
191 extern ACPI_TABLE_MSCT *msct_ptr;
192 
193 #define	MAX_NODES		8
194 #define	NLGRP			(MAX_NODES * (MAX_NODES - 1) + 1)
195 
196 /*
197  * Constants for configuring probing
198  */
199 #define	LGRP_PLAT_PROBE_NROUNDS		64	/* default laps for probing */
200 #define	LGRP_PLAT_PROBE_NSAMPLES	1	/* default samples to take */
201 #define	LGRP_PLAT_PROBE_NREADS		256	/* number of vendor ID reads */
202 
203 /*
204  * Flags for probing
205  */
206 #define	LGRP_PLAT_PROBE_ENABLE		0x1	/* enable probing */
207 #define	LGRP_PLAT_PROBE_PGCPY		0x2	/* probe using page copy */
208 #define	LGRP_PLAT_PROBE_VENDOR		0x4	/* probe vendor ID register */
209 
210 /*
211  * Hash proximity domain ID into node to domain mapping table "mod" number of
212  * nodes to minimize span of entries used and try to have lowest numbered
213  * proximity domain be node 0
214  */
215 #define	NODE_DOMAIN_HASH(domain, node_cnt) \
216 	((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \
217 	    ((domain) - lgrp_plat_prox_domain_min) % node_cnt)
218 
219 /*
220  * CPU to node ID mapping structure (only used with SRAT)
221  */
222 typedef	struct cpu_node_map {
223 	int		exists;
224 	uint_t		node;
225 	uint32_t	apicid;
226 	uint32_t	prox_domain;
227 } cpu_node_map_t;
228 
229 /*
230  * Latency statistics
231  */
232 typedef struct lgrp_plat_latency_stats {
233 	hrtime_t	latencies[MAX_NODES][MAX_NODES];
234 	hrtime_t	latency_max;
235 	hrtime_t	latency_min;
236 } lgrp_plat_latency_stats_t;
237 
238 /*
239  * Memory configuration for probing
240  */
241 typedef struct lgrp_plat_probe_mem_config {
242 	size_t	probe_memsize;		/* how much memory to probe per node */
243 	caddr_t	probe_va[MAX_NODES];	/* where memory mapped for probing */
244 	pfn_t	probe_pfn[MAX_NODES];	/* physical pages to map for probing */
245 } lgrp_plat_probe_mem_config_t;
246 
247 /*
248  * Statistics kept for probing
249  */
250 typedef struct lgrp_plat_probe_stats {
251 	hrtime_t	flush_cost;
252 	hrtime_t	probe_cost;
253 	hrtime_t	probe_cost_total;
254 	hrtime_t	probe_error_code;
255 	hrtime_t	probe_errors[MAX_NODES][MAX_NODES];
256 	int		probe_suspect[MAX_NODES][MAX_NODES];
257 	hrtime_t	probe_max[MAX_NODES][MAX_NODES];
258 	hrtime_t	probe_min[MAX_NODES][MAX_NODES];
259 } lgrp_plat_probe_stats_t;
260 
261 /*
262  * Node to proximity domain ID mapping structure (only used with SRAT)
263  */
264 typedef	struct node_domain_map {
265 	int		exists;
266 	uint32_t	prox_domain;
267 } node_domain_map_t;
268 
269 /*
270  * Node ID and starting and ending page for physical memory in memory node
271  */
272 typedef	struct memnode_phys_addr_map {
273 	pfn_t		start;
274 	pfn_t		end;
275 	int		exists;
276 	uint32_t	prox_domain;
277 	uint32_t	device_id;
278 	uint_t		lgrphand;
279 } memnode_phys_addr_map_t;
280 
281 /*
282  * Number of CPUs for which we got APIC IDs
283  */
284 static int				lgrp_plat_apic_ncpus = 0;
285 
286 /*
287  * CPU to node ID mapping table (only used for SRAT) and its max number of
288  * entries
289  */
290 static cpu_node_map_t			*lgrp_plat_cpu_node = NULL;
291 static uint_t				lgrp_plat_cpu_node_nentries = 0;
292 
293 /*
294  * Latency statistics
295  */
296 lgrp_plat_latency_stats_t		lgrp_plat_lat_stats;
297 
298 /*
299  * Whether memory is interleaved across nodes causing MPO to be disabled
300  */
301 static int				lgrp_plat_mem_intrlv = 0;
302 
303 /*
304  * Node ID to proximity domain ID mapping table (only used for SRAT)
305  */
306 static node_domain_map_t		lgrp_plat_node_domain[MAX_NODES];
307 
308 /*
309  * Physical address range for memory in each node
310  */
311 static memnode_phys_addr_map_t		lgrp_plat_memnode_info[MAX_MEM_NODES];
312 
313 /*
314  * Statistics gotten from probing
315  */
316 static lgrp_plat_probe_stats_t		lgrp_plat_probe_stats;
317 
318 /*
319  * Memory configuration for probing
320  */
321 static lgrp_plat_probe_mem_config_t	lgrp_plat_probe_mem_config;
322 
323 /*
324  * Lowest proximity domain ID seen in ACPI SRAT
325  */
326 static uint32_t				lgrp_plat_prox_domain_min = UINT32_MAX;
327 
328 /*
329  * Error code from processing ACPI SRAT
330  */
331 static int				lgrp_plat_srat_error = 0;
332 
333 /*
334  * Error code from processing ACPI SLIT
335  */
336 static int				lgrp_plat_slit_error = 0;
337 
338 /*
339  * Whether lgrp topology has been flattened to 2 levels.
340  */
341 static int				lgrp_plat_topo_flatten = 0;
342 
343 
344 /*
345  * Maximum memory node ID in use.
346  */
347 static uint_t				lgrp_plat_max_mem_node;
348 
349 /*
350  * Allocate lgroup array statically
351  */
352 static lgrp_t				lgrp_space[NLGRP];
353 static int				nlgrps_alloc;
354 
355 
356 /*
357  * Enable finding and using minimum proximity domain ID when hashing
358  */
359 int			lgrp_plat_domain_min_enable = 1;
360 
361 /*
362  * Maximum possible number of nodes in system
363  */
364 uint_t			lgrp_plat_node_cnt = 1;
365 
366 /*
367  * Enable sorting nodes in ascending order by starting physical address
368  */
369 int			lgrp_plat_node_sort_enable = 1;
370 
371 /*
372  * Configuration Parameters for Probing
373  * - lgrp_plat_probe_flags	Flags to specify enabling probing, probe
374  *				operation, etc.
375  * - lgrp_plat_probe_nrounds	How many rounds of probing to do
376  * - lgrp_plat_probe_nsamples	Number of samples to take when probing each
377  *				node
378  * - lgrp_plat_probe_nreads	Number of times to read vendor ID from
379  *				Northbridge for each probe
380  */
381 uint_t			lgrp_plat_probe_flags = 0;
382 int			lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
383 int			lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
384 int			lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
385 
386 /*
387  * Enable use of ACPI System Resource Affinity Table (SRAT), System
388  * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT)
389  */
390 int			lgrp_plat_srat_enable = 1;
391 int			lgrp_plat_slit_enable = 1;
392 int			lgrp_plat_msct_enable = 1;
393 
394 /*
395  * mnode_xwa: set to non-zero value to initiate workaround if large pages are
396  * found to be crossing memory node boundaries. The workaround will eliminate
397  * a base size page at the end of each memory node boundary to ensure that
398  * a large page with constituent pages that span more than 1 memory node
399  * can never be formed.
400  *
401  */
402 int	mnode_xwa = 1;
403 
404 /*
405  * Static array to hold lgroup statistics
406  */
407 struct lgrp_stats	lgrp_stats[NLGRP];
408 
409 
410 /*
411  * Forward declarations of platform interface routines
412  */
413 void		plat_build_mem_nodes(struct memlist *list);
414 
415 int		plat_mnode_xcheck(pfn_t pfncnt);
416 
417 lgrp_handle_t	plat_mem_node_to_lgrphand(int mnode);
418 
419 int		plat_pfn_to_mem_node(pfn_t pfn);
420 
421 /*
422  * Forward declarations of lgroup platform interface routines
423  */
424 lgrp_t		*lgrp_plat_alloc(lgrp_id_t lgrpid);
425 
426 void		lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
427 
428 lgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);
429 
430 void		lgrp_plat_init(lgrp_init_stages_t stage);
431 
432 int		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
433 
434 int		lgrp_plat_max_lgrps(void);
435 
436 pgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
437     lgrp_mem_query_t query);
438 
439 lgrp_handle_t	lgrp_plat_pfn_to_hand(pfn_t pfn);
440 
441 void		lgrp_plat_probe(void);
442 
443 lgrp_handle_t	lgrp_plat_root_hand(void);
444 
445 
446 /*
447  * Forward declarations of local routines
448  */
449 static int	is_opteron(void);
450 
451 static int	lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
452     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
453     uint32_t domain);
454 
455 static int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
456     int cpu_node_nentries);
457 
458 static int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
459     int node_cnt, uint32_t domain);
460 
461 static void	lgrp_plat_get_numa_config(void);
462 
463 static void	lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
464     lgrp_plat_latency_stats_t *lat_stats,
465     lgrp_plat_probe_stats_t *probe_stats);
466 
467 static int	lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
468     lgrp_plat_latency_stats_t *lat_stats);
469 
470 static void	lgrp_plat_main_init(void);
471 
472 static pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
473 
474 static int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
475     int node_cnt, uint32_t domain);
476 
477 static int	lgrp_plat_memnode_info_update(node_domain_map_t *node_domain,
478     int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt,
479     uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id);
480 
481 static void	lgrp_plat_node_sort(node_domain_map_t *node_domain,
482     int node_cnt, cpu_node_map_t *cpu_node, int cpu_count,
483     memnode_phys_addr_map_t *memnode_info);
484 
485 static hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
486     int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
487     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);
488 
489 static int	lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);
490 
491 static int	lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp,
492     node_domain_map_t *node_domain, uint_t node_cnt,
493     memnode_phys_addr_map_t *memnode_info,
494     lgrp_plat_latency_stats_t *lat_stats);
495 
496 static int	lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info,
497     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
498     lgrp_plat_latency_stats_t *lat_stats);
499 
500 static int	lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
501     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
502     cpu_node_map_t *cpu_node, int cpu_count,
503     memnode_phys_addr_map_t *memnode_info);
504 
505 static void	lgrp_plat_release_bootstrap(void);
506 
507 static int	lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp,
508     uint32_t *prox_domain_min);
509 
510 static int	lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp,
511     uint32_t *prox_domain_min);
512 
513 static void	lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats);
514 
515 static void	opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
516     memnode_phys_addr_map_t *memnode_info);
517 
518 static hrtime_t	opt_probe_vendor(int dest_node, int nreads);
519 
520 
521 /*
522  * PLATFORM INTERFACE ROUTINES
523  */
524 
525 /*
526  * Configure memory nodes for machines with more than one node (ie NUMA)
527  */
528 void
529 plat_build_mem_nodes(struct memlist *list)
530 {
531 	pfn_t		cur_start;	/* start addr of subrange */
532 	pfn_t		cur_end;	/* end addr of subrange */
533 	pfn_t		start;		/* start addr of whole range */
534 	pfn_t		end;		/* end addr of whole range */
535 	pgcnt_t		endcnt;		/* pages to sacrifice */
536 
537 	/*
538 	 * Boot install lists are arranged <addr, len>, ...
539 	 */
540 	while (list) {
541 		int	node;
542 
543 		start = list->ml_address >> PAGESHIFT;
544 		end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT;
545 
546 		if (start > physmax) {
547 			list = list->ml_next;
548 			continue;
549 		}
550 		if (end > physmax)
551 			end = physmax;
552 
553 		/*
554 		 * When there is only one memnode, just add memory to memnode
555 		 */
556 		if (max_mem_nodes == 1) {
557 			mem_node_add_slice(start, end);
558 			list = list->ml_next;
559 			continue;
560 		}
561 
562 		/*
563 		 * mem_node_add_slice() expects to get a memory range that
564 		 * is within one memnode, so need to split any memory range
565 		 * that spans multiple memnodes into subranges that are each
566 		 * contained within one memnode when feeding them to
567 		 * mem_node_add_slice()
568 		 */
569 		cur_start = start;
570 		do {
571 			node = plat_pfn_to_mem_node(cur_start);
572 
573 			/*
574 			 * Panic if DRAM address map registers or SRAT say
575 			 * memory in node doesn't exist or address from
576 			 * boot installed memory list entry isn't in this node.
577 			 * This shouldn't happen and rest of code can't deal
578 			 * with this if it does.
579 			 */
580 			if (node < 0 || node >= lgrp_plat_max_mem_node ||
581 			    !lgrp_plat_memnode_info[node].exists ||
582 			    cur_start < lgrp_plat_memnode_info[node].start ||
583 			    cur_start > lgrp_plat_memnode_info[node].end) {
584 				cmn_err(CE_PANIC, "Don't know which memnode "
585 				    "to add installed memory address 0x%lx\n",
586 				    cur_start);
587 			}
588 
589 			/*
590 			 * End of current subrange should not span memnodes
591 			 */
592 			cur_end = end;
593 			endcnt = 0;
594 			if (lgrp_plat_memnode_info[node].exists &&
595 			    cur_end > lgrp_plat_memnode_info[node].end) {
596 				cur_end = lgrp_plat_memnode_info[node].end;
597 				if (mnode_xwa > 1) {
598 					/*
599 					 * sacrifice the last page in each
600 					 * node to eliminate large pages
601 					 * that span more than 1 memory node.
602 					 */
603 					endcnt = 1;
604 					physinstalled--;
605 				}
606 			}
607 
608 			mem_node_add_slice(cur_start, cur_end - endcnt);
609 
610 			/*
611 			 * Next subrange starts after end of current one
612 			 */
613 			cur_start = cur_end + 1;
614 		} while (cur_end < end);
615 
616 		list = list->ml_next;
617 	}
618 	mem_node_physalign = 0;
619 	mem_node_pfn_shift = 0;
620 }
621 
622 
623 /*
624  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
625  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
626  * a crossing is found and returns 0 otherwise.
627  */
628 int
629 plat_mnode_xcheck(pfn_t pfncnt)
630 {
631 	int	node, prevnode = -1, basenode;
632 	pfn_t	ea, sa;
633 
634 	for (node = 0; node < lgrp_plat_max_mem_node; node++) {
635 
636 		if (lgrp_plat_memnode_info[node].exists == 0)
637 			continue;
638 
639 		if (prevnode == -1) {
640 			prevnode = node;
641 			basenode = node;
642 			continue;
643 		}
644 
645 		/* assume x86 node pfn ranges are in increasing order */
646 		ASSERT(lgrp_plat_memnode_info[node].start >
647 		    lgrp_plat_memnode_info[prevnode].end);
648 
649 		/*
650 		 * continue if the starting address of node is not contiguous
651 		 * with the previous node.
652 		 */
653 
654 		if (lgrp_plat_memnode_info[node].start !=
655 		    (lgrp_plat_memnode_info[prevnode].end + 1)) {
656 			basenode = node;
657 			prevnode = node;
658 			continue;
659 		}
660 
661 		/* check if the starting address of node is pfncnt aligned */
662 		if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) {
663 
664 			/*
665 			 * at this point, node starts at an unaligned boundary
666 			 * and is contiguous with the previous node(s) to
667 			 * basenode. Check if there is an aligned contiguous
668 			 * range of length pfncnt that crosses this boundary.
669 			 */
670 
671 			sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end,
672 			    pfncnt);
673 			ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start),
674 			    pfncnt);
675 
676 			ASSERT((ea - sa) == pfncnt);
677 			if (sa >= lgrp_plat_memnode_info[basenode].start &&
678 			    ea <= (lgrp_plat_memnode_info[node].end + 1)) {
679 				/*
680 				 * large page found to cross mnode boundary.
681 				 * Return Failure if workaround not enabled.
682 				 */
683 				if (mnode_xwa == 0)
684 					return (1);
685 				mnode_xwa++;
686 			}
687 		}
688 		prevnode = node;
689 	}
690 	return (0);
691 }
692 
693 
694 lgrp_handle_t
695 plat_mem_node_to_lgrphand(int mnode)
696 {
697 	if (max_mem_nodes == 1)
698 		return (LGRP_DEFAULT_HANDLE);
699 
700 	ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node);
701 
702 	return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand));
703 }
704 
705 int
706 plat_pfn_to_mem_node(pfn_t pfn)
707 {
708 	int	node;
709 
710 	if (max_mem_nodes == 1)
711 		return (0);
712 
713 	for (node = 0; node < lgrp_plat_max_mem_node; node++) {
714 		/*
715 		 * Skip nodes with no memory
716 		 */
717 		if (!lgrp_plat_memnode_info[node].exists)
718 			continue;
719 
720 		membar_consumer();
721 		if (pfn >= lgrp_plat_memnode_info[node].start &&
722 		    pfn <= lgrp_plat_memnode_info[node].end)
723 			return (node);
724 	}
725 
726 	/*
727 	 * Didn't find memnode where this PFN lives which should never happen
728 	 */
729 	ASSERT(node < lgrp_plat_max_mem_node);
730 	return (-1);
731 }
732 
733 
734 /*
735  * LGROUP PLATFORM INTERFACE ROUTINES
736  */
737 
738 /*
739  * Allocate additional space for an lgroup.
740  */
741 lgrp_t *
742 lgrp_plat_alloc(lgrp_id_t lgrpid)
743 {
744 	lgrp_t *lgrp;
745 
746 	lgrp = &lgrp_space[nlgrps_alloc++];
747 	if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
748 		return (NULL);
749 	return (lgrp);
750 }
751 
752 
753 /*
754  * Platform handling for (re)configuration changes
755  *
756  * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug:
757  * 1) Use cpu_lock to synchronize between lgrp_plat_config() and
758  *    lgrp_plat_cpu_to_hand().
759  * 2) Disable latency probing logic by making sure that the flag
760  *    LGRP_PLAT_PROBE_ENABLE is cleared.
761  *
762  * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug:
763  * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal.
764  * 2) Only expansion to existing entries, no shrinking.
765  * 3) On writing side, DR framework ensures that lgrp_plat_config() is called
766  *    in single-threaded context. And membar_producer() is used to ensure that
767  *    all changes are visible to other CPUs before setting the "exists" flag.
768  * 4) On reading side, membar_consumer() after checking the "exists" flag
769  *    ensures that right values are retrieved.
770  *
771  * Mechanism to protect lgrp_plat_node_domain[] at hotplug:
772  * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal.
773  * 2) On writing side, it's single-threaded and membar_producer() is used to
774  *    ensure all changes are visible to other CPUs before setting the "exists"
775  *    flag.
776  * 3) On reading side, membar_consumer() after checking the "exists" flag
777  *    ensures that right values are retrieved.
778  */
779 void
780 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
781 {
782 #ifdef	__xpv
783 	_NOTE(ARGUNUSED(flag, arg));
784 #else
785 	int	rc, node;
786 	cpu_t	*cp;
787 	void	*hdl = NULL;
788 	uchar_t	*sliptr = NULL;
789 	uint32_t domain, apicid, slicnt = 0;
790 	update_membounds_t *mp;
791 
792 	extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *,
793 	    uint32_t *, uint32_t *, uchar_t **);
794 	extern void acpidev_dr_free_cpu_numa_info(void *);
795 
796 	/*
797 	 * This interface is used to support CPU/memory DR operations.
798 	 * Don't bother here if it's still during boot or only one lgrp node
799 	 * is supported.
800 	 */
801 	if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1)
802 		return;
803 
804 	switch (flag) {
805 	case LGRP_CONFIG_CPU_ADD:
806 		cp = (cpu_t *)arg;
807 		ASSERT(cp != NULL);
808 		ASSERT(MUTEX_HELD(&cpu_lock));
809 
810 		/* Check whether CPU already exists. */
811 		ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists);
812 		if (lgrp_plat_cpu_node[cp->cpu_id].exists) {
813 			cmn_err(CE_WARN,
814 			    "!lgrp: CPU(%d) already exists in cpu_node map.",
815 			    cp->cpu_id);
816 			break;
817 		}
818 
819 		/* Query CPU lgrp information. */
820 		rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
821 		    &slicnt, &sliptr);
822 		ASSERT(rc == 0);
823 		if (rc != 0) {
824 			cmn_err(CE_WARN,
825 			    "!lgrp: failed to query lgrp info for CPU(%d).",
826 			    cp->cpu_id);
827 			break;
828 		}
829 
830 		/* Update node to proximity domain mapping */
831 		node = lgrp_plat_domain_to_node(lgrp_plat_node_domain,
832 		    lgrp_plat_node_cnt, domain);
833 		if (node == -1) {
834 			node = lgrp_plat_node_domain_update(
835 			    lgrp_plat_node_domain, lgrp_plat_node_cnt, domain);
836 			ASSERT(node != -1);
837 			if (node == -1) {
838 				acpidev_dr_free_cpu_numa_info(hdl);
839 				cmn_err(CE_WARN, "!lgrp: failed to update "
840 				    "node_domain map for domain(%u).", domain);
841 				break;
842 			}
843 		}
844 
845 		/* Update latency information among lgrps. */
846 		if (slicnt != 0 && sliptr != NULL) {
847 			if (lgrp_plat_process_sli(domain, sliptr, slicnt,
848 			    lgrp_plat_node_domain, lgrp_plat_node_cnt,
849 			    &lgrp_plat_lat_stats) != 0) {
850 				cmn_err(CE_WARN, "!lgrp: failed to update "
851 				    "latency information for domain (%u).",
852 				    domain);
853 			}
854 		}
855 
856 		/* Update CPU to node mapping. */
857 		lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain;
858 		lgrp_plat_cpu_node[cp->cpu_id].node = node;
859 		lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid;
860 		lgrp_plat_cpu_node[cp->cpu_id].exists = 1;
861 		lgrp_plat_apic_ncpus++;
862 
863 		acpidev_dr_free_cpu_numa_info(hdl);
864 		break;
865 
866 	case LGRP_CONFIG_CPU_DEL:
867 		cp = (cpu_t *)arg;
868 		ASSERT(cp != NULL);
869 		ASSERT(MUTEX_HELD(&cpu_lock));
870 
871 		/* Check whether CPU exists. */
872 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists);
873 		if (!lgrp_plat_cpu_node[cp->cpu_id].exists) {
874 			cmn_err(CE_WARN,
875 			    "!lgrp: CPU(%d) doesn't exist in cpu_node map.",
876 			    cp->cpu_id);
877 			break;
878 		}
879 
880 		/* Query CPU lgrp information. */
881 		rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
882 		    NULL, NULL);
883 		ASSERT(rc == 0);
884 		if (rc != 0) {
885 			cmn_err(CE_WARN,
886 			    "!lgrp: failed to query lgrp info for CPU(%d).",
887 			    cp->cpu_id);
888 			break;
889 		}
890 
891 		/* Update map. */
892 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid);
893 		ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain);
894 		lgrp_plat_cpu_node[cp->cpu_id].exists = 0;
895 		lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX;
896 		lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX;
897 		lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX;
898 		lgrp_plat_apic_ncpus--;
899 
900 		acpidev_dr_free_cpu_numa_info(hdl);
901 		break;
902 
903 	case LGRP_CONFIG_MEM_ADD:
904 		mp = (update_membounds_t *)arg;
905 		ASSERT(mp != NULL);
906 
907 		/* Update latency information among lgrps. */
908 		if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) {
909 			if (lgrp_plat_process_sli(mp->u_domain,
910 			    mp->u_sli_ptr, mp->u_sli_cnt,
911 			    lgrp_plat_node_domain, lgrp_plat_node_cnt,
912 			    &lgrp_plat_lat_stats) != 0) {
913 				cmn_err(CE_WARN, "!lgrp: failed to update "
914 				    "latency information for domain (%u).",
915 				    domain);
916 			}
917 		}
918 
919 		if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain,
920 		    lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes,
921 		    mp->u_base, mp->u_base + mp->u_length,
922 		    mp->u_domain, mp->u_device_id) < 0) {
923 			cmn_err(CE_WARN,
924 			    "!lgrp: failed to update latency  information for "
925 			    "memory (0x%" PRIx64 " - 0x%" PRIx64 ").",
926 			    mp->u_base, mp->u_base + mp->u_length);
927 		}
928 		break;
929 
930 	default:
931 		break;
932 	}
933 #endif	/* __xpv */
934 }
935 
936 
937 /*
938  * Return the platform handle for the lgroup containing the given CPU
939  */
940 lgrp_handle_t
941 lgrp_plat_cpu_to_hand(processorid_t id)
942 {
943 	lgrp_handle_t	hand;
944 
945 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
946 
947 	if (lgrp_plat_node_cnt == 1)
948 		return (LGRP_DEFAULT_HANDLE);
949 
950 	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
951 	    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);
952 
953 	if (hand == (lgrp_handle_t)-1)
954 		return (LGRP_NULL_HANDLE);
955 
956 	return (hand);
957 }
958 
959 
960 /*
961  * Platform-specific initialization of lgroups
962  */
963 void
964 lgrp_plat_init(lgrp_init_stages_t stage)
965 {
966 #if defined(__xpv)
967 #else	/* __xpv */
968 	u_longlong_t	value;
969 #endif	/* __xpv */
970 
971 	switch (stage) {
972 	case LGRP_INIT_STAGE1:
973 #if defined(__xpv)
974 		/*
975 		 * XXPV	For now, the hypervisor treats all memory equally.
976 		 */
977 		lgrp_plat_node_cnt = max_mem_nodes = 1;
978 #else	/* __xpv */
979 
980 		/*
981 		 * Get boot property for lgroup topology height limit
982 		 */
983 		if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
984 			(void) lgrp_topo_ht_limit_set((int)value);
985 
986 		/*
987 		 * Get boot property for enabling/disabling SRAT
988 		 */
989 		if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
990 			lgrp_plat_srat_enable = (int)value;
991 
992 		/*
993 		 * Get boot property for enabling/disabling SLIT
994 		 */
995 		if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
996 			lgrp_plat_slit_enable = (int)value;
997 
998 		/*
999 		 * Get boot property for enabling/disabling MSCT
1000 		 */
1001 		if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0)
1002 			lgrp_plat_msct_enable = (int)value;
1003 
1004 		/*
1005 		 * Initialize as a UMA machine
1006 		 */
1007 		if (lgrp_topo_ht_limit() == 1) {
1008 			lgrp_plat_node_cnt = max_mem_nodes = 1;
1009 			lgrp_plat_max_mem_node = 1;
1010 			return;
1011 		}
1012 
1013 		lgrp_plat_get_numa_config();
1014 
1015 		/*
1016 		 * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes
1017 		 * to support memory DR operations if memory DR is enabled.
1018 		 */
1019 		lgrp_plat_max_mem_node = lgrp_plat_node_cnt;
1020 		if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) {
1021 			max_mem_nodes = MAX_MEM_NODES_PER_LGROUP *
1022 			    lgrp_plat_node_cnt;
1023 			ASSERT(max_mem_nodes <= MAX_MEM_NODES);
1024 		}
1025 #endif	/* __xpv */
1026 		break;
1027 
1028 	case LGRP_INIT_STAGE3:
1029 		lgrp_plat_probe();
1030 		lgrp_plat_release_bootstrap();
1031 		break;
1032 
1033 	case LGRP_INIT_STAGE4:
1034 		lgrp_plat_main_init();
1035 		break;
1036 
1037 	default:
1038 		break;
1039 	}
1040 }
1041 
1042 
1043 /*
1044  * Return latency between "from" and "to" lgroups
1045  *
1046  * This latency number can only be used for relative comparison
1047  * between lgroups on the running system, cannot be used across platforms,
1048  * and may not reflect the actual latency.  It is platform and implementation
1049  * specific, so platform gets to decide its value.  It would be nice if the
1050  * number was at least proportional to make comparisons more meaningful though.
1051  */
1052 int
1053 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
1054 {
1055 	lgrp_handle_t	src, dest;
1056 	int		node;
1057 
1058 	if (max_mem_nodes == 1)
1059 		return (0);
1060 
1061 	/*
1062 	 * Return max latency for root lgroup
1063 	 */
1064 	if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
1065 		return (lgrp_plat_lat_stats.latency_max);
1066 
1067 	src = from;
1068 	dest = to;
1069 
1070 	/*
1071 	 * Return 0 for nodes (lgroup platform handles) out of range
1072 	 */
1073 	if (src >= MAX_NODES || dest >= MAX_NODES)
1074 		return (0);
1075 
1076 	/*
1077 	 * Probe from current CPU if its lgroup latencies haven't been set yet
1078 	 * and we are trying to get latency from current CPU to some node.
1079 	 * Avoid probing if CPU/memory DR is enabled.
1080 	 */
1081 	if (lgrp_plat_lat_stats.latencies[src][src] == 0) {
1082 		/*
1083 		 * Latency information should be updated by lgrp_plat_config()
1084 		 * for DR operations. Something is wrong if reaches here.
1085 		 * For safety, flatten lgrp topology to two levels.
1086 		 */
1087 		if (plat_dr_support_cpu() || plat_dr_support_memory()) {
1088 			ASSERT(lgrp_plat_lat_stats.latencies[src][src]);
1089 			cmn_err(CE_WARN,
1090 			    "lgrp: failed to get latency information, "
1091 			    "fall back to two-level topology.");
1092 			lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1093 		} else {
1094 			node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1095 			    lgrp_plat_cpu_node_nentries);
1096 			ASSERT3U(node, <, lgrp_plat_node_cnt);
1097 			if (node == (lgrp_handle_t)-1)
1098 				return (0);
1099 			if (node == src)
1100 				lgrp_plat_probe();
1101 		}
1102 	}
1103 
1104 	return (lgrp_plat_lat_stats.latencies[src][dest]);
1105 }
1106 
1107 
1108 /*
1109  * Return the maximum number of lgrps supported by the platform.
1110  * Before lgrp topology is known it returns an estimate based on the number of
1111  * nodes. Once topology is known it returns:
1112  * 1) the actual maximim number of lgrps created if CPU/memory DR operations
1113  *    are not suppported.
1114  * 2) the maximum possible number of lgrps if CPU/memory DR operations are
1115  *    supported.
1116  */
1117 int
1118 lgrp_plat_max_lgrps(void)
1119 {
1120 	if (!lgrp_topo_initialized || plat_dr_support_cpu() ||
1121 	    plat_dr_support_memory()) {
1122 		return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
1123 	} else {
1124 		return (lgrp_alloc_max + 1);
1125 	}
1126 }
1127 
1128 
1129 /*
1130  * Count number of memory pages (_t) based on mnode id (_n) and query type (_t).
1131  */
1132 #define	_LGRP_PLAT_MEM_SIZE(_n, _q, _t)					\
1133 	if (mem_node_config[_n].exists) {				\
1134 		switch (_q) {						\
1135 		case LGRP_MEM_SIZE_FREE:				\
1136 			_t += MNODE_PGCNT(_n);				\
1137 			break;						\
1138 		case LGRP_MEM_SIZE_AVAIL:				\
1139 			_t += mem_node_memlist_pages(_n, phys_avail);	\
1140 				break;					\
1141 		case LGRP_MEM_SIZE_INSTALL:				\
1142 			_t += mem_node_memlist_pages(_n, phys_install);	\
1143 			break;						\
1144 		default:						\
1145 			break;						\
1146 		}							\
1147 	}
1148 
1149 /*
1150  * Return the number of free pages in an lgroup.
1151  *
1152  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
1153  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
1154  * number of allocatable base pagesize pages corresponding to the
1155  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
1156  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
1157  * memory installed, regardless of whether or not it's usable.
1158  */
1159 pgcnt_t
1160 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
1161 {
1162 	int	mnode;
1163 	pgcnt_t npgs = (pgcnt_t)0;
1164 	extern struct memlist *phys_avail;
1165 	extern struct memlist *phys_install;
1166 
1167 
1168 	if (plathand == LGRP_DEFAULT_HANDLE)
1169 		return (lgrp_plat_mem_size_default(plathand, query));
1170 
1171 	if (plathand != LGRP_NULL_HANDLE) {
1172 		/* Count memory node present at boot. */
1173 		mnode = (int)plathand;
1174 		ASSERT(mnode < lgrp_plat_node_cnt);
1175 		_LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1176 
1177 		/* Count possible hot-added memory nodes. */
1178 		for (mnode = lgrp_plat_node_cnt;
1179 		    mnode < lgrp_plat_max_mem_node; mnode++) {
1180 			if (lgrp_plat_memnode_info[mnode].lgrphand == plathand)
1181 				_LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1182 		}
1183 	}
1184 
1185 	return (npgs);
1186 }
1187 
1188 
1189 /*
1190  * Return the platform handle of the lgroup that contains the physical memory
1191  * corresponding to the given page frame number
1192  */
1193 lgrp_handle_t
1194 lgrp_plat_pfn_to_hand(pfn_t pfn)
1195 {
1196 	int	mnode;
1197 
1198 	if (max_mem_nodes == 1)
1199 		return (LGRP_DEFAULT_HANDLE);
1200 
1201 	if (pfn > physmax)
1202 		return (LGRP_NULL_HANDLE);
1203 
1204 	mnode = plat_pfn_to_mem_node(pfn);
1205 	if (mnode < 0)
1206 		return (LGRP_NULL_HANDLE);
1207 
1208 	return (MEM_NODE_2_LGRPHAND(mnode));
1209 }
1210 
1211 
1212 /*
1213  * Probe memory in each node from current CPU to determine latency topology
1214  *
1215  * The probing code will probe the vendor ID register on the Northbridge of
1216  * Opteron processors and probe memory for other processors by default.
1217  *
1218  * Since probing is inherently error prone, the code takes laps across all the
1219  * nodes probing from each node to each of the other nodes some number of
1220  * times.  Furthermore, each node is probed some number of times before moving
1221  * onto the next one during each lap.  The minimum latency gotten between nodes
1222  * is kept as the latency between the nodes.
1223  *
1224  * After all that,  the probe times are adjusted by normalizing values that are
1225  * close to each other and local latencies are made the same.  Lastly, the
1226  * latencies are verified to make sure that certain conditions are met (eg.
1227  * local < remote, latency(a, b) == latency(b, a), etc.).
1228  *
1229  * If any of the conditions aren't met, the code will export a NUMA
1230  * configuration with the local CPUs and memory given by the SRAT or PCI config
1231  * space registers and one remote memory latency since it can't tell exactly
1232  * how far each node is from each other.
1233  */
1234 void
1235 lgrp_plat_probe(void)
1236 {
1237 	int				from;
1238 	int				i;
1239 	lgrp_plat_latency_stats_t	*lat_stats;
1240 	boolean_t			probed;
1241 	hrtime_t			probe_time;
1242 	int				to;
1243 
1244 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
1245 	    max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
1246 		return;
1247 
1248 	/* SRAT and SLIT should be enabled if DR operations are enabled. */
1249 	if (plat_dr_support_cpu() || plat_dr_support_memory())
1250 		return;
1251 
1252 	/*
1253 	 * Determine ID of node containing current CPU
1254 	 */
1255 	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1256 	    lgrp_plat_cpu_node_nentries);
1257 	ASSERT3U(from, <, lgrp_plat_node_cnt);
1258 	if (from == (lgrp_handle_t)-1)
1259 		return;
1260 	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
1261 		ASSERT(lgrp_plat_node_domain[from].exists);
1262 
1263 	/*
1264 	 * Don't need to probe if got times already
1265 	 */
1266 	lat_stats = &lgrp_plat_lat_stats;
1267 	if (lat_stats->latencies[from][from] != 0)
1268 		return;
1269 
1270 	/*
1271 	 * Read vendor ID in Northbridge or read and write page(s)
1272 	 * in each node from current CPU and remember how long it takes,
1273 	 * so we can build latency topology of machine later.
1274 	 * This should approximate the memory latency between each node.
1275 	 */
1276 	probed = B_FALSE;
1277 	for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
1278 		for (to = 0; to < lgrp_plat_node_cnt; to++) {
1279 			/*
1280 			 * Get probe time and skip over any nodes that can't be
1281 			 * probed yet or don't have memory
1282 			 */
1283 			probe_time = lgrp_plat_probe_time(to,
1284 			    lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
1285 			    &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
1286 			    &lgrp_plat_probe_stats);
1287 			if (probe_time == 0)
1288 				continue;
1289 
1290 			probed = B_TRUE;
1291 
1292 			/*
1293 			 * Keep lowest probe time as latency between nodes
1294 			 */
1295 			if (lat_stats->latencies[from][to] == 0 ||
1296 			    probe_time < lat_stats->latencies[from][to])
1297 				lat_stats->latencies[from][to] = probe_time;
1298 
1299 			/*
1300 			 * Update overall minimum and maximum probe times
1301 			 * across all nodes
1302 			 */
1303 			if (probe_time < lat_stats->latency_min ||
1304 			    lat_stats->latency_min == -1)
1305 				lat_stats->latency_min = probe_time;
1306 			if (probe_time > lat_stats->latency_max)
1307 				lat_stats->latency_max = probe_time;
1308 		}
1309 	}
1310 
1311 	/*
1312 	 * Bail out if weren't able to probe any nodes from current CPU
1313 	 */
1314 	if (probed == B_FALSE)
1315 		return;
1316 
1317 	/*
1318 	 * - Fix up latencies such that local latencies are same,
1319 	 *   latency(i, j) == latency(j, i), etc. (if possible)
1320 	 *
1321 	 * - Verify that latencies look ok
1322 	 *
1323 	 * - Fallback to just optimizing for local and remote if
1324 	 *   latencies didn't look right
1325 	 */
1326 	lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats,
1327 	    &lgrp_plat_probe_stats);
1328 	lgrp_plat_probe_stats.probe_error_code =
1329 	    lgrp_plat_latency_verify(lgrp_plat_memnode_info,
1330 	    &lgrp_plat_lat_stats);
1331 	if (lgrp_plat_probe_stats.probe_error_code)
1332 		lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1333 }
1334 
1335 
1336 /*
1337  * Return platform handle for root lgroup
1338  */
1339 lgrp_handle_t
1340 lgrp_plat_root_hand(void)
1341 {
1342 	return (LGRP_DEFAULT_HANDLE);
1343 }
1344 
1345 
1346 /*
1347  * INTERNAL ROUTINES
1348  */
1349 
1350 
1351 /*
1352  * Update CPU to node mapping for given CPU and proximity domain.
1353  * Return values:
1354  *	- zero for success
1355  *	- positive numbers for warnings
1356  *	- negative numbers for errors
1357  */
1358 static int
1359 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
1360     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
1361 {
1362 	uint_t	i;
1363 	int	node;
1364 
1365 	/*
1366 	 * Get node number for proximity domain
1367 	 */
1368 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
1369 	if (node == -1) {
1370 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
1371 		    domain);
1372 		if (node == -1)
1373 			return (-1);
1374 	}
1375 
1376 	/*
1377 	 * Search for entry with given APIC ID and fill in its node and
1378 	 * proximity domain IDs (if they haven't been set already)
1379 	 */
1380 	for (i = 0; i < nentries; i++) {
1381 		/*
1382 		 * Skip nonexistent entries and ones without matching APIC ID
1383 		 */
1384 		if (!cpu_node[i].exists || cpu_node[i].apicid != apicid)
1385 			continue;
1386 
1387 		/*
1388 		 * Just return if entry completely and correctly filled in
1389 		 * already
1390 		 */
1391 		if (cpu_node[i].prox_domain == domain &&
1392 		    cpu_node[i].node == node)
1393 			return (1);
1394 
1395 		/*
1396 		 * It's invalid to have more than one entry with the same
1397 		 * local APIC ID in SRAT table.
1398 		 */
1399 		if (cpu_node[i].node != UINT_MAX)
1400 			return (-2);
1401 
1402 		/*
1403 		 * Fill in node and proximity domain IDs
1404 		 */
1405 		cpu_node[i].prox_domain = domain;
1406 		cpu_node[i].node = node;
1407 
1408 		return (0);
1409 	}
1410 
1411 	/*
1412 	 * It's possible that an apicid doesn't exist in the cpu_node map due
1413 	 * to user limits number of CPUs powered on at boot by specifying the
1414 	 * boot_ncpus kernel option.
1415 	 */
1416 	return (2);
1417 }
1418 
1419 
1420 /*
1421  * Get node ID for given CPU
1422  */
1423 static int
1424 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
1425     int cpu_node_nentries)
1426 {
1427 	processorid_t	cpuid;
1428 
1429 	if (cp == NULL)
1430 		return (-1);
1431 
1432 	cpuid = cp->cpu_id;
1433 	if (cpuid < 0 || cpuid >= max_ncpus)
1434 		return (-1);
1435 
1436 	/*
1437 	 * SRAT doesn't exist, isn't enabled, or there was an error processing
1438 	 * it, so return node ID for Opteron and -1 otherwise.
1439 	 */
1440 	if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
1441 	    lgrp_plat_srat_error) {
1442 		if (is_opteron())
1443 			return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE));
1444 		return (-1);
1445 	}
1446 
1447 	/*
1448 	 * Return -1 when CPU to node ID mapping entry doesn't exist for given
1449 	 * CPU
1450 	 */
1451 	if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
1452 		return (-1);
1453 
1454 	return (cpu_node[cpuid].node);
1455 }
1456 
1457 
1458 /*
1459  * Return node number for given proximity domain/system locality
1460  */
1461 static int
1462 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
1463     uint32_t domain)
1464 {
1465 	uint_t	node;
1466 	uint_t	start;
1467 
1468 	/*
1469 	 * Hash proximity domain ID into node to domain mapping table (array),
1470 	 * search for entry with matching proximity domain ID, and return index
1471 	 * of matching entry as node ID.
1472 	 */
1473 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
1474 	do {
1475 		if (node_domain[node].exists) {
1476 			membar_consumer();
1477 			if (node_domain[node].prox_domain == domain)
1478 				return (node);
1479 		}
1480 		node = (node + 1) % node_cnt;
1481 	} while (node != start);
1482 	return (-1);
1483 }
1484 
1485 
1486 /*
1487  * Get NUMA configuration of machine
1488  */
1489 static void
1490 lgrp_plat_get_numa_config(void)
1491 {
1492 	uint_t		probe_op;
1493 
1494 	/*
1495 	 * Read boot property with CPU to APIC ID mapping table/array to
1496 	 * determine number of CPUs
1497 	 */
1498 	lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
1499 
1500 	/*
1501 	 * Determine which CPUs and memory are local to each other and number
1502 	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
1503 	 */
1504 	if (lgrp_plat_apic_ncpus > 0) {
1505 		int	retval;
1506 
1507 		/* Reserve enough resources if CPU DR is enabled. */
1508 		if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus)
1509 			lgrp_plat_cpu_node_nentries = max_ncpus;
1510 		else
1511 			lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
1512 
1513 		/*
1514 		 * Temporarily allocate boot memory to use for CPU to node
1515 		 * mapping since kernel memory allocator isn't alive yet
1516 		 */
1517 		lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
1518 		    NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t),
1519 		    sizeof (int));
1520 
1521 		ASSERT(lgrp_plat_cpu_node != NULL);
1522 		if (lgrp_plat_cpu_node) {
1523 			bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
1524 			    sizeof (cpu_node_map_t));
1525 		} else {
1526 			lgrp_plat_cpu_node_nentries = 0;
1527 		}
1528 
1529 		/*
1530 		 * Fill in CPU to node ID mapping table with APIC ID for each
1531 		 * CPU
1532 		 */
1533 		(void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
1534 
1535 		retval = lgrp_plat_process_srat(srat_ptr, msct_ptr,
1536 		    &lgrp_plat_prox_domain_min,
1537 		    lgrp_plat_node_domain, lgrp_plat_cpu_node,
1538 		    lgrp_plat_apic_ncpus, lgrp_plat_memnode_info);
1539 		if (retval <= 0) {
1540 			lgrp_plat_srat_error = retval;
1541 			lgrp_plat_node_cnt = 1;
1542 		} else {
1543 			lgrp_plat_srat_error = 0;
1544 			lgrp_plat_node_cnt = retval;
1545 		}
1546 	}
1547 
1548 	/*
1549 	 * Try to use PCI config space registers on Opteron if there's an error
1550 	 * processing CPU to APIC ID mapping or SRAT
1551 	 */
1552 	if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
1553 	    is_opteron())
1554 		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
1555 		    lgrp_plat_memnode_info);
1556 
1557 	/*
1558 	 * Don't bother to setup system for multiple lgroups and only use one
1559 	 * memory node when memory is interleaved between any nodes or there is
1560 	 * only one NUMA node
1561 	 */
1562 	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
1563 		lgrp_plat_node_cnt = max_mem_nodes = 1;
1564 		(void) lgrp_topo_ht_limit_set(1);
1565 		return;
1566 	}
1567 
1568 	/*
1569 	 * Leaf lgroups on x86/x64 architectures contain one physical
1570 	 * processor chip. Tune lgrp_expand_proc_thresh and
1571 	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
1572 	 * things out aggressively.
1573 	 */
1574 	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
1575 	lgrp_expand_proc_diff = 0;
1576 
1577 	/*
1578 	 * There should be one memnode (physical page free list(s)) for
1579 	 * each node if memory DR is disabled.
1580 	 */
1581 	max_mem_nodes = lgrp_plat_node_cnt;
1582 
1583 	/*
1584 	 * Initialize min and max latency before reading SLIT or probing
1585 	 */
1586 	lgrp_plat_lat_stats.latency_min = -1;
1587 	lgrp_plat_lat_stats.latency_max = 0;
1588 
1589 	/*
1590 	 * Determine how far each NUMA node is from each other by
1591 	 * reading ACPI System Locality Information Table (SLIT) if it
1592 	 * exists
1593 	 */
1594 	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
1595 	    lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info,
1596 	    &lgrp_plat_lat_stats);
1597 
1598 	/*
1599 	 * Disable support of CPU/memory DR operations if multiple locality
1600 	 * domains exist in system and either of following is true.
1601 	 * 1) Failed to process SLIT table.
1602 	 * 2) Latency probing is enabled by user.
1603 	 */
1604 	if (lgrp_plat_node_cnt > 1 &&
1605 	    (plat_dr_support_cpu() || plat_dr_support_memory())) {
1606 		if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 ||
1607 		    !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 ||
1608 		    lgrp_plat_apic_ncpus <= 0) {
1609 			cmn_err(CE_CONT,
1610 			    "?lgrp: failed to process ACPI SRAT/SLIT table, "
1611 			    "disable support of CPU/memory DR operations.");
1612 			plat_dr_disable_cpu();
1613 			plat_dr_disable_memory();
1614 		} else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) {
1615 			cmn_err(CE_CONT,
1616 			    "?lgrp: latency probing enabled by user, "
1617 			    "disable support of CPU/memory DR operations.");
1618 			plat_dr_disable_cpu();
1619 			plat_dr_disable_memory();
1620 		}
1621 	}
1622 
1623 	/* Done if succeeded to process SLIT table. */
1624 	if (lgrp_plat_slit_error == 0)
1625 		return;
1626 
1627 	/*
1628 	 * Probe to determine latency between NUMA nodes when SLIT
1629 	 * doesn't exist or make sense
1630 	 */
1631 	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
1632 
1633 	/*
1634 	 * Specify whether to probe using vendor ID register or page copy
1635 	 * if hasn't been specified already or is overspecified
1636 	 */
1637 	probe_op = lgrp_plat_probe_flags &
1638 	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1639 
1640 	if (probe_op == 0 ||
1641 	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
1642 		lgrp_plat_probe_flags &=
1643 		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1644 		if (is_opteron())
1645 			lgrp_plat_probe_flags |=
1646 			    LGRP_PLAT_PROBE_VENDOR;
1647 		else
1648 			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
1649 	}
1650 
1651 	/*
1652 	 * Probing errors can mess up the lgroup topology and
1653 	 * force us fall back to a 2 level lgroup topology.
1654 	 * Here we bound how tall the lgroup topology can grow
1655 	 * in hopes of avoiding any anamolies in probing from
1656 	 * messing up the lgroup topology by limiting the
1657 	 * accuracy of the latency topology.
1658 	 *
1659 	 * Assume that nodes will at least be configured in a
1660 	 * ring, so limit height of lgroup topology to be less
1661 	 * than number of nodes on a system with 4 or more
1662 	 * nodes
1663 	 */
1664 	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
1665 	    lgrp_topo_ht_limit_default())
1666 		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
1667 }
1668 
1669 
1670 /*
1671  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
1672  * be considered same
1673  */
1674 #define	LGRP_LAT_TOLERANCE_SHIFT	4
1675 
1676 int	lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
1677 
1678 
1679 /*
1680  * Adjust latencies between nodes to be symmetric, normalize latencies between
1681  * any nodes that are within some tolerance to be same, and make local
1682  * latencies be same
1683  */
1684 static void
1685 lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
1686     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
1687 {
1688 	int				i;
1689 	int				j;
1690 	int				k;
1691 	int				l;
1692 	u_longlong_t			max;
1693 	u_longlong_t			min;
1694 	u_longlong_t			t;
1695 	u_longlong_t			t1;
1696 	u_longlong_t			t2;
1697 	const lgrp_config_flag_t	cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
1698 	int				lat_corrected[MAX_NODES][MAX_NODES];
1699 
1700 	t = 0;
1701 	/*
1702 	 * Nothing to do when this is an UMA machine or don't have args needed
1703 	 */
1704 	if (max_mem_nodes == 1)
1705 		return;
1706 
1707 	ASSERT(memnode_info != NULL && lat_stats != NULL &&
1708 	    probe_stats != NULL);
1709 
1710 	/*
1711 	 * Make sure that latencies are symmetric between any two nodes
1712 	 * (ie. latency(node0, node1) == latency(node1, node0))
1713 	 */
1714 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1715 		if (!memnode_info[i].exists)
1716 			continue;
1717 
1718 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1719 			if (!memnode_info[j].exists)
1720 				continue;
1721 
1722 			t1 = lat_stats->latencies[i][j];
1723 			t2 = lat_stats->latencies[j][i];
1724 
1725 			if (t1 == 0 || t2 == 0 || t1 == t2)
1726 				continue;
1727 
1728 			/*
1729 			 * Latencies should be same
1730 			 * - Use minimum of two latencies which should be same
1731 			 * - Track suspect probe times not within tolerance of
1732 			 *   min value
1733 			 * - Remember how much values are corrected by
1734 			 */
1735 			if (t1 > t2) {
1736 				t = t2;
1737 				probe_stats->probe_errors[i][j] += t1 - t2;
1738 				if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
1739 					probe_stats->probe_suspect[i][j]++;
1740 					probe_stats->probe_suspect[j][i]++;
1741 				}
1742 			} else if (t2 > t1) {
1743 				t = t1;
1744 				probe_stats->probe_errors[j][i] += t2 - t1;
1745 				if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
1746 					probe_stats->probe_suspect[i][j]++;
1747 					probe_stats->probe_suspect[j][i]++;
1748 				}
1749 			}
1750 
1751 			lat_stats->latencies[i][j] =
1752 			    lat_stats->latencies[j][i] = t;
1753 			lgrp_config(cflag, t1, t);
1754 			lgrp_config(cflag, t2, t);
1755 		}
1756 	}
1757 
1758 	/*
1759 	 * Keep track of which latencies get corrected
1760 	 */
1761 	for (i = 0; i < MAX_NODES; i++)
1762 		for (j = 0; j < MAX_NODES; j++)
1763 			lat_corrected[i][j] = 0;
1764 
1765 	/*
1766 	 * For every two nodes, see whether there is another pair of nodes which
1767 	 * are about the same distance apart and make the latencies be the same
1768 	 * if they are close enough together
1769 	 */
1770 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1771 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1772 			if (!memnode_info[j].exists)
1773 				continue;
1774 			/*
1775 			 * Pick one pair of nodes (i, j)
1776 			 * and get latency between them
1777 			 */
1778 			t1 = lat_stats->latencies[i][j];
1779 
1780 			/*
1781 			 * Skip this pair of nodes if there isn't a latency
1782 			 * for it yet
1783 			 */
1784 			if (t1 == 0)
1785 				continue;
1786 
1787 			for (k = 0; k < lgrp_plat_node_cnt; k++) {
1788 				for (l = 0; l < lgrp_plat_node_cnt; l++) {
1789 					if (!memnode_info[l].exists)
1790 						continue;
1791 					/*
1792 					 * Pick another pair of nodes (k, l)
1793 					 * not same as (i, j) and get latency
1794 					 * between them
1795 					 */
1796 					if (k == i && l == j)
1797 						continue;
1798 
1799 					t2 = lat_stats->latencies[k][l];
1800 
1801 					/*
1802 					 * Skip this pair of nodes if there
1803 					 * isn't a latency for it yet
1804 					 */
1805 
1806 					if (t2 == 0)
1807 						continue;
1808 
1809 					/*
1810 					 * Skip nodes (k, l) if they already
1811 					 * have same latency as (i, j) or
1812 					 * their latency isn't close enough to
1813 					 * be considered/made the same
1814 					 */
1815 					if (t1 == t2 || (t1 > t2 && t1 - t2 >
1816 					    t1 >> lgrp_plat_probe_lt_shift) ||
1817 					    (t2 > t1 && t2 - t1 >
1818 					    t2 >> lgrp_plat_probe_lt_shift))
1819 						continue;
1820 
1821 					/*
1822 					 * Make latency(i, j) same as
1823 					 * latency(k, l), try to use latency
1824 					 * that has been adjusted already to get
1825 					 * more consistency (if possible), and
1826 					 * remember which latencies were
1827 					 * adjusted for next time
1828 					 */
1829 					if (lat_corrected[i][j]) {
1830 						t = t1;
1831 						lgrp_config(cflag, t2, t);
1832 						t2 = t;
1833 					} else if (lat_corrected[k][l]) {
1834 						t = t2;
1835 						lgrp_config(cflag, t1, t);
1836 						t1 = t;
1837 					} else {
1838 						if (t1 > t2)
1839 							t = t2;
1840 						else
1841 							t = t1;
1842 						lgrp_config(cflag, t1, t);
1843 						lgrp_config(cflag, t2, t);
1844 						t1 = t2 = t;
1845 					}
1846 
1847 					lat_stats->latencies[i][j] =
1848 					    lat_stats->latencies[k][l] = t;
1849 
1850 					lat_corrected[i][j] =
1851 					    lat_corrected[k][l] = 1;
1852 				}
1853 			}
1854 		}
1855 	}
1856 
1857 	/*
1858 	 * Local latencies should be same
1859 	 * - Find min and max local latencies
1860 	 * - Make all local latencies be minimum
1861 	 */
1862 	min = -1;
1863 	max = 0;
1864 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1865 		if (!memnode_info[i].exists)
1866 			continue;
1867 		t = lat_stats->latencies[i][i];
1868 		if (t == 0)
1869 			continue;
1870 		if (min == -1 || t < min)
1871 			min = t;
1872 		if (t > max)
1873 			max = t;
1874 	}
1875 	if (min != max) {
1876 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
1877 			int	local;
1878 
1879 			if (!memnode_info[i].exists)
1880 				continue;
1881 
1882 			local = lat_stats->latencies[i][i];
1883 			if (local == 0)
1884 				continue;
1885 
1886 			/*
1887 			 * Track suspect probe times that aren't within
1888 			 * tolerance of minimum local latency and how much
1889 			 * probe times are corrected by
1890 			 */
1891 			if (local - min > min >> lgrp_plat_probe_lt_shift)
1892 				probe_stats->probe_suspect[i][i]++;
1893 
1894 			probe_stats->probe_errors[i][i] += local - min;
1895 
1896 			/*
1897 			 * Make local latencies be minimum
1898 			 */
1899 			lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
1900 			lat_stats->latencies[i][i] = min;
1901 		}
1902 	}
1903 
1904 	/*
1905 	 * Determine max probe time again since just adjusted latencies
1906 	 */
1907 	lat_stats->latency_max = 0;
1908 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1909 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1910 			if (!memnode_info[j].exists)
1911 				continue;
1912 			t = lat_stats->latencies[i][j];
1913 			if (t > lat_stats->latency_max)
1914 				lat_stats->latency_max = t;
1915 		}
1916 	}
1917 }
1918 
1919 
1920 /*
1921  * Verify following about latencies between nodes:
1922  *
1923  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
1924  * - Local latencies same
1925  * - Local < remote
1926  * - Number of latencies seen is reasonable
1927  * - Number of occurrences of a given latency should be more than 1
1928  *
1929  * Returns:
1930  *	0	Success
1931  *	-1	Not symmetric
1932  *	-2	Local latencies not same
1933  *	-3	Local >= remote
1934  */
1935 static int
1936 lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
1937     lgrp_plat_latency_stats_t *lat_stats)
1938 {
1939 	int				i;
1940 	int				j;
1941 	u_longlong_t			t1;
1942 	u_longlong_t			t2;
1943 
1944 	ASSERT(memnode_info != NULL && lat_stats != NULL);
1945 
1946 	/*
1947 	 * Nothing to do when this is an UMA machine, lgroup topology is
1948 	 * limited to 2 levels, or there aren't any probe times yet
1949 	 */
1950 	if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
1951 	    lat_stats->latencies[0][0] == 0)
1952 		return (0);
1953 
1954 	/*
1955 	 * Make sure that latencies are symmetric between any two nodes
1956 	 * (ie. latency(node0, node1) == latency(node1, node0))
1957 	 */
1958 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
1959 		if (!memnode_info[i].exists)
1960 			continue;
1961 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
1962 			if (!memnode_info[j].exists)
1963 				continue;
1964 			t1 = lat_stats->latencies[i][j];
1965 			t2 = lat_stats->latencies[j][i];
1966 
1967 			if (t1 == 0 || t2 == 0 || t1 == t2)
1968 				continue;
1969 
1970 			return (-1);
1971 		}
1972 	}
1973 
1974 	/*
1975 	 * Local latencies should be same
1976 	 */
1977 	t1 = lat_stats->latencies[0][0];
1978 	for (i = 1; i < lgrp_plat_node_cnt; i++) {
1979 		if (!memnode_info[i].exists)
1980 			continue;
1981 
1982 		t2 = lat_stats->latencies[i][i];
1983 		if (t2 == 0)
1984 			continue;
1985 
1986 		if (t1 == 0) {
1987 			t1 = t2;
1988 			continue;
1989 		}
1990 
1991 		if (t1 != t2)
1992 			return (-2);
1993 	}
1994 
1995 	/*
1996 	 * Local latencies should be less than remote
1997 	 */
1998 	if (t1) {
1999 		for (i = 0; i < lgrp_plat_node_cnt; i++) {
2000 			for (j = 0; j < lgrp_plat_node_cnt; j++) {
2001 				if (!memnode_info[j].exists)
2002 					continue;
2003 				t2 = lat_stats->latencies[i][j];
2004 				if (i == j || t2 == 0)
2005 					continue;
2006 
2007 				if (t1 >= t2)
2008 					return (-3);
2009 			}
2010 		}
2011 	}
2012 
2013 	return (0);
2014 }
2015 
2016 
2017 /*
2018  * Platform-specific initialization
2019  */
2020 static void
2021 lgrp_plat_main_init(void)
2022 {
2023 	int	curnode;
2024 	int	ht_limit;
2025 	int	i;
2026 
2027 	/*
2028 	 * Print a notice that MPO is disabled when memory is interleaved
2029 	 * across nodes....Would do this when it is discovered, but can't
2030 	 * because it happens way too early during boot....
2031 	 */
2032 	if (lgrp_plat_mem_intrlv)
2033 		cmn_err(CE_NOTE,
2034 		    "MPO disabled because memory is interleaved\n");
2035 
2036 	/*
2037 	 * Don't bother to do any probing if it is disabled, there is only one
2038 	 * node, or the height of the lgroup topology less than or equal to 2
2039 	 */
2040 	ht_limit = lgrp_topo_ht_limit();
2041 	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
2042 	    max_mem_nodes == 1 || ht_limit <= 2) {
2043 		/*
2044 		 * Setup lgroup latencies for 2 level lgroup topology
2045 		 * (ie. local and remote only) if they haven't been set yet
2046 		 */
2047 		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
2048 		    lgrp_plat_lat_stats.latency_max == 0)
2049 			lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
2050 		return;
2051 	}
2052 
2053 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2054 		/*
2055 		 * Should have been able to probe from CPU 0 when it was added
2056 		 * to lgroup hierarchy, but may not have been able to then
2057 		 * because it happens so early in boot that gethrtime() hasn't
2058 		 * been initialized.  (:-(
2059 		 */
2060 		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
2061 		    lgrp_plat_cpu_node_nentries);
2062 		ASSERT3U(curnode, <, lgrp_plat_node_cnt);
2063 		if (curnode == (lgrp_handle_t)-1)
2064 			return;
2065 		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
2066 			lgrp_plat_probe();
2067 
2068 		return;
2069 	}
2070 
2071 	/*
2072 	 * When probing memory, use one page for every sample to determine
2073 	 * lgroup topology and taking multiple samples
2074 	 */
2075 	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
2076 		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
2077 		    lgrp_plat_probe_nsamples;
2078 
2079 	/*
2080 	 * Map memory in each node needed for probing to determine latency
2081 	 * topology
2082 	 */
2083 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2084 		int	mnode;
2085 
2086 		/*
2087 		 * Skip this node and leave its probe page NULL
2088 		 * if it doesn't have any memory
2089 		 */
2090 		mnode = i;
2091 		if (!mem_node_config[mnode].exists) {
2092 			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
2093 			continue;
2094 		}
2095 
2096 		/*
2097 		 * Allocate one kernel virtual page
2098 		 */
2099 		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
2100 		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
2101 		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
2102 			cmn_err(CE_WARN,
2103 			    "lgrp_plat_main_init: couldn't allocate memory");
2104 			return;
2105 		}
2106 
2107 		/*
2108 		 * Get PFN for first page in each node
2109 		 */
2110 		lgrp_plat_probe_mem_config.probe_pfn[i] =
2111 		    mem_node_config[mnode].physbase;
2112 
2113 		/*
2114 		 * Map virtual page to first page in node
2115 		 */
2116 		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
2117 		    lgrp_plat_probe_mem_config.probe_memsize,
2118 		    lgrp_plat_probe_mem_config.probe_pfn[i],
2119 		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
2120 		    HAT_LOAD_NOCONSIST);
2121 	}
2122 
2123 	/*
2124 	 * Probe from current CPU
2125 	 */
2126 	lgrp_plat_probe();
2127 }
2128 
2129 
2130 /*
2131  * Return the number of free, allocatable, or installed
2132  * pages in an lgroup
2133  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
2134  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
2135  */
2136 static pgcnt_t
2137 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
2138 {
2139 	_NOTE(ARGUNUSED(lgrphand));
2140 
2141 	struct memlist *mlist;
2142 	pgcnt_t npgs = 0;
2143 	extern struct memlist *phys_avail;
2144 	extern struct memlist *phys_install;
2145 
2146 	switch (query) {
2147 	case LGRP_MEM_SIZE_FREE:
2148 		return ((pgcnt_t)freemem);
2149 	case LGRP_MEM_SIZE_AVAIL:
2150 		memlist_read_lock();
2151 		for (mlist = phys_avail; mlist; mlist = mlist->ml_next)
2152 			npgs += btop(mlist->ml_size);
2153 		memlist_read_unlock();
2154 		return (npgs);
2155 	case LGRP_MEM_SIZE_INSTALL:
2156 		memlist_read_lock();
2157 		for (mlist = phys_install; mlist; mlist = mlist->ml_next)
2158 			npgs += btop(mlist->ml_size);
2159 		memlist_read_unlock();
2160 		return (npgs);
2161 	default:
2162 		return ((pgcnt_t)0);
2163 	}
2164 }
2165 
2166 
2167 /*
2168  * Update node to proximity domain mappings for given domain and return node ID
2169  */
2170 static int
2171 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
2172     uint32_t domain)
2173 {
2174 	uint_t	node;
2175 	uint_t	start;
2176 
2177 	/*
2178 	 * Hash proximity domain ID into node to domain mapping table (array)
2179 	 * and add entry for it into first non-existent or matching entry found
2180 	 */
2181 	node = start = NODE_DOMAIN_HASH(domain, node_cnt);
2182 	do {
2183 		/*
2184 		 * Entry doesn't exist yet, so create one for this proximity
2185 		 * domain and return node ID which is index into mapping table.
2186 		 */
2187 		if (!node_domain[node].exists) {
2188 			node_domain[node].prox_domain = domain;
2189 			membar_producer();
2190 			node_domain[node].exists = 1;
2191 			return (node);
2192 		}
2193 
2194 		/*
2195 		 * Entry exists for this proximity domain already, so just
2196 		 * return node ID (index into table).
2197 		 */
2198 		if (node_domain[node].prox_domain == domain)
2199 			return (node);
2200 		node = NODE_DOMAIN_HASH(node + 1, node_cnt);
2201 	} while (node != start);
2202 
2203 	/*
2204 	 * Ran out of supported number of entries which shouldn't happen....
2205 	 */
2206 	ASSERT(node != start);
2207 	return (-1);
2208 }
2209 
2210 /*
2211  * Update node memory information for given proximity domain with specified
2212  * starting and ending physical address range (and return positive numbers for
2213  * success and negative ones for errors)
2214  */
2215 static int
2216 lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt,
2217     memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start,
2218     uint64_t end, uint32_t domain, uint32_t device_id)
2219 {
2220 	int	node, mnode;
2221 
2222 	/*
2223 	 * Get node number for proximity domain
2224 	 */
2225 	node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
2226 	if (node == -1) {
2227 		node = lgrp_plat_node_domain_update(node_domain, node_cnt,
2228 		    domain);
2229 		if (node == -1)
2230 			return (-1);
2231 	}
2232 
2233 	/*
2234 	 * This function is called during boot if device_id is
2235 	 * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for
2236 	 * memory DR operations.
2237 	 */
2238 	if (device_id != ACPI_MEMNODE_DEVID_BOOT) {
2239 		ASSERT(lgrp_plat_max_mem_node <= memnode_cnt);
2240 
2241 		for (mnode = lgrp_plat_node_cnt;
2242 		    mnode < lgrp_plat_max_mem_node; mnode++) {
2243 			if (memnode_info[mnode].exists &&
2244 			    memnode_info[mnode].prox_domain == domain &&
2245 			    memnode_info[mnode].device_id == device_id) {
2246 				if (btop(start) < memnode_info[mnode].start)
2247 					memnode_info[mnode].start = btop(start);
2248 				if (btop(end) > memnode_info[mnode].end)
2249 					memnode_info[mnode].end = btop(end);
2250 				return (1);
2251 			}
2252 		}
2253 
2254 		if (lgrp_plat_max_mem_node >= memnode_cnt) {
2255 			return (-3);
2256 		} else {
2257 			lgrp_plat_max_mem_node++;
2258 			memnode_info[mnode].start = btop(start);
2259 			memnode_info[mnode].end = btop(end);
2260 			memnode_info[mnode].prox_domain = domain;
2261 			memnode_info[mnode].device_id = device_id;
2262 			memnode_info[mnode].lgrphand = node;
2263 			membar_producer();
2264 			memnode_info[mnode].exists = 1;
2265 			return (0);
2266 		}
2267 	}
2268 
2269 	/*
2270 	 * Create entry in table for node if it doesn't exist
2271 	 */
2272 	ASSERT(node < memnode_cnt);
2273 	if (!memnode_info[node].exists) {
2274 		memnode_info[node].start = btop(start);
2275 		memnode_info[node].end = btop(end);
2276 		memnode_info[node].prox_domain = domain;
2277 		memnode_info[node].device_id = device_id;
2278 		memnode_info[node].lgrphand = node;
2279 		membar_producer();
2280 		memnode_info[node].exists = 1;
2281 		return (0);
2282 	}
2283 
2284 	/*
2285 	 * Entry already exists for this proximity domain
2286 	 *
2287 	 * There may be more than one SRAT memory entry for a domain, so we may
2288 	 * need to update existing start or end address for the node.
2289 	 */
2290 	if (memnode_info[node].prox_domain == domain) {
2291 		if (btop(start) < memnode_info[node].start)
2292 			memnode_info[node].start = btop(start);
2293 		if (btop(end) > memnode_info[node].end)
2294 			memnode_info[node].end = btop(end);
2295 		return (1);
2296 	}
2297 	return (-2);
2298 }
2299 
2300 
2301 /*
2302  * Have to sort nodes by starting physical address because plat_mnode_xcheck()
2303  * assumes and expects memnodes to be sorted in ascending order by physical
2304  * address.
2305  */
2306 static void
2307 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt,
2308     cpu_node_map_t *cpu_node, int cpu_count,
2309     memnode_phys_addr_map_t *memnode_info)
2310 {
2311 	boolean_t	found;
2312 	int		i;
2313 	int		j;
2314 	int		n;
2315 	boolean_t	sorted;
2316 	boolean_t	swapped;
2317 
2318 	if (!lgrp_plat_node_sort_enable || node_cnt <= 1 ||
2319 	    node_domain == NULL || memnode_info == NULL)
2320 		return;
2321 
2322 	/*
2323 	 * Sorted already?
2324 	 */
2325 	sorted = B_TRUE;
2326 	for (i = 0; i < node_cnt - 1; i++) {
2327 		/*
2328 		 * Skip entries that don't exist
2329 		 */
2330 		if (!memnode_info[i].exists)
2331 			continue;
2332 
2333 		/*
2334 		 * Try to find next existing entry to compare against
2335 		 */
2336 		found = B_FALSE;
2337 		for (j = i + 1; j < node_cnt; j++) {
2338 			if (memnode_info[j].exists) {
2339 				found = B_TRUE;
2340 				break;
2341 			}
2342 		}
2343 
2344 		/*
2345 		 * Done if no more existing entries to compare against
2346 		 */
2347 		if (found == B_FALSE)
2348 			break;
2349 
2350 		/*
2351 		 * Not sorted if starting address of current entry is bigger
2352 		 * than starting address of next existing entry
2353 		 */
2354 		if (memnode_info[i].start > memnode_info[j].start) {
2355 			sorted = B_FALSE;
2356 			break;
2357 		}
2358 	}
2359 
2360 	/*
2361 	 * Don't need to sort if sorted already
2362 	 */
2363 	if (sorted == B_TRUE)
2364 		return;
2365 
2366 	/*
2367 	 * Just use bubble sort since number of nodes is small
2368 	 */
2369 	n = node_cnt;
2370 	do {
2371 		swapped = B_FALSE;
2372 		n--;
2373 		for (i = 0; i < n; i++) {
2374 			/*
2375 			 * Skip entries that don't exist
2376 			 */
2377 			if (!memnode_info[i].exists)
2378 				continue;
2379 
2380 			/*
2381 			 * Try to find next existing entry to compare against
2382 			 */
2383 			found = B_FALSE;
2384 			for (j = i + 1; j <= n; j++) {
2385 				if (memnode_info[j].exists) {
2386 					found = B_TRUE;
2387 					break;
2388 				}
2389 			}
2390 
2391 			/*
2392 			 * Done if no more existing entries to compare against
2393 			 */
2394 			if (found == B_FALSE)
2395 				break;
2396 
2397 			if (memnode_info[i].start > memnode_info[j].start) {
2398 				memnode_phys_addr_map_t	save_addr;
2399 				node_domain_map_t	save_node;
2400 
2401 				/*
2402 				 * Swap node to proxmity domain ID assignments
2403 				 */
2404 				bcopy(&node_domain[i], &save_node,
2405 				    sizeof (node_domain_map_t));
2406 				bcopy(&node_domain[j], &node_domain[i],
2407 				    sizeof (node_domain_map_t));
2408 				bcopy(&save_node, &node_domain[j],
2409 				    sizeof (node_domain_map_t));
2410 
2411 				/*
2412 				 * Swap node to physical memory assignments
2413 				 */
2414 				bcopy(&memnode_info[i], &save_addr,
2415 				    sizeof (memnode_phys_addr_map_t));
2416 				bcopy(&memnode_info[j], &memnode_info[i],
2417 				    sizeof (memnode_phys_addr_map_t));
2418 				bcopy(&save_addr, &memnode_info[j],
2419 				    sizeof (memnode_phys_addr_map_t));
2420 				swapped = B_TRUE;
2421 			}
2422 		}
2423 	} while (swapped == B_TRUE);
2424 
2425 	/*
2426 	 * Check to make sure that CPUs assigned to correct node IDs now since
2427 	 * node to proximity domain ID assignments may have been changed above
2428 	 */
2429 	if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1)
2430 		return;
2431 	for (i = 0; i < cpu_count; i++) {
2432 		int		node;
2433 
2434 		node = lgrp_plat_domain_to_node(node_domain, node_cnt,
2435 		    cpu_node[i].prox_domain);
2436 		if (cpu_node[i].node != node)
2437 			cpu_node[i].node = node;
2438 	}
2439 
2440 }
2441 
2442 
2443 /*
2444  * Return time needed to probe from current CPU to memory in given node
2445  */
2446 static hrtime_t
2447 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
2448     lgrp_plat_probe_mem_config_t *probe_mem_config,
2449     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
2450 {
2451 	caddr_t			buf;
2452 	hrtime_t		elapsed;
2453 	hrtime_t		end;
2454 	int			from;
2455 	int			i;
2456 	int			ipl;
2457 	hrtime_t		max;
2458 	hrtime_t		min;
2459 	hrtime_t		start;
2460 	extern int		use_sse_pagecopy;
2461 
2462 	/*
2463 	 * Determine ID of node containing current CPU
2464 	 */
2465 	from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
2466 	ASSERT3U(from, <, lgrp_plat_node_cnt);
2467 	if (from == (lgrp_handle_t)-1)
2468 		return (0);
2469 
2470 	/*
2471 	 * Do common work for probing main memory
2472 	 */
2473 	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
2474 		/*
2475 		 * Skip probing any nodes without memory and
2476 		 * set probe time to 0
2477 		 */
2478 		if (probe_mem_config->probe_va[to] == NULL) {
2479 			lat_stats->latencies[from][to] = 0;
2480 			return (0);
2481 		}
2482 
2483 		/*
2484 		 * Invalidate caches once instead of once every sample
2485 		 * which should cut cost of probing by a lot
2486 		 */
2487 		probe_stats->flush_cost = gethrtime();
2488 		invalidate_cache();
2489 		probe_stats->flush_cost = gethrtime() -
2490 		    probe_stats->flush_cost;
2491 		probe_stats->probe_cost_total += probe_stats->flush_cost;
2492 	}
2493 
2494 	/*
2495 	 * Probe from current CPU to given memory using specified operation
2496 	 * and take specified number of samples
2497 	 */
2498 	max = 0;
2499 	min = -1;
2500 	for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
2501 		probe_stats->probe_cost = gethrtime();
2502 
2503 		/*
2504 		 * Can't measure probe time if gethrtime() isn't working yet
2505 		 */
2506 		if (probe_stats->probe_cost == 0 && gethrtime() == 0)
2507 			return (0);
2508 
2509 		if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2510 			/*
2511 			 * Measure how long it takes to read vendor ID from
2512 			 * Northbridge
2513 			 */
2514 			elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
2515 		} else {
2516 			/*
2517 			 * Measure how long it takes to copy page
2518 			 * on top of itself
2519 			 */
2520 			buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
2521 
2522 			kpreempt_disable();
2523 			ipl = splhigh();
2524 			start = gethrtime();
2525 			if (use_sse_pagecopy)
2526 				hwblkpagecopy(buf, buf);
2527 			else
2528 				bcopy(buf, buf, PAGESIZE);
2529 			end = gethrtime();
2530 			elapsed = end - start;
2531 			splx(ipl);
2532 			kpreempt_enable();
2533 		}
2534 
2535 		probe_stats->probe_cost = gethrtime() -
2536 		    probe_stats->probe_cost;
2537 		probe_stats->probe_cost_total += probe_stats->probe_cost;
2538 
2539 		if (min == -1 || elapsed < min)
2540 			min = elapsed;
2541 		if (elapsed > max)
2542 			max = elapsed;
2543 	}
2544 
2545 	/*
2546 	 * Update minimum and maximum probe times between
2547 	 * these two nodes
2548 	 */
2549 	if (min < probe_stats->probe_min[from][to] ||
2550 	    probe_stats->probe_min[from][to] == 0)
2551 		probe_stats->probe_min[from][to] = min;
2552 
2553 	if (max > probe_stats->probe_max[from][to])
2554 		probe_stats->probe_max[from][to] = max;
2555 
2556 	return (min);
2557 }
2558 
2559 
2560 /*
2561  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
2562  * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
2563  * and return number of CPU APIC IDs.
2564  *
2565  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
2566  *       in in cpu_apicid_array boot property which is based on and follows
2567  *	 same ordering as processor list in ACPI MADT.  If the code in
2568  *	 usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns
2569  *	 CPU IDs ever changes, then this code will need to change too....
2570  */
2571 static int
2572 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node)
2573 {
2574 	int	boot_prop_len;
2575 	char	*boot_prop_name = BP_CPU_APICID_ARRAY;
2576 	uint32_t *cpu_apicid_array;
2577 	int	i;
2578 	int	n;
2579 
2580 	/*
2581 	 * Check length of property value
2582 	 */
2583 	boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
2584 	if (boot_prop_len <= 0)
2585 		return (-1);
2586 
2587 	/*
2588 	 * Calculate number of entries in array and return when the system is
2589 	 * not very interesting for NUMA. It's not interesting for NUMA if
2590 	 * system has only one CPU and doesn't support CPU hotplug.
2591 	 */
2592 	n = boot_prop_len / sizeof (*cpu_apicid_array);
2593 	if (n == 1 && !plat_dr_support_cpu())
2594 		return (-2);
2595 
2596 	cpu_apicid_array = (uint32_t *)BOP_ALLOC(bootops, NULL, boot_prop_len,
2597 	    sizeof (*cpu_apicid_array));
2598 	/*
2599 	 * Get CPU to APIC ID property value
2600 	 */
2601 	if (cpu_apicid_array == NULL ||
2602 	    BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
2603 		return (-3);
2604 
2605 	/*
2606 	 * Just return number of CPU APIC IDs if CPU to node mapping table is
2607 	 * NULL
2608 	 */
2609 	if (cpu_node == NULL) {
2610 		if (plat_dr_support_cpu() && n >= boot_ncpus) {
2611 			return (boot_ncpus);
2612 		} else {
2613 			return (n);
2614 		}
2615 	}
2616 
2617 	/*
2618 	 * Fill in CPU to node ID mapping table with APIC ID for each CPU
2619 	 */
2620 	for (i = 0; i < n; i++) {
2621 		/* Only add boot CPUs into the map if CPU DR is enabled. */
2622 		if (plat_dr_support_cpu() && i >= boot_ncpus)
2623 			break;
2624 		cpu_node[i].exists = 1;
2625 		cpu_node[i].apicid = cpu_apicid_array[i];
2626 		cpu_node[i].prox_domain = UINT32_MAX;
2627 		cpu_node[i].node = UINT_MAX;
2628 	}
2629 
2630 	/*
2631 	 * Return number of CPUs based on number of APIC IDs
2632 	 */
2633 	return (i);
2634 }
2635 
2636 
2637 /*
2638  * Read ACPI System Locality Information Table (SLIT) to determine how far each
2639  * NUMA node is from each other
2640  */
2641 static int
2642 lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp,
2643     node_domain_map_t *node_domain, uint_t node_cnt,
2644     memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats)
2645 {
2646 	int		i;
2647 	int		j;
2648 	int		src;
2649 	int		dst;
2650 	int		localities;
2651 	hrtime_t	max;
2652 	hrtime_t	min;
2653 	int		retval;
2654 	uint8_t		*slit_entries;
2655 
2656 	if (tp == NULL || !lgrp_plat_slit_enable)
2657 		return (1);
2658 
2659 	if (lat_stats == NULL)
2660 		return (2);
2661 
2662 	localities = tp->LocalityCount;
2663 
2664 	min = lat_stats->latency_min;
2665 	max = lat_stats->latency_max;
2666 
2667 	/*
2668 	 * Fill in latency matrix based on SLIT entries
2669 	 */
2670 	slit_entries = tp->Entry;
2671 	for (i = 0; i < localities; i++) {
2672 		src = lgrp_plat_domain_to_node(node_domain,
2673 		    node_cnt, i);
2674 		if (src == -1)
2675 			continue;
2676 
2677 		for (j = 0; j < localities; j++) {
2678 			uint8_t	latency;
2679 
2680 			dst = lgrp_plat_domain_to_node(node_domain,
2681 			    node_cnt, j);
2682 			if (dst == -1)
2683 				continue;
2684 
2685 			latency = slit_entries[(i * localities) + j];
2686 			lat_stats->latencies[src][dst] = latency;
2687 			if (latency < min || min == -1)
2688 				min = latency;
2689 			if (latency > max)
2690 				max = latency;
2691 		}
2692 	}
2693 
2694 	/*
2695 	 * Verify that latencies/distances given in SLIT look reasonable
2696 	 */
2697 	retval = lgrp_plat_latency_verify(memnode_info, lat_stats);
2698 
2699 	if (retval) {
2700 		/*
2701 		 * Reinitialize (zero) latency table since SLIT doesn't look
2702 		 * right
2703 		 */
2704 		for (i = 0; i < localities; i++) {
2705 			for (j = 0; j < localities; j++)
2706 				lat_stats->latencies[i][j] = 0;
2707 		}
2708 	} else {
2709 		/*
2710 		 * Update min and max latencies seen since SLIT looks valid
2711 		 */
2712 		lat_stats->latency_min = min;
2713 		lat_stats->latency_max = max;
2714 	}
2715 
2716 	return (retval);
2717 }
2718 
2719 
2720 /*
2721  * Update lgrp latencies according to information returned by ACPI _SLI method.
2722  */
2723 static int
2724 lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info,
2725     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
2726     lgrp_plat_latency_stats_t *lat_stats)
2727 {
2728 	int		i;
2729 	int		src, dst;
2730 	uint8_t		latency;
2731 	hrtime_t	max, min;
2732 
2733 	if (lat_stats == NULL || sli_info == NULL ||
2734 	    sli_cnt == 0 || domain_id >= sli_cnt)
2735 		return (-1);
2736 
2737 	src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id);
2738 	if (src == -1) {
2739 		src = lgrp_plat_node_domain_update(node_domain, node_cnt,
2740 		    domain_id);
2741 		if (src == -1)
2742 			return (-1);
2743 	}
2744 
2745 	/*
2746 	 * Don't update latency info if topology has been flattened to 2 levels.
2747 	 */
2748 	if (lgrp_plat_topo_flatten != 0) {
2749 		return (0);
2750 	}
2751 
2752 	/*
2753 	 * Latency information for proximity domain is ready.
2754 	 * TODO: support adjusting latency information at runtime.
2755 	 */
2756 	if (lat_stats->latencies[src][src] != 0) {
2757 		return (0);
2758 	}
2759 
2760 	/* Validate latency information. */
2761 	for (i = 0; i < sli_cnt; i++) {
2762 		if (i == domain_id) {
2763 			if (sli_info[i] != ACPI_SLIT_SELF_LATENCY ||
2764 			    sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) {
2765 				return (-1);
2766 			}
2767 		} else {
2768 			if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY ||
2769 			    sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY ||
2770 			    sli_info[i] != sli_info[sli_cnt + i]) {
2771 				return (-1);
2772 			}
2773 		}
2774 	}
2775 
2776 	min = lat_stats->latency_min;
2777 	max = lat_stats->latency_max;
2778 	for (i = 0; i < sli_cnt; i++) {
2779 		dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i);
2780 		if (dst == -1)
2781 			continue;
2782 
2783 		ASSERT(sli_info[i] == sli_info[sli_cnt + i]);
2784 
2785 		/* Update row in latencies matrix. */
2786 		latency = sli_info[i];
2787 		lat_stats->latencies[src][dst] = latency;
2788 		if (latency < min || min == -1)
2789 			min = latency;
2790 		if (latency > max)
2791 			max = latency;
2792 
2793 		/* Update column in latencies matrix. */
2794 		latency = sli_info[sli_cnt + i];
2795 		lat_stats->latencies[dst][src] = latency;
2796 		if (latency < min || min == -1)
2797 			min = latency;
2798 		if (latency > max)
2799 			max = latency;
2800 	}
2801 	lat_stats->latency_min = min;
2802 	lat_stats->latency_max = max;
2803 
2804 	return (0);
2805 }
2806 
2807 
2808 /*
2809  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
2810  * and memory are local to each other in the same NUMA node and return number
2811  * of nodes
2812  */
2813 static int
2814 lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
2815     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
2816     cpu_node_map_t *cpu_node, int cpu_count,
2817     memnode_phys_addr_map_t *memnode_info)
2818 {
2819 	ACPI_SUBTABLE_HEADER	*item, *srat_end;
2820 	int			i;
2821 	int			node_cnt;
2822 	int			proc_entry_count;
2823 	int			rc;
2824 
2825 	/*
2826 	 * Nothing to do when no SRAT or disabled
2827 	 */
2828 	if (tp == NULL || !lgrp_plat_srat_enable)
2829 		return (-1);
2830 
2831 	/*
2832 	 * Try to get domain information from MSCT table.
2833 	 * ACPI4.0: OSPM will use information provided by the MSCT only
2834 	 * when the System Resource Affinity Table (SRAT) exists.
2835 	 */
2836 	node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min);
2837 	if (node_cnt <= 0) {
2838 		/*
2839 		 * Determine number of nodes by counting number of proximity
2840 		 * domains in SRAT.
2841 		 */
2842 		node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min);
2843 	}
2844 	/*
2845 	 * Return if number of nodes is 1 or less since don't need to read SRAT.
2846 	 */
2847 	if (node_cnt == 1)
2848 		return (1);
2849 	else if (node_cnt <= 0)
2850 		return (-2);
2851 
2852 	/*
2853 	 * Walk through SRAT, examining each CPU and memory entry to determine
2854 	 * which CPUs and memory belong to which node.
2855 	 */
2856 	item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
2857 	srat_end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
2858 	proc_entry_count = 0;
2859 	while (item < srat_end) {
2860 		uint32_t	apic_id;
2861 		uint32_t	domain;
2862 		uint64_t	end;
2863 		uint64_t	length;
2864 		uint64_t	start;
2865 
2866 		switch (item->Type) {
2867 		case ACPI_SRAT_TYPE_CPU_AFFINITY: {	/* CPU entry */
2868 			ACPI_SRAT_CPU_AFFINITY *cpu =
2869 			    (ACPI_SRAT_CPU_AFFINITY *) item;
2870 
2871 			if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED) ||
2872 			    cpu_node == NULL)
2873 				break;
2874 
2875 			/*
2876 			 * Calculate domain (node) ID and fill in APIC ID to
2877 			 * domain/node mapping table
2878 			 */
2879 			domain = cpu->ProximityDomainLo;
2880 			for (i = 0; i < 3; i++) {
2881 				domain += cpu->ProximityDomainHi[i] <<
2882 				    ((i + 1) * 8);
2883 			}
2884 			apic_id = cpu->ApicId;
2885 
2886 			rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2887 			    cpu_node, cpu_count, apic_id, domain);
2888 			if (rc < 0)
2889 				return (-3);
2890 			else if (rc == 0)
2891 				proc_entry_count++;
2892 			break;
2893 		}
2894 		case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {	/* memory entry */
2895 			ACPI_SRAT_MEM_AFFINITY *mem =
2896 			    (ACPI_SRAT_MEM_AFFINITY *)item;
2897 
2898 			if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED) ||
2899 			    memnode_info == NULL)
2900 				break;
2901 
2902 			/*
2903 			 * Get domain (node) ID and fill in domain/node
2904 			 * to memory mapping table
2905 			 */
2906 			domain = mem->ProximityDomain;
2907 			start = mem->BaseAddress;
2908 			length = mem->Length;
2909 			end = start + length - 1;
2910 
2911 			/*
2912 			 * According to ACPI 4.0, both ENABLE and HOTPLUG flags
2913 			 * may be set for memory address range entries in SRAT
2914 			 * table which are reserved for memory hot plug.
2915 			 * We intersect memory address ranges in SRAT table
2916 			 * with memory ranges in physinstalled to filter out
2917 			 * memory address ranges reserved for hot plug.
2918 			 */
2919 			if (mem->Flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
2920 				uint64_t	rstart = UINT64_MAX;
2921 				uint64_t	rend = 0;
2922 				struct memlist	*ml;
2923 				extern struct bootops	*bootops;
2924 
2925 				memlist_read_lock();
2926 				for (ml = bootops->boot_mem->physinstalled;
2927 				    ml; ml = ml->ml_next) {
2928 					uint64_t tstart = ml->ml_address;
2929 					uint64_t tend;
2930 
2931 					tend = ml->ml_address + ml->ml_size;
2932 					if (tstart > end || tend < start)
2933 						continue;
2934 					if (start > tstart)
2935 						tstart = start;
2936 					if (rstart > tstart)
2937 						rstart = tstart;
2938 					if (end < tend)
2939 						tend = end;
2940 					if (rend < tend)
2941 						rend = tend;
2942 				}
2943 				memlist_read_unlock();
2944 				start = rstart;
2945 				end = rend;
2946 				/* Skip this entry if no memory installed. */
2947 				if (start > end)
2948 					break;
2949 			}
2950 
2951 			if (lgrp_plat_memnode_info_update(node_domain,
2952 			    node_cnt, memnode_info, node_cnt,
2953 			    start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0)
2954 				return (-4);
2955 			break;
2956 		}
2957 		case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {	/* x2apic CPU */
2958 			ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
2959 			    (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
2960 
2961 			if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED) ||
2962 			    cpu_node == NULL)
2963 				break;
2964 
2965 			/*
2966 			 * Calculate domain (node) ID and fill in APIC ID to
2967 			 * domain/node mapping table
2968 			 */
2969 			domain = x2cpu->ProximityDomain;
2970 			apic_id = x2cpu->ApicId;
2971 
2972 			rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2973 			    cpu_node, cpu_count, apic_id, domain);
2974 			if (rc < 0)
2975 				return (-3);
2976 			else if (rc == 0)
2977 				proc_entry_count++;
2978 			break;
2979 		}
2980 		default:
2981 			break;
2982 		}
2983 
2984 		item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
2985 	}
2986 
2987 	/*
2988 	 * Should have seen at least as many SRAT processor entries as CPUs
2989 	 */
2990 	if (proc_entry_count < cpu_count)
2991 		return (-5);
2992 
2993 	/*
2994 	 * Need to sort nodes by starting physical address since VM system
2995 	 * assumes and expects memnodes to be sorted in ascending order by
2996 	 * physical address
2997 	 */
2998 	lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count,
2999 	    memnode_info);
3000 
3001 	return (node_cnt);
3002 }
3003 
3004 
3005 /*
3006  * Allocate permanent memory for any temporary memory that we needed to
3007  * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
3008  * initialized and copy everything from temporary to permanent memory since
3009  * temporary boot memory will eventually be released during boot
3010  */
3011 static void
3012 lgrp_plat_release_bootstrap(void)
3013 {
3014 	void	*buf;
3015 	size_t	size;
3016 
3017 	if (lgrp_plat_cpu_node_nentries > 0) {
3018 		size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
3019 		buf = kmem_alloc(size, KM_SLEEP);
3020 		bcopy(lgrp_plat_cpu_node, buf, size);
3021 		lgrp_plat_cpu_node = buf;
3022 	}
3023 }
3024 
3025 
3026 /*
3027  * Return number of proximity domains given in ACPI SRAT
3028  */
3029 static int
3030 lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp, uint32_t *prox_domain_min)
3031 {
3032 	int			domain_cnt;
3033 	uint32_t		domain_min;
3034 	ACPI_SUBTABLE_HEADER	*item, *end;
3035 	int			i;
3036 	node_domain_map_t	node_domain[MAX_NODES];
3037 
3038 
3039 	if (tp == NULL || !lgrp_plat_srat_enable)
3040 		return (1);
3041 
3042 	/*
3043 	 * Walk through SRAT to find minimum proximity domain ID
3044 	 */
3045 	domain_min = UINT32_MAX;
3046 	item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
3047 	end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
3048 	while (item < end) {
3049 		uint32_t	domain;
3050 
3051 		switch (item->Type) {
3052 		case ACPI_SRAT_TYPE_CPU_AFFINITY: {	/* CPU entry */
3053 			ACPI_SRAT_CPU_AFFINITY *cpu =
3054 			    (ACPI_SRAT_CPU_AFFINITY *) item;
3055 
3056 			if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3057 				item = (ACPI_SUBTABLE_HEADER *)
3058 				    ((uintptr_t)item + item->Length);
3059 				continue;
3060 			}
3061 			domain = cpu->ProximityDomainLo;
3062 			for (i = 0; i < 3; i++) {
3063 				domain += cpu->ProximityDomainHi[i] <<
3064 				    ((i + 1) * 8);
3065 			}
3066 			break;
3067 		}
3068 		case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {	/* memory entry */
3069 			ACPI_SRAT_MEM_AFFINITY *mem =
3070 			    (ACPI_SRAT_MEM_AFFINITY *)item;
3071 
3072 			if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) {
3073 				item = (ACPI_SUBTABLE_HEADER *)
3074 				    ((uintptr_t)item + item->Length);
3075 				continue;
3076 			}
3077 			domain = mem->ProximityDomain;
3078 			break;
3079 		}
3080 		case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {	/* x2apic CPU */
3081 			ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
3082 			    (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
3083 
3084 			if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3085 				item = (ACPI_SUBTABLE_HEADER *)
3086 				    ((uintptr_t)item + item->Length);
3087 				continue;
3088 			}
3089 			domain = x2cpu->ProximityDomain;
3090 			break;
3091 		}
3092 		default:
3093 			item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item +
3094 			    item->Length);
3095 			continue;
3096 		}
3097 
3098 		/*
3099 		 * Keep track of minimum proximity domain ID
3100 		 */
3101 		if (domain < domain_min)
3102 			domain_min = domain;
3103 
3104 		item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
3105 	}
3106 	if (lgrp_plat_domain_min_enable && prox_domain_min != NULL)
3107 		*prox_domain_min = domain_min;
3108 
3109 	/*
3110 	 * Walk through SRAT, examining each CPU and memory entry to determine
3111 	 * proximity domain ID for each.
3112 	 */
3113 	domain_cnt = 0;
3114 	item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
3115 	end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
3116 	bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
3117 	while (item < end) {
3118 		uint32_t	domain;
3119 		boolean_t	overflow;
3120 		uint_t		start;
3121 
3122 		switch (item->Type) {
3123 		case ACPI_SRAT_TYPE_CPU_AFFINITY: {	/* CPU entry */
3124 			ACPI_SRAT_CPU_AFFINITY *cpu =
3125 			    (ACPI_SRAT_CPU_AFFINITY *) item;
3126 
3127 			if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3128 				item = (ACPI_SUBTABLE_HEADER *)
3129 				    ((uintptr_t)item + item->Length);
3130 				continue;
3131 			}
3132 			domain = cpu->ProximityDomainLo;
3133 			for (i = 0; i < 3; i++) {
3134 				domain += cpu->ProximityDomainHi[i] <<
3135 				    ((i + 1) * 8);
3136 			}
3137 			break;
3138 		}
3139 		case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {	/* memory entry */
3140 			ACPI_SRAT_MEM_AFFINITY *mem =
3141 			    (ACPI_SRAT_MEM_AFFINITY *)item;
3142 
3143 			if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) {
3144 				item = (ACPI_SUBTABLE_HEADER *)
3145 				    ((uintptr_t)item + item->Length);
3146 				continue;
3147 			}
3148 			domain = mem->ProximityDomain;
3149 			break;
3150 		}
3151 		case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {	/* x2apic CPU */
3152 			ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
3153 			    (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
3154 
3155 			if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3156 				item = (ACPI_SUBTABLE_HEADER *)
3157 				    ((uintptr_t)item + item->Length);
3158 				continue;
3159 			}
3160 			domain = x2cpu->ProximityDomain;
3161 			break;
3162 		}
3163 		default:
3164 			item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item +
3165 			    item->Length);
3166 			continue;
3167 		}
3168 
3169 		/*
3170 		 * Count and keep track of which proximity domain IDs seen
3171 		 */
3172 		start = i = domain % MAX_NODES;
3173 		overflow = B_TRUE;
3174 		do {
3175 			/*
3176 			 * Create entry for proximity domain and increment
3177 			 * count when no entry exists where proximity domain
3178 			 * hashed
3179 			 */
3180 			if (!node_domain[i].exists) {
3181 				node_domain[i].exists = 1;
3182 				node_domain[i].prox_domain = domain;
3183 				domain_cnt++;
3184 				overflow = B_FALSE;
3185 				break;
3186 			}
3187 
3188 			/*
3189 			 * Nothing to do when proximity domain seen already
3190 			 * and its entry exists
3191 			 */
3192 			if (node_domain[i].prox_domain == domain) {
3193 				overflow = B_FALSE;
3194 				break;
3195 			}
3196 
3197 			/*
3198 			 * Entry exists where proximity domain hashed, but for
3199 			 * different proximity domain so keep search for empty
3200 			 * slot to put it or matching entry whichever comes
3201 			 * first.
3202 			 */
3203 			i = (i + 1) % MAX_NODES;
3204 		} while (i != start);
3205 
3206 		/*
3207 		 * Didn't find empty or matching entry which means have more
3208 		 * proximity domains than supported nodes (:-(
3209 		 */
3210 		ASSERT(overflow != B_TRUE);
3211 		if (overflow == B_TRUE)
3212 			return (-1);
3213 
3214 		item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
3215 	}
3216 	return (domain_cnt);
3217 }
3218 
3219 
3220 /*
3221  * Parse domain information in ACPI Maximum System Capability Table (MSCT).
3222  * MSCT table has been verified in function process_msct() in fakebop.c.
3223  */
3224 static int
3225 lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp, uint32_t *prox_domain_min)
3226 {
3227 	int last_seen = 0;
3228 	uint32_t proxmin = UINT32_MAX;
3229 	ACPI_MSCT_PROXIMITY *item, *end;
3230 
3231 	if (tp == NULL || lgrp_plat_msct_enable == 0)
3232 		return (-1);
3233 
3234 	if (tp->MaxProximityDomains >= MAX_NODES) {
3235 		cmn_err(CE_CONT,
3236 		    "?lgrp: too many proximity domains (%d), max %d supported, "
3237 		    "disable support of CPU/memory DR operations.",
3238 		    tp->MaxProximityDomains + 1, MAX_NODES);
3239 		plat_dr_disable_cpu();
3240 		plat_dr_disable_memory();
3241 		return (-1);
3242 	}
3243 
3244 	if (prox_domain_min != NULL) {
3245 		end = (void *)(tp->Header.Length + (uintptr_t)tp);
3246 		for (item = (void *)((uintptr_t)tp +
3247 		    tp->ProximityOffset); item < end;
3248 		    item = (void *)(item->Length + (uintptr_t)item)) {
3249 			if (item->RangeStart < proxmin) {
3250 				proxmin = item->RangeStart;
3251 			}
3252 
3253 			last_seen = item->RangeEnd - item->RangeStart + 1;
3254 			/*
3255 			 * Break out if all proximity domains have been
3256 			 * processed. Some BIOSes may have unused items
3257 			 * at the end of MSCT table.
3258 			 */
3259 			if (last_seen > tp->MaxProximityDomains) {
3260 				break;
3261 			}
3262 		}
3263 		*prox_domain_min = proxmin;
3264 	}
3265 
3266 	return (tp->MaxProximityDomains + 1);
3267 }
3268 
3269 
3270 /*
3271  * Set lgroup latencies for 2 level lgroup topology
3272  */
3273 static void
3274 lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats)
3275 {
3276 	int	i, j;
3277 
3278 	ASSERT(lat_stats != NULL);
3279 
3280 	if (lgrp_plat_node_cnt >= 4)
3281 		cmn_err(CE_NOTE,
3282 		    "MPO only optimizing for local and remote\n");
3283 	for (i = 0; i < lgrp_plat_node_cnt; i++) {
3284 		for (j = 0; j < lgrp_plat_node_cnt; j++) {
3285 			if (i == j)
3286 				lat_stats->latencies[i][j] = 2;
3287 			else
3288 				lat_stats->latencies[i][j] = 3;
3289 		}
3290 	}
3291 	lat_stats->latency_min = 2;
3292 	lat_stats->latency_max = 3;
3293 	/* TODO: check it. */
3294 	lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
3295 	lgrp_plat_topo_flatten = 1;
3296 }
3297 
3298 
3299 /*
3300  * The following Opteron specific constants, macros, types, and routines define
3301  * PCI configuration space registers and how to read them to determine the NUMA
3302  * configuration of *supported* Opteron processors.  They provide the same
3303  * information that may be gotten from the ACPI System Resource Affinity Table
3304  * (SRAT) if it exists on the machine of interest.
3305  *
3306  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
3307  * of interest describes all of these registers and their contents.  The main
3308  * registers used by this code to determine the NUMA configuration of the
3309  * machine are the node ID register for the number of NUMA nodes and the DRAM
3310  * address map registers for the physical address range of each node.
3311  *
3312  * NOTE: The format and how to determine the NUMA configuration using PCI
3313  *	 config space registers may change or may not be supported in future
3314  *	 Opteron processor families.
3315  */
3316 
3317 /*
3318  * How many bits to shift Opteron DRAM Address Map base and limit registers
3319  * to get actual value
3320  */
3321 #define	OPT_DRAMADDR_HI_LSHIFT_ADDR	40	/* shift left for address */
3322 #define	OPT_DRAMADDR_LO_LSHIFT_ADDR	8	/* shift left for address */
3323 
3324 #define	OPT_DRAMADDR_HI_MASK_ADDR	0x000000FF /* address bits 47-40 */
3325 #define	OPT_DRAMADDR_LO_MASK_ADDR	0xFFFF0000 /* address bits 39-24 */
3326 
3327 #define	OPT_DRAMADDR_LO_MASK_OFF	0xFFFFFF /* offset for address */
3328 
3329 /*
3330  * Macros to derive addresses from Opteron DRAM Address Map registers
3331  */
3332 #define	OPT_DRAMADDR_HI(reg) \
3333 	(((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
3334 	    OPT_DRAMADDR_HI_LSHIFT_ADDR)
3335 
3336 #define	OPT_DRAMADDR_LO(reg) \
3337 	(((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
3338 	    OPT_DRAMADDR_LO_LSHIFT_ADDR)
3339 
3340 #define	OPT_DRAMADDR(high, low) \
3341 	(OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
3342 
3343 /*
3344  * Bit masks defining what's in Opteron DRAM Address Map base register
3345  */
3346 #define	OPT_DRAMBASE_LO_MASK_RE		0x1	/* read enable */
3347 #define	OPT_DRAMBASE_LO_MASK_WE		0x2	/* write enable */
3348 #define	OPT_DRAMBASE_LO_MASK_INTRLVEN	0x700	/* interleave */
3349 
3350 /*
3351  * Bit masks defining what's in Opteron DRAM Address Map limit register
3352  */
3353 #define	OPT_DRAMLIMIT_LO_MASK_DSTNODE	0x7		/* destination node */
3354 #define	OPT_DRAMLIMIT_LO_MASK_INTRLVSEL	0x700		/* interleave select */
3355 
3356 
3357 /*
3358  * Opteron Node ID register in PCI configuration space contains
3359  * number of nodes in system, etc. for Opteron K8.  The following
3360  * constants and macros define its contents, structure, and access.
3361  */
3362 
3363 /*
3364  * Bit masks defining what's in Opteron Node ID register
3365  */
3366 #define	OPT_NODE_MASK_ID	0x7	/* node ID */
3367 #define	OPT_NODE_MASK_CNT	0x70	/* node count */
3368 #define	OPT_NODE_MASK_IONODE	0x700	/* Hypertransport I/O hub node ID */
3369 #define	OPT_NODE_MASK_LCKNODE	0x7000	/* lock controller node ID */
3370 #define	OPT_NODE_MASK_CPUCNT	0xF0000	/* CPUs in system (0 means 1 CPU)  */
3371 
3372 /*
3373  * How many bits in Opteron Node ID register to shift right to get actual value
3374  */
3375 #define	OPT_NODE_RSHIFT_CNT	0x4	/* shift right for node count value */
3376 
3377 /*
3378  * Macros to get values from Opteron Node ID register
3379  */
3380 #define	OPT_NODE_CNT(reg) \
3381 	((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
3382 
3383 /*
3384  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
3385  * "in/out" instructions
3386  *
3387  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
3388  *	 other uses should just do MMIO to access PCI ECS.
3389  *	 Must enable special bit in Northbridge Configuration Register on
3390  *	 Greyhound for extended CF8 space access to be able to access PCI ECS
3391  *	 using "in/out" instructions and restore special bit after done
3392  *	 accessing PCI ECS.
3393  */
3394 #define	OPT_PCI_ECS_ADDR(bus, device, function, reg) \
3395 	(PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
3396 	    (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
3397 	    ((((reg) >> 8) & 0xf) << 24))
3398 
3399 /*
3400  * PCI configuration space registers accessed by specifying
3401  * a bus, device, function, and offset.  The following constants
3402  * define the values needed to access Opteron K8 configuration
3403  * info to determine its node topology
3404  */
3405 
3406 #define	OPT_PCS_BUS_CONFIG	0	/* Hypertransport config space bus */
3407 
3408 /*
3409  * Opteron PCI configuration space register function values
3410  */
3411 #define	OPT_PCS_FUNC_HT		0	/* Hypertransport configuration */
3412 #define	OPT_PCS_FUNC_ADDRMAP	1	/* Address map configuration */
3413 #define	OPT_PCS_FUNC_DRAM	2	/* DRAM configuration */
3414 #define	OPT_PCS_FUNC_MISC	3	/* Miscellaneous configuration */
3415 
3416 /*
3417  * PCI Configuration Space register offsets
3418  */
3419 #define	OPT_PCS_OFF_VENDOR	0x0	/* device/vendor ID register */
3420 #define	OPT_PCS_OFF_DRAMBASE_HI	0x140	/* DRAM Base register (node 0) */
3421 #define	OPT_PCS_OFF_DRAMBASE_LO	0x40	/* DRAM Base register (node 0) */
3422 #define	OPT_PCS_OFF_NODEID	0x60	/* Node ID register */
3423 
3424 /*
3425  * Opteron PCI Configuration Space device IDs for nodes
3426  */
3427 #define	OPT_PCS_DEV_NODE0		24	/* device number for node 0 */
3428 
3429 
3430 /*
3431  * Opteron DRAM address map gives base and limit for physical memory in a node
3432  */
3433 typedef	struct opt_dram_addr_map {
3434 	uint32_t	base_hi;
3435 	uint32_t	base_lo;
3436 	uint32_t	limit_hi;
3437 	uint32_t	limit_lo;
3438 } opt_dram_addr_map_t;
3439 
3440 
3441 /*
3442  * Supported AMD processor families
3443  */
3444 #define	AMD_FAMILY_HAMMER	15
3445 #define	AMD_FAMILY_GREYHOUND	16
3446 
3447 /*
3448  * Whether to have is_opteron() return 1 even when processor isn't supported
3449  */
3450 uint_t	is_opteron_override = 0;
3451 
3452 /*
3453  * AMD processor family for current CPU
3454  */
3455 uint_t	opt_family = 0;
3456 
3457 
3458 /*
3459  * Determine whether we're running on a supported AMD Opteron since reading
3460  * node count and DRAM address map registers may have different format or
3461  * may not be supported across processor families
3462  */
3463 static int
3464 is_opteron(void)
3465 {
3466 
3467 	if (x86_vendor != X86_VENDOR_AMD)
3468 		return (0);
3469 
3470 	opt_family = cpuid_getfamily(CPU);
3471 	if (opt_family == AMD_FAMILY_HAMMER ||
3472 	    opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
3473 		return (1);
3474 	else
3475 		return (0);
3476 }
3477 
3478 
3479 /*
3480  * Determine NUMA configuration for Opteron from registers that live in PCI
3481  * configuration space
3482  */
3483 static void
3484 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
3485     memnode_phys_addr_map_t *memnode_info)
3486 {
3487 	uint_t				bus;
3488 	uint_t				dev;
3489 	struct opt_dram_addr_map	dram_map[MAX_NODES];
3490 	uint_t				node;
3491 	uint_t				node_info[MAX_NODES];
3492 	uint_t				off_hi;
3493 	uint_t				off_lo;
3494 	uint64_t nb_cfg_reg;
3495 
3496 	/*
3497 	 * Read configuration registers from PCI configuration space to
3498 	 * determine node information, which memory is in each node, etc.
3499 	 *
3500 	 * Write to PCI configuration space address register to specify
3501 	 * which configuration register to read and read/write PCI
3502 	 * configuration space data register to get/set contents
3503 	 */
3504 	bus = OPT_PCS_BUS_CONFIG;
3505 	dev = OPT_PCS_DEV_NODE0;
3506 	off_hi = OPT_PCS_OFF_DRAMBASE_HI;
3507 	off_lo = OPT_PCS_OFF_DRAMBASE_LO;
3508 
3509 	/*
3510 	 * Read node ID register for node 0 to get node count
3511 	 */
3512 	node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
3513 	    OPT_PCS_OFF_NODEID);
3514 	*node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
3515 
3516 	/*
3517 	 * If number of nodes is more than maximum supported, then set node
3518 	 * count to 1 and treat system as UMA instead of NUMA.
3519 	 */
3520 	if (*node_cnt > MAX_NODES) {
3521 		*node_cnt = 1;
3522 		return;
3523 	}
3524 
3525 	/*
3526 	 * For Greyhound, PCI Extended Configuration Space must be enabled to
3527 	 * read high DRAM address map base and limit registers
3528 	 */
3529 	nb_cfg_reg = 0;
3530 	if (opt_family == AMD_FAMILY_GREYHOUND) {
3531 		nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
3532 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3533 			wrmsr(MSR_AMD_NB_CFG,
3534 			    nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
3535 	}
3536 
3537 	for (node = 0; node < *node_cnt; node++) {
3538 		uint32_t	base_hi;
3539 		uint32_t	base_lo;
3540 		uint32_t	limit_hi;
3541 		uint32_t	limit_lo;
3542 
3543 		/*
3544 		 * Read node ID register (except for node 0 which we just read)
3545 		 */
3546 		if (node > 0) {
3547 			node_info[node] = pci_getl_func(bus, dev,
3548 			    OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
3549 		}
3550 
3551 		/*
3552 		 * Read DRAM base and limit registers which specify
3553 		 * physical memory range of each node
3554 		 */
3555 		if (opt_family != AMD_FAMILY_GREYHOUND)
3556 			base_hi = 0;
3557 		else {
3558 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3559 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
3560 			base_hi = dram_map[node].base_hi =
3561 			    inl(PCI_CONFDATA);
3562 		}
3563 		base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
3564 		    OPT_PCS_FUNC_ADDRMAP, off_lo);
3565 
3566 		if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
3567 		    mem_intrlv)
3568 			*mem_intrlv = *mem_intrlv + 1;
3569 
3570 		off_hi += 4;	/* high limit register offset */
3571 		if (opt_family != AMD_FAMILY_GREYHOUND)
3572 			limit_hi = 0;
3573 		else {
3574 			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3575 			    OPT_PCS_FUNC_ADDRMAP, off_hi));
3576 			limit_hi = dram_map[node].limit_hi =
3577 			    inl(PCI_CONFDATA);
3578 		}
3579 
3580 		off_lo += 4;	/* low limit register offset */
3581 		limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
3582 		    dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
3583 
3584 		/*
3585 		 * Increment device number to next node and register offsets
3586 		 * for DRAM base register of next node
3587 		 */
3588 		off_hi += 4;
3589 		off_lo += 4;
3590 		dev++;
3591 
3592 		/*
3593 		 * Both read and write enable bits must be enabled in DRAM
3594 		 * address map base register for physical memory to exist in
3595 		 * node
3596 		 */
3597 		if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
3598 		    (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
3599 			/*
3600 			 * Mark node memory as non-existent and set start and
3601 			 * end addresses to be same in memnode_info[]
3602 			 */
3603 			memnode_info[node].exists = 0;
3604 			memnode_info[node].start = memnode_info[node].end =
3605 			    (pfn_t)-1;
3606 			continue;
3607 		}
3608 
3609 		/*
3610 		 * Mark node memory as existing and remember physical address
3611 		 * range of each node for use later
3612 		 */
3613 		memnode_info[node].exists = 1;
3614 
3615 		memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
3616 
3617 		memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
3618 		    OPT_DRAMADDR_LO_MASK_OFF);
3619 	}
3620 
3621 	/*
3622 	 * Restore PCI Extended Configuration Space enable bit
3623 	 */
3624 	if (opt_family == AMD_FAMILY_GREYHOUND) {
3625 		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3626 			wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
3627 	}
3628 }
3629 
3630 
3631 /*
3632  * Return average amount of time to read vendor ID register on Northbridge
3633  * N times on specified destination node from current CPU
3634  */
3635 static hrtime_t
3636 opt_probe_vendor(int dest_node, int nreads)
3637 {
3638 	int		cnt;
3639 	uint_t		dev;
3640 	/* LINTED: set but not used in function */
3641 	volatile uint_t	dev_vendor __unused;
3642 	hrtime_t	elapsed;
3643 	hrtime_t	end;
3644 	int		ipl;
3645 	hrtime_t	start;
3646 
3647 	dev = OPT_PCS_DEV_NODE0 + dest_node;
3648 	kpreempt_disable();
3649 	ipl = spl8();
3650 	outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
3651 	    OPT_PCS_OFF_VENDOR));
3652 	start = gethrtime();
3653 	for (cnt = 0; cnt < nreads; cnt++)
3654 		dev_vendor = inl(PCI_CONFDATA);
3655 	end = gethrtime();
3656 	elapsed = (end - start) / nreads;
3657 	splx(ipl);
3658 	kpreempt_enable();
3659 	return (elapsed);
3660 }
3661