i86pc/os/lgrpplat.c

7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * CDDL HEADER START
7c478bd9Sstevel@tonic-gate *
7c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
c39996a7Sstevel * Common Development and Distribution License (the "License").
c39996a7Sstevel * You may not use this file except in compliance with the License.
7c478bd9Sstevel@tonic-gate *
7c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
7c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
7c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
7c478bd9Sstevel@tonic-gate * and limitations under the License.
7c478bd9Sstevel@tonic-gate *
7c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
7c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
7c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
7c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
7c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
7c478bd9Sstevel@tonic-gate *
7c478bd9Sstevel@tonic-gate * CDDL HEADER END
7c478bd9Sstevel@tonic-gate */
c39996a7Sstevel
7c478bd9Sstevel@tonic-gate/*
472714d6Skchow * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
7c478bd9Sstevel@tonic-gate * Use is subject to license terms.
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate#pragma ident	"%Z%%M%	%I%	%E% SMI"
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate
2e2c009bSjjc/*
2e2c009bSjjc * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
2e2c009bSjjc * ================================================================
2e2c009bSjjc * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
2e2c009bSjjc * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
2e2c009bSjjc * one or more CPUs and some local memory.  The CPUs in each node can access
2e2c009bSjjc * the memory in the other nodes but at a higher latency than accessing their
2e2c009bSjjc * local memory.  Typically, a system with only one node has Uniform Memory
2e2c009bSjjc * Access (UMA), but it may be possible to have a one node system that has
2e2c009bSjjc * some global memory outside of the node which is higher latency.
2e2c009bSjjc *
2e2c009bSjjc * Module Description
2e2c009bSjjc * ------------------
2e2c009bSjjc * This module provides a platform interface for determining which CPUs and
2e2c009bSjjc * which memory (and how much) are in a NUMA node and how far each node is from
2e2c009bSjjc * each other.  The interface is used by the Virtual Memory (VM) system and the
2e2c009bSjjc * common lgroup framework.  The VM system uses the plat_*() routines to fill
2e2c009bSjjc * in its memory node (memnode) array with the physical address range spanned
2e2c009bSjjc * by each NUMA node to know which memory belongs to which node, so it can
2e2c009bSjjc * build and manage a physical page free list for each NUMA node and allocate
2e2c009bSjjc * local memory from each node as needed.  The common lgroup framework uses the
2e2c009bSjjc * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
2e2c009bSjjc * to each node (leaf lgroup) and how far each node is from each other, so it
2e2c009bSjjc * can build the latency (lgroup) topology for the machine in order to optimize
2e2c009bSjjc * for locality.  Also, an lgroup platform handle instead of lgroups are used
2e2c009bSjjc * in the interface with this module, so this module shouldn't need to know
2e2c009bSjjc * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
2e2c009bSjjc * etc. are in each NUMA node, how far each node is from each other, and to use
2e2c009bSjjc * a unique lgroup platform handle to refer to each node through the interface.
2e2c009bSjjc *
2e2c009bSjjc * Determining NUMA Configuration
2e2c009bSjjc * ------------------------------
2e2c009bSjjc * By default, this module will try to determine the NUMA configuration of the
2e2c009bSjjc * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
2e2c009bSjjc * Locality Information Table (SLIT).  The SRAT contains info to tell which
2e2c009bSjjc * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
2e2c009bSjjc * is a matrix that gives the distance between each system locality (which is
2e2c009bSjjc * a NUMA node and should correspond to proximity domains in the SRAT).  For
2e2c009bSjjc * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
2e2c009bSjjc * specification.
2e2c009bSjjc *
2e2c009bSjjc * If the SRAT doesn't exist on a system with AMD Opteron processors, we
2e2c009bSjjc * examine registers in PCI configuration space to determine how many nodes are
2e2c009bSjjc * in the system and which CPUs and memory are in each node.
2e2c009bSjjc * do while booting the kernel.
2e2c009bSjjc *
2e2c009bSjjc * NOTE: Using these PCI configuration space registers to determine this
2e2c009bSjjc *       locality info is not guaranteed to work or be compatible across all
2e2c009bSjjc *	 Opteron processor families.
2e2c009bSjjc *
2e2c009bSjjc * If the SLIT does not exist or look right, the kernel will probe to determine
2e2c009bSjjc * the distance between nodes as long as the NUMA CPU and memory configuration
2e2c009bSjjc * has been determined (see lgrp_plat_probe() for details).
2e2c009bSjjc *
2e2c009bSjjc * Data Structures
2e2c009bSjjc * ---------------
2e2c009bSjjc * The main data structures used by this code are the following:
2e2c009bSjjc *
2e2c009bSjjc * - lgrp_plat_cpu_node[]		APIC ID to node ID mapping table
2e2c009bSjjc *					indexed by hashed APIC ID (only used
2e2c009bSjjc *					for SRAT)
2e2c009bSjjc *
2e2c009bSjjc * - lgrp_plat_lat_stats.latencies[][]	Table of latencies between same and
2e2c009bSjjc *					different nodes indexed by node ID
2e2c009bSjjc *
2e2c009bSjjc * - lgrp_plat_node_cnt			Number of NUMA nodes in system
2e2c009bSjjc *
2e2c009bSjjc * - lgrp_plat_node_domain[]		Node ID to proximity domain ID mapping
2e2c009bSjjc *					table indexed by node ID (only used
2e2c009bSjjc *					for SRAT)
2e2c009bSjjc *
2e2c009bSjjc * - lgrp_plat_node_memory[]		Table with physical address range for
2e2c009bSjjc *					each node indexed by node ID
2e2c009bSjjc *
2e2c009bSjjc * The code is implemented to make the following always be true:
2e2c009bSjjc *
2e2c009bSjjc *	lgroup platform handle == node ID == memnode ID
2e2c009bSjjc *
2e2c009bSjjc * Moreover, it allows for the proximity domain ID to be equal to all of the
2e2c009bSjjc * above as long as the proximity domains IDs are numbered from 0 to <number of
2e2c009bSjjc * nodes - 1>.  This is done by hashing each proximity domain ID into the range
2e2c009bSjjc * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
2e2c009bSjjc * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
2e2c009bSjjc * and be assigned node ID N.  If the proximity domain IDs aren't numbered
2e2c009bSjjc * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
2e2c009bSjjc * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
2e2c009bSjjc * to node IDs.  However, the proximity domain IDs may not map to the
2e2c009bSjjc * equivalent node ID since we want to keep the node IDs numbered from 0 to
2e2c009bSjjc * <number of nodes - 1> to minimize cost of searching and potentially space.
2e2c009bSjjc */
2e2c009bSjjc
2e2c009bSjjc
7c478bd9Sstevel@tonic-gate#include <sys/archsystm.h>	/* for {in,out}{b,w,l}() */
7c478bd9Sstevel@tonic-gate#include <sys/cmn_err.h>
f78a91cdSjjc#include <sys/controlregs.h>
7c478bd9Sstevel@tonic-gate#include <sys/cpupart.h>
7c478bd9Sstevel@tonic-gate#include <sys/cpuvar.h>
7c478bd9Sstevel@tonic-gate#include <sys/lgrp.h>
7c478bd9Sstevel@tonic-gate#include <sys/machsystm.h>
7c478bd9Sstevel@tonic-gate#include <sys/memlist.h>
7c478bd9Sstevel@tonic-gate#include <sys/memnode.h>
7c478bd9Sstevel@tonic-gate#include <sys/mman.h>
ef50d8c0Sesaxe#include <sys/pci_cfgspace.h>
ef50d8c0Sesaxe#include <sys/pci_impl.h>
7c478bd9Sstevel@tonic-gate#include <sys/param.h>
fb2f18f8Sesaxe#include <sys/pghw.h>
7c478bd9Sstevel@tonic-gate#include <sys/promif.h>		/* for prom_printf() */
2e2c009bSjjc#include <sys/sysmacros.h>
7c478bd9Sstevel@tonic-gate#include <sys/systm.h>
7c478bd9Sstevel@tonic-gate#include <sys/thread.h>
7c478bd9Sstevel@tonic-gate#include <sys/types.h>
7c478bd9Sstevel@tonic-gate#include <sys/var.h>
7c478bd9Sstevel@tonic-gate#include <sys/x86_archext.h>	/* for x86_feature and X86_AMD */
7c478bd9Sstevel@tonic-gate#include <vm/hat_i86.h>
7c478bd9Sstevel@tonic-gate#include <vm/seg_kmem.h>
affbd3ccSkchow#include <vm/vm_dep.h>
7c478bd9Sstevel@tonic-gate
2e2c009bSjjc#include "acpi_fw.h"		/* for SRAT and SLIT */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate#define	MAX_NODES		8
7c478bd9Sstevel@tonic-gate#define	NLGRP			(MAX_NODES * (MAX_NODES - 1) + 1)
7c478bd9Sstevel@tonic-gate
2e2c009bSjjc/*
2e2c009bSjjc * Constants for configuring probing
2e2c009bSjjc */
7c478bd9Sstevel@tonic-gate#define	LGRP_PLAT_PROBE_NROUNDS		64	/* default laps for probing */
7c478bd9Sstevel@tonic-gate#define	LGRP_PLAT_PROBE_NSAMPLES	1	/* default samples to take */
8949bcd6Sandrei#define	LGRP_PLAT_PROBE_NREADS		256	/* number of vendor ID reads */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
2e2c009bSjjc * Flags for probing
2e2c009bSjjc */
2e2c009bSjjc#define	LGRP_PLAT_PROBE_ENABLE		0x1	/* enable probing */
2e2c009bSjjc#define	LGRP_PLAT_PROBE_PGCPY		0x2	/* probe using page copy */
2e2c009bSjjc#define	LGRP_PLAT_PROBE_VENDOR		0x4	/* probe vendor ID register */
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Hash CPU APIC ID into CPU to node mapping table using max_ncpus
2e2c009bSjjc * to minimize span of entries used
2e2c009bSjjc */
2e2c009bSjjc#define	CPU_NODE_HASH(apicid)		((apicid) % max_ncpus)
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Hash proximity domain ID into node to domain mapping table using to minimize
2e2c009bSjjc * span of entries used
2e2c009bSjjc */
2e2c009bSjjc#define	NODE_DOMAIN_HASH(domain)	((domain) % lgrp_plat_node_cnt)
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * CPU APIC ID to node ID mapping structure (only used with SRAT)
2e2c009bSjjc */
2e2c009bSjjctypedef	struct cpu_node_map {
2e2c009bSjjc	int		exists;
2e2c009bSjjc	uint_t		node;
2e2c009bSjjc	uint32_t	apicid;
2e2c009bSjjc	uint32_t	prox_domain;
2e2c009bSjjc} cpu_node_map_t;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Latency statistics
2e2c009bSjjc */
2e2c009bSjjctypedef struct lgrp_plat_latency_stats {
2e2c009bSjjc	hrtime_t	latencies[MAX_NODES][MAX_NODES];
2e2c009bSjjc	hrtime_t	latency_max;
2e2c009bSjjc	hrtime_t	latency_min;
2e2c009bSjjc} lgrp_plat_latency_stats_t;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Memory configuration for probing
2e2c009bSjjc */
2e2c009bSjjctypedef struct lgrp_plat_probe_mem_config {
2e2c009bSjjc	size_t	probe_memsize;		/* how much memory to probe per node */
2e2c009bSjjc	caddr_t	probe_va[MAX_NODES];	/* where memory mapped for probing */
2e2c009bSjjc	pfn_t	probe_pfn[MAX_NODES];	/* physical pages to map for probing */
2e2c009bSjjc} lgrp_plat_probe_mem_config_t;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Statistics kept for probing
2e2c009bSjjc */
2e2c009bSjjctypedef struct lgrp_plat_probe_stats {
2e2c009bSjjc	hrtime_t	flush_cost;
2e2c009bSjjc	hrtime_t	probe_cost;
2e2c009bSjjc	hrtime_t	probe_cost_total;
2e2c009bSjjc	hrtime_t	probe_error_code;
2e2c009bSjjc	hrtime_t	probe_errors[MAX_NODES][MAX_NODES];
2e2c009bSjjc	int		probe_suspect[MAX_NODES][MAX_NODES];
2e2c009bSjjc	hrtime_t	probe_max[MAX_NODES][MAX_NODES];
2e2c009bSjjc	hrtime_t	probe_min[MAX_NODES][MAX_NODES];
2e2c009bSjjc} lgrp_plat_probe_stats_t;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Node to proximity domain ID mapping structure (only used with SRAT)
2e2c009bSjjc */
2e2c009bSjjctypedef	struct node_domain_map {
2e2c009bSjjc	int		exists;
2e2c009bSjjc	uint32_t	prox_domain;
2e2c009bSjjc} node_domain_map_t;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Node ID and starting and ending page for physical memory in node
2e2c009bSjjc */
2e2c009bSjjctypedef	struct node_phys_addr_map {
2e2c009bSjjc	pfn_t		start;
2e2c009bSjjc	pfn_t		end;
2e2c009bSjjc	int		exists;
2e2c009bSjjc	uint32_t	prox_domain;
2e2c009bSjjc} node_phys_addr_map_t;
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * CPU APIC ID to node ID mapping table (only used for SRAT)
2e2c009bSjjc */
2e2c009bSjjcstatic cpu_node_map_t			lgrp_plat_cpu_node[NCPU];
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Latency statistics
2e2c009bSjjc */
2e2c009bSjjclgrp_plat_latency_stats_t		lgrp_plat_lat_stats;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Whether memory is interleaved across nodes causing MPO to be disabled
2e2c009bSjjc */
2e2c009bSjjcstatic int				lgrp_plat_mem_intrlv = 0;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Node ID to proximity domain ID mapping table (only used for SRAT)
2e2c009bSjjc */
2e2c009bSjjcstatic node_domain_map_t		lgrp_plat_node_domain[MAX_NODES];
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Physical address range for memory in each node
2e2c009bSjjc */
2e2c009bSjjcstatic node_phys_addr_map_t		lgrp_plat_node_memory[MAX_NODES];
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Statistics gotten from probing
2e2c009bSjjc */
2e2c009bSjjcstatic lgrp_plat_probe_stats_t		lgrp_plat_probe_stats;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Memory configuration for probing
2e2c009bSjjc */
2e2c009bSjjcstatic lgrp_plat_probe_mem_config_t	lgrp_plat_probe_mem_config;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Error code from processing ACPI SRAT
2e2c009bSjjc */
2e2c009bSjjcstatic int				lgrp_plat_srat_error = 0;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Error code from processing ACPI SLIT
2e2c009bSjjc */
2e2c009bSjjcstatic int				lgrp_plat_slit_error = 0;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Allocate lgroup array statically
2e2c009bSjjc */
2e2c009bSjjcstatic lgrp_t				lgrp_space[NLGRP];
2e2c009bSjjcstatic int				nlgrps_alloc;
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Number of nodes in system
2e2c009bSjjc */
2e2c009bSjjcuint_t			lgrp_plat_node_cnt = 1;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Configuration Parameters for Probing
2e2c009bSjjc * - lgrp_plat_probe_flags	Flags to specify enabling probing, probe
2e2c009bSjjc *				operation, etc.
2e2c009bSjjc * - lgrp_plat_probe_nrounds	How many rounds of probing to do
2e2c009bSjjc * - lgrp_plat_probe_nsamples	Number of samples to take when probing each
2e2c009bSjjc *				node
2e2c009bSjjc * - lgrp_plat_probe_nreads	Number of times to read vendor ID from
2e2c009bSjjc *				Northbridge for each probe
2e2c009bSjjc */
2e2c009bSjjcuint_t			lgrp_plat_probe_flags = 0;
2e2c009bSjjcint			lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
2e2c009bSjjcint			lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
2e2c009bSjjcint			lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Enable use of ACPI System Resource Affinity Table (SRAT) and System
2e2c009bSjjc * Locality Information Table (SLIT)
2e2c009bSjjc */
2e2c009bSjjcint			lgrp_plat_srat_enable = 1;
2e2c009bSjjcint			lgrp_plat_slit_enable = 1;
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Static array to hold lgroup statistics
2e2c009bSjjc */
2e2c009bSjjcstruct lgrp_stats	lgrp_stats[NLGRP];
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Forward declarations of platform interface routines
2e2c009bSjjc */
2e2c009bSjjcvoid		plat_build_mem_nodes(struct memlist *list);
2e2c009bSjjc
2e2c009bSjjcint		plat_lgrphand_to_mem_node(lgrp_handle_t hand);
2e2c009bSjjc
2e2c009bSjjclgrp_handle_t	plat_mem_node_to_lgrphand(int mnode);
2e2c009bSjjc
2e2c009bSjjcint		plat_mnode_xcheck(pfn_t pfncnt);
2e2c009bSjjc
2e2c009bSjjcint		plat_pfn_to_mem_node(pfn_t pfn);
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Forward declarations of lgroup platform interface routines
2e2c009bSjjc */
2e2c009bSjjclgrp_t		*lgrp_plat_alloc(lgrp_id_t lgrpid);
2e2c009bSjjc
2e2c009bSjjcvoid		lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
2e2c009bSjjc
2e2c009bSjjclgrp_handle_t	lgrp_plat_cpu_to_hand(processorid_t id);
2e2c009bSjjc
2e2c009bSjjcvoid		lgrp_plat_init(void);
2e2c009bSjjc
2e2c009bSjjcint		lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
2e2c009bSjjc
2e2c009bSjjcvoid		lgrp_plat_main_init(void);
2e2c009bSjjc
2e2c009bSjjcint		lgrp_plat_max_lgrps(void);
2e2c009bSjjc
2e2c009bSjjcpgcnt_t		lgrp_plat_mem_size(lgrp_handle_t plathand,
2e2c009bSjjc    lgrp_mem_query_t query);
2e2c009bSjjc
2e2c009bSjjclgrp_handle_t	lgrp_plat_pfn_to_hand(pfn_t pfn);
2e2c009bSjjc
2e2c009bSjjcvoid		lgrp_plat_probe(void);
2e2c009bSjjc
2e2c009bSjjclgrp_handle_t	lgrp_plat_root_hand(void);
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Forward declarations of local routines
2e2c009bSjjc */
2e2c009bSjjcstatic int	is_opteron(void);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
2e2c009bSjjc    uint32_t domain);
2e2c009bSjjc
2e2c009bSjjcstatic void	lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats,
2e2c009bSjjc    lgrp_plat_probe_stats_t *probe_stats);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats);
2e2c009bSjjc
2e2c009bSjjcstatic pgcnt_t	lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
2e2c009bSjjc    uint32_t domain);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
*e9dd3ea3Sjjc    node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
2e2c009bSjjc    uint32_t domain);
2e2c009bSjjc
2e2c009bSjjcstatic hrtime_t	lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
2e2c009bSjjc    lgrp_plat_probe_mem_config_t *probe_mem_config,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats,
2e2c009bSjjc    lgrp_plat_probe_stats_t *probe_stats);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
2e2c009bSjjc    node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt,
2e2c009bSjjc    node_domain_map_t *node_domain, cpu_node_map_t *cpu_node,
2e2c009bSjjc    node_phys_addr_map_t *node_memory);
2e2c009bSjjc
2e2c009bSjjcstatic int	lgrp_plat_srat_domains(struct srat *tp);
2e2c009bSjjc
2e2c009bSjjcstatic void	lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats);
2e2c009bSjjc
2e2c009bSjjcstatic void	opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
2e2c009bSjjc    node_phys_addr_map_t *node_memory);
2e2c009bSjjc
2e2c009bSjjcstatic hrtime_t	opt_probe_vendor(int dest_node, int nreads);
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * PLATFORM INTERFACE ROUTINES
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
2e2c009bSjjc * Configure memory nodes for machines with more than one node (ie NUMA)
2e2c009bSjjc */
2e2c009bSjjcvoid
2e2c009bSjjcplat_build_mem_nodes(struct memlist *list)
2e2c009bSjjc{
2e2c009bSjjc	pfn_t		cur_start;	/* start addr of subrange */
2e2c009bSjjc	pfn_t		cur_end;	/* end addr of subrange */
2e2c009bSjjc	pfn_t		start;		/* start addr of whole range */
2e2c009bSjjc	pfn_t		end;		/* end addr of whole range */
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Boot install lists are arranged <addr, len>, ...
2e2c009bSjjc	 */
2e2c009bSjjc	while (list) {
2e2c009bSjjc		int	node;
2e2c009bSjjc
2e2c009bSjjc		start = list->address >> PAGESHIFT;
2e2c009bSjjc		end = (list->address + list->size - 1) >> PAGESHIFT;
2e2c009bSjjc
2e2c009bSjjc		if (start > physmax) {
2e2c009bSjjc			list = list->next;
2e2c009bSjjc			continue;
2e2c009bSjjc		}
2e2c009bSjjc		if (end > physmax)
2e2c009bSjjc			end = physmax;
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * When there is only one memnode, just add memory to memnode
2e2c009bSjjc		 */
2e2c009bSjjc		if (max_mem_nodes == 1) {
2e2c009bSjjc			mem_node_add_slice(start, end);
2e2c009bSjjc			list = list->next;
2e2c009bSjjc			continue;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * mem_node_add_slice() expects to get a memory range that
2e2c009bSjjc		 * is within one memnode, so need to split any memory range
2e2c009bSjjc		 * that spans multiple memnodes into subranges that are each
2e2c009bSjjc		 * contained within one memnode when feeding them to
2e2c009bSjjc		 * mem_node_add_slice()
2e2c009bSjjc		 */
2e2c009bSjjc		cur_start = start;
2e2c009bSjjc		do {
2e2c009bSjjc			node = plat_pfn_to_mem_node(cur_start);
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Panic if DRAM address map registers or SRAT say
2e2c009bSjjc			 * memory in node doesn't exist or address from
2e2c009bSjjc			 * boot installed memory list entry isn't in this node.
2e2c009bSjjc			 * This shouldn't happen and rest of code can't deal
2e2c009bSjjc			 * with this if it does.
2e2c009bSjjc			 */
2e2c009bSjjc			if (node < 0 || node >= lgrp_plat_node_cnt ||
2e2c009bSjjc			    !lgrp_plat_node_memory[node].exists ||
2e2c009bSjjc			    cur_start < lgrp_plat_node_memory[node].start ||
2e2c009bSjjc			    cur_start > lgrp_plat_node_memory[node].end) {
2e2c009bSjjc				cmn_err(CE_PANIC, "Don't know which memnode "
2e2c009bSjjc				    "to add installed memory address 0x%lx\n",
2e2c009bSjjc				    cur_start);
2e2c009bSjjc			}
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * End of current subrange should not span memnodes
2e2c009bSjjc			 */
2e2c009bSjjc			cur_end = end;
2e2c009bSjjc			if (lgrp_plat_node_memory[node].exists &&
2e2c009bSjjc			    cur_end > lgrp_plat_node_memory[node].end)
2e2c009bSjjc				cur_end = lgrp_plat_node_memory[node].end;
2e2c009bSjjc
2e2c009bSjjc			mem_node_add_slice(cur_start, cur_end);
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Next subrange starts after end of current one
2e2c009bSjjc			 */
2e2c009bSjjc			cur_start = cur_end + 1;
2e2c009bSjjc		} while (cur_end < end);
2e2c009bSjjc
2e2c009bSjjc		list = list->next;
2e2c009bSjjc	}
2e2c009bSjjc	mem_node_physalign = 0;
2e2c009bSjjc	mem_node_pfn_shift = 0;
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjcint
2e2c009bSjjcplat_lgrphand_to_mem_node(lgrp_handle_t hand)
2e2c009bSjjc{
2e2c009bSjjc	if (max_mem_nodes == 1)
2e2c009bSjjc		return (0);
2e2c009bSjjc
2e2c009bSjjc	return ((int)hand);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
2e2c009bSjjc * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
2e2c009bSjjc * a crossing is found and returns 0 otherwise.
2e2c009bSjjc */
2e2c009bSjjcint
2e2c009bSjjcplat_mnode_xcheck(pfn_t pfncnt)
2e2c009bSjjc{
2e2c009bSjjc	int	node, prevnode = -1, basenode;
2e2c009bSjjc	pfn_t	ea, sa;
2e2c009bSjjc
2e2c009bSjjc	for (node = 0; node < lgrp_plat_node_cnt; node++) {
2e2c009bSjjc
2e2c009bSjjc		if (lgrp_plat_node_memory[node].exists == 0)
2e2c009bSjjc			continue;
2e2c009bSjjc
2e2c009bSjjc		if (prevnode == -1) {
2e2c009bSjjc			prevnode = node;
2e2c009bSjjc			basenode = node;
2e2c009bSjjc			continue;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/* assume x86 node pfn ranges are in increasing order */
2e2c009bSjjc		ASSERT(lgrp_plat_node_memory[node].start >
2e2c009bSjjc		    lgrp_plat_node_memory[prevnode].end);
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * continue if the starting address of node is not contiguous
2e2c009bSjjc		 * with the previous node.
2e2c009bSjjc		 */
2e2c009bSjjc
2e2c009bSjjc		if (lgrp_plat_node_memory[node].start !=
2e2c009bSjjc		    (lgrp_plat_node_memory[prevnode].end + 1)) {
2e2c009bSjjc			basenode = node;
2e2c009bSjjc			prevnode = node;
2e2c009bSjjc			continue;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/* check if the starting address of node is pfncnt aligned */
2e2c009bSjjc		if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) {
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * at this point, node starts at an unaligned boundary
2e2c009bSjjc			 * and is contiguous with the previous node(s) to
2e2c009bSjjc			 * basenode. Check if there is an aligned contiguous
2e2c009bSjjc			 * range of length pfncnt that crosses this boundary.
2e2c009bSjjc			 */
2e2c009bSjjc
2e2c009bSjjc			sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end,
2e2c009bSjjc			    pfncnt);
2e2c009bSjjc			ea = P2ROUNDUP((lgrp_plat_node_memory[node].start),
2e2c009bSjjc			    pfncnt);
2e2c009bSjjc
2e2c009bSjjc			ASSERT((ea - sa) == pfncnt);
2e2c009bSjjc			if (sa >= lgrp_plat_node_memory[basenode].start &&
2e2c009bSjjc			    ea <= (lgrp_plat_node_memory[node].end + 1))
2e2c009bSjjc				return (1);
2e2c009bSjjc		}
2e2c009bSjjc		prevnode = node;
2e2c009bSjjc	}
2e2c009bSjjc	return (0);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjclgrp_handle_t
2e2c009bSjjcplat_mem_node_to_lgrphand(int mnode)
2e2c009bSjjc{
2e2c009bSjjc	if (max_mem_nodes == 1)
2e2c009bSjjc		return (LGRP_DEFAULT_HANDLE);
2e2c009bSjjc
2e2c009bSjjc	return ((lgrp_handle_t)mnode);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjcint
2e2c009bSjjcplat_pfn_to_mem_node(pfn_t pfn)
2e2c009bSjjc{
2e2c009bSjjc	int	node;
2e2c009bSjjc
2e2c009bSjjc	if (max_mem_nodes == 1)
2e2c009bSjjc		return (0);
2e2c009bSjjc
2e2c009bSjjc	for (node = 0; node < lgrp_plat_node_cnt; node++) {
2e2c009bSjjc		/*
2e2c009bSjjc		 * Skip nodes with no memory
2e2c009bSjjc		 */
2e2c009bSjjc		if (!lgrp_plat_node_memory[node].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc
2e2c009bSjjc		if (pfn >= lgrp_plat_node_memory[node].start &&
2e2c009bSjjc		    pfn <= lgrp_plat_node_memory[node].end)
2e2c009bSjjc			return (node);
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Didn't find memnode where this PFN lives which should never happen
2e2c009bSjjc	 */
2e2c009bSjjc	ASSERT(node < lgrp_plat_node_cnt);
2e2c009bSjjc	return (-1);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * LGROUP PLATFORM INTERFACE ROUTINES
2e2c009bSjjc */
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Allocate additional space for an lgroup.
2e2c009bSjjc */
2e2c009bSjjc/* ARGSUSED */
2e2c009bSjjclgrp_t *
2e2c009bSjjclgrp_plat_alloc(lgrp_id_t lgrpid)
2e2c009bSjjc{
2e2c009bSjjc	lgrp_t *lgrp;
2e2c009bSjjc
2e2c009bSjjc	lgrp = &lgrp_space[nlgrps_alloc++];
2e2c009bSjjc	if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
2e2c009bSjjc		return (NULL);
2e2c009bSjjc	return (lgrp);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Platform handling for (re)configuration changes
2e2c009bSjjc */
2e2c009bSjjc/* ARGSUSED */
2e2c009bSjjcvoid
2e2c009bSjjclgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
2e2c009bSjjc{
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return the platform handle for the lgroup containing the given CPU
2e2c009bSjjc */
2e2c009bSjjc/* ARGSUSED */
2e2c009bSjjclgrp_handle_t
2e2c009bSjjclgrp_plat_cpu_to_hand(processorid_t id)
2e2c009bSjjc{
2e2c009bSjjc	lgrp_handle_t	hand;
2e2c009bSjjc
2e2c009bSjjc	if (lgrp_plat_node_cnt == 1)
2e2c009bSjjc		return (LGRP_DEFAULT_HANDLE);
2e2c009bSjjc
2e2c009bSjjc	hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
2e2c009bSjjc	    lgrp_plat_cpu_node);
2e2c009bSjjc
2e2c009bSjjc	ASSERT(hand != (lgrp_handle_t)-1);
2e2c009bSjjc	if (hand == (lgrp_handle_t)-1)
2e2c009bSjjc		return (LGRP_NULL_HANDLE);
2e2c009bSjjc
2e2c009bSjjc	return (hand);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Platform-specific initialization of lgroups
2e2c009bSjjc */
2e2c009bSjjcvoid
2e2c009bSjjclgrp_plat_init(void)
2e2c009bSjjc{
2e2c009bSjjc#if defined(__xpv)
2e2c009bSjjc	/*
2e2c009bSjjc	 * XXPV	For now, the hypervisor treats all memory equally.
2e2c009bSjjc	 */
2e2c009bSjjc	lgrp_plat_node_cnt = max_mem_nodes = 1;
2e2c009bSjjc#else	/* __xpv */
2e2c009bSjjc	uint_t	probe_op;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Initialize as a UMA machine
2e2c009bSjjc	 */
2e2c009bSjjc	if (lgrp_topo_ht_limit() == 1) {
2e2c009bSjjc		lgrp_plat_node_cnt = max_mem_nodes = 1;
2e2c009bSjjc		return;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Determine which CPUs and memory are local to each other and number
2e2c009bSjjc	 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
2e2c009bSjjc	 */
2e2c009bSjjc	lgrp_plat_srat_error = lgrp_plat_process_srat(srat_ptr,
2e2c009bSjjc	    &lgrp_plat_node_cnt, lgrp_plat_node_domain, lgrp_plat_cpu_node,
2e2c009bSjjc	    lgrp_plat_node_memory);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Try to use PCI config space registers on Opteron if SRAT doesn't
2e2c009bSjjc	 * exist or there is some error processing the SRAT
2e2c009bSjjc	 */
2e2c009bSjjc	if (lgrp_plat_srat_error != 0 && is_opteron())
2e2c009bSjjc		opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
2e2c009bSjjc		    lgrp_plat_node_memory);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Don't bother to setup system for multiple lgroups and only use one
2e2c009bSjjc	 * memory node when memory is interleaved between any nodes or there is
2e2c009bSjjc	 * only one NUMA node
2e2c009bSjjc	 *
2e2c009bSjjc	 * NOTE: May need to change this for Dynamic Reconfiguration (DR)
2e2c009bSjjc	 *	 when and if it happens for x86/x64
2e2c009bSjjc	 */
2e2c009bSjjc	if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
2e2c009bSjjc		lgrp_plat_node_cnt = max_mem_nodes = 1;
2e2c009bSjjc		(void) lgrp_topo_ht_limit_set(1);
2e2c009bSjjc		return;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Leaf lgroups on x86/x64 architectures contain one physical
2e2c009bSjjc	 * processor chip. Tune lgrp_expand_proc_thresh and
2e2c009bSjjc	 * lgrp_expand_proc_diff so that lgrp_choose() will spread
2e2c009bSjjc	 * things out aggressively.
2e2c009bSjjc	 */
2e2c009bSjjc	lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
2e2c009bSjjc	lgrp_expand_proc_diff = 0;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * There should be one memnode (physical page free list(s)) for
2e2c009bSjjc	 * each node
2e2c009bSjjc	 */
2e2c009bSjjc	max_mem_nodes = lgrp_plat_node_cnt;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Determine how far each NUMA node is from each other by
2e2c009bSjjc	 * reading ACPI System Locality Information Table (SLIT) if it
2e2c009bSjjc	 * exists
2e2c009bSjjc	 */
2e2c009bSjjc	lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
2e2c009bSjjc	    lgrp_plat_node_cnt, lgrp_plat_node_memory,
2e2c009bSjjc	    &lgrp_plat_lat_stats);
2e2c009bSjjc	if (lgrp_plat_slit_error == 0)
2e2c009bSjjc		return;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Probe to determine latency between NUMA nodes when SLIT
2e2c009bSjjc	 * doesn't exist or make sense
2e2c009bSjjc	 */
2e2c009bSjjc	lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Specify whether to probe using vendor ID register or page copy
2e2c009bSjjc	 * if hasn't been specified already or is overspecified
2e2c009bSjjc	 */
2e2c009bSjjc	probe_op = lgrp_plat_probe_flags &
2e2c009bSjjc	    (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
2e2c009bSjjc
2e2c009bSjjc	if (probe_op == 0 ||
2e2c009bSjjc	    probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
2e2c009bSjjc		lgrp_plat_probe_flags &=
2e2c009bSjjc		    ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
2e2c009bSjjc		if (is_opteron())
2e2c009bSjjc			lgrp_plat_probe_flags |=
2e2c009bSjjc			    LGRP_PLAT_PROBE_VENDOR;
2e2c009bSjjc		else
2e2c009bSjjc			lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Probing errors can mess up the lgroup topology and
2e2c009bSjjc	 * force us fall back to a 2 level lgroup topology.
2e2c009bSjjc	 * Here we bound how tall the lgroup topology can grow
2e2c009bSjjc	 * in hopes of avoiding any anamolies in probing from
2e2c009bSjjc	 * messing up the lgroup topology by limiting the
2e2c009bSjjc	 * accuracy of the latency topology.
2e2c009bSjjc	 *
2e2c009bSjjc	 * Assume that nodes will at least be configured in a
2e2c009bSjjc	 * ring, so limit height of lgroup topology to be less
2e2c009bSjjc	 * than number of nodes on a system with 4 or more
2e2c009bSjjc	 * nodes
2e2c009bSjjc	 */
2e2c009bSjjc	if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
2e2c009bSjjc	    lgrp_topo_ht_limit_default())
2e2c009bSjjc		(void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
2e2c009bSjjc#endif	/* __xpv */
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return latency between "from" and "to" lgroups
2e2c009bSjjc *
2e2c009bSjjc * This latency number can only be used for relative comparison
2e2c009bSjjc * between lgroups on the running system, cannot be used across platforms,
2e2c009bSjjc * and may not reflect the actual latency.  It is platform and implementation
2e2c009bSjjc * specific, so platform gets to decide its value.  It would be nice if the
2e2c009bSjjc * number was at least proportional to make comparisons more meaningful though.
2e2c009bSjjc */
2e2c009bSjjc/* ARGSUSED */
2e2c009bSjjcint
2e2c009bSjjclgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
2e2c009bSjjc{
2e2c009bSjjc	lgrp_handle_t	src, dest;
2e2c009bSjjc	int		node;
2e2c009bSjjc
2e2c009bSjjc	if (max_mem_nodes == 1)
2e2c009bSjjc		return (0);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Return max latency for root lgroup
2e2c009bSjjc	 */
2e2c009bSjjc	if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
2e2c009bSjjc		return (lgrp_plat_lat_stats.latency_max);
2e2c009bSjjc
2e2c009bSjjc	src = from;
2e2c009bSjjc	dest = to;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Return 0 for nodes (lgroup platform handles) out of range
2e2c009bSjjc	 */
2e2c009bSjjc	if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
2e2c009bSjjc		return (0);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Probe from current CPU if its lgroup latencies haven't been set yet
2e2c009bSjjc	 * and we are trying to get latency from current CPU to some node
2e2c009bSjjc	 */
2e2c009bSjjc	node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
2e2c009bSjjc	ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
2e2c009bSjjc	if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src)
2e2c009bSjjc		lgrp_plat_probe();
2e2c009bSjjc
2e2c009bSjjc	return (lgrp_plat_lat_stats.latencies[src][dest]);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Platform-specific initialization
2e2c009bSjjc */
2e2c009bSjjcvoid
2e2c009bSjjclgrp_plat_main_init(void)
2e2c009bSjjc{
2e2c009bSjjc	int	curnode;
2e2c009bSjjc	int	ht_limit;
2e2c009bSjjc	int	i;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Print a notice that MPO is disabled when memory is interleaved
2e2c009bSjjc	 * across nodes....Would do this when it is discovered, but can't
2e2c009bSjjc	 * because it happens way too early during boot....
2e2c009bSjjc	 */
2e2c009bSjjc	if (lgrp_plat_mem_intrlv)
2e2c009bSjjc		cmn_err(CE_NOTE,
2e2c009bSjjc		    "MPO disabled because memory is interleaved\n");
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Don't bother to do any probing if it is disabled, there is only one
2e2c009bSjjc	 * node, or the height of the lgroup topology less than or equal to 2
2e2c009bSjjc	 */
2e2c009bSjjc	ht_limit = lgrp_topo_ht_limit();
2e2c009bSjjc	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
2e2c009bSjjc	    max_mem_nodes == 1 || ht_limit <= 2) {
2e2c009bSjjc		/*
2e2c009bSjjc		 * Setup lgroup latencies for 2 level lgroup topology
2e2c009bSjjc		 * (ie. local and remote only) if they haven't been set yet
2e2c009bSjjc		 */
2e2c009bSjjc		if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
2e2c009bSjjc		    lgrp_plat_lat_stats.latency_max == 0)
2e2c009bSjjc			lgrp_plat_2level_setup(lgrp_plat_node_memory,
2e2c009bSjjc			    &lgrp_plat_lat_stats);
2e2c009bSjjc		return;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2e2c009bSjjc		/*
2e2c009bSjjc		 * Should have been able to probe from CPU 0 when it was added
2e2c009bSjjc		 * to lgroup hierarchy, but may not have been able to then
2e2c009bSjjc		 * because it happens so early in boot that gethrtime() hasn't
2e2c009bSjjc		 * been initialized.  (:-(
2e2c009bSjjc		 */
2e2c009bSjjc		curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
2e2c009bSjjc		ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
2e2c009bSjjc		if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
2e2c009bSjjc			lgrp_plat_probe();
2e2c009bSjjc
2e2c009bSjjc		return;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * When probing memory, use one page for every sample to determine
2e2c009bSjjc	 * lgroup topology and taking multiple samples
2e2c009bSjjc	 */
2e2c009bSjjc	if (lgrp_plat_probe_mem_config.probe_memsize == 0)
2e2c009bSjjc		lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
2e2c009bSjjc		    lgrp_plat_probe_nsamples;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Map memory in each node needed for probing to determine latency
2e2c009bSjjc	 * topology
2e2c009bSjjc	 */
2e2c009bSjjc	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		int	mnode;
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Skip this node and leave its probe page NULL
2e2c009bSjjc		 * if it doesn't have any memory
2e2c009bSjjc		 */
2e2c009bSjjc		mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i);
2e2c009bSjjc		if (!mem_node_config[mnode].exists) {
2e2c009bSjjc			lgrp_plat_probe_mem_config.probe_va[i] = NULL;
2e2c009bSjjc			continue;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Allocate one kernel virtual page
2e2c009bSjjc		 */
2e2c009bSjjc		lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
2e2c009bSjjc		    lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
2e2c009bSjjc		if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
2e2c009bSjjc			cmn_err(CE_WARN,
2e2c009bSjjc			    "lgrp_plat_main_init: couldn't allocate memory");
2e2c009bSjjc			return;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Get PFN for first page in each node
2e2c009bSjjc		 */
2e2c009bSjjc		lgrp_plat_probe_mem_config.probe_pfn[i] =
2e2c009bSjjc		    mem_node_config[mnode].physbase;
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Map virtual page to first page in node
2e2c009bSjjc		 */
2e2c009bSjjc		hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
2e2c009bSjjc		    lgrp_plat_probe_mem_config.probe_memsize,
2e2c009bSjjc		    lgrp_plat_probe_mem_config.probe_pfn[i],
2e2c009bSjjc		    PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
2e2c009bSjjc		    HAT_LOAD_NOCONSIST);
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Probe from current CPU
2e2c009bSjjc	 */
2e2c009bSjjc	lgrp_plat_probe();
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return the maximum number of lgrps supported by the platform.
2e2c009bSjjc * Before lgrp topology is known it returns an estimate based on the number of
2e2c009bSjjc * nodes. Once topology is known it returns the actual maximim number of lgrps
2e2c009bSjjc * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and
2e2c009bSjjc * dynamic addition of new nodes, this number may not grow during system
2e2c009bSjjc * lifetime (yet).
2e2c009bSjjc */
2e2c009bSjjcint
2e2c009bSjjclgrp_plat_max_lgrps(void)
2e2c009bSjjc{
2e2c009bSjjc	return (lgrp_topo_initialized ?
2e2c009bSjjc	    lgrp_alloc_max + 1 :
2e2c009bSjjc	    lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return the number of free pages in an lgroup.
2e2c009bSjjc *
2e2c009bSjjc * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
2e2c009bSjjc * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
2e2c009bSjjc * number of allocatable base pagesize pages corresponding to the
2e2c009bSjjc * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
2e2c009bSjjc * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
2e2c009bSjjc * memory installed, regardless of whether or not it's usable.
2e2c009bSjjc */
2e2c009bSjjcpgcnt_t
2e2c009bSjjclgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
2e2c009bSjjc{
2e2c009bSjjc	int	mnode;
2e2c009bSjjc	pgcnt_t npgs = (pgcnt_t)0;
2e2c009bSjjc	extern struct memlist *phys_avail;
2e2c009bSjjc	extern struct memlist *phys_install;
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc	if (plathand == LGRP_DEFAULT_HANDLE)
2e2c009bSjjc		return (lgrp_plat_mem_size_default(plathand, query));
2e2c009bSjjc
2e2c009bSjjc	if (plathand != LGRP_NULL_HANDLE) {
2e2c009bSjjc		mnode = plat_lgrphand_to_mem_node(plathand);
2e2c009bSjjc		if (mnode >= 0 && mem_node_config[mnode].exists) {
2e2c009bSjjc			switch (query) {
2e2c009bSjjc			case LGRP_MEM_SIZE_FREE:
2e2c009bSjjc				npgs = MNODE_PGCNT(mnode);
2e2c009bSjjc				break;
2e2c009bSjjc			case LGRP_MEM_SIZE_AVAIL:
2e2c009bSjjc				npgs = mem_node_memlist_pages(mnode,
2e2c009bSjjc				    phys_avail);
2e2c009bSjjc				break;
2e2c009bSjjc			case LGRP_MEM_SIZE_INSTALL:
2e2c009bSjjc				npgs = mem_node_memlist_pages(mnode,
2e2c009bSjjc				    phys_install);
2e2c009bSjjc				break;
2e2c009bSjjc			default:
2e2c009bSjjc				break;
2e2c009bSjjc			}
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc	return (npgs);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return the platform handle of the lgroup that contains the physical memory
2e2c009bSjjc * corresponding to the given page frame number
2e2c009bSjjc */
2e2c009bSjjc/* ARGSUSED */
2e2c009bSjjclgrp_handle_t
2e2c009bSjjclgrp_plat_pfn_to_hand(pfn_t pfn)
2e2c009bSjjc{
2e2c009bSjjc	int	mnode;
2e2c009bSjjc
2e2c009bSjjc	if (max_mem_nodes == 1)
2e2c009bSjjc		return (LGRP_DEFAULT_HANDLE);
2e2c009bSjjc
2e2c009bSjjc	if (pfn > physmax)
2e2c009bSjjc		return (LGRP_NULL_HANDLE);
2e2c009bSjjc
2e2c009bSjjc	mnode = plat_pfn_to_mem_node(pfn);
2e2c009bSjjc	if (mnode < 0)
2e2c009bSjjc		return (LGRP_NULL_HANDLE);
2e2c009bSjjc
2e2c009bSjjc	return (MEM_NODE_2_LGRPHAND(mnode));
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Probe memory in each node from current CPU to determine latency topology
2e2c009bSjjc *
2e2c009bSjjc * The probing code will probe the vendor ID register on the Northbridge of
2e2c009bSjjc * Opteron processors and probe memory for other processors by default.
2e2c009bSjjc *
2e2c009bSjjc * Since probing is inherently error prone, the code takes laps across all the
2e2c009bSjjc * nodes probing from each node to each of the other nodes some number of
2e2c009bSjjc * times.  Furthermore, each node is probed some number of times before moving
2e2c009bSjjc * onto the next one during each lap.  The minimum latency gotten between nodes
2e2c009bSjjc * is kept as the latency between the nodes.
2e2c009bSjjc *
2e2c009bSjjc * After all that,  the probe times are adjusted by normalizing values that are
2e2c009bSjjc * close to each other and local latencies are made the same.  Lastly, the
2e2c009bSjjc * latencies are verified to make sure that certain conditions are met (eg.
2e2c009bSjjc * local < remote, latency(a, b) == latency(b, a), etc.).
2e2c009bSjjc *
2e2c009bSjjc * If any of the conditions aren't met, the code will export a NUMA
2e2c009bSjjc * configuration with the local CPUs and memory given by the SRAT or PCI config
2e2c009bSjjc * space registers and one remote memory latency since it can't tell exactly
2e2c009bSjjc * how far each node is from each other.
2e2c009bSjjc */
2e2c009bSjjcvoid
2e2c009bSjjclgrp_plat_probe(void)
2e2c009bSjjc{
2e2c009bSjjc	int				from;
2e2c009bSjjc	int				i;
2e2c009bSjjc	lgrp_plat_latency_stats_t	*lat_stats;
2e2c009bSjjc	hrtime_t			probe_time;
2e2c009bSjjc	int				to;
2e2c009bSjjc
2e2c009bSjjc	if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
2e2c009bSjjc	    max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
2e2c009bSjjc		return;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Determine ID of node containing current CPU
2e2c009bSjjc	 */
2e2c009bSjjc	from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node);
2e2c009bSjjc	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
2e2c009bSjjc	if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
2e2c009bSjjc		ASSERT(lgrp_plat_node_domain[from].exists);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Don't need to probe if got times already
2e2c009bSjjc	 */
2e2c009bSjjc	lat_stats = &lgrp_plat_lat_stats;
2e2c009bSjjc	if (lat_stats->latencies[from][from] != 0)
2e2c009bSjjc		return;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Read vendor ID in Northbridge or read and write page(s)
2e2c009bSjjc	 * in each node from current CPU and remember how long it takes,
2e2c009bSjjc	 * so we can build latency topology of machine later.
2e2c009bSjjc	 * This should approximate the memory latency between each node.
2e2c009bSjjc	 */
2e2c009bSjjc	for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
2e2c009bSjjc		for (to = 0; to < lgrp_plat_node_cnt; to++) {
2e2c009bSjjc			/*
2e2c009bSjjc			 * Get probe time and bail out if can't get it yet
2e2c009bSjjc			 */
2e2c009bSjjc			probe_time = lgrp_plat_probe_time(to,
2e2c009bSjjc			    lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config,
2e2c009bSjjc			    &lgrp_plat_lat_stats, &lgrp_plat_probe_stats);
2e2c009bSjjc			if (probe_time == 0)
2e2c009bSjjc				return;
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Keep lowest probe time as latency between nodes
2e2c009bSjjc			 */
2e2c009bSjjc			if (lat_stats->latencies[from][to] == 0 ||
2e2c009bSjjc			    probe_time < lat_stats->latencies[from][to])
2e2c009bSjjc				lat_stats->latencies[from][to] = probe_time;
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Update overall minimum and maximum probe times
2e2c009bSjjc			 * across all nodes
2e2c009bSjjc			 */
2e2c009bSjjc			if (probe_time < lat_stats->latency_min ||
2e2c009bSjjc			    lat_stats->latency_min == -1)
2e2c009bSjjc				lat_stats->latency_min = probe_time;
2e2c009bSjjc			if (probe_time > lat_stats->latency_max)
2e2c009bSjjc				lat_stats->latency_max = probe_time;
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * - Fix up latencies such that local latencies are same,
2e2c009bSjjc	 *   latency(i, j) == latency(j, i), etc. (if possible)
2e2c009bSjjc	 *
2e2c009bSjjc	 * - Verify that latencies look ok
2e2c009bSjjc	 *
2e2c009bSjjc	 * - Fallback to just optimizing for local and remote if
2e2c009bSjjc	 *   latencies didn't look right
2e2c009bSjjc	 */
2e2c009bSjjc	lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats,
2e2c009bSjjc	    &lgrp_plat_probe_stats);
2e2c009bSjjc	lgrp_plat_probe_stats.probe_error_code =
2e2c009bSjjc	    lgrp_plat_latency_verify(lgrp_plat_node_memory,
2e2c009bSjjc	    &lgrp_plat_lat_stats);
2e2c009bSjjc	if (lgrp_plat_probe_stats.probe_error_code)
2e2c009bSjjc		lgrp_plat_2level_setup(lgrp_plat_node_memory,
2e2c009bSjjc		    &lgrp_plat_lat_stats);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return platform handle for root lgroup
2e2c009bSjjc */
2e2c009bSjjclgrp_handle_t
2e2c009bSjjclgrp_plat_root_hand(void)
2e2c009bSjjc{
2e2c009bSjjc	return (LGRP_DEFAULT_HANDLE);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * INTERNAL ROUTINES
2e2c009bSjjc */
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Update CPU to node mapping for given CPU and proximity domain (and returns
2e2c009bSjjc * negative numbers for errors and positive ones for success)
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
2e2c009bSjjc    cpu_node_map_t *cpu_node, uint32_t apicid, uint32_t domain)
2e2c009bSjjc{
2e2c009bSjjc	uint_t	i;
2e2c009bSjjc	uint_t	start;
2e2c009bSjjc	int	node;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Get node number for proximity domain
2e2c009bSjjc	 */
2e2c009bSjjc	node = lgrp_plat_domain_to_node(node_domain, domain);
2e2c009bSjjc	if (node == -1) {
2e2c009bSjjc		node = lgrp_plat_node_domain_update(node_domain, domain);
2e2c009bSjjc		if (node == -1)
2e2c009bSjjc			return (-1);
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Hash given CPU APIC ID into CPU to node mapping table/array and
2e2c009bSjjc	 * enter it and its corresponding node and proximity domain IDs into
2e2c009bSjjc	 * first non-existent or matching entry
2e2c009bSjjc	 */
2e2c009bSjjc	i = start = CPU_NODE_HASH(apicid);
2e2c009bSjjc	do {
2e2c009bSjjc		if (cpu_node[i].exists) {
2e2c009bSjjc			/*
2e2c009bSjjc			 * Update already existing entry for CPU
2e2c009bSjjc			 */
2e2c009bSjjc			if (cpu_node[i].apicid == apicid) {
2e2c009bSjjc				/*
2e2c009bSjjc				 * Just return when everything same
2e2c009bSjjc				 */
2e2c009bSjjc				if (cpu_node[i].prox_domain == domain &&
2e2c009bSjjc				    cpu_node[i].node == node)
2e2c009bSjjc					return (1);
2e2c009bSjjc
2e2c009bSjjc				/*
2e2c009bSjjc				 * Assert that proximity domain and node IDs
2e2c009bSjjc				 * should be same and return error on non-debug
2e2c009bSjjc				 * kernel
2e2c009bSjjc				 */
2e2c009bSjjc				ASSERT(cpu_node[i].prox_domain == domain &&
2e2c009bSjjc				    cpu_node[i].node == node);
2e2c009bSjjc				return (-1);
2e2c009bSjjc			}
2e2c009bSjjc		} else {
2e2c009bSjjc			/*
2e2c009bSjjc			 * Create new entry for CPU
2e2c009bSjjc			 */
2e2c009bSjjc			cpu_node[i].exists = 1;
2e2c009bSjjc			cpu_node[i].apicid = apicid;
2e2c009bSjjc			cpu_node[i].prox_domain = domain;
2e2c009bSjjc			cpu_node[i].node = node;
2e2c009bSjjc			return (0);
2e2c009bSjjc		}
2e2c009bSjjc		i = CPU_NODE_HASH(i + 1);
2e2c009bSjjc	} while (i != start);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Ran out of supported number of entries which shouldn't happen....
2e2c009bSjjc	 */
2e2c009bSjjc	ASSERT(i != start);
2e2c009bSjjc	return (-1);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Get node ID for given CPU ID
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node)
2e2c009bSjjc{
2e2c009bSjjc	uint32_t	apicid;
2e2c009bSjjc	uint_t		i;
2e2c009bSjjc	uint_t		start;
2e2c009bSjjc
2e2c009bSjjc	if (cp == NULL)
2e2c009bSjjc		return (-1);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * SRAT doesn't exist, isn't enabled, or there was an error processing
2e2c009bSjjc	 * it, so return chip ID for Opteron and -1 otherwise.
2e2c009bSjjc	 */
2e2c009bSjjc	if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
2e2c009bSjjc	    lgrp_plat_srat_error) {
2e2c009bSjjc		if (is_opteron())
2e2c009bSjjc			return (pg_plat_hw_instance_id(cp, PGHW_CHIP));
2e2c009bSjjc		return (-1);
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * SRAT does exist, so get APIC ID for given CPU and map that to its
2e2c009bSjjc	 * node ID
2e2c009bSjjc	 */
2e2c009bSjjc	apicid = cpuid_get_apicid(cp);
2e2c009bSjjc	i = start = CPU_NODE_HASH(apicid);
2e2c009bSjjc	do {
2e2c009bSjjc		if (cpu_node[i].apicid == apicid && cpu_node[i].exists)
2e2c009bSjjc			return (cpu_node[i].node);
2e2c009bSjjc		i = CPU_NODE_HASH(i + 1);
2e2c009bSjjc	} while (i != start);
2e2c009bSjjc	return (-1);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return node number for given proximity domain/system locality
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_domain_to_node(node_domain_map_t *node_domain, uint32_t domain)
2e2c009bSjjc{
2e2c009bSjjc	uint_t	node;
2e2c009bSjjc	uint_t	start;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Hash proximity domain ID into node to domain mapping table (array),
2e2c009bSjjc	 * search for entry with matching proximity domain ID, and return index
2e2c009bSjjc	 * of matching entry as node ID.
2e2c009bSjjc	 */
2e2c009bSjjc	node = start = NODE_DOMAIN_HASH(domain);
2e2c009bSjjc	do {
2e2c009bSjjc		if (node_domain[node].prox_domain == domain &&
2e2c009bSjjc		    node_domain[node].exists)
2e2c009bSjjc			return (node);
2e2c009bSjjc		node = NODE_DOMAIN_HASH(node + 1);
2e2c009bSjjc	} while (node != start);
2e2c009bSjjc	return (-1);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
2e2c009bSjjc * be considered same
2e2c009bSjjc */
2e2c009bSjjc#define	LGRP_LAT_TOLERANCE_SHIFT	4
2e2c009bSjjc
2e2c009bSjjcint	lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Adjust latencies between nodes to be symmetric, normalize latencies between
2e2c009bSjjc * any nodes that are within some tolerance to be same, and make local
2e2c009bSjjc * latencies be same
2e2c009bSjjc */
2e2c009bSjjcstatic void
2e2c009bSjjclgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
2e2c009bSjjc{
2e2c009bSjjc	int				i;
2e2c009bSjjc	int				j;
2e2c009bSjjc	int				k;
2e2c009bSjjc	int				l;
2e2c009bSjjc	u_longlong_t			max;
2e2c009bSjjc	u_longlong_t			min;
2e2c009bSjjc	u_longlong_t			t;
2e2c009bSjjc	u_longlong_t			t1;
2e2c009bSjjc	u_longlong_t			t2;
2e2c009bSjjc	const lgrp_config_flag_t	cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
2e2c009bSjjc	int				lat_corrected[MAX_NODES][MAX_NODES];
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Nothing to do when this is an UMA machine or don't have args needed
2e2c009bSjjc	 */
2e2c009bSjjc	if (max_mem_nodes == 1)
2e2c009bSjjc		return;
2e2c009bSjjc
2e2c009bSjjc	ASSERT(node_memory != NULL && lat_stats != NULL &&
2e2c009bSjjc	    probe_stats != NULL);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Make sure that latencies are symmetric between any two nodes
2e2c009bSjjc	 * (ie. latency(node0, node1) == latency(node1, node0))
2e2c009bSjjc	 */
2e2c009bSjjc	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		if (!node_memory[i].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc
2e2c009bSjjc		for (j = 0; j < lgrp_plat_node_cnt; j++) {
2e2c009bSjjc			if (!node_memory[j].exists)
2e2c009bSjjc				continue;
2e2c009bSjjc
2e2c009bSjjc			t1 = lat_stats->latencies[i][j];
2e2c009bSjjc			t2 = lat_stats->latencies[j][i];
2e2c009bSjjc
2e2c009bSjjc			if (t1 == 0 || t2 == 0 || t1 == t2)
2e2c009bSjjc				continue;
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Latencies should be same
2e2c009bSjjc			 * - Use minimum of two latencies which should be same
2e2c009bSjjc			 * - Track suspect probe times not within tolerance of
2e2c009bSjjc			 *   min value
2e2c009bSjjc			 * - Remember how much values are corrected by
2e2c009bSjjc			 */
2e2c009bSjjc			if (t1 > t2) {
2e2c009bSjjc				t = t2;
2e2c009bSjjc				probe_stats->probe_errors[i][j] += t1 - t2;
2e2c009bSjjc				if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
2e2c009bSjjc					probe_stats->probe_suspect[i][j]++;
2e2c009bSjjc					probe_stats->probe_suspect[j][i]++;
2e2c009bSjjc				}
2e2c009bSjjc			} else if (t2 > t1) {
2e2c009bSjjc				t = t1;
2e2c009bSjjc				probe_stats->probe_errors[j][i] += t2 - t1;
2e2c009bSjjc				if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
2e2c009bSjjc					probe_stats->probe_suspect[i][j]++;
2e2c009bSjjc					probe_stats->probe_suspect[j][i]++;
2e2c009bSjjc				}
2e2c009bSjjc			}
2e2c009bSjjc
2e2c009bSjjc			lat_stats->latencies[i][j] =
2e2c009bSjjc			    lat_stats->latencies[j][i] = t;
2e2c009bSjjc			lgrp_config(cflag, t1, t);
2e2c009bSjjc			lgrp_config(cflag, t2, t);
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Keep track of which latencies get corrected
2e2c009bSjjc	 */
2e2c009bSjjc	for (i = 0; i < MAX_NODES; i++)
2e2c009bSjjc		for (j = 0; j < MAX_NODES; j++)
2e2c009bSjjc			lat_corrected[i][j] = 0;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * For every two nodes, see whether there is another pair of nodes which
2e2c009bSjjc	 * are about the same distance apart and make the latencies be the same
2e2c009bSjjc	 * if they are close enough together
2e2c009bSjjc	 */
2e2c009bSjjc	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		if (!node_memory[i].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc		for (j = 0; j < lgrp_plat_node_cnt; j++) {
2e2c009bSjjc			if (!node_memory[j].exists)
2e2c009bSjjc				continue;
2e2c009bSjjc			/*
2e2c009bSjjc			 * Pick one pair of nodes (i, j)
2e2c009bSjjc			 * and get latency between them
2e2c009bSjjc			 */
2e2c009bSjjc			t1 = lat_stats->latencies[i][j];
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Skip this pair of nodes if there isn't a latency
2e2c009bSjjc			 * for it yet
2e2c009bSjjc			 */
2e2c009bSjjc			if (t1 == 0)
2e2c009bSjjc				continue;
2e2c009bSjjc
2e2c009bSjjc			for (k = 0; k < lgrp_plat_node_cnt; k++) {
2e2c009bSjjc				if (!node_memory[k].exists)
2e2c009bSjjc					continue;
2e2c009bSjjc				for (l = 0; l < lgrp_plat_node_cnt; l++) {
2e2c009bSjjc					if (!node_memory[l].exists)
2e2c009bSjjc						continue;
2e2c009bSjjc					/*
2e2c009bSjjc					 * Pick another pair of nodes (k, l)
2e2c009bSjjc					 * not same as (i, j) and get latency
2e2c009bSjjc					 * between them
2e2c009bSjjc					 */
2e2c009bSjjc					if (k == i && l == j)
2e2c009bSjjc						continue;
2e2c009bSjjc
2e2c009bSjjc					t2 = lat_stats->latencies[k][l];
2e2c009bSjjc
2e2c009bSjjc					/*
2e2c009bSjjc					 * Skip this pair of nodes if there
2e2c009bSjjc					 * isn't a latency for it yet
2e2c009bSjjc					 */
2e2c009bSjjc
2e2c009bSjjc					if (t2 == 0)
2e2c009bSjjc						continue;
2e2c009bSjjc
2e2c009bSjjc					/*
2e2c009bSjjc					 * Skip nodes (k, l) if they already
2e2c009bSjjc					 * have same latency as (i, j) or
2e2c009bSjjc					 * their latency isn't close enough to
2e2c009bSjjc					 * be considered/made the same
2e2c009bSjjc					 */
2e2c009bSjjc					if (t1 == t2 || (t1 > t2 && t1 - t2 >
2e2c009bSjjc					    t1 >> lgrp_plat_probe_lt_shift) ||
2e2c009bSjjc					    (t2 > t1 && t2 - t1 >
2e2c009bSjjc					    t2 >> lgrp_plat_probe_lt_shift))
2e2c009bSjjc						continue;
2e2c009bSjjc
2e2c009bSjjc					/*
2e2c009bSjjc					 * Make latency(i, j) same as
2e2c009bSjjc					 * latency(k, l), try to use latency
2e2c009bSjjc					 * that has been adjusted already to get
2e2c009bSjjc					 * more consistency (if possible), and
2e2c009bSjjc					 * remember which latencies were
2e2c009bSjjc					 * adjusted for next time
2e2c009bSjjc					 */
2e2c009bSjjc					if (lat_corrected[i][j]) {
2e2c009bSjjc						t = t1;
2e2c009bSjjc						lgrp_config(cflag, t2, t);
2e2c009bSjjc						t2 = t;
2e2c009bSjjc					} else if (lat_corrected[k][l]) {
2e2c009bSjjc						t = t2;
2e2c009bSjjc						lgrp_config(cflag, t1, t);
2e2c009bSjjc						t1 = t;
2e2c009bSjjc					} else {
2e2c009bSjjc						if (t1 > t2)
2e2c009bSjjc							t = t2;
2e2c009bSjjc						else
2e2c009bSjjc							t = t1;
2e2c009bSjjc						lgrp_config(cflag, t1, t);
2e2c009bSjjc						lgrp_config(cflag, t2, t);
2e2c009bSjjc						t1 = t2 = t;
2e2c009bSjjc					}
2e2c009bSjjc
2e2c009bSjjc					lat_stats->latencies[i][j] =
2e2c009bSjjc					    lat_stats->latencies[k][l] = t;
2e2c009bSjjc
2e2c009bSjjc					lat_corrected[i][j] =
2e2c009bSjjc					    lat_corrected[k][l] = 1;
2e2c009bSjjc				}
2e2c009bSjjc			}
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Local latencies should be same
2e2c009bSjjc	 * - Find min and max local latencies
2e2c009bSjjc	 * - Make all local latencies be minimum
2e2c009bSjjc	 */
2e2c009bSjjc	min = -1;
2e2c009bSjjc	max = 0;
2e2c009bSjjc	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		if (!node_memory[i].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc		t = lat_stats->latencies[i][i];
2e2c009bSjjc		if (t == 0)
2e2c009bSjjc			continue;
2e2c009bSjjc		if (min == -1 || t < min)
2e2c009bSjjc			min = t;
2e2c009bSjjc		if (t > max)
2e2c009bSjjc			max = t;
2e2c009bSjjc	}
2e2c009bSjjc	if (min != max) {
2e2c009bSjjc		for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc			int	local;
2e2c009bSjjc
2e2c009bSjjc			if (!node_memory[i].exists)
2e2c009bSjjc				continue;
2e2c009bSjjc
2e2c009bSjjc			local = lat_stats->latencies[i][i];
2e2c009bSjjc			if (local == 0)
2e2c009bSjjc				continue;
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Track suspect probe times that aren't within
2e2c009bSjjc			 * tolerance of minimum local latency and how much
2e2c009bSjjc			 * probe times are corrected by
2e2c009bSjjc			 */
2e2c009bSjjc			if (local - min > min >> lgrp_plat_probe_lt_shift)
2e2c009bSjjc				probe_stats->probe_suspect[i][i]++;
2e2c009bSjjc
2e2c009bSjjc			probe_stats->probe_errors[i][i] += local - min;
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Make local latencies be minimum
2e2c009bSjjc			 */
2e2c009bSjjc			lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
2e2c009bSjjc			lat_stats->latencies[i][i] = min;
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Determine max probe time again since just adjusted latencies
2e2c009bSjjc	 */
2e2c009bSjjc	lat_stats->latency_max = 0;
2e2c009bSjjc	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		if (!node_memory[i].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc		for (j = 0; j < lgrp_plat_node_cnt; j++) {
2e2c009bSjjc			if (!node_memory[j].exists)
2e2c009bSjjc				continue;
2e2c009bSjjc			t = lat_stats->latencies[i][j];
2e2c009bSjjc			if (t > lat_stats->latency_max)
2e2c009bSjjc				lat_stats->latency_max = t;
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Verify following about latencies between nodes:
2e2c009bSjjc *
2e2c009bSjjc * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
2e2c009bSjjc * - Local latencies same
2e2c009bSjjc * - Local < remote
2e2c009bSjjc * - Number of latencies seen is reasonable
2e2c009bSjjc * - Number of occurrences of a given latency should be more than 1
2e2c009bSjjc *
2e2c009bSjjc * Returns:
2e2c009bSjjc *	0	Success
2e2c009bSjjc *	-1	Not symmetric
2e2c009bSjjc *	-2	Local latencies not same
2e2c009bSjjc *	-3	Local >= remote
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_latency_verify(node_phys_addr_map_t *node_memory,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats)
2e2c009bSjjc{
2e2c009bSjjc	int				i;
2e2c009bSjjc	int				j;
2e2c009bSjjc	u_longlong_t			t1;
2e2c009bSjjc	u_longlong_t			t2;
2e2c009bSjjc
2e2c009bSjjc	ASSERT(node_memory != NULL && lat_stats != NULL);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Nothing to do when this is an UMA machine, lgroup topology is
2e2c009bSjjc	 * limited to 2 levels, or there aren't any probe times yet
2e2c009bSjjc	 */
2e2c009bSjjc	if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
2e2c009bSjjc	    lat_stats->latencies[0][0] == 0)
2e2c009bSjjc		return (0);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Make sure that latencies are symmetric between any two nodes
2e2c009bSjjc	 * (ie. latency(node0, node1) == latency(node1, node0))
2e2c009bSjjc	 */
2e2c009bSjjc	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		if (!node_memory[i].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc		for (j = 0; j < lgrp_plat_node_cnt; j++) {
2e2c009bSjjc			if (!node_memory[j].exists)
2e2c009bSjjc				continue;
2e2c009bSjjc			t1 = lat_stats->latencies[i][j];
2e2c009bSjjc			t2 = lat_stats->latencies[j][i];
2e2c009bSjjc
2e2c009bSjjc			if (t1 == 0 || t2 == 0 || t1 == t2)
2e2c009bSjjc				continue;
2e2c009bSjjc
2e2c009bSjjc			return (-1);
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Local latencies should be same
2e2c009bSjjc	 */
2e2c009bSjjc	t1 = lat_stats->latencies[0][0];
2e2c009bSjjc	for (i = 1; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		if (!node_memory[i].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc
2e2c009bSjjc		t2 = lat_stats->latencies[i][i];
2e2c009bSjjc		if (t2 == 0)
2e2c009bSjjc			continue;
2e2c009bSjjc
2e2c009bSjjc		if (t1 == 0) {
2e2c009bSjjc			t1 = t2;
2e2c009bSjjc			continue;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		if (t1 != t2)
2e2c009bSjjc			return (-2);
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Local latencies should be less than remote
2e2c009bSjjc	 */
2e2c009bSjjc	if (t1) {
2e2c009bSjjc		for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc			if (!node_memory[i].exists)
2e2c009bSjjc				continue;
2e2c009bSjjc			for (j = 0; j < lgrp_plat_node_cnt; j++) {
2e2c009bSjjc				if (!node_memory[j].exists)
2e2c009bSjjc					continue;
2e2c009bSjjc				t2 = lat_stats->latencies[i][j];
2e2c009bSjjc				if (i == j || t2 == 0)
2e2c009bSjjc					continue;
2e2c009bSjjc
2e2c009bSjjc				if (t1 >= t2)
2e2c009bSjjc					return (-3);
2e2c009bSjjc			}
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	return (0);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return the number of free, allocatable, or installed
2e2c009bSjjc * pages in an lgroup
2e2c009bSjjc * This is a copy of the MAX_MEM_NODES == 1 version of the routine
2e2c009bSjjc * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
2e2c009bSjjc */
2e2c009bSjjc/* ARGSUSED */
2e2c009bSjjcstatic pgcnt_t
2e2c009bSjjclgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
2e2c009bSjjc{
2e2c009bSjjc	struct memlist *mlist;
2e2c009bSjjc	pgcnt_t npgs = 0;
2e2c009bSjjc	extern struct memlist *phys_avail;
2e2c009bSjjc	extern struct memlist *phys_install;
2e2c009bSjjc
2e2c009bSjjc	switch (query) {
2e2c009bSjjc	case LGRP_MEM_SIZE_FREE:
2e2c009bSjjc		return ((pgcnt_t)freemem);
2e2c009bSjjc	case LGRP_MEM_SIZE_AVAIL:
2e2c009bSjjc		memlist_read_lock();
2e2c009bSjjc		for (mlist = phys_avail; mlist; mlist = mlist->next)
2e2c009bSjjc			npgs += btop(mlist->size);
2e2c009bSjjc		memlist_read_unlock();
2e2c009bSjjc		return (npgs);
2e2c009bSjjc	case LGRP_MEM_SIZE_INSTALL:
2e2c009bSjjc		memlist_read_lock();
2e2c009bSjjc		for (mlist = phys_install; mlist; mlist = mlist->next)
2e2c009bSjjc			npgs += btop(mlist->size);
2e2c009bSjjc		memlist_read_unlock();
2e2c009bSjjc		return (npgs);
2e2c009bSjjc	default:
2e2c009bSjjc		return ((pgcnt_t)0);
2e2c009bSjjc	}
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Update node to proximity domain mappings for given domain and return node ID
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_node_domain_update(node_domain_map_t *node_domain, uint32_t domain)
2e2c009bSjjc{
2e2c009bSjjc	uint_t	node;
2e2c009bSjjc	uint_t	start;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Hash proximity domain ID into node to domain mapping table (array)
2e2c009bSjjc	 * and add entry for it into first non-existent or matching entry found
2e2c009bSjjc	 */
2e2c009bSjjc	node = start = NODE_DOMAIN_HASH(domain);
2e2c009bSjjc	do {
2e2c009bSjjc		/*
2e2c009bSjjc		 * Entry doesn't exist yet, so create one for this proximity
2e2c009bSjjc		 * domain and return node ID which is index into mapping table.
2e2c009bSjjc		 */
2e2c009bSjjc		if (!node_domain[node].exists) {
2e2c009bSjjc			node_domain[node].exists = 1;
2e2c009bSjjc			node_domain[node].prox_domain = domain;
2e2c009bSjjc			return (node);
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Entry exists for this proximity domain already, so just
2e2c009bSjjc		 * return node ID (index into table).
2e2c009bSjjc		 */
2e2c009bSjjc		if (node_domain[node].prox_domain == domain)
2e2c009bSjjc			return (node);
2e2c009bSjjc		node = NODE_DOMAIN_HASH(node + 1);
2e2c009bSjjc	} while (node != start);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Ran out of supported number of entries which shouldn't happen....
2e2c009bSjjc	 */
2e2c009bSjjc	ASSERT(node != start);
2e2c009bSjjc	return (-1);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Update node memory information for given proximity domain with specified
2e2c009bSjjc * starting and ending physical address range (and return positive numbers for
2e2c009bSjjc * success and negative ones for errors)
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_node_memory_update(node_domain_map_t *node_domain,
*e9dd3ea3Sjjc    node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
2e2c009bSjjc    uint32_t domain)
2e2c009bSjjc{
2e2c009bSjjc	int	node;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Get node number for proximity domain
2e2c009bSjjc	 */
2e2c009bSjjc	node = lgrp_plat_domain_to_node(node_domain, domain);
2e2c009bSjjc	if (node == -1) {
2e2c009bSjjc		node = lgrp_plat_node_domain_update(node_domain, domain);
2e2c009bSjjc		if (node == -1)
2e2c009bSjjc			return (-1);
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Create entry in table for node if it doesn't exist
2e2c009bSjjc	 */
2e2c009bSjjc	if (!node_memory[node].exists) {
2e2c009bSjjc		node_memory[node].exists = 1;
2e2c009bSjjc		node_memory[node].start = btop(start);
2e2c009bSjjc		node_memory[node].end = btop(end);
2e2c009bSjjc		node_memory[node].prox_domain = domain;
2e2c009bSjjc		return (0);
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Entry already exists for this proximity domain
2e2c009bSjjc	 *
2e2c009bSjjc	 * There may be more than one SRAT memory entry for a domain, so we may
2e2c009bSjjc	 * need to update existing start or end address for the node.
2e2c009bSjjc	 */
2e2c009bSjjc	if (node_memory[node].prox_domain == domain) {
2e2c009bSjjc		if (btop(start) < node_memory[node].start)
2e2c009bSjjc			node_memory[node].start = btop(start);
2e2c009bSjjc		if (btop(end) > node_memory[node].end)
2e2c009bSjjc			node_memory[node].end = btop(end);
2e2c009bSjjc		return (1);
2e2c009bSjjc	}
2e2c009bSjjc	return (-2);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return time needed to probe from current CPU to memory in given node
2e2c009bSjjc */
2e2c009bSjjcstatic hrtime_t
2e2c009bSjjclgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
2e2c009bSjjc    lgrp_plat_probe_mem_config_t *probe_mem_config,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
2e2c009bSjjc{
2e2c009bSjjc	caddr_t			buf;
2e2c009bSjjc	hrtime_t		elapsed;
2e2c009bSjjc	hrtime_t		end;
2e2c009bSjjc	int			from;
2e2c009bSjjc	int			i;
2e2c009bSjjc	int			ipl;
2e2c009bSjjc	hrtime_t		max;
2e2c009bSjjc	hrtime_t		min;
2e2c009bSjjc	hrtime_t		start;
2e2c009bSjjc	extern int		use_sse_pagecopy;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Determine ID of node containing current CPU
2e2c009bSjjc	 */
2e2c009bSjjc	from = lgrp_plat_cpu_to_node(CPU, cpu_node);
2e2c009bSjjc	ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Do common work for probing main memory
2e2c009bSjjc	 */
2e2c009bSjjc	if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
2e2c009bSjjc		/*
2e2c009bSjjc		 * Skip probing any nodes without memory and
2e2c009bSjjc		 * set probe time to 0
2e2c009bSjjc		 */
2e2c009bSjjc		if (probe_mem_config->probe_va[to] == NULL) {
2e2c009bSjjc			lat_stats->latencies[from][to] = 0;
2e2c009bSjjc			return (0);
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Invalidate caches once instead of once every sample
2e2c009bSjjc		 * which should cut cost of probing by a lot
2e2c009bSjjc		 */
2e2c009bSjjc		probe_stats->flush_cost = gethrtime();
2e2c009bSjjc		invalidate_cache();
2e2c009bSjjc		probe_stats->flush_cost = gethrtime() -
2e2c009bSjjc		    probe_stats->flush_cost;
2e2c009bSjjc		probe_stats->probe_cost_total += probe_stats->flush_cost;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Probe from current CPU to given memory using specified operation
2e2c009bSjjc	 * and take specified number of samples
2e2c009bSjjc	 */
2e2c009bSjjc	max = 0;
2e2c009bSjjc	min = -1;
2e2c009bSjjc	for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
2e2c009bSjjc		probe_stats->probe_cost = gethrtime();
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Can't measure probe time if gethrtime() isn't working yet
2e2c009bSjjc		 */
2e2c009bSjjc		if (probe_stats->probe_cost == 0 && gethrtime() == 0)
2e2c009bSjjc			return (0);
2e2c009bSjjc
2e2c009bSjjc		if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2e2c009bSjjc			/*
2e2c009bSjjc			 * Measure how long it takes to read vendor ID from
2e2c009bSjjc			 * Northbridge
2e2c009bSjjc			 */
2e2c009bSjjc			elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
2e2c009bSjjc		} else {
2e2c009bSjjc			/*
2e2c009bSjjc			 * Measure how long it takes to copy page
2e2c009bSjjc			 * on top of itself
2e2c009bSjjc			 */
2e2c009bSjjc			buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
2e2c009bSjjc
2e2c009bSjjc			kpreempt_disable();
2e2c009bSjjc			ipl = splhigh();
2e2c009bSjjc			start = gethrtime();
2e2c009bSjjc			if (use_sse_pagecopy)
2e2c009bSjjc				hwblkpagecopy(buf, buf);
2e2c009bSjjc			else
2e2c009bSjjc				bcopy(buf, buf, PAGESIZE);
2e2c009bSjjc			end = gethrtime();
2e2c009bSjjc			elapsed = end - start;
2e2c009bSjjc			splx(ipl);
2e2c009bSjjc			kpreempt_enable();
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		probe_stats->probe_cost = gethrtime() -
2e2c009bSjjc		    probe_stats->probe_cost;
2e2c009bSjjc		probe_stats->probe_cost_total += probe_stats->probe_cost;
2e2c009bSjjc
2e2c009bSjjc		if (min == -1 || elapsed < min)
2e2c009bSjjc			min = elapsed;
2e2c009bSjjc		if (elapsed > max)
2e2c009bSjjc			max = elapsed;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Update minimum and maximum probe times between
2e2c009bSjjc	 * these two nodes
2e2c009bSjjc	 */
2e2c009bSjjc	if (min < probe_stats->probe_min[from][to] ||
2e2c009bSjjc	    probe_stats->probe_min[from][to] == 0)
2e2c009bSjjc		probe_stats->probe_min[from][to] = min;
2e2c009bSjjc
2e2c009bSjjc	if (max > probe_stats->probe_max[from][to])
2e2c009bSjjc		probe_stats->probe_max[from][to] = max;
2e2c009bSjjc
2e2c009bSjjc	return (min);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Read ACPI System Locality Information Table (SLIT) to determine how far each
2e2c009bSjjc * NUMA node is from each other
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_process_slit(struct slit *tp, uint_t node_cnt,
2e2c009bSjjc    node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats)
2e2c009bSjjc{
2e2c009bSjjc	int		i;
2e2c009bSjjc	int		j;
2e2c009bSjjc	int		localities;
2e2c009bSjjc	hrtime_t	max;
2e2c009bSjjc	hrtime_t	min;
2e2c009bSjjc	int		retval;
2e2c009bSjjc	uint8_t		*slit_entries;
2e2c009bSjjc
2e2c009bSjjc	if (tp == NULL || !lgrp_plat_slit_enable)
2e2c009bSjjc		return (1);
2e2c009bSjjc
2e2c009bSjjc	if (lat_stats == NULL)
2e2c009bSjjc		return (2);
2e2c009bSjjc
2e2c009bSjjc	localities = tp->number;
2e2c009bSjjc	if (localities != node_cnt)
2e2c009bSjjc		return (3);
2e2c009bSjjc
2e2c009bSjjc	min = lat_stats->latency_min;
2e2c009bSjjc	max = lat_stats->latency_max;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Fill in latency matrix based on SLIT entries
2e2c009bSjjc	 */
2e2c009bSjjc	slit_entries = tp->entry;
2e2c009bSjjc	for (i = 0; i < localities; i++) {
2e2c009bSjjc		for (j = 0; j < localities; j++) {
2e2c009bSjjc			uint8_t	latency;
2e2c009bSjjc
2e2c009bSjjc			latency = slit_entries[(i * localities) + j];
2e2c009bSjjc			lat_stats->latencies[i][j] = latency;
2e2c009bSjjc			if (latency < min)
2e2c009bSjjc				min = latency;
2e2c009bSjjc			if (latency > max)
2e2c009bSjjc				max = latency;
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Verify that latencies/distances given in SLIT look reasonable
2e2c009bSjjc	 */
2e2c009bSjjc	retval = lgrp_plat_latency_verify(node_memory, lat_stats);
2e2c009bSjjc
2e2c009bSjjc	if (retval) {
2e2c009bSjjc		/*
2e2c009bSjjc		 * Reinitialize (zero) latency table since SLIT doesn't look
2e2c009bSjjc		 * right
2e2c009bSjjc		 */
2e2c009bSjjc		for (i = 0; i < localities; i++) {
2e2c009bSjjc			for (j = 0; j < localities; j++)
2e2c009bSjjc				lat_stats->latencies[i][j] = 0;
2e2c009bSjjc		}
2e2c009bSjjc	} else {
2e2c009bSjjc		/*
2e2c009bSjjc		 * Update min and max latencies seen since SLIT looks valid
2e2c009bSjjc		 */
2e2c009bSjjc		lat_stats->latency_min = min;
2e2c009bSjjc		lat_stats->latency_max = max;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	return (retval);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
2e2c009bSjjc * and memory are local to each other in the same NUMA node
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt,
2e2c009bSjjc    node_domain_map_t *node_domain, cpu_node_map_t *cpu_node,
2e2c009bSjjc    node_phys_addr_map_t *node_memory)
2e2c009bSjjc{
2e2c009bSjjc	struct srat_item	*end;
2e2c009bSjjc	int			i;
2e2c009bSjjc	struct srat_item	*item;
2e2c009bSjjc
2e2c009bSjjc	if (tp == NULL || !lgrp_plat_srat_enable)
2e2c009bSjjc		return (1);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Determine number of nodes by counting number of proximity domains in
2e2c009bSjjc	 * SRAT
2e2c009bSjjc	 */
2e2c009bSjjc	if (node_cnt) {
2e2c009bSjjc		int	nodes;
2e2c009bSjjc
2e2c009bSjjc		nodes = lgrp_plat_srat_domains(tp);
2e2c009bSjjc		if (nodes < 0) {
2e2c009bSjjc			*node_cnt = 1;
2e2c009bSjjc			return (2);
2e2c009bSjjc		}
2e2c009bSjjc		*node_cnt = nodes;
2e2c009bSjjc	}
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Walk through SRAT, examining each CPU and memory entry to determine
2e2c009bSjjc	 * which CPUs and memory belong to which node.
2e2c009bSjjc	 */
2e2c009bSjjc	item = tp->list;
2e2c009bSjjc	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
2e2c009bSjjc	while (item < end) {
2e2c009bSjjc		uint32_t	apic_id;
2e2c009bSjjc		uint32_t	domain;
2e2c009bSjjc		uint64_t	end;
2e2c009bSjjc		uint64_t	length;
2e2c009bSjjc		uint64_t	start;
2e2c009bSjjc
2e2c009bSjjc		switch (item->type) {
2e2c009bSjjc		case SRAT_PROCESSOR:	/* CPU entry */
2e2c009bSjjc			if (!(item->i.p.flags & SRAT_ENABLED) ||
2e2c009bSjjc			    cpu_node == NULL)
2e2c009bSjjc				break;
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Calculate domain (node) ID and fill in APIC ID to
2e2c009bSjjc			 * domain/node mapping table
2e2c009bSjjc			 */
2e2c009bSjjc			domain = item->i.p.domain1;
2e2c009bSjjc			for (i = 0; i < 3; i++) {
2e2c009bSjjc				domain += item->i.p.domain2[i] <<
2e2c009bSjjc				    ((i + 1) * 8);
2e2c009bSjjc			}
2e2c009bSjjc			apic_id = item->i.p.apic_id;
2e2c009bSjjc
2e2c009bSjjc			if (lgrp_plat_cpu_node_update(node_domain, cpu_node,
2e2c009bSjjc			    apic_id, domain) < 0)
2e2c009bSjjc				return (3);
2e2c009bSjjc			break;
2e2c009bSjjc
2e2c009bSjjc		case SRAT_MEMORY:	/* memory entry */
2e2c009bSjjc			if (!(item->i.m.flags & SRAT_ENABLED) ||
2e2c009bSjjc			    node_memory == NULL)
2e2c009bSjjc				break;
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Get domain (node) ID and fill in domain/node
2e2c009bSjjc			 * to memory mapping table
2e2c009bSjjc			 */
2e2c009bSjjc			domain = item->i.m.domain;
2e2c009bSjjc			start = item->i.m.base_addr;
2e2c009bSjjc			length = item->i.m.len;
2e2c009bSjjc			end = start + length - 1;
2e2c009bSjjc
2e2c009bSjjc			if (lgrp_plat_node_memory_update(node_domain,
2e2c009bSjjc			    node_memory, start, end, domain) < 0)
2e2c009bSjjc				return (4);
2e2c009bSjjc			break;
2e2c009bSjjc
2e2c009bSjjc		default:
2e2c009bSjjc			break;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		item = (struct srat_item *)((uintptr_t)item + item->len);
2e2c009bSjjc	}
2e2c009bSjjc	return (0);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Return number of proximity domains given in ACPI SRAT
2e2c009bSjjc */
2e2c009bSjjcstatic int
2e2c009bSjjclgrp_plat_srat_domains(struct srat *tp)
2e2c009bSjjc{
2e2c009bSjjc	int			domain_cnt;
2e2c009bSjjc	struct srat_item	*end;
2e2c009bSjjc	int			i;
2e2c009bSjjc	struct srat_item	*item;
2e2c009bSjjc	node_domain_map_t	node_domain[MAX_NODES];
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc	if (tp == NULL || !lgrp_plat_srat_enable)
2e2c009bSjjc		return (1);
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * Walk through SRAT, examining each CPU and memory entry to determine
2e2c009bSjjc	 * proximity domain ID for each.
2e2c009bSjjc	 */
2e2c009bSjjc	domain_cnt = 0;
2e2c009bSjjc	item = tp->list;
2e2c009bSjjc	end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
2e2c009bSjjc	bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
2e2c009bSjjc	while (item < end) {
2e2c009bSjjc		uint32_t	domain;
2e2c009bSjjc		boolean_t	overflow;
2e2c009bSjjc		uint_t		start;
2e2c009bSjjc
2e2c009bSjjc		switch (item->type) {
2e2c009bSjjc		case SRAT_PROCESSOR:	/* CPU entry */
2e2c009bSjjc			if (!(item->i.p.flags & SRAT_ENABLED))
2e2c009bSjjc				break;
2e2c009bSjjc			domain = item->i.p.domain1;
2e2c009bSjjc			for (i = 0; i < 3; i++) {
2e2c009bSjjc				domain += item->i.p.domain2[i] <<
2e2c009bSjjc				    ((i + 1) * 8);
2e2c009bSjjc			}
2e2c009bSjjc			break;
2e2c009bSjjc
2e2c009bSjjc		case SRAT_MEMORY:	/* memory entry */
2e2c009bSjjc			if (!(item->i.m.flags & SRAT_ENABLED))
2e2c009bSjjc				break;
2e2c009bSjjc			domain = item->i.m.domain;
2e2c009bSjjc			break;
2e2c009bSjjc
2e2c009bSjjc		default:
2e2c009bSjjc			break;
2e2c009bSjjc		}
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Count and keep track of which proximity domain IDs seen
2e2c009bSjjc		 */
2e2c009bSjjc		start = i = domain % MAX_NODES;
2e2c009bSjjc		overflow = B_TRUE;
2e2c009bSjjc		do {
2e2c009bSjjc			/*
2e2c009bSjjc			 * Create entry for proximity domain and increment
2e2c009bSjjc			 * count when no entry exists where proximity domain
2e2c009bSjjc			 * hashed
2e2c009bSjjc			 */
2e2c009bSjjc			if (!node_domain[i].exists) {
2e2c009bSjjc				node_domain[i].exists = 1;
2e2c009bSjjc				node_domain[i].prox_domain = domain;
2e2c009bSjjc				domain_cnt++;
2e2c009bSjjc				overflow = B_FALSE;
2e2c009bSjjc				break;
2e2c009bSjjc			}
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Nothing to do when proximity domain seen already
2e2c009bSjjc			 * and its entry exists
2e2c009bSjjc			 */
2e2c009bSjjc			if (node_domain[i].prox_domain == domain) {
2e2c009bSjjc				overflow = B_FALSE;
2e2c009bSjjc				break;
2e2c009bSjjc			}
2e2c009bSjjc
2e2c009bSjjc			/*
2e2c009bSjjc			 * Entry exists where proximity domain hashed, but for
2e2c009bSjjc			 * different proximity domain so keep search for empty
2e2c009bSjjc			 * slot to put it or matching entry whichever comes
2e2c009bSjjc			 * first.
2e2c009bSjjc			 */
2e2c009bSjjc			i = (i + 1) % MAX_NODES;
2e2c009bSjjc		} while (i != start);
2e2c009bSjjc
2e2c009bSjjc		/*
2e2c009bSjjc		 * Didn't find empty or matching entry which means have more
2e2c009bSjjc		 * proximity domains than supported nodes (:-(
2e2c009bSjjc		 */
2e2c009bSjjc		ASSERT(overflow != B_TRUE);
2e2c009bSjjc		if (overflow == B_TRUE)
2e2c009bSjjc			return (-1);
2e2c009bSjjc
2e2c009bSjjc		item = (struct srat_item *)((uintptr_t)item + item->len);
2e2c009bSjjc	}
2e2c009bSjjc	return (domain_cnt);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Set lgroup latencies for 2 level lgroup topology
2e2c009bSjjc */
2e2c009bSjjcstatic void
2e2c009bSjjclgrp_plat_2level_setup(node_phys_addr_map_t *node_memory,
2e2c009bSjjc    lgrp_plat_latency_stats_t *lat_stats)
2e2c009bSjjc{
2e2c009bSjjc	int	i;
2e2c009bSjjc
2e2c009bSjjc	ASSERT(node_memory != NULL && lat_stats != NULL);
2e2c009bSjjc
2e2c009bSjjc	if (lgrp_plat_node_cnt >= 4)
2e2c009bSjjc		cmn_err(CE_NOTE,
2e2c009bSjjc		    "MPO only optimizing for local and remote\n");
2e2c009bSjjc	for (i = 0; i < lgrp_plat_node_cnt; i++) {
2e2c009bSjjc		int	j;
2e2c009bSjjc
2e2c009bSjjc		if (!node_memory[i].exists)
2e2c009bSjjc			continue;
2e2c009bSjjc		for (j = 0; j < lgrp_plat_node_cnt; j++) {
2e2c009bSjjc			if (!node_memory[j].exists)
2e2c009bSjjc				continue;
2e2c009bSjjc			if (i == j)
2e2c009bSjjc				lat_stats->latencies[i][j] = 2;
2e2c009bSjjc			else
2e2c009bSjjc				lat_stats->latencies[i][j] = 3;
2e2c009bSjjc		}
2e2c009bSjjc	}
2e2c009bSjjc	lat_stats->latency_min = 2;
2e2c009bSjjc	lat_stats->latency_max = 3;
2e2c009bSjjc	lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
2e2c009bSjjc}
2e2c009bSjjc
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * The following Opteron specific constants, macros, types, and routines define
2e2c009bSjjc * PCI configuration space registers and how to read them to determine the NUMA
2e2c009bSjjc * configuration of *supported* Opteron processors.  They provide the same
2e2c009bSjjc * information that may be gotten from the ACPI System Resource Affinity Table
2e2c009bSjjc * (SRAT) if it exists on the machine of interest.
2e2c009bSjjc *
2e2c009bSjjc * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
2e2c009bSjjc * of interest describes all of these registers and their contents.  The main
2e2c009bSjjc * registers used by this code to determine the NUMA configuration of the
2e2c009bSjjc * machine are the node ID register for the number of NUMA nodes and the DRAM
2e2c009bSjjc * address map registers for the physical address range of each node.
2e2c009bSjjc *
2e2c009bSjjc * NOTE: The format and how to determine the NUMA configuration using PCI
2e2c009bSjjc *	 config space registers may change or may not be supported in future
2e2c009bSjjc *	 Opteron processor families.
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * How many bits to shift Opteron DRAM Address Map base and limit registers
7c478bd9Sstevel@tonic-gate * to get actual value
7c478bd9Sstevel@tonic-gate */
f78a91cdSjjc#define	OPT_DRAMADDR_HI_LSHIFT_ADDR	40	/* shift left for address */
f78a91cdSjjc#define	OPT_DRAMADDR_LO_LSHIFT_ADDR	8	/* shift left for address */
7c478bd9Sstevel@tonic-gate
f78a91cdSjjc#define	OPT_DRAMADDR_HI_MASK_ADDR	0x000000FF /* address bits 47-40 */
f78a91cdSjjc#define	OPT_DRAMADDR_LO_MASK_ADDR	0xFFFF0000 /* address bits 39-24 */
f78a91cdSjjc
f78a91cdSjjc#define	OPT_DRAMADDR_LO_MASK_OFF	0xFFFFFF /* offset for address */
f78a91cdSjjc
f78a91cdSjjc/*
f78a91cdSjjc * Macros to derive addresses from Opteron DRAM Address Map registers
f78a91cdSjjc */
f78a91cdSjjc#define	OPT_DRAMADDR_HI(reg) \
f78a91cdSjjc	(((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
f78a91cdSjjc	    OPT_DRAMADDR_HI_LSHIFT_ADDR)
f78a91cdSjjc
f78a91cdSjjc#define	OPT_DRAMADDR_LO(reg) \
f78a91cdSjjc	(((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
f78a91cdSjjc	    OPT_DRAMADDR_LO_LSHIFT_ADDR)
f78a91cdSjjc
f78a91cdSjjc#define	OPT_DRAMADDR(high, low) \
f78a91cdSjjc	(OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map base register
7c478bd9Sstevel@tonic-gate */
f78a91cdSjjc#define	OPT_DRAMBASE_LO_MASK_RE		0x1	/* read enable */
f78a91cdSjjc#define	OPT_DRAMBASE_LO_MASK_WE		0x2	/* write enable */
f78a91cdSjjc#define	OPT_DRAMBASE_LO_MASK_INTRLVEN	0x700	/* interleave */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map limit register
7c478bd9Sstevel@tonic-gate */
f78a91cdSjjc#define	OPT_DRAMLIMIT_LO_MASK_DSTNODE	0x7		/* destination node */
f78a91cdSjjc#define	OPT_DRAMLIMIT_LO_MASK_INTRLVSEL	0x700		/* interleave select */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Opteron Node ID register in PCI configuration space contains
7c478bd9Sstevel@tonic-gate * number of nodes in system, etc. for Opteron K8.  The following
7c478bd9Sstevel@tonic-gate * constants and macros define its contents, structure, and access.
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron Node ID register
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate#define	OPT_NODE_MASK_ID	0x7	/* node ID */
7c478bd9Sstevel@tonic-gate#define	OPT_NODE_MASK_CNT	0x70	/* node count */
7c478bd9Sstevel@tonic-gate#define	OPT_NODE_MASK_IONODE	0x700	/* Hypertransport I/O hub node ID */
7c478bd9Sstevel@tonic-gate#define	OPT_NODE_MASK_LCKNODE	0x7000	/* lock controller node ID */
7c478bd9Sstevel@tonic-gate#define	OPT_NODE_MASK_CPUCNT	0xF0000	/* CPUs in system (0 means 1 CPU)  */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * How many bits in Opteron Node ID register to shift right to get actual value
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate#define	OPT_NODE_RSHIFT_CNT	0x4	/* shift right for node count value */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Macros to get values from Opteron Node ID register
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate#define	OPT_NODE_CNT(reg) \
7c478bd9Sstevel@tonic-gate	((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
7c478bd9Sstevel@tonic-gate
f78a91cdSjjc/*
f78a91cdSjjc * Macro to setup PCI Extended Configuration Space (ECS) address to give to
f78a91cdSjjc * "in/out" instructions
f78a91cdSjjc *
f78a91cdSjjc * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
f78a91cdSjjc *	 other uses should just do MMIO to access PCI ECS.
f78a91cdSjjc *	 Must enable special bit in Northbridge Configuration Register on
f78a91cdSjjc *	 Greyhound for extended CF8 space access to be able to access PCI ECS
f78a91cdSjjc *	 using "in/out" instructions and restore special bit after done
f78a91cdSjjc *	 accessing PCI ECS.
f78a91cdSjjc */
f78a91cdSjjc#define	OPT_PCI_ECS_ADDR(bus, device, function, reg) \
f78a91cdSjjc	(PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
f78a91cdSjjc	    (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
f78a91cdSjjc	    ((((reg) >> 8) & 0xf) << 24))
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * PCI configuration space registers accessed by specifying
7c478bd9Sstevel@tonic-gate * a bus, device, function, and offset.  The following constants
7c478bd9Sstevel@tonic-gate * define the values needed to access Opteron K8 configuration
7c478bd9Sstevel@tonic-gate * info to determine its node topology
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_BUS_CONFIG	0	/* Hypertransport config space bus */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Opteron PCI configuration space register function values
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_FUNC_HT		0	/* Hypertransport configuration */
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_FUNC_ADDRMAP	1	/* Address map configuration */
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_FUNC_DRAM	2	/* DRAM configuration */
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_FUNC_MISC	3	/* Miscellaneous configuration */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * PCI Configuration Space register offsets
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_OFF_VENDOR	0x0	/* device/vendor ID register */
f78a91cdSjjc#define	OPT_PCS_OFF_DRAMBASE_HI	0x140	/* DRAM Base register (node 0) */
f78a91cdSjjc#define	OPT_PCS_OFF_DRAMBASE_LO	0x40	/* DRAM Base register (node 0) */
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_OFF_NODEID	0x60	/* Node ID register */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Opteron PCI Configuration Space device IDs for nodes
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gate#define	OPT_PCS_DEV_NODE0		24	/* device number for node 0 */
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
7c478bd9Sstevel@tonic-gate * Opteron DRAM address map gives base and limit for physical memory in a node
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gatetypedef	struct opt_dram_addr_map {
f78a91cdSjjc	uint32_t	base_hi;
f78a91cdSjjc	uint32_t	base_lo;
f78a91cdSjjc	uint32_t	limit_hi;
f78a91cdSjjc	uint32_t	limit_lo;
7c478bd9Sstevel@tonic-gate} opt_dram_addr_map_t;
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
f78a91cdSjjc * Supported AMD processor families
f78a91cdSjjc */
f78a91cdSjjc#define	AMD_FAMILY_HAMMER	15
f78a91cdSjjc#define	AMD_FAMILY_GREYHOUND	16
7c478bd9Sstevel@tonic-gate
f78a91cdSjjc/*
2e2c009bSjjc * Whether to have is_opteron() return 1 even when processor isn't supported
f78a91cdSjjc */
f78a91cdSjjcuint_t	is_opteron_override = 0;
f78a91cdSjjc
f78a91cdSjjc/*
f78a91cdSjjc * AMD processor family for current CPU
f78a91cdSjjc */
7c478bd9Sstevel@tonic-gateuint_t	opt_family = 0;
f78a91cdSjjc
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
f78a91cdSjjc * Determine whether we're running on a supported AMD Opteron since reading
f78a91cdSjjc * node count and DRAM address map registers may have different format or
2e2c009bSjjc * may not be supported across processor families
7c478bd9Sstevel@tonic-gate */
2e2c009bSjjcstatic int
7c478bd9Sstevel@tonic-gateis_opteron(void)
7c478bd9Sstevel@tonic-gate{
f78a91cdSjjc
7c478bd9Sstevel@tonic-gate	if (x86_vendor != X86_VENDOR_AMD)
7c478bd9Sstevel@tonic-gate		return (0);
7c478bd9Sstevel@tonic-gate
f78a91cdSjjc	opt_family = cpuid_getfamily(CPU);
f78a91cdSjjc	if (opt_family == AMD_FAMILY_HAMMER ||
f78a91cdSjjc	    opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
7c478bd9Sstevel@tonic-gate		return (1);
7c478bd9Sstevel@tonic-gate	else
7c478bd9Sstevel@tonic-gate		return (0);
7c478bd9Sstevel@tonic-gate}
7c478bd9Sstevel@tonic-gate
2e2c009bSjjc
2e2c009bSjjc/*
2e2c009bSjjc * Determine NUMA configuration for Opteron from registers that live in PCI
2e2c009bSjjc * configuration space
2e2c009bSjjc */
2e2c009bSjjcstatic void
2e2c009bSjjcopt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
2e2c009bSjjc    node_phys_addr_map_t *node_memory)
7c478bd9Sstevel@tonic-gate{
7c478bd9Sstevel@tonic-gate	uint_t				bus;
7c478bd9Sstevel@tonic-gate	uint_t				dev;
2e2c009bSjjc	struct opt_dram_addr_map	dram_map[MAX_NODES];
7c478bd9Sstevel@tonic-gate	uint_t				node;
2e2c009bSjjc	uint_t				node_info[MAX_NODES];
f78a91cdSjjc	uint_t				off_hi;
f78a91cdSjjc	uint_t				off_lo;
f78a91cdSjjc	uint64_t			nb_cfg_reg;
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate	/*
7c478bd9Sstevel@tonic-gate	 * Read configuration registers from PCI configuration space to
7c478bd9Sstevel@tonic-gate	 * determine node information, which memory is in each node, etc.
7c478bd9Sstevel@tonic-gate	 *
7c478bd9Sstevel@tonic-gate	 * Write to PCI configuration space address register to specify
7c478bd9Sstevel@tonic-gate	 * which configuration register to read and read/write PCI
7c478bd9Sstevel@tonic-gate	 * configuration space data register to get/set contents
7c478bd9Sstevel@tonic-gate	 */
7c478bd9Sstevel@tonic-gate	bus = OPT_PCS_BUS_CONFIG;
7c478bd9Sstevel@tonic-gate	dev = OPT_PCS_DEV_NODE0;
f78a91cdSjjc	off_hi = OPT_PCS_OFF_DRAMBASE_HI;
f78a91cdSjjc	off_lo = OPT_PCS_OFF_DRAMBASE_LO;
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate	/*
7c478bd9Sstevel@tonic-gate	 * Read node ID register for node 0 to get node count
7c478bd9Sstevel@tonic-gate	 */
2e2c009bSjjc	node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
ef50d8c0Sesaxe	    OPT_PCS_OFF_NODEID);
2e2c009bSjjc	*node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
2e2c009bSjjc
2e2c009bSjjc	/*
2e2c009bSjjc	 * If number of nodes is more than maximum supported, then set node
2e2c009bSjjc	 * count to 1 and treat system as UMA instead of NUMA.
2e2c009bSjjc	 */
2e2c009bSjjc	if (*node_cnt > MAX_NODES) {
2e2c009bSjjc		*node_cnt = 1;
2e2c009bSjjc		return;
2e2c009bSjjc	}
7c478bd9Sstevel@tonic-gate
f78a91cdSjjc	/*
f78a91cdSjjc	 * For Greyhound, PCI Extended Configuration Space must be enabled to
f78a91cdSjjc	 * read high DRAM address map base and limit registers
f78a91cdSjjc	 */
f78a91cdSjjc	if (opt_family == AMD_FAMILY_GREYHOUND) {
f78a91cdSjjc		nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
f78a91cdSjjc		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
f78a91cdSjjc			wrmsr(MSR_AMD_NB_CFG,
f78a91cdSjjc			    nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
f78a91cdSjjc	}
f78a91cdSjjc
2e2c009bSjjc	for (node = 0; node < *node_cnt; node++) {
f78a91cdSjjc		uint32_t	base_hi;
f78a91cdSjjc		uint32_t	base_lo;
f78a91cdSjjc		uint32_t	limit_hi;
f78a91cdSjjc		uint32_t	limit_lo;
f78a91cdSjjc
7c478bd9Sstevel@tonic-gate		/*
7c478bd9Sstevel@tonic-gate		 * Read node ID register (except for node 0 which we just read)
7c478bd9Sstevel@tonic-gate		 */
7c478bd9Sstevel@tonic-gate		if (node > 0) {
2e2c009bSjjc			node_info[node] = pci_getl_func(bus, dev,
ef50d8c0Sesaxe			    OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
7c478bd9Sstevel@tonic-gate		}
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate		/*
7c478bd9Sstevel@tonic-gate		 * Read DRAM base and limit registers which specify
7c478bd9Sstevel@tonic-gate		 * physical memory range of each node
7c478bd9Sstevel@tonic-gate		 */
f78a91cdSjjc		if (opt_family != AMD_FAMILY_GREYHOUND)
f78a91cdSjjc			base_hi = 0;
f78a91cdSjjc		else {
f78a91cdSjjc			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
f78a91cdSjjc			    OPT_PCS_FUNC_ADDRMAP, off_hi));
2e2c009bSjjc			base_hi = dram_map[node].base_hi =
f78a91cdSjjc			    inl(PCI_CONFDATA);
f78a91cdSjjc		}
2e2c009bSjjc		base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
f78a91cdSjjc		    OPT_PCS_FUNC_ADDRMAP, off_lo);
f78a91cdSjjc
2e2c009bSjjc		if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
2e2c009bSjjc		    mem_intrlv)
2e2c009bSjjc			*mem_intrlv = *mem_intrlv + 1;
7c478bd9Sstevel@tonic-gate
f78a91cdSjjc		off_hi += 4;	/* high limit register offset */
f78a91cdSjjc		if (opt_family != AMD_FAMILY_GREYHOUND)
f78a91cdSjjc			limit_hi = 0;
f78a91cdSjjc		else {
f78a91cdSjjc			outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
f78a91cdSjjc			    OPT_PCS_FUNC_ADDRMAP, off_hi));
2e2c009bSjjc			limit_hi = dram_map[node].limit_hi =
f78a91cdSjjc			    inl(PCI_CONFDATA);
f78a91cdSjjc		}
f78a91cdSjjc
f78a91cdSjjc		off_lo += 4;	/* low limit register offset */
2e2c009bSjjc		limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
f78a91cdSjjc		    dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate		/*
f78a91cdSjjc		 * Increment device number to next node and register offsets
f78a91cdSjjc		 * for DRAM base register of next node
7c478bd9Sstevel@tonic-gate		 */
f78a91cdSjjc		off_hi += 4;
f78a91cdSjjc		off_lo += 4;
7c478bd9Sstevel@tonic-gate		dev++;
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate		/*
a940d195Sjjc		 * Both read and write enable bits must be enabled in DRAM
a940d195Sjjc		 * address map base register for physical memory to exist in
a940d195Sjjc		 * node
a940d195Sjjc		 */
f78a91cdSjjc		if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
f78a91cdSjjc		    (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
a940d195Sjjc			/*
a940d195Sjjc			 * Mark node memory as non-existent and set start and
2e2c009bSjjc			 * end addresses to be same in node_memory[]
a940d195Sjjc			 */
2e2c009bSjjc			node_memory[node].exists = 0;
2e2c009bSjjc			node_memory[node].start = node_memory[node].end =
2e2c009bSjjc			    (pfn_t)-1;
a940d195Sjjc			continue;
a940d195Sjjc		}
a940d195Sjjc
a940d195Sjjc		/*
a940d195Sjjc		 * Mark node memory as existing and remember physical address
a940d195Sjjc		 * range of each node for use later
7c478bd9Sstevel@tonic-gate		 */
2e2c009bSjjc		node_memory[node].exists = 1;
f78a91cdSjjc
2e2c009bSjjc		node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
f78a91cdSjjc
2e2c009bSjjc		node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
f78a91cdSjjc		    OPT_DRAMADDR_LO_MASK_OFF);
f78a91cdSjjc	}
f78a91cdSjjc
f78a91cdSjjc	/*
f78a91cdSjjc	 * Restore PCI Extended Configuration Space enable bit
f78a91cdSjjc	 */
f78a91cdSjjc	if (opt_family == AMD_FAMILY_GREYHOUND) {
f78a91cdSjjc		if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
f78a91cdSjjc			wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
7c478bd9Sstevel@tonic-gate	}
7c478bd9Sstevel@tonic-gate}
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate
7c478bd9Sstevel@tonic-gate/*
2e2c009bSjjc * Return average amount of time to read vendor ID register on Northbridge
2e2c009bSjjc * N times on specified destination node from current CPU
7c478bd9Sstevel@tonic-gate */
7c478bd9Sstevel@tonic-gatestatic hrtime_t
2e2c009bSjjcopt_probe_vendor(int dest_node, int nreads)
7c478bd9Sstevel@tonic-gate{
2e2c009bSjjc	int		cnt;
7c478bd9Sstevel@tonic-gate	uint_t		dev;
7c478bd9Sstevel@tonic-gate	/* LINTED: set but not used in function */
7c478bd9Sstevel@tonic-gate	volatile uint_t	dev_vendor;
7c478bd9Sstevel@tonic-gate	hrtime_t	elapsed;
7c478bd9Sstevel@tonic-gate	hrtime_t	end;
7c478bd9Sstevel@tonic-gate	int		ipl;
7c478bd9Sstevel@tonic-gate	hrtime_t	start;
7c478bd9Sstevel@tonic-gate
2e2c009bSjjc	dev = OPT_PCS_DEV_NODE0 + dest_node;
7c478bd9Sstevel@tonic-gate	kpreempt_disable();
7c478bd9Sstevel@tonic-gate	ipl = spl8();
2e2c009bSjjc	outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
7c478bd9Sstevel@tonic-gate	    OPT_PCS_OFF_VENDOR));
7c478bd9Sstevel@tonic-gate	start = gethrtime();
2e2c009bSjjc	for (cnt = 0; cnt < nreads; cnt++)
7c478bd9Sstevel@tonic-gate		dev_vendor = inl(PCI_CONFDATA);
7c478bd9Sstevel@tonic-gate	end = gethrtime();
2e2c009bSjjc	elapsed = (end - start) / nreads;
7c478bd9Sstevel@tonic-gate	splx(ipl);
7c478bd9Sstevel@tonic-gate	kpreempt_enable();
2e2c009bSjjc	return (elapsed);
7c478bd9Sstevel@tonic-gate}