xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision b8f43eb65c2ac2ff69cf1a69aabc90c27cdb859e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2022 Oxide Computer Company
28  * Copyright 2022 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.
162  *
163  * The stepping is used to refer to a revision of a specific microprocessor. The
164  * term comes from equipment used to produce masks that are used to create
165  * integrated circuits.
166  *
167  * The information is present in leaf 1, %eax. In technical documentation you
168  * will see the terms extended model and extended family. The original family,
169  * model, and stepping fields were each 4 bits wide. If the values in either
170  * are 0xf, then one is to consult the extended model and extended family, which
171  * take previously reserved bits and allow for a larger number of models and add
172  * 0xf to them.
173  *
174  * When we process this information, we store the full family, model, and
175  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
176  * cpi_step, respectively. Whenever you are performing comparisons with the
177  * family, model, and stepping, you should use these members and not the raw
178  * values from cpuid. If you must use the raw values from cpuid directly, you
179  * must make sure that you add the extended model and family to the base model
180  * and family.
181  *
182  * In general, we do not use information about the family, model, and stepping
183  * to determine whether or not a feature is present; that is generally driven by
184  * specific leaves. However, when something we care about on the processor is
185  * not considered 'architectural' meaning that it is specific to a set of
186  * processors and not promised in the architecture model to be consistent from
187  * generation to generation, then we will fall back on this information. The
188  * most common cases where this comes up is when we have to workaround errata in
189  * the processor, are dealing with processor-specific features such as CPU
190  * performance counters, or we want to provide additional information for things
191  * such as fault management.
192  *
193  * While processors also do have a brand string, which is the name that people
194  * are familiar with when buying the processor, they are not meant for
195  * programmatic consumption. That is what the family, model, and stepping are
196  * for.
197  *
198  * ------------
199  * CPUID Passes
200  * ------------
201  *
202  * As part of performing feature detection, we break this into several different
203  * passes. There used to be a pass 0 that was done from assembly in locore.s to
204  * support processors that have a missing or broken cpuid instruction (notably
205  * certain Cyrix processors) but those were all 32-bit processors which are no
206  * longer supported. Passes are no longer numbered explicitly to make it easier
207  * to break them up or move them around as needed; however, they still have a
208  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
209  * x86_archext.h. The external interface to execute a cpuid pass or determine
210  * whether a pass has been completed consists of cpuid_execpass() and
211  * cpuid_checkpass() respectively.  The passes now, in that execution order,
212  * are as follows:
213  *
214  *	PRELUDE		This pass does not have any dependencies on system
215  *			setup; in particular, unlike all subsequent passes it is
216  *			guaranteed not to require PCI config space access.  It
217  *			sets the flag indicating that the processor we are
218  *			running on supports the cpuid instruction, which all
219  *			64-bit processors do.  This would also be the place to
220  *			add any other basic state that is required later on and
221  *			can be learned without dependencies.
222  *
223  *	IDENT		Determine which vendor manufactured the CPU, the family,
224  *			model, and stepping information, and compute basic
225  *			identifying tags from those values.  This is done first
226  *			so that machine-dependent code can control the features
227  *			the cpuid instruction will report during subsequent
228  *			passes if needed, and so that any intervening
229  *			machine-dependent code that needs basic identity will
230  *			have it available.
231  *
232  *	BASIC		This is the primary pass and is responsible for doing a
233  *			large number of different things:
234  *
235  *			1. Gathering a large number of feature flags to
236  *			determine which features the CPU support and which
237  *			indicate things that we need to do other work in the OS
238  *			to enable. Features detected this way are added to the
239  *			x86_featureset which can be queried to
240  *			determine what we should do. This includes processing
241  *			all of the basic and extended CPU features that we care
242  *			about.
243  *
244  *			2. Determining the CPU's topology. This includes
245  *			information about how many cores and threads are present
246  *			in the package. It also is responsible for figuring out
247  *			which logical CPUs are potentially part of the same core
248  *			and what other resources they might share. For more
249  *			information see the 'Topology' section.
250  *
251  *			3. Determining the set of CPU security-specific features
252  *			that we need to worry about and determine the
253  *			appropriate set of workarounds.
254  *
255  *			Pass 1 on the boot CPU occurs before KMDB is started.
256  *
257  *	EXTENDED	The second pass is done after startup(). Here, we check
258  *			other miscellaneous features. Most of this is gathering
259  *			additional basic and extended features that we'll use in
260  *			later passes or for debugging support.
261  *
262  *	DYNAMIC		The third pass occurs after the kernel memory allocator
263  *			has been fully initialized. This gathers information
264  *			where we might need dynamic memory available for our
265  *			uses. This includes several varying width leaves that
266  *			have cache information and the processor's brand string.
267  *
268  *	RESOLVE		The fourth and final normal pass is performed after the
269  *			kernel has brought most everything online. This is
270  *			invoked from post_startup(). In this pass, we go through
271  *			the set of features that we have enabled and turn that
272  *			into the hardware auxiliary vector features that
273  *			userland receives. This is used by userland, primarily
274  *			by the run-time link-editor (RTLD), though userland
275  *			software could also refer to it directly.
276  *
277  * The function that performs a pass is currently assumed to be infallible, and
278  * all existing implementation are.  This simplifies callers by allowing
279  * cpuid_execpass() to return void. Similarly, implementers do not need to check
280  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
281  * Both of these assumptions can be relaxed if needed by future developments.
282  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
283  * error to attempt to execute a pass before all previous passes have been
284  * completed on the specified CPU, or to request cpuid information before the
285  * pass that captures it has been executed.  These conditions can be tested
286  * using cpuid_checkpass().
287  *
288  * The Microcode Pass
289  *
290  * After a microcode update, we do a selective rescan of the cpuid leaves to
291  * determine what features have changed. Microcode updates can provide more
292  * details about security related features to deal with issues like Spectre and
293  * L1TF. On occasion, vendors have violated their contract and removed bits.
294  * However, we don't try to detect that because that puts us in a situation that
295  * we really can't deal with. As such, the only thing we rescan are security
296  * related features today. See cpuid_pass_ucode().  This pass may be run in a
297  * different sequence on APs and therefore is not part of the sequential order;
298  * It is invoked directly instead of by cpuid_execpass() and its completion
299  * status cannot be checked by cpuid_checkpass().  This could be integrated with
300  * a more complex dependency mechanism if warranted by future developments.
301  *
302  * All of the passes are run on all CPUs. However, for the most part we only
303  * care about what the boot CPU says about this information and use the other
304  * CPUs as a rough guide to sanity check that we have the same feature set.
305  *
306  * We do not support running multiple logical CPUs with disjoint, let alone
307  * different, feature sets.
308  *
309  * ------------------
310  * Processor Topology
311  * ------------------
312  *
313  * One of the important things that we need to do is to understand the topology
314  * of the underlying processor. When we say topology in this case, we're trying
315  * to understand the relationship between the logical CPUs that the operating
316  * system sees and the underlying physical layout. Different logical CPUs may
317  * share different resources which can have important consequences for the
318  * performance of the system. For example, they may share caches, execution
319  * units, and more.
320  *
321  * The topology of the processor changes from generation to generation and
322  * vendor to vendor.  Along with that, different vendors use different
323  * terminology, and the operating system itself uses occasionally overlapping
324  * terminology. It's important to understand what this topology looks like so
325  * one can understand the different things that we try to calculate and
326  * determine.
327  *
328  * To get started, let's talk about a little bit of terminology that we've used
329  * so far, is used throughout this file, and is fairly generic across multiple
330  * vendors:
331  *
332  * CPU
333  *	A central processing unit (CPU) refers to a logical and/or virtual
334  *	entity that the operating system can execute instructions on. The
335  *	underlying resources for this CPU may be shared between multiple
336  *	entities; however, to the operating system it is a discrete unit.
337  *
338  * PROCESSOR and PACKAGE
339  *
340  *	Generally, when we use the term 'processor' on its own, we are referring
341  *	to the physical entity that one buys and plugs into a board. However,
342  *	because processor has been overloaded and one might see it used to mean
343  *	multiple different levels, we will instead use the term 'package' for
344  *	the rest of this file. The term package comes from the electrical
345  *	engineering side and refers to the physical entity that encloses the
346  *	electronics inside. Strictly speaking the package can contain more than
347  *	just the CPU, for example, on many processors it may also have what's
348  *	called an 'integrated graphical processing unit (GPU)'. Because the
349  *	package can encapsulate multiple units, it is the largest physical unit
350  *	that we refer to.
351  *
352  * SOCKET
353  *
354  *	A socket refers to unit on a system board (generally the motherboard)
355  *	that can receive a package. A single package, or processor, is plugged
356  *	into a single socket. A system may have multiple sockets. Often times,
357  *	the term socket is used interchangeably with package and refers to the
358  *	electrical component that has plugged in, and not the receptacle itself.
359  *
360  * CORE
361  *
362  *	A core refers to the physical instantiation of a CPU, generally, with a
363  *	full set of hardware resources available to it. A package may contain
364  *	multiple cores inside of it or it may just have a single one. A
365  *	processor with more than one core is often referred to as 'multi-core'.
366  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
367  *	that has 'multi-core' processors.
368  *
369  *	A core may expose a single logical CPU to the operating system, or it
370  *	may expose multiple CPUs, which we call threads, defined below.
371  *
372  *	Some resources may still be shared by cores in the same package. For
373  *	example, many processors will share the level 3 cache between cores.
374  *	Some AMD generations share hardware resources between cores. For more
375  *	information on that see the section 'AMD Topology'.
376  *
377  * THREAD and STRAND
378  *
379  *	In this file, generally a thread refers to a hardware resources and not
380  *	the operating system's logical abstraction. A thread is always exposed
381  *	as an independent logical CPU to the operating system. A thread belongs
382  *	to a specific core. A core may have more than one thread. When that is
383  *	the case, the threads that are part of the same core are often referred
384  *	to as 'siblings'.
385  *
386  *	When multiple threads exist, this is generally referred to as
387  *	simultaneous multi-threading (SMT). When Intel introduced this in their
388  *	processors they called it hyper-threading (HT). When multiple threads
389  *	are active in a core, they split the resources of the core. For example,
390  *	two threads may share the same set of hardware execution units.
391  *
392  *	The operating system often uses the term 'strand' to refer to a thread.
393  *	This helps disambiguate it from the software concept.
394  *
395  * CHIP
396  *
397  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
398  *	base meaning, it is used to refer to a single integrated circuit, which
399  *	may or may not be the only thing in the package. In illumos, when you
400  *	see the term 'chip' it is almost always referring to the same thing as
401  *	the 'package'. However, many vendors may use chip to refer to one of
402  *	many integrated circuits that have been placed in the package. As an
403  *	example, see the subsequent definition.
404  *
405  *	To try and keep things consistent, we will only use chip when referring
406  *	to the entire integrated circuit package, with the exception of the
407  *	definition of multi-chip module (because it is in the name) and use the
408  *	term 'die' when we want the more general, potential sub-component
409  *	definition.
410  *
411  * DIE
412  *
413  *	A die refers to an integrated circuit. Inside of the package there may
414  *	be a single die or multiple dies. This is sometimes called a 'chip' in
415  *	vendor's parlance, but in this file, we use the term die to refer to a
416  *	subcomponent.
417  *
418  * MULTI-CHIP MODULE
419  *
420  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
421  *	are connected together in the same package. When a multi-chip design is
422  *	used, generally each chip is manufactured independently and then joined
423  *	together in the package. For example, on AMD's Zen microarchitecture
424  *	(family 0x17), the package contains several dies (the second meaning of
425  *	chip from above) that are connected together.
426  *
427  * CACHE
428  *
429  *	A cache is a part of the processor that maintains copies of recently
430  *	accessed memory. Caches are split into levels and then into types.
431  *	Commonly there are one to three levels, called level one, two, and
432  *	three. The lower the level, the smaller it is, the closer it is to the
433  *	execution units of the CPU, and the faster it is to access. The layout
434  *	and design of the cache come in many different flavors, consult other
435  *	resources for a discussion of those.
436  *
437  *	Caches are generally split into two types, the instruction and data
438  *	cache. The caches contain what their names suggest, the instruction
439  *	cache has executable program text, while the data cache has all other
440  *	memory that the processor accesses. As of this writing, data is kept
441  *	coherent between all of the caches on x86, so if one modifies program
442  *	text before it is executed, that will be in the data cache, and the
443  *	instruction cache will be synchronized with that change when the
444  *	processor actually executes those instructions. This coherency also
445  *	covers the fact that data could show up in multiple caches.
446  *
447  *	Generally, the lowest level caches are specific to a core. However, the
448  *	last layer cache is shared between some number of cores. The number of
449  *	CPUs sharing this last level cache is important. This has implications
450  *	for the choices that the scheduler makes, as accessing memory that might
451  *	be in a remote cache after thread migration can be quite expensive.
452  *
453  *	Sometimes, the word cache is abbreviated with a '$', because in US
454  *	English the word cache is pronounced the same as cash. So L1D$ refers to
455  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
456  *	in the rest of this theory statement for clarity.
457  *
458  * MEMORY CONTROLLER
459  *
460  *	The memory controller is a component that provides access to DRAM. Each
461  *	memory controller can access a set number of DRAM channels. Each channel
462  *	can have a number of DIMMs (sticks of memory) associated with it. A
463  *	given package may have more than one memory controller. The association
464  *	of the memory controller to a group of cores is important as it is
465  *	cheaper to access memory on the controller that you are associated with.
466  *
467  * NUMA
468  *
469  *	NUMA or non-uniform memory access, describes a way that systems are
470  *	built. On x86, any processor core can address all of the memory in the
471  *	system. However, When using multiple sockets or possibly within a
472  *	multi-chip module, some of that memory is physically closer and some of
473  *	it is further. Memory that is further away is more expensive to access.
474  *	Consider the following image of multiple sockets with memory:
475  *
476  *	+--------+                                                +--------+
477  *	| DIMM A |         +----------+      +----------+         | DIMM D |
478  *	+--------+-+       |          |      |          |       +-+------+-+
479  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
480  *	  +--------+-+     |          |      |          |     +-+------+-+
481  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
482  *	    +--------+                                        +--------+
483  *
484  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
485  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
486  *	access DIMMs A-C and more expensive to access D-F as it has to go
487  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
488  *	D-F are cheaper than A-C. While the socket form is the most common, when
489  *	using multi-chip modules, this can also sometimes occur. For another
490  *	example of this that's more involved, see the AMD topology section.
491  *
492  *
493  * Intel Topology
494  * --------------
495  *
496  * Most Intel processors since Nehalem, (as of this writing the current gen
497  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
498  * the package is a single monolithic die. MCMs currently aren't used. Most
499  * parts have three levels of caches, with the L3 cache being shared between
500  * all of the cores on the package. The L1/L2 cache is generally specific to
501  * an individual core. The following image shows at a simplified level what
502  * this looks like. The memory controller is commonly part of something called
503  * the 'Uncore', that used to be separate physical chips that were not a part of
504  * the package, but are now part of the same chip.
505  *
506  *  +-----------------------------------------------------------------------+
507  *  | Package                                                               |
508  *  |  +-------------------+  +-------------------+  +-------------------+  |
509  *  |  | Core              |  | Core              |  | Core              |  |
510  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
511  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
512  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
513  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
514  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
515  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
516  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
517  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
518  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
519  *  |  +-------------------+  +-------------------+  +-------------------+  |
520  *  | +-------------------------------------------------------------------+ |
521  *  | |                         Shared L3 Cache                           | |
522  *  | +-------------------------------------------------------------------+ |
523  *  | +-------------------------------------------------------------------+ |
524  *  | |                        Memory Controller                          | |
525  *  | +-------------------------------------------------------------------+ |
526  *  +-----------------------------------------------------------------------+
527  *
528  * A side effect of this current architecture is that what we care about from a
529  * scheduling and topology perspective, is simplified. In general we care about
530  * understanding which logical CPUs are part of the same core and socket.
531  *
532  * To determine the relationship between threads and cores, Intel initially used
533  * the identifier in the advanced programmable interrupt controller (APIC). They
534  * also added cpuid leaf 4 to give additional information about the number of
535  * threads and CPUs in the processor. With the addition of x2apic (which
536  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
537  * additional cpuid topology leaf 0xB was added.
538  *
539  * AMD Topology
540  * ------------
541  *
542  * When discussing AMD topology, we want to break this into three distinct
543  * generations of topology. There's the basic topology that has been used in
544  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
545  * with family 0x15 (Bulldozer), and there's the topology that was introduced
546  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
547  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
548  * additional terminology that's worth talking about.
549  *
550  * Until the introduction of family 0x17 (Zen), AMD did not implement something
551  * that they considered SMT. Whether or not the AMD processors have SMT
552  * influences many things including scheduling and reliability, availability,
553  * and serviceability (RAS) features.
554  *
555  * NODE
556  *
557  *	AMD uses the term node to refer to a die that contains a number of cores
558  *	and I/O resources. Depending on the processor family and model, more
559  *	than one node can be present in the package. When there is more than one
560  *	node this indicates a multi-chip module. Usually each node has its own
561  *	access to memory and I/O devices. This is important and generally
562  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
563  *	result, we track this relationship in the operating system.
564  *
565  *	In processors with an L3 cache, the L3 cache is generally shared across
566  *	the entire node, though the way this is carved up varies from generation
567  *	to generation.
568  *
569  * BULLDOZER
570  *
571  *	Starting with the Bulldozer family (0x15) and continuing until the
572  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
573  *	compute unit. In a compute unit, two traditional cores share a number of
574  *	hardware resources. Critically, they share the FPU, L1 instruction
575  *	cache, and the L2 cache. Several compute units were then combined inside
576  *	of a single node.  Because the integer execution units, L1 data cache,
577  *	and some other resources were not shared between the cores, AMD never
578  *	considered this to be SMT.
579  *
580  * ZEN
581  *
582  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
583  *	is called Zeppelin. These modules are similar to the idea of nodes used
584  *	previously. Each of these nodes has two DRAM channels which all of the
585  *	cores in the node can access uniformly. These nodes are linked together
586  *	in the package, creating a NUMA environment.
587  *
588  *	The Zeppelin die itself contains two different 'core complexes'. Each
589  *	core complex consists of four cores which each have two threads, for a
590  *	total of 8 logical CPUs per complex. Unlike other generations,
591  *	where all the logical CPUs in a given node share the L3 cache, here each
592  *	core complex has its own shared L3 cache.
593  *
594  *	A further thing that we need to consider is that in some configurations,
595  *	particularly with the Threadripper line of processors, not every die
596  *	actually has its memory controllers wired up to actual memory channels.
597  *	This means that some cores have memory attached to them and others
598  *	don't.
599  *
600  *	To put Zen in perspective, consider the following images:
601  *
602  *      +--------------------------------------------------------+
603  *      | Core Complex                                           |
604  *      | +-------------------+    +-------------------+  +---+  |
605  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
606  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
607  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
608  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
609  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
610  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
611  *      | +-------------------+    +-------------------+  | C |  |
612  *      | +-------------------+    +-------------------+  | a |  |
613  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
614  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
615  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
616  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
617  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
618  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
619  *      | +-------------------+    +-------------------+  +---+  |
620  *      |                                                        |
621  *	+--------------------------------------------------------+
622  *
623  *  This first image represents a single Zen core complex that consists of four
624  *  cores.
625  *
626  *
627  *	+--------------------------------------------------------+
628  *	| Zeppelin Die                                           |
629  *	|  +--------------------------------------------------+  |
630  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
631  *	|  +--------------------------------------------------+  |
632  *      |                           HH                           |
633  *	|          +-----------+    HH    +-----------+          |
634  *	|          |           |    HH    |           |          |
635  *	|          |    Core   |==========|    Core   |          |
636  *	|          |  Complex  |==========|  Complex  |          |
637  *	|          |           |    HH    |           |          |
638  *	|          +-----------+    HH    +-----------+          |
639  *      |                           HH                           |
640  *	|  +--------------------------------------------------+  |
641  *	|  |                Memory Controller                 |  |
642  *	|  +--------------------------------------------------+  |
643  *      |                                                        |
644  *	+--------------------------------------------------------+
645  *
646  *  This image represents a single Zeppelin Die. Note how both cores are
647  *  connected to the same memory controller and I/O units. While each core
648  *  complex has its own L3 cache as seen in the first image, they both have
649  *  uniform access to memory.
650  *
651  *
652  *                      PP                     PP
653  *                      PP                     PP
654  *           +----------PP---------------------PP---------+
655  *           |          PP                     PP         |
656  *           |    +-----------+          +-----------+    |
657  *           |    |           |          |           |    |
658  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
659  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
660  *           |    |           |          |           |    |
661  *           |    +-----------+ooo    ...+-----------+    |
662  *           |          HH      ooo  ...       HH         |
663  *           |          HH        oo..         HH         |
664  *           |          HH        ..oo         HH         |
665  *           |          HH      ...  ooo       HH         |
666  *           |    +-----------+...    ooo+-----------+    |
667  *           |    |           |          |           |    |
668  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
669  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
670  *           |    |           |          |           |    |
671  *           |    +-----------+          +-----------+    |
672  *           |          PP                     PP         |
673  *           +----------PP---------------------PP---------+
674  *                      PP                     PP
675  *                      PP                     PP
676  *
677  *  This image represents a single Zen package. In this example, it has four
678  *  Zeppelin dies, though some configurations only have a single one. In this
679  *  example, each die is directly connected to the next. Also, each die is
680  *  represented as being connected to memory by the 'M' character and connected
681  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
682  *  die is made up of two core complexes, we have multiple different NUMA
683  *  domains that we care about for these systems.
684  *
685  * ZEN 2
686  *
687  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
688  *	each Zeppelin Die had its own I/O die, that has been moved out of the
689  *	core complex in Zen 2. The actual core complex looks pretty similar, but
690  *	now the die actually looks much simpler:
691  *
692  *      +--------------------------------------------------------+
693  *      | Zen 2 Core Complex Die    HH                           |
694  *      |                           HH                           |
695  *      |          +-----------+    HH    +-----------+          |
696  *      |          |           |    HH    |           |          |
697  *      |          |    Core   |==========|    Core   |          |
698  *      |          |  Complex  |==========|  Complex  |          |
699  *      |          |           |    HH    |           |          |
700  *      |          +-----------+    HH    +-----------+          |
701  *      |                           HH                           |
702  *      |                           HH                           |
703  *      +--------------------------------------------------------+
704  *
705  *	From here, when we add the central I/O die, this changes things a bit.
706  *	Each die is connected to the I/O die, rather than trying to interconnect
707  *	them directly. The following image takes the same Zen 1 image that we
708  *	had earlier and shows what it looks like with the I/O die instead:
709  *
710  *                                 PP    PP
711  *                                 PP    PP
712  *           +---------------------PP----PP---------------------+
713  *           |                     PP    PP                     |
714  *           |  +-----------+      PP    PP      +-----------+  |
715  *           |  |           |      PP    PP      |           |  |
716  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
717  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
718  *           |  |         |o|oooo|          |oooo|o|         |  |
719  *           |  +-----------+    |          |    +-----------+  |
720  *           |                   |   I/O    |                   |
721  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
722  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
723  *           |                   |          |                   |
724  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
725  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
726  *           |                   |          |                   |
727  *           |  +-----------+    |          |    +-----------+  |
728  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
729  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
730  *           |  |    Die    |      PP    PP      |    Die    |  |
731  *           |  |           |      PP    PP      |           |  |
732  *           |  +-----------+      PP    PP      +-----------+  |
733  *           |                     PP    PP                     |
734  *           +---------------------PP----PP---------------------+
735  *                                 PP    PP
736  *                                 PP    PP
737  *
738  *	The above has four core complex dies installed, though the Zen 2 EPYC
739  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
740  *	generally only have one to two. The more notable difference here is how
741  *	everything communicates. Note that memory and PCIe come out of the
742  *	central die. This changes the way that one die accesses a resource. It
743  *	basically always has to go to the I/O die, where as in Zen 1 it may have
744  *	satisfied it locally. In general, this ends up being a better strategy
745  *	for most things, though it is possible to still treat everything in four
746  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
747  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
748  *	now there is only one 'node' present.
749  *
750  * ZEN 3
751  *
752  *	From an architectural perspective, Zen 3 is a much smaller change from
753  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
754  *	its microarchitectural changes. The biggest thing for us is how the die
755  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
756  *	cache. However, in Zen 3, the L3 is now shared between the entire core
757  *	complex die and is no longer partitioned between each core complex. This
758  *	means that all cores on the die can share the same L3 cache. Otherwise,
759  *	the general layout of the overall package with various core complexes
760  *	and an I/O die stays the same. Here's what the Core Complex Die looks
761  *	like in a bit more detail:
762  *
763  *               +-------------------------------------------------+
764  *               | Zen 3 Core Complex Die                          |
765  *               | +-------------------+    +-------------------+  |
766  *               | | Core       +----+ |    | Core       +----+ |  |
767  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
768  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
769  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
770  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
771  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
772  *               | +-------------------+    +-------------------+  |
773  *               | +-------------------+    +-------------------+  |
774  *               | | Core       +----+ |    | Core       +----+ |  |
775  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
776  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
777  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
778  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
779  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
780  *               | +-------------------+    +-------------------+  |
781  *               |                                                 |
782  *               | +--------------------------------------------+  |
783  *               | |                 L3 Cache                   |  |
784  *               | +--------------------------------------------+  |
785  *               |                                                 |
786  *               | +-------------------+    +-------------------+  |
787  *               | | Core       +----+ |    | Core       +----+ |  |
788  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
789  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
790  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
791  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
792  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
793  *               | +-------------------+    +-------------------+  |
794  *               | +-------------------+    +-------------------+  |
795  *               | | Core       +----+ |    | Core       +----+ |  |
796  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
797  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
798  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
799  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
800  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
801  *               | +-------------------+    +-------------------+  |
802  *               +-------------------------------------------------+
803  *
804  *	While it is not pictured, there are connections from the die to the
805  *	broader data fabric and additional functional blocks to support that
806  *	communication and coherency.
807  *
808  * CPUID LEAVES
809  *
810  * There are a few different CPUID leaves that we can use to try and understand
811  * the actual state of the world. As part of the introduction of family 0xf, AMD
812  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
813  * processors that are in the system. Because families before Zen didn't have
814  * SMT, this was always the number of cores that were in the system. However, it
815  * should always be thought of as the number of logical threads to be consistent
816  * between generations. In addition we also get the size of the APIC ID that is
817  * used to represent the number of logical processors. This is important for
818  * deriving topology information.
819  *
820  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
821  * bit between Bulldozer and later families, but it is quite useful in
822  * determining the topology information. Because this information has changed
823  * across family generations, it's worth calling out what these mean
824  * explicitly. The registers have the following meanings:
825  *
826  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
827  *		APIC ID, even though on systems without x2apic support, it will
828  *		be limited to 8 bits.
829  *
830  *	%ebx	On Bulldozer-era systems this contains information about the
831  *		number of cores that are in a compute unit (cores that share
832  *		resources). It also contains a per-package compute unit ID that
833  *		identifies which compute unit the logical CPU is a part of.
834  *
835  *		On Zen-era systems this instead contains the number of threads
836  *		per core and the ID of the core that the logical CPU is a part
837  *		of. Note, this ID is unique only to the package, it is not
838  *		globally unique across the entire system.
839  *
840  *	%ecx	This contains the number of nodes that exist in the package. It
841  *		also contains an ID that identifies which node the logical CPU
842  *		is a part of.
843  *
844  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
845  * cache layout to determine which logical CPUs are sharing which caches.
846  *
847  * illumos Topology
848  * ----------------
849  *
850  * Based on the above we synthesize the information into several different
851  * variables that we store in the 'struct cpuid_info'. We'll go into the details
852  * of what each member is supposed to represent and their uniqueness. In
853  * general, there are two levels of uniqueness that we care about. We care about
854  * an ID that is globally unique. That means that it will be unique across all
855  * entities in the system. For example, the default logical CPU ID is globally
856  * unique. On the other hand, there is some information that we only care about
857  * being unique within the context of a single package / socket. Here are the
858  * variables that we keep track of and their meaning.
859  *
860  * Several of the values that are asking for an identifier, with the exception
861  * of cpi_apicid, are allowed to be synthetic.
862  *
863  *
864  * cpi_apicid
865  *
866  *	This is the value of the CPU's APIC id. This should be the full 32-bit
867  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
868  *	APIC ID. This value is globally unique between all logical CPUs across
869  *	all packages. This is usually required by the APIC.
870  *
871  * cpi_chipid
872  *
873  *	This value indicates the ID of the package that the logical CPU is a
874  *	part of. This value is allowed to be synthetic. It is usually derived by
875  *	taking the CPU's APIC ID and determining how many bits are used to
876  *	represent CPU cores in the package. All logical CPUs that are part of
877  *	the same package must have the same value.
878  *
879  * cpi_coreid
880  *
881  *	This represents the ID of a CPU core. Two logical CPUs should only have
882  *	the same cpi_coreid value if they are part of the same core. These
883  *	values may be synthetic. On systems that support SMT, this value is
884  *	usually derived from the APIC ID, otherwise it is often synthetic and
885  *	just set to the value of the cpu_id in the cpu_t.
886  *
887  * cpi_pkgcoreid
888  *
889  *	This is similar to the cpi_coreid in that logical CPUs that are part of
890  *	the same core should have the same ID. The main difference is that these
891  *	values are only required to be unique to a given socket.
892  *
893  * cpi_clogid
894  *
895  *	This represents the logical ID of a logical CPU. This value should be
896  *	unique within a given socket for each logical CPU. This is allowed to be
897  *	synthetic, though it is usually based off of the CPU's apic ID. The
898  *	broader system expects that logical CPUs that have are part of the same
899  *	core have contiguous numbers. For example, if there were two threads per
900  *	core, then the core IDs divided by two should be the same and the first
901  *	modulus two should be zero and the second one. For example, IDs 4 and 5
902  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
903  *	6 represent two logical CPUs that are part of different cores.
904  *
905  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
906  *	from the same source, strictly speaking, they don't have to be and the
907  *	two values should be considered logically independent. One should not
908  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
909  *	some kind of relationship. While this is tempting, we've seen cases on
910  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
911  *
912  * cpi_ncpu_per_chip
913  *
914  *	This value indicates the total number of logical CPUs that exist in the
915  *	physical package. Critically, this is not the number of logical CPUs
916  *	that exist for just the single core.
917  *
918  *	This value should be the same for all logical CPUs in the same package.
919  *
920  * cpi_ncore_per_chip
921  *
922  *	This value indicates the total number of physical CPU cores that exist
923  *	in the package. The system compares this value with cpi_ncpu_per_chip to
924  *	determine if simultaneous multi-threading (SMT) is enabled. When
925  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
926  *	the X86FSET_HTT feature is not set. If this value is greater than one,
927  *	than we consider the processor to have the feature X86FSET_CMP, to
928  *	indicate that there is support for more than one core.
929  *
930  *	This value should be the same for all logical CPUs in the same package.
931  *
932  * cpi_procnodes_per_pkg
933  *
934  *	This value indicates the number of 'nodes' that exist in the package.
935  *	When processors are actually a multi-chip module, this represents the
936  *	number of such modules that exist in the package. Currently, on Intel
937  *	based systems this member is always set to 1.
938  *
939  *	This value should be the same for all logical CPUs in the same package.
940  *
941  * cpi_procnodeid
942  *
943  *	This value indicates the ID of the node that the logical CPU is a part
944  *	of. All logical CPUs that are in the same node must have the same value
945  *	here. This value must be unique across all of the packages in the
946  *	system.  On Intel based systems, this is currently set to the value in
947  *	cpi_chipid because there is only one node.
948  *
949  * cpi_cores_per_compunit
950  *
951  *	This value indicates the number of cores that are part of a compute
952  *	unit. See the AMD topology section for this. This member only has real
953  *	meaning currently for AMD Bulldozer family processors. For all other
954  *	processors, this should currently be set to 1.
955  *
956  * cpi_compunitid
957  *
958  *	This indicates the compute unit that the logical CPU belongs to. For
959  *	processors without AMD Bulldozer-style compute units this should be set
960  *	to the value of cpi_coreid.
961  *
962  * cpi_ncpu_shr_last_cache
963  *
964  *	This indicates the number of logical CPUs that are sharing the same last
965  *	level cache. This value should be the same for all CPUs that are sharing
966  *	that cache. The last cache refers to the cache that is closest to memory
967  *	and furthest away from the CPU.
968  *
969  * cpi_last_lvl_cacheid
970  *
971  *	This indicates the ID of the last cache that the logical CPU uses. This
972  *	cache is often shared between multiple logical CPUs and is the cache
973  *	that is closest to memory and furthest away from the CPU. This value
974  *	should be the same for a group of logical CPUs only if they actually
975  *	share the same last level cache. IDs should not overlap between
976  *	packages.
977  *
978  * cpi_ncore_bits
979  *
980  *	This indicates the number of bits that are required to represent all of
981  *	the cores in the system. As cores are derived based on their APIC IDs,
982  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
983  *	this value to be larger than the actual number of IDs that are present
984  *	in the system. This is used to size tables by the CMI framework. It is
985  *	only filled in for Intel and AMD CPUs.
986  *
987  * cpi_nthread_bits
988  *
989  *	This indicates the number of bits required to represent all of the IDs
990  *	that cover the logical CPUs that exist on a given core. It's OK for this
991  *	value to be larger than the actual number of IDs that are present in the
992  *	system.  This is used to size tables by the CMI framework. It is
993  *	only filled in for Intel and AMD CPUs.
994  *
995  * -----------
996  * Hypervisors
997  * -----------
998  *
999  * If trying to manage the differences between vendors wasn't bad enough, it can
1000  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1001  * the ability to interpose on all cpuid instructions and change them to suit
1002  * their purposes. In general, this is necessary as the hypervisor wants to be
1003  * able to present a more uniform set of features or not necessarily give the
1004  * guest operating system kernel knowledge of all features so it can be
1005  * more easily migrated between systems.
1006  *
1007  * When it comes to trying to determine topology information, this can be a
1008  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1009  * leaf, it'll often return all zeros. Because of that, you'll often see various
1010  * checks scattered about fields being non-zero before we assume we can use
1011  * them.
1012  *
1013  * When it comes to topology information, the hypervisor is often incentivized
1014  * to lie to you about topology. This is because it doesn't always actually
1015  * guarantee that topology at all. The topology path we take in the system
1016  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1017  * or AMD CPU, then we basically do our normal path. However, when they don't
1018  * use an actual vendor, then that usually turns into multiple one-core CPUs
1019  * that we enumerate that are often on different sockets. The actual behavior
1020  * depends greatly on what the hypervisor actually exposes to us.
1021  *
1022  * --------------------
1023  * Exposing Information
1024  * --------------------
1025  *
1026  * We expose CPUID information in three different forms in the system.
1027  *
1028  * The first is through the x86_featureset variable. This is used in conjunction
1029  * with the is_x86_feature() function. This is queried by x86-specific functions
1030  * to determine which features are or aren't present in the system and to make
1031  * decisions based upon them. For example, users of this include everything from
1032  * parts of the system dedicated to reliability, availability, and
1033  * serviceability (RAS), to making decisions about how to handle security
1034  * mitigations, to various x86-specific drivers. General purpose or
1035  * architecture independent drivers should never be calling this function.
1036  *
1037  * The second means is through the auxiliary vector. The auxiliary vector is a
1038  * series of tagged data that the kernel passes down to a user program when it
1039  * begins executing. This information is used to indicate to programs what
1040  * instruction set extensions are present. For example, information about the
1041  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1042  * since user programs cannot make use of it. However, things like the AVX
1043  * instruction sets are. Programs use this information to make run-time
1044  * decisions about what features they should use. As an example, the run-time
1045  * link-editor (rtld) can relocate different functions depending on the hardware
1046  * support available.
1047  *
1048  * The final form is through a series of accessor functions that all have the
1049  * form cpuid_get*. This is used by a number of different subsystems in the
1050  * kernel to determine more detailed information about what we're running on,
1051  * topology information, etc. Some of these subsystems include processor groups
1052  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1053  * microcode, and performance monitoring. These functions all ASSERT that the
1054  * CPU they're being called on has reached a certain cpuid pass. If the passes
1055  * are rearranged, then this needs to be adjusted.
1056  *
1057  * -----------------------------------------------
1058  * Speculative Execution CPU Side Channel Security
1059  * -----------------------------------------------
1060  *
1061  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1062  * execution in the CPU to create side channels there have been a number of
1063  * different attacks and corresponding issues that the operating system needs to
1064  * mitigate against. The following list is some of the common, but not
1065  * exhaustive, set of issues that we know about and have done some or need to do
1066  * more work in the system to mitigate against:
1067  *
1068  *   - Spectre v1
1069  *   - swapgs (Spectre v1 variant)
1070  *   - Spectre v2
1071  *   - Meltdown (Spectre v3)
1072  *   - Rogue Register Read (Spectre v3a)
1073  *   - Speculative Store Bypass (Spectre v4)
1074  *   - ret2spec, SpectreRSB
1075  *   - L1 Terminal Fault (L1TF)
1076  *   - Microarchitectural Data Sampling (MDS)
1077  *
1078  * Each of these requires different sets of mitigations and has different attack
1079  * surfaces. For the most part, this discussion is about protecting the kernel
1080  * from non-kernel executing environments such as user processes and hardware
1081  * virtual machines. Unfortunately, there are a number of user vs. user
1082  * scenarios that exist with these. The rest of this section will describe the
1083  * overall approach that the system has taken to address these as well as their
1084  * shortcomings. Unfortunately, not all of the above have been handled today.
1085  *
1086  * SPECTRE v2, ret2spec, SpectreRSB
1087  *
1088  * The second variant of the spectre attack focuses on performing branch target
1089  * injection. This generally impacts indirect call instructions in the system.
1090  * There are three different ways to mitigate this issue that are commonly
1091  * described today:
1092  *
1093  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1094  *  2. Using Retpolines and RSB Stuffing
1095  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1096  *
1097  * IBRS uses a feature added to microcode to restrict speculation, among other
1098  * things. This form of mitigation has not been used as it has been generally
1099  * seen as too expensive and requires reactivation upon various transitions in
1100  * the system.
1101  *
1102  * As a less impactful alternative to IBRS, retpolines were developed by
1103  * Google. These basically require one to replace indirect calls with a specific
1104  * trampoline that will cause speculation to fail and break the attack.
1105  * Retpolines require compiler support. We always build with retpolines in the
1106  * external thunk mode. This means that a traditional indirect call is replaced
1107  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1108  * of this is that all indirect function calls are performed through a register.
1109  *
1110  * We have to use a common external location of the thunk and not inline it into
1111  * the callsite so that way we can have a single place to patch these functions.
1112  * As it turns out, we currently have two different forms of retpolines that
1113  * exist in the system:
1114  *
1115  *  1. A full retpoline
1116  *  2. A no-op version
1117  *
1118  * The first one is used in the general case. Historically, there was an
1119  * AMD-specific optimized retopoline variant that was based around using a
1120  * serializing lfence instruction; however, in March 2022 it was announced that
1121  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1122  * use it and it is no longer available in the system.
1123  *
1124  * The third form described above is the most curious. It turns out that the way
1125  * that retpolines are implemented is that they rely on how speculation is
1126  * performed on a 'ret' instruction. Intel has continued to optimize this
1127  * process (which is partly why we need to have return stack buffer stuffing,
1128  * but more on that in a bit) and in processors starting with Cascade Lake
1129  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1130  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1131  *
1132  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1133  * physical core. However, if this is the case, we don't want to use retpolines
1134  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1135  * function (called a thunk) into a jmp instruction. This means that we're still
1136  * paying the cost of an extra jump to the external thunk, but it gives us
1137  * flexibility and the ability to have a single kernel image that works across a
1138  * wide variety of systems and hardware features.
1139  *
1140  * Unfortunately, this alone is insufficient. First, Skylake systems have
1141  * additional speculation for the Return Stack Buffer (RSB) which is used to
1142  * return from call instructions which retpolines take advantage of. However,
1143  * this problem is not just limited to Skylake and is actually more pernicious.
1144  * The SpectreRSB paper introduces several more problems that can arise with
1145  * dealing with this. The RSB can be poisoned just like the indirect branch
1146  * predictor. This means that one needs to clear the RSB when transitioning
1147  * between two different privilege domains. Some examples include:
1148  *
1149  *  - Switching between two different user processes
1150  *  - Going between user land and the kernel
1151  *  - Returning to the kernel from a hardware virtual machine
1152  *
1153  * Mitigating this involves combining a couple of different things. The first is
1154  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1155  * Bridge. When an RSB entry refers to a user address and we're executing in the
1156  * kernel, speculation through it will be stopped when SMEP is enabled. This
1157  * protects against a number of the different cases that we would normally be
1158  * worried about such as when we enter the kernel from user land.
1159  *
1160  * To prevent against additional manipulation of the RSB from other contexts
1161  * such as a non-root VMX context attacking the kernel we first look to
1162  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1163  * nothing else that we need to do to protect the kernel at this time.
1164  *
1165  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1166  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1167  * Currently this is employed on context switch and vmx_exit. The
1168  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1169  *
1170  * If SMEP is not present, then we would have to stuff the RSB every time we
1171  * transitioned from user mode to the kernel, which isn't very practical right
1172  * now.
1173  *
1174  * To fully protect user to user and vmx to vmx attacks from these classes of
1175  * issues, we would also need to allow them to opt into performing an Indirect
1176  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1177  *
1178  * By default, the system will enable RSB stuffing and the required variant of
1179  * retpolines and store that information in the x86_spectrev2_mitigation value.
1180  * This will be evaluated after a microcode update as well, though it is
1181  * expected that microcode updates will not take away features. This may mean
1182  * that a late loaded microcode may not end up in the optimal configuration
1183  * (though this should be rare).
1184  *
1185  * Currently we do not build kmdb with retpolines or perform any additional side
1186  * channel security mitigations for it. One complication with kmdb is that it
1187  * requires its own retpoline thunks and it would need to adjust itself based on
1188  * what the kernel does. The threat model of kmdb is more limited and therefore
1189  * it may make more sense to investigate using prediction barriers as the whole
1190  * system is only executing a single instruction at a time while in kmdb.
1191  *
1192  * SPECTRE v1, v4
1193  *
1194  * The v1 and v4 variants of spectre are not currently mitigated in the
1195  * system and require other classes of changes to occur in the code.
1196  *
1197  * SPECTRE v1 (SWAPGS VARIANT)
1198  *
1199  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1200  * can generally affect any branch-dependent code. The swapgs issue is one
1201  * variant of this. If we are coming in from userspace, we can have code like
1202  * this:
1203  *
1204  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1205  *	je	1f
1206  *	movq	$0, REGOFF_SAVFP(%rsp)
1207  *	swapgs
1208  *	1:
1209  *	movq	%gs:CPU_THREAD, %rax
1210  *
1211  * If an attacker can cause a mis-speculation of the branch here, we could skip
1212  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1213  * load. If subsequent code can act as the usual Spectre cache gadget, this
1214  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1215  * any use of the %gs override.
1216  *
1217  * The other case is also an issue: if we're coming into a trap from kernel
1218  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1219  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1220  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1221  * case, and the fix is the same in both cases (an lfence at the branch target
1222  * 1: in this example), we'll just do it unconditionally.
1223  *
1224  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1225  * harder for user-space to actually set a useful %gsbase value: although it's
1226  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1227  * mitigate anyway.
1228  *
1229  * MELTDOWN
1230  *
1231  * Meltdown, or spectre v3, allowed a user process to read any data in their
1232  * address space regardless of whether or not the page tables in question
1233  * allowed the user to have the ability to read them. The solution to meltdown
1234  * is kernel page table isolation. In this world, there are two page tables that
1235  * are used for a process, one in user land and one in the kernel. To implement
1236  * this we use per-CPU page tables and switch between the user and kernel
1237  * variants when entering and exiting the kernel.  For more information about
1238  * this process and how the trampolines work, please see the big theory
1239  * statements and additional comments in:
1240  *
1241  *  - uts/i86pc/ml/kpti_trampolines.s
1242  *  - uts/i86pc/vm/hat_i86.c
1243  *
1244  * While Meltdown only impacted Intel systems and there are also Intel systems
1245  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1246  * kernel page table isolation enabled. While this may at first seem weird, an
1247  * important thing to remember is that you can't speculatively read an address
1248  * if it's never in your page table at all. Having user processes without kernel
1249  * pages present provides us with an important layer of defense in the kernel
1250  * against any other side channel attacks that exist and have yet to be
1251  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1252  * default, no matter the x86 system.
1253  *
1254  * L1 TERMINAL FAULT
1255  *
1256  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1257  * execution uses page table entries. Effectively, it is two different problems.
1258  * The first is that it ignores the not present bit in the page table entries
1259  * when performing speculative execution. This means that something can
1260  * speculatively read the listed physical address if it's present in the L1
1261  * cache under certain conditions (see Intel's documentation for the full set of
1262  * conditions). Secondly, this can be used to bypass hardware virtualization
1263  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1264  * instructions.
1265  *
1266  * For the non-hardware virtualized case, this is relatively easy to deal with.
1267  * We must make sure that all unmapped pages have an address of zero. This means
1268  * that they could read the first 4k of physical memory; however, we never use
1269  * that first page in the operating system and always skip putting it in our
1270  * memory map, even if firmware tells us we can use it in our memory map. While
1271  * other systems try to put extra metadata in the address and reserved bits,
1272  * which led to this being problematic in those cases, we do not.
1273  *
1274  * For hardware virtual machines things are more complicated. Because they can
1275  * construct their own page tables, it isn't hard for them to perform this
1276  * attack against any physical address. The one wrinkle is that this physical
1277  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1278  * to flush the L1 data cache. We wrap this up in the function
1279  * spec_uarch_flush(). This function is also used in the mitigation of
1280  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1281  * hypervisors such as KVM or bhyve are responsible for performing this before
1282  * entering the guest.
1283  *
1284  * Because this attack takes place in the L1 cache, there's another wrinkle
1285  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1286  * designs. This means that when a thread enters a hardware virtualized context
1287  * and flushes the L1 data cache, the other thread on the processor may then go
1288  * ahead and put new data in it that can be potentially attacked. While one
1289  * solution is to disable SMT on the system, another option that is available is
1290  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1291  * goes through and makes sure that if a HVM is being scheduled on one thread,
1292  * then the thing on the other thread is from the same hardware virtual machine.
1293  * If an interrupt comes in or the guest exits to the broader system, then the
1294  * other SMT thread will be kicked out.
1295  *
1296  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1297  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1298  * perform L1TF related mitigations.
1299  *
1300  * MICROARCHITECTURAL DATA SAMPLING
1301  *
1302  * Microarchitectural data sampling (MDS) is a combination of four discrete
1303  * vulnerabilities that are similar issues affecting various parts of the CPU's
1304  * microarchitectural implementation around load, store, and fill buffers.
1305  * Specifically it is made up of the following subcomponents:
1306  *
1307  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1308  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1309  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1310  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1311  *
1312  * To begin addressing these, Intel has introduced another feature in microcode
1313  * called MD_CLEAR. This changes the verw instruction to operate in a different
1314  * way. This allows us to execute the verw instruction in a particular way to
1315  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1316  * updated when this microcode is present to flush this state.
1317  *
1318  * Primarily we need to flush this state whenever we transition from the kernel
1319  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1320  * little bit different. Here the structures are statically sized when a logical
1321  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1322  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1323  * mwait, or another ACPI method. To perform these flushes, we call
1324  * x86_md_clear() at all of these transition points.
1325  *
1326  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1327  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1328  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1329  * a no-op.
1330  *
1331  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1332  * particular, everything we've discussed above is only valid for a single
1333  * thread executing on a core. In the case where you have hyper-threading
1334  * present, this attack can be performed between threads. The theoretical fix
1335  * for this is to ensure that both threads are always in the same security
1336  * domain. This means that they are executing in the same ring and mutually
1337  * trust each other. Practically speaking, this would mean that a system call
1338  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1339  * Rather than implement this, we recommend that one disables hyper-threading
1340  * through the use of psradm -aS.
1341  *
1342  * TSX ASYNCHRONOUS ABORT
1343  *
1344  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1345  * behaves like MDS, but leverages Intel's transactional instructions as another
1346  * vector. Effectively, when a transaction hits one of these cases (unmapped
1347  * page, various cache snoop activity, etc.) then the same data can be exposed
1348  * as in the case of MDS. This means that you can attack your twin.
1349  *
1350  * Intel has described that there are two different ways that we can mitigate
1351  * this problem on affected processors:
1352  *
1353  *   1) We can use the same techniques used to deal with MDS. Flushing the
1354  *      microarchitectural buffers and disabling hyperthreading will mitigate
1355  *      this in the same way.
1356  *
1357  *   2) Using microcode to disable TSX.
1358  *
1359  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1360  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1361  * That's OK as we're already doing all such mitigations. On the other hand,
1362  * processors with MDS_NO are all supposed to receive microcode updates that
1363  * enumerate support for disabling TSX. In general, we'd rather use this method
1364  * when available as it doesn't require disabling hyperthreading to be
1365  * effective. Currently we basically are relying on microcode for processors
1366  * that enumerate MDS_NO.
1367  *
1368  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1369  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1370  * different powers. The first allows us to cause all transactions to
1371  * immediately abort. The second gives us a means of disabling TSX completely,
1372  * which includes removing it from cpuid. If we have support for this in
1373  * microcode during the first cpuid pass, then we'll disable TSX completely such
1374  * that user land never has a chance to observe the bit. However, if we are late
1375  * loading the microcode, then we must use the functionality to cause
1376  * transactions to automatically abort. This is necessary for user land's sake.
1377  * Once a program sees a cpuid bit, it must not be taken away.
1378  *
1379  * We track whether or not we should do this based on what cpuid pass we're in.
1380  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1381  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1382  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1383  * second time after we do the initial microcode update.  As a result we need to
1384  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1385  * suitable microcode on the current CPU (which happens prior to
1386  * cpuid_pass_ucode()).
1387  *
1388  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1389  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1390  * unfortunate feature in a number of ways, and taking the opportunity to
1391  * finally be able to turn it off is likely to be of benefit in the future.
1392  *
1393  * SUMMARY
1394  *
1395  * The following table attempts to summarize the mitigations for various issues
1396  * and what's done in various places:
1397  *
1398  *  - Spectre v1: Not currently mitigated
1399  *  - swapgs: lfences after swapgs paths
1400  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS if HW support
1401  *  - Meltdown: Kernel Page Table Isolation
1402  *  - Spectre v3a: Updated CPU microcode
1403  *  - Spectre v4: Not currently mitigated
1404  *  - SpectreRSB: SMEP and RSB Stuffing
1405  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1406  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1407  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1408  *
1409  * The following table indicates the x86 feature set bits that indicate that a
1410  * given problem has been solved or a notable feature is present:
1411  *
1412  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1413  *  - MDS_NO: All forms of MDS
1414  *  - TAA_NO: TAA
1415  */
1416 
1417 #include <sys/types.h>
1418 #include <sys/archsystm.h>
1419 #include <sys/x86_archext.h>
1420 #include <sys/kmem.h>
1421 #include <sys/systm.h>
1422 #include <sys/cmn_err.h>
1423 #include <sys/sunddi.h>
1424 #include <sys/sunndi.h>
1425 #include <sys/cpuvar.h>
1426 #include <sys/processor.h>
1427 #include <sys/sysmacros.h>
1428 #include <sys/pg.h>
1429 #include <sys/fp.h>
1430 #include <sys/controlregs.h>
1431 #include <sys/bitmap.h>
1432 #include <sys/auxv_386.h>
1433 #include <sys/memnode.h>
1434 #include <sys/pci_cfgspace.h>
1435 #include <sys/comm_page.h>
1436 #include <sys/mach_mmu.h>
1437 #include <sys/ucode.h>
1438 #include <sys/tsc.h>
1439 #include <sys/kobj.h>
1440 #include <sys/asm_misc.h>
1441 
1442 #ifdef __xpv
1443 #include <sys/hypervisor.h>
1444 #else
1445 #include <sys/ontrap.h>
1446 #endif
1447 
1448 uint_t x86_vendor = X86_VENDOR_IntelClone;
1449 uint_t x86_type = X86_TYPE_OTHER;
1450 uint_t x86_clflush_size = 0;
1451 
1452 #if defined(__xpv)
1453 int x86_use_pcid = 0;
1454 int x86_use_invpcid = 0;
1455 #else
1456 int x86_use_pcid = -1;
1457 int x86_use_invpcid = -1;
1458 #endif
1459 
1460 typedef enum {
1461 	X86_SPECTREV2_RETPOLINE,
1462 	X86_SPECTREV2_ENHANCED_IBRS,
1463 	X86_SPECTREV2_DISABLED
1464 } x86_spectrev2_mitigation_t;
1465 
1466 uint_t x86_disable_spectrev2 = 0;
1467 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1468     X86_SPECTREV2_RETPOLINE;
1469 
1470 /*
1471  * The mitigation status for TAA:
1472  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1473  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1474  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1475  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1476  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1477  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1478  */
1479 typedef enum {
1480 	X86_TAA_NOTHING,
1481 	X86_TAA_DISABLED,
1482 	X86_TAA_MD_CLEAR,
1483 	X86_TAA_TSX_FORCE_ABORT,
1484 	X86_TAA_TSX_DISABLE,
1485 	X86_TAA_HW_MITIGATED
1486 } x86_taa_mitigation_t;
1487 
1488 uint_t x86_disable_taa = 0;
1489 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1490 
1491 uint_t pentiumpro_bug4046376;
1492 
1493 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1494 
1495 static char *x86_feature_names[NUM_X86_FEATURES] = {
1496 	"lgpg",
1497 	"tsc",
1498 	"msr",
1499 	"mtrr",
1500 	"pge",
1501 	"de",
1502 	"cmov",
1503 	"mmx",
1504 	"mca",
1505 	"pae",
1506 	"cv8",
1507 	"pat",
1508 	"sep",
1509 	"sse",
1510 	"sse2",
1511 	"htt",
1512 	"asysc",
1513 	"nx",
1514 	"sse3",
1515 	"cx16",
1516 	"cmp",
1517 	"tscp",
1518 	"mwait",
1519 	"sse4a",
1520 	"cpuid",
1521 	"ssse3",
1522 	"sse4_1",
1523 	"sse4_2",
1524 	"1gpg",
1525 	"clfsh",
1526 	"64",
1527 	"aes",
1528 	"pclmulqdq",
1529 	"xsave",
1530 	"avx",
1531 	"vmx",
1532 	"svm",
1533 	"topoext",
1534 	"f16c",
1535 	"rdrand",
1536 	"x2apic",
1537 	"avx2",
1538 	"bmi1",
1539 	"bmi2",
1540 	"fma",
1541 	"smep",
1542 	"smap",
1543 	"adx",
1544 	"rdseed",
1545 	"mpx",
1546 	"avx512f",
1547 	"avx512dq",
1548 	"avx512pf",
1549 	"avx512er",
1550 	"avx512cd",
1551 	"avx512bw",
1552 	"avx512vl",
1553 	"avx512fma",
1554 	"avx512vbmi",
1555 	"avx512_vpopcntdq",
1556 	"avx512_4vnniw",
1557 	"avx512_4fmaps",
1558 	"xsaveopt",
1559 	"xsavec",
1560 	"xsaves",
1561 	"sha",
1562 	"umip",
1563 	"pku",
1564 	"ospke",
1565 	"pcid",
1566 	"invpcid",
1567 	"ibrs",
1568 	"ibpb",
1569 	"stibp",
1570 	"ssbd",
1571 	"ssbd_virt",
1572 	"rdcl_no",
1573 	"ibrs_all",
1574 	"rsba",
1575 	"ssb_no",
1576 	"stibp_all",
1577 	"flush_cmd",
1578 	"l1d_vmentry_no",
1579 	"fsgsbase",
1580 	"clflushopt",
1581 	"clwb",
1582 	"monitorx",
1583 	"clzero",
1584 	"xop",
1585 	"fma4",
1586 	"tbm",
1587 	"avx512_vnni",
1588 	"amd_pcec",
1589 	"md_clear",
1590 	"mds_no",
1591 	"core_thermal",
1592 	"pkg_thermal",
1593 	"tsx_ctrl",
1594 	"taa_no",
1595 	"ppin",
1596 	"vaes",
1597 	"vpclmulqdq",
1598 	"lfence_serializing"
1599 };
1600 
1601 boolean_t
1602 is_x86_feature(void *featureset, uint_t feature)
1603 {
1604 	ASSERT(feature < NUM_X86_FEATURES);
1605 	return (BT_TEST((ulong_t *)featureset, feature));
1606 }
1607 
1608 void
1609 add_x86_feature(void *featureset, uint_t feature)
1610 {
1611 	ASSERT(feature < NUM_X86_FEATURES);
1612 	BT_SET((ulong_t *)featureset, feature);
1613 }
1614 
1615 void
1616 remove_x86_feature(void *featureset, uint_t feature)
1617 {
1618 	ASSERT(feature < NUM_X86_FEATURES);
1619 	BT_CLEAR((ulong_t *)featureset, feature);
1620 }
1621 
1622 boolean_t
1623 compare_x86_featureset(void *setA, void *setB)
1624 {
1625 	/*
1626 	 * We assume that the unused bits of the bitmap are always zero.
1627 	 */
1628 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1629 		return (B_TRUE);
1630 	} else {
1631 		return (B_FALSE);
1632 	}
1633 }
1634 
1635 void
1636 print_x86_featureset(void *featureset)
1637 {
1638 	uint_t i;
1639 
1640 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1641 		if (is_x86_feature(featureset, i)) {
1642 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1643 			    x86_feature_names[i]);
1644 		}
1645 	}
1646 }
1647 
1648 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1649 static size_t xsave_state_size = 0;
1650 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1651 boolean_t xsave_force_disable = B_FALSE;
1652 extern int disable_smap;
1653 
1654 /*
1655  * This is set to platform type we are running on.
1656  */
1657 static int platform_type = -1;
1658 
1659 #if !defined(__xpv)
1660 /*
1661  * Variable to patch if hypervisor platform detection needs to be
1662  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1663  */
1664 int enable_platform_detection = 1;
1665 #endif
1666 
1667 /*
1668  * monitor/mwait info.
1669  *
1670  * size_actual and buf_actual are the real address and size allocated to get
1671  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1672  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1673  * processor cache-line alignment, but this is not guarantied in the furture.
1674  */
1675 struct mwait_info {
1676 	size_t		mon_min;	/* min size to avoid missed wakeups */
1677 	size_t		mon_max;	/* size to avoid false wakeups */
1678 	size_t		size_actual;	/* size actually allocated */
1679 	void		*buf_actual;	/* memory actually allocated */
1680 	uint32_t	support;	/* processor support of monitor/mwait */
1681 };
1682 
1683 /*
1684  * xsave/xrestor info.
1685  *
1686  * This structure contains HW feature bits and the size of the xsave save area.
1687  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1688  * (xsave_state) to describe the xsave layout. However, at runtime the
1689  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1690  * xsave_state structure simply represents the legacy layout of the beginning
1691  * of the xsave area.
1692  */
1693 struct xsave_info {
1694 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1695 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1696 	size_t		xsav_max_size;  /* max size save area for HW features */
1697 	size_t		ymm_size;	/* AVX: size of ymm save area */
1698 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1699 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1700 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1701 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1702 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1703 	size_t		opmask_size;	/* AVX512: size of opmask save */
1704 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1705 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1706 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1707 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1708 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1709 };
1710 
1711 
1712 /*
1713  * These constants determine how many of the elements of the
1714  * cpuid we cache in the cpuid_info data structure; the
1715  * remaining elements are accessible via the cpuid instruction.
1716  */
1717 
1718 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1719 #define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1720 
1721 /*
1722  * See the big theory statement for a more detailed explanation of what some of
1723  * these members mean.
1724  */
1725 struct cpuid_info {
1726 	uint_t cpi_pass;		/* last pass completed */
1727 	/*
1728 	 * standard function information
1729 	 */
1730 	uint_t cpi_maxeax;		/* fn 0: %eax */
1731 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1732 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1733 
1734 	uint_t cpi_family;		/* fn 1: extended family */
1735 	uint_t cpi_model;		/* fn 1: extended model */
1736 	uint_t cpi_step;		/* fn 1: stepping */
1737 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1738 					/*		AMD: package/socket # */
1739 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1740 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1741 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1742 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1743 	uint_t cpi_ncache;		/* fn 2: number of elements */
1744 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1745 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1746 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1747 					/* Intel fn: 4, AMD fn: 8000001d */
1748 	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1749 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1750 	/*
1751 	 * extended function information
1752 	 */
1753 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1754 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1755 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1756 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1757 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1758 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1759 
1760 	id_t cpi_coreid;		/* same coreid => strands share core */
1761 	int cpi_pkgcoreid;		/* core number within single package */
1762 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1763 					/* Intel: fn 4: %eax[31-26] */
1764 
1765 	/*
1766 	 * These values represent the number of bits that are required to store
1767 	 * information about the number of cores and threads.
1768 	 */
1769 	uint_t cpi_ncore_bits;
1770 	uint_t cpi_nthread_bits;
1771 	/*
1772 	 * supported feature information
1773 	 */
1774 	uint32_t cpi_support[6];
1775 #define	STD_EDX_FEATURES	0
1776 #define	AMD_EDX_FEATURES	1
1777 #define	TM_EDX_FEATURES		2
1778 #define	STD_ECX_FEATURES	3
1779 #define	AMD_ECX_FEATURES	4
1780 #define	STD_EBX_FEATURES	5
1781 	/*
1782 	 * Synthesized information, where known.
1783 	 */
1784 	uint32_t cpi_chiprev;		/* See X86_CHIPREV_* in x86_archext.h */
1785 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1786 	uint32_t cpi_socket;		/* Chip package/socket type */
1787 
1788 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1789 	uint32_t cpi_apicid;
1790 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1791 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1792 					/* Intel: 1 */
1793 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1794 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1795 
1796 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1797 };
1798 
1799 
1800 static struct cpuid_info cpuid_info0;
1801 
1802 /*
1803  * These bit fields are defined by the Intel Application Note AP-485
1804  * "Intel Processor Identification and the CPUID Instruction"
1805  */
1806 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1807 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1808 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1809 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1810 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1811 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1812 
1813 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1814 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1815 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1816 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1817 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1818 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1819 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1820 
1821 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1822 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1823 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1824 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1825 
1826 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1827 #define	CPI_XMAXEAX_MAX		0x80000100
1828 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1829 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1830 
1831 /*
1832  * Function 4 (Deterministic Cache Parameters) macros
1833  * Defined by Intel Application Note AP-485
1834  */
1835 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1836 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1837 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1838 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1839 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1840 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1841 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1842 
1843 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1844 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1845 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1846 
1847 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1848 
1849 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1850 
1851 
1852 /*
1853  * A couple of shorthand macros to identify "later" P6-family chips
1854  * like the Pentium M and Core.  First, the "older" P6-based stuff
1855  * (loosely defined as "pre-Pentium-4"):
1856  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1857  */
1858 #define	IS_LEGACY_P6(cpi) (			\
1859 	cpi->cpi_family == 6 &&			\
1860 		(cpi->cpi_model == 1 ||		\
1861 		cpi->cpi_model == 3 ||		\
1862 		cpi->cpi_model == 5 ||		\
1863 		cpi->cpi_model == 6 ||		\
1864 		cpi->cpi_model == 7 ||		\
1865 		cpi->cpi_model == 8 ||		\
1866 		cpi->cpi_model == 0xA ||	\
1867 		cpi->cpi_model == 0xB)		\
1868 )
1869 
1870 /* A "new F6" is everything with family 6 that's not the above */
1871 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1872 
1873 /* Extended family/model support */
1874 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1875 	cpi->cpi_family >= 0xf)
1876 
1877 /*
1878  * Info for monitor/mwait idle loop.
1879  *
1880  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1881  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1882  * 2006.
1883  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1884  * Documentation Updates" #33633, Rev 2.05, December 2006.
1885  */
1886 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1887 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1888 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1889 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1890 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1891 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1892 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1893 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1894 /*
1895  * Number of sub-cstates for a given c-state.
1896  */
1897 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1898 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1899 
1900 /*
1901  * XSAVE leaf 0xD enumeration
1902  */
1903 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1904 #define	CPUID_LEAFD_2_YMM_SIZE		256
1905 
1906 /*
1907  * Common extended leaf names to cut down on typos.
1908  */
1909 #define	CPUID_LEAF_EXT_0		0x80000000
1910 #define	CPUID_LEAF_EXT_8		0x80000008
1911 #define	CPUID_LEAF_EXT_1d		0x8000001d
1912 #define	CPUID_LEAF_EXT_1e		0x8000001e
1913 
1914 /*
1915  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1916  * file to try and keep people using the expected cpuid_* interfaces.
1917  */
1918 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1919 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1920 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1921 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1922 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1923 
1924 /*
1925  * Apply up various platform-dependent restrictions where the
1926  * underlying platform restrictions mean the CPU can be marked
1927  * as less capable than its cpuid instruction would imply.
1928  */
1929 #if defined(__xpv)
1930 static void
1931 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1932 {
1933 	switch (eax) {
1934 	case 1: {
1935 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1936 		    0 : CPUID_INTC_EDX_MCA;
1937 		cp->cp_edx &=
1938 		    ~(mcamask |
1939 		    CPUID_INTC_EDX_PSE |
1940 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1941 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1942 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1943 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1944 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1945 		break;
1946 	}
1947 
1948 	case 0x80000001:
1949 		cp->cp_edx &=
1950 		    ~(CPUID_AMD_EDX_PSE |
1951 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1952 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1953 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1954 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1955 		    CPUID_AMD_EDX_TSCP);
1956 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1957 		break;
1958 	default:
1959 		break;
1960 	}
1961 
1962 	switch (vendor) {
1963 	case X86_VENDOR_Intel:
1964 		switch (eax) {
1965 		case 4:
1966 			/*
1967 			 * Zero out the (ncores-per-chip - 1) field
1968 			 */
1969 			cp->cp_eax &= 0x03fffffff;
1970 			break;
1971 		default:
1972 			break;
1973 		}
1974 		break;
1975 	case X86_VENDOR_AMD:
1976 	case X86_VENDOR_HYGON:
1977 		switch (eax) {
1978 
1979 		case 0x80000001:
1980 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1981 			break;
1982 
1983 		case CPUID_LEAF_EXT_8:
1984 			/*
1985 			 * Zero out the (ncores-per-chip - 1) field
1986 			 */
1987 			cp->cp_ecx &= 0xffffff00;
1988 			break;
1989 		default:
1990 			break;
1991 		}
1992 		break;
1993 	default:
1994 		break;
1995 	}
1996 }
1997 #else
1998 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
1999 #endif
2000 
2001 /*
2002  *  Some undocumented ways of patching the results of the cpuid
2003  *  instruction to permit running Solaris 10 on future cpus that
2004  *  we don't currently support.  Could be set to non-zero values
2005  *  via settings in eeprom.
2006  */
2007 
2008 uint32_t cpuid_feature_ecx_include;
2009 uint32_t cpuid_feature_ecx_exclude;
2010 uint32_t cpuid_feature_edx_include;
2011 uint32_t cpuid_feature_edx_exclude;
2012 
2013 /*
2014  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2015  */
2016 void
2017 cpuid_alloc_space(cpu_t *cpu)
2018 {
2019 	/*
2020 	 * By convention, cpu0 is the boot cpu, which is set up
2021 	 * before memory allocation is available.  All other cpus get
2022 	 * their cpuid_info struct allocated here.
2023 	 */
2024 	ASSERT(cpu->cpu_id != 0);
2025 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2026 	cpu->cpu_m.mcpu_cpi =
2027 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2028 }
2029 
2030 void
2031 cpuid_free_space(cpu_t *cpu)
2032 {
2033 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2034 	int i;
2035 
2036 	ASSERT(cpi != NULL);
2037 	ASSERT(cpi != &cpuid_info0);
2038 
2039 	/*
2040 	 * Free up any cache leaf related dynamic storage. The first entry was
2041 	 * cached from the standard cpuid storage, so we should not free it.
2042 	 */
2043 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2044 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2045 	if (cpi->cpi_cache_leaf_size > 0)
2046 		kmem_free(cpi->cpi_cache_leaves,
2047 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2048 
2049 	kmem_free(cpi, sizeof (*cpi));
2050 	cpu->cpu_m.mcpu_cpi = NULL;
2051 }
2052 
2053 #if !defined(__xpv)
2054 /*
2055  * Determine the type of the underlying platform. This is used to customize
2056  * initialization of various subsystems (e.g. TSC). determine_platform() must
2057  * only ever be called once to prevent two processors from seeing different
2058  * values of platform_type. Must be called before cpuid_pass_ident(), the
2059  * earliest consumer to execute; the identification pass will call
2060  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2061  */
2062 void
2063 determine_platform(void)
2064 {
2065 	struct cpuid_regs cp;
2066 	uint32_t base;
2067 	uint32_t regs[4];
2068 	char *hvstr = (char *)regs;
2069 
2070 	ASSERT(platform_type == -1);
2071 
2072 	platform_type = HW_NATIVE;
2073 
2074 	if (!enable_platform_detection)
2075 		return;
2076 
2077 	/*
2078 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2079 	 * vendor signature, and set platform type accordingly.
2080 	 *
2081 	 * References:
2082 	 * http://lkml.org/lkml/2008/10/1/246
2083 	 * http://kb.vmware.com/kb/1009458
2084 	 */
2085 	cp.cp_eax = 0x1;
2086 	(void) __cpuid_insn(&cp);
2087 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2088 		cp.cp_eax = 0x40000000;
2089 		(void) __cpuid_insn(&cp);
2090 		regs[0] = cp.cp_ebx;
2091 		regs[1] = cp.cp_ecx;
2092 		regs[2] = cp.cp_edx;
2093 		regs[3] = 0;
2094 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2095 			platform_type = HW_XEN_HVM;
2096 			return;
2097 		}
2098 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2099 			platform_type = HW_VMWARE;
2100 			return;
2101 		}
2102 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2103 			platform_type = HW_KVM;
2104 			return;
2105 		}
2106 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2107 			platform_type = HW_BHYVE;
2108 			return;
2109 		}
2110 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
2111 			platform_type = HW_MICROSOFT;
2112 	} else {
2113 		/*
2114 		 * Check older VMware hardware versions. VMware hypervisor is
2115 		 * detected by performing an IN operation to VMware hypervisor
2116 		 * port and checking that value returned in %ebx is VMware
2117 		 * hypervisor magic value.
2118 		 *
2119 		 * References: http://kb.vmware.com/kb/1009458
2120 		 */
2121 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2122 		if (regs[1] == VMWARE_HVMAGIC) {
2123 			platform_type = HW_VMWARE;
2124 			return;
2125 		}
2126 	}
2127 
2128 	/*
2129 	 * Check Xen hypervisor. In a fully virtualized domain,
2130 	 * Xen's pseudo-cpuid function returns a string representing the
2131 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2132 	 * supported cpuid function. We need at least a (base + 2) leaf value
2133 	 * to do what we want to do. Try different base values, since the
2134 	 * hypervisor might use a different one depending on whether Hyper-V
2135 	 * emulation is switched on by default or not.
2136 	 */
2137 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2138 		cp.cp_eax = base;
2139 		(void) __cpuid_insn(&cp);
2140 		regs[0] = cp.cp_ebx;
2141 		regs[1] = cp.cp_ecx;
2142 		regs[2] = cp.cp_edx;
2143 		regs[3] = 0;
2144 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2145 		    cp.cp_eax >= (base + 2)) {
2146 			platform_type &= ~HW_NATIVE;
2147 			platform_type |= HW_XEN_HVM;
2148 			return;
2149 		}
2150 	}
2151 }
2152 
2153 int
2154 get_hwenv(void)
2155 {
2156 	ASSERT(platform_type != -1);
2157 	return (platform_type);
2158 }
2159 
2160 int
2161 is_controldom(void)
2162 {
2163 	return (0);
2164 }
2165 
2166 #else
2167 
2168 int
2169 get_hwenv(void)
2170 {
2171 	return (HW_XEN_PV);
2172 }
2173 
2174 int
2175 is_controldom(void)
2176 {
2177 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2178 }
2179 
2180 #endif	/* __xpv */
2181 
2182 /*
2183  * Make sure that we have gathered all of the CPUID leaves that we might need to
2184  * determine topology. We assume that the standard leaf 1 has already been done
2185  * and that xmaxeax has already been calculated.
2186  */
2187 static void
2188 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2189 {
2190 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2191 
2192 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2193 		struct cpuid_regs *cp;
2194 
2195 		cp = &cpi->cpi_extd[8];
2196 		cp->cp_eax = CPUID_LEAF_EXT_8;
2197 		(void) __cpuid_insn(cp);
2198 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2199 	}
2200 
2201 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2202 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2203 		struct cpuid_regs *cp;
2204 
2205 		cp = &cpi->cpi_extd[0x1e];
2206 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2207 		(void) __cpuid_insn(cp);
2208 	}
2209 }
2210 
2211 /*
2212  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2213  * it to everything else. If not, and we're on an AMD system where 8000001e is
2214  * valid, then we use that. Othewrise, we fall back to the default value for the
2215  * APIC ID in leaf 1.
2216  */
2217 static uint32_t
2218 cpuid_gather_apicid(struct cpuid_info *cpi)
2219 {
2220 	/*
2221 	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2222 	 * it, we need to gather it again.
2223 	 */
2224 	if (cpi->cpi_maxeax >= 0xB) {
2225 		struct cpuid_regs regs;
2226 		struct cpuid_regs *cp;
2227 
2228 		cp = &regs;
2229 		cp->cp_eax = 0xB;
2230 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2231 		(void) __cpuid_insn(cp);
2232 
2233 		if (cp->cp_ebx != 0) {
2234 			return (cp->cp_edx);
2235 		}
2236 	}
2237 
2238 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2239 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2240 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2241 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2242 		return (cpi->cpi_extd[0x1e].cp_eax);
2243 	}
2244 
2245 	return (CPI_APIC_ID(cpi));
2246 }
2247 
2248 /*
2249  * For AMD processors, attempt to calculate the number of chips and cores that
2250  * exist. The way that we do this varies based on the generation, because the
2251  * generations themselves have changed dramatically.
2252  *
2253  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2254  * However, with the advent of family 17h (Zen) it actually tells us the number
2255  * of threads, so we need to look at leaf 0x8000001e if available to determine
2256  * its value. Otherwise, for all prior families, the number of enabled cores is
2257  * the same as threads.
2258  *
2259  * If we do not have leaf 0x80000008, then we assume that this processor does
2260  * not have anything. AMD's older CPUID specification says there's no reason to
2261  * fall back to leaf 1.
2262  *
2263  * In some virtualization cases we will not have leaf 8000001e or it will be
2264  * zero. When that happens we assume the number of threads is one.
2265  */
2266 static void
2267 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2268 {
2269 	uint_t nthreads, nthread_per_core;
2270 
2271 	nthreads = nthread_per_core = 1;
2272 
2273 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2274 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2275 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2276 		nthreads = CPI_CPU_COUNT(cpi);
2277 	}
2278 
2279 	/*
2280 	 * For us to have threads, and know about it, we have to be at least at
2281 	 * family 17h and have the cpuid bit that says we have extended
2282 	 * topology.
2283 	 */
2284 	if (cpi->cpi_family >= 0x17 &&
2285 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2286 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2287 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2288 	}
2289 
2290 	*ncpus = nthreads;
2291 	*ncores = nthreads / nthread_per_core;
2292 }
2293 
2294 /*
2295  * Seed the initial values for the cores and threads for an Intel based
2296  * processor. These values will be overwritten if we detect that the processor
2297  * supports CPUID leaf 0xb.
2298  */
2299 static void
2300 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2301 {
2302 	/*
2303 	 * Only seed the number of physical cores from the first level leaf 4
2304 	 * information. The number of threads there indicate how many share the
2305 	 * L1 cache, which may or may not have anything to do with the number of
2306 	 * logical CPUs per core.
2307 	 */
2308 	if (cpi->cpi_maxeax >= 4) {
2309 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2310 	} else {
2311 		*ncores = 1;
2312 	}
2313 
2314 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2315 		*ncpus = CPI_CPU_COUNT(cpi);
2316 	} else {
2317 		*ncpus = *ncores;
2318 	}
2319 }
2320 
2321 static boolean_t
2322 cpuid_leafB_getids(cpu_t *cpu)
2323 {
2324 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2325 	struct cpuid_regs regs;
2326 	struct cpuid_regs *cp;
2327 
2328 	if (cpi->cpi_maxeax < 0xB)
2329 		return (B_FALSE);
2330 
2331 	cp = &regs;
2332 	cp->cp_eax = 0xB;
2333 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2334 
2335 	(void) __cpuid_insn(cp);
2336 
2337 	/*
2338 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2339 	 * indicates that the extended topology enumeration leaf is
2340 	 * available.
2341 	 */
2342 	if (cp->cp_ebx != 0) {
2343 		uint32_t x2apic_id = 0;
2344 		uint_t coreid_shift = 0;
2345 		uint_t ncpu_per_core = 1;
2346 		uint_t chipid_shift = 0;
2347 		uint_t ncpu_per_chip = 1;
2348 		uint_t i;
2349 		uint_t level;
2350 
2351 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2352 			cp->cp_eax = 0xB;
2353 			cp->cp_ecx = i;
2354 
2355 			(void) __cpuid_insn(cp);
2356 			level = CPI_CPU_LEVEL_TYPE(cp);
2357 
2358 			if (level == 1) {
2359 				x2apic_id = cp->cp_edx;
2360 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2361 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2362 			} else if (level == 2) {
2363 				x2apic_id = cp->cp_edx;
2364 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2365 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2366 			}
2367 		}
2368 
2369 		/*
2370 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2371 		 */
2372 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2373 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2374 		    ncpu_per_core;
2375 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2376 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2377 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2378 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2379 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2380 		cpi->cpi_compunitid = cpi->cpi_coreid;
2381 
2382 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2383 			cpi->cpi_nthread_bits = coreid_shift;
2384 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2385 		}
2386 
2387 		return (B_TRUE);
2388 	} else {
2389 		return (B_FALSE);
2390 	}
2391 }
2392 
2393 static void
2394 cpuid_intel_getids(cpu_t *cpu, void *feature)
2395 {
2396 	uint_t i;
2397 	uint_t chipid_shift = 0;
2398 	uint_t coreid_shift = 0;
2399 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2400 
2401 	/*
2402 	 * There are no compute units or processor nodes currently on Intel.
2403 	 * Always set these to one.
2404 	 */
2405 	cpi->cpi_procnodes_per_pkg = 1;
2406 	cpi->cpi_cores_per_compunit = 1;
2407 
2408 	/*
2409 	 * If cpuid Leaf B is present, use that to try and get this information.
2410 	 * It will be the most accurate for Intel CPUs.
2411 	 */
2412 	if (cpuid_leafB_getids(cpu))
2413 		return;
2414 
2415 	/*
2416 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2417 	 * and ncore_per_chip. These represent the largest power of two values
2418 	 * that we need to cover all of the IDs in the system. Therefore, we use
2419 	 * those values to seed the number of bits needed to cover information
2420 	 * in the case when leaf B is not available. These values will probably
2421 	 * be larger than required, but that's OK.
2422 	 */
2423 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2424 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2425 
2426 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2427 		chipid_shift++;
2428 
2429 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2430 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2431 
2432 	if (is_x86_feature(feature, X86FSET_CMP)) {
2433 		/*
2434 		 * Multi-core (and possibly multi-threaded)
2435 		 * processors.
2436 		 */
2437 		uint_t ncpu_per_core = 0;
2438 
2439 		if (cpi->cpi_ncore_per_chip == 1)
2440 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2441 		else if (cpi->cpi_ncore_per_chip > 1)
2442 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2443 			    cpi->cpi_ncore_per_chip;
2444 		/*
2445 		 * 8bit APIC IDs on dual core Pentiums
2446 		 * look like this:
2447 		 *
2448 		 * +-----------------------+------+------+
2449 		 * | Physical Package ID   |  MC  |  HT  |
2450 		 * +-----------------------+------+------+
2451 		 * <------- chipid -------->
2452 		 * <------- coreid --------------->
2453 		 *			   <--- clogid -->
2454 		 *			   <------>
2455 		 *			   pkgcoreid
2456 		 *
2457 		 * Where the number of bits necessary to
2458 		 * represent MC and HT fields together equals
2459 		 * to the minimum number of bits necessary to
2460 		 * store the value of cpi->cpi_ncpu_per_chip.
2461 		 * Of those bits, the MC part uses the number
2462 		 * of bits necessary to store the value of
2463 		 * cpi->cpi_ncore_per_chip.
2464 		 */
2465 		for (i = 1; i < ncpu_per_core; i <<= 1)
2466 			coreid_shift++;
2467 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2468 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2469 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2470 		/*
2471 		 * Single-core multi-threaded processors.
2472 		 */
2473 		cpi->cpi_coreid = cpi->cpi_chipid;
2474 		cpi->cpi_pkgcoreid = 0;
2475 	} else {
2476 		/*
2477 		 * Single-core single-thread processors.
2478 		 */
2479 		cpi->cpi_coreid = cpu->cpu_id;
2480 		cpi->cpi_pkgcoreid = 0;
2481 	}
2482 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2483 	cpi->cpi_compunitid = cpi->cpi_coreid;
2484 }
2485 
2486 /*
2487  * Historically, AMD has had CMP chips with only a single thread per core.
2488  * However, starting in family 17h (Zen), this has changed and they now have
2489  * multiple threads. Our internal core id needs to be a unique value.
2490  *
2491  * To determine the core id of an AMD system, if we're from a family before 17h,
2492  * then we just use the cpu id, as that gives us a good value that will be
2493  * unique for each core. If instead, we're on family 17h or later, then we need
2494  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2495  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2496  * We can't use the normal core id in that leaf as it's only unique within the
2497  * socket, which is perfect for cpi_pkgcoreid, but not us.
2498  */
2499 static id_t
2500 cpuid_amd_get_coreid(cpu_t *cpu)
2501 {
2502 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2503 
2504 	if (cpi->cpi_family >= 0x17 &&
2505 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2506 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2507 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2508 		if (nthreads > 1) {
2509 			VERIFY3U(nthreads, ==, 2);
2510 			return (cpi->cpi_apicid >> 1);
2511 		}
2512 	}
2513 
2514 	return (cpu->cpu_id);
2515 }
2516 
2517 /*
2518  * IDs on AMD is a more challenging task. This is notable because of the
2519  * following two facts:
2520  *
2521  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2522  *     also no way to get an actual unique core id from the system. As such, we
2523  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2524  *     however, guarantee that sibling cores of a chip will have sequential
2525  *     coreids starting at a multiple of the number of cores per chip - that is
2526  *     usually the case, but if the APIC IDs have been set up in a different
2527  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2528  *
2529  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2530  *     called compute units. These compute units share the L1I cache, L2 cache,
2531  *     and the FPU. To deal with this, a new topology leaf was added in
2532  *     0x8000001e. However, parts of this leaf have different meanings
2533  *     once we get to family 0x17.
2534  */
2535 
2536 static void
2537 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2538 {
2539 	int i, first_half, coreidsz;
2540 	uint32_t nb_caps_reg;
2541 	uint_t node2_1;
2542 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2543 	struct cpuid_regs *cp;
2544 
2545 	/*
2546 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2547 	 * hasn't been stripped by virtualization). We always set the compute
2548 	 * unit id to the same value. Also, initialize the default number of
2549 	 * cores per compute unit and nodes per package. This will be
2550 	 * overwritten when we know information about a particular family.
2551 	 */
2552 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2553 	cpi->cpi_compunitid = cpi->cpi_coreid;
2554 	cpi->cpi_cores_per_compunit = 1;
2555 	cpi->cpi_procnodes_per_pkg = 1;
2556 
2557 	/*
2558 	 * To construct the logical ID, we need to determine how many APIC IDs
2559 	 * are dedicated to the cores and threads. This is provided for us in
2560 	 * 0x80000008. However, if it's not present (say due to virtualization),
2561 	 * then we assume it's one. This should be present on all 64-bit AMD
2562 	 * processors.  It was added in family 0xf (Hammer).
2563 	 */
2564 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2565 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2566 
2567 		/*
2568 		 * In AMD parlance chip is really a node while illumos
2569 		 * uses chip as equivalent to socket/package.
2570 		 */
2571 		if (coreidsz == 0) {
2572 			/* Use legacy method */
2573 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2574 				coreidsz++;
2575 			if (coreidsz == 0)
2576 				coreidsz = 1;
2577 		}
2578 	} else {
2579 		/* Assume single-core part */
2580 		coreidsz = 1;
2581 	}
2582 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2583 
2584 	/*
2585 	 * The package core ID varies depending on the family. While it may be
2586 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2587 	 * this value is the core id in the given node. For non-virtualized
2588 	 * family 17h, we need to take the logical core id and shift off the
2589 	 * threads like we do when getting the core id.  Otherwise, we can use
2590 	 * the clogid as is. When family 17h is virtualized, the clogid should
2591 	 * be sufficient as if we don't have valid data in the leaf, then we
2592 	 * won't think we have SMT, in which case the cpi_clogid should be
2593 	 * sufficient.
2594 	 */
2595 	if (cpi->cpi_family >= 0x17 &&
2596 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2597 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2598 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2599 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2600 		if (nthreads > 1) {
2601 			VERIFY3U(nthreads, ==, 2);
2602 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2603 		} else {
2604 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2605 		}
2606 	} else {
2607 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2608 	}
2609 
2610 	/*
2611 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2612 	 * (bulldozer) or newer, then we can derive all of this from leaf
2613 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2614 	 */
2615 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2616 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2617 		cp = &cpi->cpi_extd[0x1e];
2618 
2619 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2620 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2621 
2622 		/*
2623 		 * For Bulldozer-era CPUs, recalculate the compute unit
2624 		 * information.
2625 		 */
2626 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2627 			cpi->cpi_cores_per_compunit =
2628 			    BITX(cp->cp_ebx, 15, 8) + 1;
2629 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2630 			    (cpi->cpi_ncore_per_chip /
2631 			    cpi->cpi_cores_per_compunit) *
2632 			    (cpi->cpi_procnodeid /
2633 			    cpi->cpi_procnodes_per_pkg);
2634 		}
2635 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2636 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2637 	} else if (cpi->cpi_family == 0x10) {
2638 		/*
2639 		 * See if we are a multi-node processor.
2640 		 * All processors in the system have the same number of nodes
2641 		 */
2642 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2643 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2644 			/* Single-node */
2645 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2646 			    coreidsz);
2647 		} else {
2648 
2649 			/*
2650 			 * Multi-node revision D (2 nodes per package
2651 			 * are supported)
2652 			 */
2653 			cpi->cpi_procnodes_per_pkg = 2;
2654 
2655 			first_half = (cpi->cpi_pkgcoreid <=
2656 			    (cpi->cpi_ncore_per_chip/2 - 1));
2657 
2658 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2659 				/* We are BSP */
2660 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2661 			} else {
2662 
2663 				/* We are AP */
2664 				/* NodeId[2:1] bits to use for reading F3xe8 */
2665 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2666 
2667 				nb_caps_reg =
2668 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2669 
2670 				/*
2671 				 * Check IntNodeNum bit (31:30, but bit 31 is
2672 				 * always 0 on dual-node processors)
2673 				 */
2674 				if (BITX(nb_caps_reg, 30, 30) == 0)
2675 					cpi->cpi_procnodeid = node2_1 +
2676 					    !first_half;
2677 				else
2678 					cpi->cpi_procnodeid = node2_1 +
2679 					    first_half;
2680 			}
2681 		}
2682 	} else {
2683 		cpi->cpi_procnodeid = 0;
2684 	}
2685 
2686 	cpi->cpi_chipid =
2687 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2688 
2689 	cpi->cpi_ncore_bits = coreidsz;
2690 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2691 	    cpi->cpi_ncore_per_chip);
2692 }
2693 
2694 static void
2695 spec_uarch_flush_noop(void)
2696 {
2697 }
2698 
2699 /*
2700  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2701  * MDS-related micro-architectural state that would normally happen by calling
2702  * x86_md_clear().
2703  */
2704 static void
2705 spec_uarch_flush_msr(void)
2706 {
2707 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2708 }
2709 
2710 /*
2711  * This function points to a function that will flush certain
2712  * micro-architectural state on the processor. This flush is used to mitigate
2713  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2714  * function can point to one of three functions:
2715  *
2716  * - A noop which is done because we either are vulnerable, but do not have
2717  *   microcode available to help deal with a fix, or because we aren't
2718  *   vulnerable.
2719  *
2720  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2721  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2722  *   however, it only flushes the MDS related micro-architectural state on the
2723  *   current hyperthread, it does not do anything for the twin.
2724  *
2725  * - x86_md_clear which will flush the MDS related state. This is done when we
2726  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2727  *   (RDCL_NO is set).
2728  */
2729 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2730 
2731 static void
2732 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2733 {
2734 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2735 
2736 	/*
2737 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2738 	 * has been fixed in hardware, it doesn't cover everything related to
2739 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2740 	 * need to mitigate this.
2741 	 */
2742 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2743 	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2744 		return;
2745 	}
2746 
2747 	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2748 		const uint8_t nop = NOP_INSTR;
2749 		uint8_t *md = (uint8_t *)x86_md_clear;
2750 
2751 		*md = nop;
2752 	}
2753 
2754 	membar_producer();
2755 }
2756 
2757 static void
2758 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2759 {
2760 	boolean_t need_l1d, need_mds;
2761 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2762 
2763 	/*
2764 	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2765 	 * hardware, then there's nothing left for us to do for enabling the
2766 	 * flush. We can also go ahead and say that SMT exclusion is
2767 	 * unnecessary.
2768 	 */
2769 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2770 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2771 	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2772 		extern int smt_exclusion;
2773 		smt_exclusion = 0;
2774 		spec_uarch_flush = spec_uarch_flush_noop;
2775 		membar_producer();
2776 		return;
2777 	}
2778 
2779 	/*
2780 	 * The locations where we need to perform an L1D flush are required both
2781 	 * for mitigating L1TF and MDS. When verw support is present in
2782 	 * microcode, then the L1D flush will take care of doing that as well.
2783 	 * However, if we have a system where RDCL_NO is present, but we don't
2784 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2785 	 * L1D flush.
2786 	 */
2787 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2788 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2789 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2790 		need_l1d = B_TRUE;
2791 	} else {
2792 		need_l1d = B_FALSE;
2793 	}
2794 
2795 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2796 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2797 		need_mds = B_TRUE;
2798 	} else {
2799 		need_mds = B_FALSE;
2800 	}
2801 
2802 	if (need_l1d) {
2803 		spec_uarch_flush = spec_uarch_flush_msr;
2804 	} else if (need_mds) {
2805 		spec_uarch_flush = x86_md_clear;
2806 	} else {
2807 		/*
2808 		 * We have no hardware mitigations available to us.
2809 		 */
2810 		spec_uarch_flush = spec_uarch_flush_noop;
2811 	}
2812 	membar_producer();
2813 }
2814 
2815 /*
2816  * We default to enabling RSB mitigations.
2817  *
2818  * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2819  * post-barrier RSB guessing suggests we should enable RSB mitigations always
2820  * unless specifically instructed not to.
2821  */
2822 static void
2823 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2824 {
2825 	const uint8_t ret = RET_INSTR;
2826 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2827 
2828 	switch (mit) {
2829 	case X86_SPECTREV2_DISABLED:
2830 		*stuff = ret;
2831 		break;
2832 	default:
2833 		break;
2834 	}
2835 }
2836 
2837 static void
2838 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2839 {
2840 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2841 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2842 	    "_r14", "_r15" };
2843 	const uint_t nthunks = ARRAY_SIZE(thunks);
2844 	const char *type;
2845 	uint_t i;
2846 
2847 	if (mit == x86_spectrev2_mitigation)
2848 		return;
2849 
2850 	switch (mit) {
2851 	case X86_SPECTREV2_RETPOLINE:
2852 		type = "gen";
2853 		break;
2854 	case X86_SPECTREV2_ENHANCED_IBRS:
2855 	case X86_SPECTREV2_DISABLED:
2856 		type = "jmp";
2857 		break;
2858 	default:
2859 		panic("asked to updated retpoline state with unknown state!");
2860 	}
2861 
2862 	for (i = 0; i < nthunks; i++) {
2863 		uintptr_t source, dest;
2864 		int ssize, dsize;
2865 		char sourcebuf[64], destbuf[64];
2866 
2867 		(void) snprintf(destbuf, sizeof (destbuf),
2868 		    "__x86_indirect_thunk%s", thunks[i]);
2869 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2870 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2871 
2872 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2873 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2874 		VERIFY3U(source, !=, 0);
2875 		VERIFY3U(dest, !=, 0);
2876 		VERIFY3S(dsize, >=, ssize);
2877 		bcopy((void *)source, (void *)dest, ssize);
2878 	}
2879 }
2880 
2881 static void
2882 cpuid_enable_enhanced_ibrs(void)
2883 {
2884 	uint64_t val;
2885 
2886 	val = rdmsr(MSR_IA32_SPEC_CTRL);
2887 	val |= IA32_SPEC_CTRL_IBRS;
2888 	wrmsr(MSR_IA32_SPEC_CTRL, val);
2889 }
2890 
2891 /*
2892  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2893  * we can disable TSX, we do so.
2894  *
2895  * This determination is done only on the boot CPU, potentially after loading
2896  * updated microcode.
2897  */
2898 static void
2899 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2900 {
2901 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2902 
2903 	VERIFY(cpu->cpu_id == 0);
2904 
2905 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2906 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2907 		return;
2908 	}
2909 
2910 	if (x86_disable_taa) {
2911 		x86_taa_mitigation = X86_TAA_DISABLED;
2912 		return;
2913 	}
2914 
2915 	/*
2916 	 * If we do not have the ability to disable TSX, then our only
2917 	 * mitigation options are in hardware (TAA_NO), or by using our existing
2918 	 * MDS mitigation as described above.  The latter relies upon us having
2919 	 * configured MDS mitigations correctly! This includes disabling SMT if
2920 	 * we want to cross-CPU-thread protection.
2921 	 */
2922 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2923 		/*
2924 		 * It's not clear whether any parts will enumerate TAA_NO
2925 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
2926 		 */
2927 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2928 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2929 			return;
2930 		}
2931 
2932 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2933 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2934 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
2935 		} else {
2936 			x86_taa_mitigation = X86_TAA_NOTHING;
2937 		}
2938 		return;
2939 	}
2940 
2941 	/*
2942 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
2943 	 * enough in boot.
2944 	 *
2945 	 * Otherwise, we'll fall back to causing transactions to abort as our
2946 	 * mitigation. TSX-using code will always take the fallback path.
2947 	 */
2948 	if (cpi->cpi_pass < 4) {
2949 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2950 	} else {
2951 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2952 	}
2953 }
2954 
2955 /*
2956  * As mentioned, we should only touch the MSR when we've got a suitable
2957  * microcode loaded on this CPU.
2958  */
2959 static void
2960 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2961 {
2962 	uint64_t val;
2963 
2964 	switch (taa) {
2965 	case X86_TAA_TSX_DISABLE:
2966 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2967 			return;
2968 		val = rdmsr(MSR_IA32_TSX_CTRL);
2969 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2970 		wrmsr(MSR_IA32_TSX_CTRL, val);
2971 		break;
2972 	case X86_TAA_TSX_FORCE_ABORT:
2973 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2974 			return;
2975 		val = rdmsr(MSR_IA32_TSX_CTRL);
2976 		val |= IA32_TSX_CTRL_RTM_DISABLE;
2977 		wrmsr(MSR_IA32_TSX_CTRL, val);
2978 		break;
2979 	case X86_TAA_HW_MITIGATED:
2980 	case X86_TAA_MD_CLEAR:
2981 	case X86_TAA_DISABLED:
2982 	case X86_TAA_NOTHING:
2983 		break;
2984 	}
2985 }
2986 
2987 static void
2988 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2989 {
2990 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2991 	x86_spectrev2_mitigation_t v2mit;
2992 
2993 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2994 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2995 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2996 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2997 			add_x86_feature(featureset, X86FSET_IBPB);
2998 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2999 			add_x86_feature(featureset, X86FSET_IBRS);
3000 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3001 			add_x86_feature(featureset, X86FSET_STIBP);
3002 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3003 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3004 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3005 			add_x86_feature(featureset, X86FSET_SSBD);
3006 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3007 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3008 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3009 			add_x86_feature(featureset, X86FSET_SSB_NO);
3010 		/*
3011 		 * Don't enable enhanced IBRS unless we're told that we should
3012 		 * prefer it and it has the same semantics as Intel. This is
3013 		 * split into two bits rather than a single one.
3014 		 */
3015 		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3016 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
3017 			add_x86_feature(featureset, X86FSET_IBRS_ALL);
3018 		}
3019 
3020 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3021 	    cpi->cpi_maxeax >= 7) {
3022 		struct cpuid_regs *ecp;
3023 		ecp = &cpi->cpi_std[7];
3024 
3025 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3026 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3027 		}
3028 
3029 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3030 			add_x86_feature(featureset, X86FSET_IBRS);
3031 			add_x86_feature(featureset, X86FSET_IBPB);
3032 		}
3033 
3034 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3035 			add_x86_feature(featureset, X86FSET_STIBP);
3036 		}
3037 
3038 		/*
3039 		 * Don't read the arch caps MSR on xpv where we lack the
3040 		 * on_trap().
3041 		 */
3042 #ifndef __xpv
3043 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3044 			on_trap_data_t otd;
3045 
3046 			/*
3047 			 * Be paranoid and assume we'll get a #GP.
3048 			 */
3049 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3050 				uint64_t reg;
3051 
3052 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3053 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3054 					add_x86_feature(featureset,
3055 					    X86FSET_RDCL_NO);
3056 				}
3057 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3058 					add_x86_feature(featureset,
3059 					    X86FSET_IBRS_ALL);
3060 				}
3061 				if (reg & IA32_ARCH_CAP_RSBA) {
3062 					add_x86_feature(featureset,
3063 					    X86FSET_RSBA);
3064 				}
3065 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3066 					add_x86_feature(featureset,
3067 					    X86FSET_L1D_VM_NO);
3068 				}
3069 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3070 					add_x86_feature(featureset,
3071 					    X86FSET_SSB_NO);
3072 				}
3073 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3074 					add_x86_feature(featureset,
3075 					    X86FSET_MDS_NO);
3076 				}
3077 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3078 					add_x86_feature(featureset,
3079 					    X86FSET_TSX_CTRL);
3080 				}
3081 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3082 					add_x86_feature(featureset,
3083 					    X86FSET_TAA_NO);
3084 				}
3085 			}
3086 			no_trap();
3087 		}
3088 #endif	/* !__xpv */
3089 
3090 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3091 			add_x86_feature(featureset, X86FSET_SSBD);
3092 
3093 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3094 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3095 	}
3096 
3097 	/*
3098 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3099 	 * will have already run this function and determined what we need to
3100 	 * do. This gives us a hook for per-HW thread mitigations such as
3101 	 * enhanced IBRS, or disabling TSX.
3102 	 */
3103 	if (cpu->cpu_id != 0) {
3104 		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
3105 			cpuid_enable_enhanced_ibrs();
3106 		}
3107 
3108 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3109 		return;
3110 	}
3111 
3112 	/*
3113 	 * Go through and initialize various security mechanisms that we should
3114 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3115 	 * TAA.
3116 	 */
3117 
3118 	/*
3119 	 * By default we've come in with retpolines enabled. Check whether we
3120 	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3121 	 * by default, but disabled if we are using enhanced IBRS. Note, we do
3122 	 * not allow the use of AMD optimized retpolines as it was disclosed by
3123 	 * AMD in March 2022 that they were still vulnerable. Prior to that
3124 	 * point, we used them.
3125 	 */
3126 	if (x86_disable_spectrev2 != 0) {
3127 		v2mit = X86_SPECTREV2_DISABLED;
3128 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3129 		cpuid_enable_enhanced_ibrs();
3130 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3131 	} else {
3132 		v2mit = X86_SPECTREV2_RETPOLINE;
3133 	}
3134 
3135 	cpuid_patch_retpolines(v2mit);
3136 	cpuid_patch_rsb(v2mit);
3137 	x86_spectrev2_mitigation = v2mit;
3138 	membar_producer();
3139 
3140 	/*
3141 	 * We need to determine what changes are required for mitigating L1TF
3142 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3143 	 * is required.
3144 	 *
3145 	 * If any of these are present, then we need to flush u-arch state at
3146 	 * various points. For MDS, we need to do so whenever we change to a
3147 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3148 	 * flush the L1D cache at VM entry. When we have microcode that handles
3149 	 * MDS, the L1D flush also clears the other u-arch state that the
3150 	 * md_clear does.
3151 	 */
3152 
3153 	/*
3154 	 * Update whether or not we need to be taking explicit action against
3155 	 * MDS.
3156 	 */
3157 	cpuid_update_md_clear(cpu, featureset);
3158 
3159 	/*
3160 	 * Determine whether SMT exclusion is required and whether or not we
3161 	 * need to perform an l1d flush.
3162 	 */
3163 	cpuid_update_l1d_flush(cpu, featureset);
3164 
3165 	/*
3166 	 * Determine what our mitigation strategy should be for TAA and then
3167 	 * also apply TAA mitigations.
3168 	 */
3169 	cpuid_update_tsx(cpu, featureset);
3170 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3171 }
3172 
3173 /*
3174  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3175  */
3176 void
3177 setup_xfem(void)
3178 {
3179 	uint64_t flags = XFEATURE_LEGACY_FP;
3180 
3181 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3182 
3183 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3184 		flags |= XFEATURE_SSE;
3185 
3186 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3187 		flags |= XFEATURE_AVX;
3188 
3189 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3190 		flags |= XFEATURE_AVX512;
3191 
3192 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3193 
3194 	xsave_bv_all = flags;
3195 }
3196 
3197 static void
3198 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3199 {
3200 	struct cpuid_info *cpi;
3201 
3202 	cpi = cpu->cpu_m.mcpu_cpi;
3203 
3204 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3205 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3206 		cpuid_gather_amd_topology_leaves(cpu);
3207 	}
3208 
3209 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3210 
3211 	/*
3212 	 * Before we can calculate the IDs that we should assign to this
3213 	 * processor, we need to understand how many cores and threads it has.
3214 	 */
3215 	switch (cpi->cpi_vendor) {
3216 	case X86_VENDOR_Intel:
3217 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3218 		    &cpi->cpi_ncore_per_chip);
3219 		break;
3220 	case X86_VENDOR_AMD:
3221 	case X86_VENDOR_HYGON:
3222 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3223 		    &cpi->cpi_ncore_per_chip);
3224 		break;
3225 	default:
3226 		/*
3227 		 * If we have some other x86 compatible chip, it's not clear how
3228 		 * they would behave. The most common case is virtualization
3229 		 * today, though there are also 64-bit VIA chips. Assume that
3230 		 * all we can get is the basic Leaf 1 HTT information.
3231 		 */
3232 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3233 			cpi->cpi_ncore_per_chip = 1;
3234 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3235 		}
3236 		break;
3237 	}
3238 
3239 	/*
3240 	 * Based on the calculated number of threads and cores, potentially
3241 	 * assign the HTT and CMT features.
3242 	 */
3243 	if (cpi->cpi_ncore_per_chip > 1) {
3244 		add_x86_feature(featureset, X86FSET_CMP);
3245 	}
3246 
3247 	if (cpi->cpi_ncpu_per_chip > 1 &&
3248 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3249 		add_x86_feature(featureset, X86FSET_HTT);
3250 	}
3251 
3252 	/*
3253 	 * Now that has been set up, we need to go through and calculate all of
3254 	 * the rest of the parameters that exist. If we think the CPU doesn't
3255 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3256 	 * up information in some way. The most likely case for this is
3257 	 * virtualization where we have a lot of partial topology information.
3258 	 */
3259 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3260 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3261 		/*
3262 		 * This is a single core, single-threaded processor.
3263 		 */
3264 		cpi->cpi_procnodes_per_pkg = 1;
3265 		cpi->cpi_cores_per_compunit = 1;
3266 		cpi->cpi_compunitid = 0;
3267 		cpi->cpi_chipid = -1;
3268 		cpi->cpi_clogid = 0;
3269 		cpi->cpi_coreid = cpu->cpu_id;
3270 		cpi->cpi_pkgcoreid = 0;
3271 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3272 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3273 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3274 		} else {
3275 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3276 		}
3277 	} else {
3278 		switch (cpi->cpi_vendor) {
3279 		case X86_VENDOR_Intel:
3280 			cpuid_intel_getids(cpu, featureset);
3281 			break;
3282 		case X86_VENDOR_AMD:
3283 		case X86_VENDOR_HYGON:
3284 			cpuid_amd_getids(cpu, featureset);
3285 			break;
3286 		default:
3287 			/*
3288 			 * In this case, it's hard to say what we should do.
3289 			 * We're going to model them to the OS as single core
3290 			 * threads. We don't have a good identifier for them, so
3291 			 * we're just going to use the cpu id all on a single
3292 			 * chip.
3293 			 *
3294 			 * This case has historically been different from the
3295 			 * case above where we don't have HTT or CMP. While they
3296 			 * could be combined, we've opted to keep it separate to
3297 			 * minimize the risk of topology changes in weird cases.
3298 			 */
3299 			cpi->cpi_procnodes_per_pkg = 1;
3300 			cpi->cpi_cores_per_compunit = 1;
3301 			cpi->cpi_chipid = 0;
3302 			cpi->cpi_coreid = cpu->cpu_id;
3303 			cpi->cpi_clogid = cpu->cpu_id;
3304 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3305 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3306 			cpi->cpi_compunitid = cpi->cpi_coreid;
3307 			break;
3308 		}
3309 	}
3310 }
3311 
3312 /*
3313  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3314  * always gather leaf 6 if it's supported; however, we only look for features on
3315  * Intel systems as AMD does not currently define any of the features we look
3316  * for below.
3317  */
3318 static void
3319 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3320 {
3321 	struct cpuid_regs *cp;
3322 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3323 
3324 	if (cpi->cpi_maxeax < 6) {
3325 		return;
3326 	}
3327 
3328 	cp = &cpi->cpi_std[6];
3329 	cp->cp_eax = 6;
3330 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3331 	(void) __cpuid_insn(cp);
3332 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3333 
3334 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3335 		return;
3336 	}
3337 
3338 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3339 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3340 	}
3341 
3342 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3343 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3344 	}
3345 }
3346 
3347 /*
3348  * PPIN is the protected processor inventory number. On AMD this is an actual
3349  * feature bit. However, on Intel systems we need to read the platform
3350  * information MSR if we're on a specific model.
3351  */
3352 #if !defined(__xpv)
3353 static void
3354 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3355 {
3356 	on_trap_data_t otd;
3357 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3358 
3359 	switch (cpi->cpi_vendor) {
3360 	case X86_VENDOR_AMD:
3361 		/*
3362 		 * This leaf will have already been gathered in the topology
3363 		 * functions.
3364 		 */
3365 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3366 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3367 				add_x86_feature(featureset, X86FSET_PPIN);
3368 			}
3369 		}
3370 		break;
3371 	case X86_VENDOR_Intel:
3372 		if (cpi->cpi_family != 6)
3373 			break;
3374 		switch (cpi->cpi_model) {
3375 		case INTC_MODEL_IVYBRIDGE_XEON:
3376 		case INTC_MODEL_HASWELL_XEON:
3377 		case INTC_MODEL_BROADWELL_XEON:
3378 		case INTC_MODEL_BROADWELL_XEON_D:
3379 		case INTC_MODEL_SKYLAKE_XEON:
3380 		case INTC_MODEL_ICELAKE_XEON:
3381 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3382 				uint64_t value;
3383 
3384 				value = rdmsr(MSR_PLATFORM_INFO);
3385 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3386 					add_x86_feature(featureset,
3387 					    X86FSET_PPIN);
3388 				}
3389 			}
3390 			no_trap();
3391 			break;
3392 		default:
3393 			break;
3394 		}
3395 		break;
3396 	default:
3397 		break;
3398 	}
3399 }
3400 #endif	/* ! __xpv */
3401 
3402 static void
3403 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3404 {
3405 	uchar_t *featureset = (uchar_t *)arg;
3406 
3407 	/*
3408 	 * We don't run on any processor that doesn't have cpuid, and could not
3409 	 * possibly have arrived here.
3410 	 */
3411 	add_x86_feature(featureset, X86FSET_CPUID);
3412 }
3413 
3414 static void
3415 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3416 {
3417 	struct cpuid_info *cpi;
3418 	struct cpuid_regs *cp;
3419 
3420 	/*
3421 	 * We require that virtual/native detection be complete and that PCI
3422 	 * config space access has been set up; at present there is no reliable
3423 	 * way to determine the latter.
3424 	 */
3425 	ASSERT3S(platform_type, !=, -1);
3426 
3427 	cpi = cpu->cpu_m.mcpu_cpi;
3428 	ASSERT(cpi != NULL);
3429 
3430 	cp = &cpi->cpi_std[0];
3431 	cp->cp_eax = 0;
3432 	cpi->cpi_maxeax = __cpuid_insn(cp);
3433 	{
3434 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3435 		*iptr++ = cp->cp_ebx;
3436 		*iptr++ = cp->cp_edx;
3437 		*iptr++ = cp->cp_ecx;
3438 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3439 	}
3440 
3441 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3442 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3443 
3444 	/*
3445 	 * Limit the range in case of weird hardware
3446 	 */
3447 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3448 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3449 	if (cpi->cpi_maxeax < 1)
3450 		return;
3451 
3452 	cp = &cpi->cpi_std[1];
3453 	cp->cp_eax = 1;
3454 	(void) __cpuid_insn(cp);
3455 
3456 	/*
3457 	 * Extract identifying constants for easy access.
3458 	 */
3459 	cpi->cpi_model = CPI_MODEL(cpi);
3460 	cpi->cpi_family = CPI_FAMILY(cpi);
3461 
3462 	if (cpi->cpi_family == 0xf)
3463 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3464 
3465 	/*
3466 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3467 	 * Intel, and presumably everyone else, uses model == 0xf, as
3468 	 * one would expect (max value means possible overflow).  Sigh.
3469 	 */
3470 
3471 	switch (cpi->cpi_vendor) {
3472 	case X86_VENDOR_Intel:
3473 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3474 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3475 		break;
3476 	case X86_VENDOR_AMD:
3477 		if (CPI_FAMILY(cpi) == 0xf)
3478 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3479 		break;
3480 	case X86_VENDOR_HYGON:
3481 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3482 		break;
3483 	default:
3484 		if (cpi->cpi_model == 0xf)
3485 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3486 		break;
3487 	}
3488 
3489 	cpi->cpi_step = CPI_STEP(cpi);
3490 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3491 
3492 	/*
3493 	 * Synthesize chip "revision" and socket type
3494 	 */
3495 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3496 	    cpi->cpi_model, cpi->cpi_step);
3497 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3498 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3499 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3500 	    cpi->cpi_model, cpi->cpi_step);
3501 }
3502 
3503 static void
3504 cpuid_pass_basic(cpu_t *cpu, void *arg)
3505 {
3506 	uchar_t *featureset = (uchar_t *)arg;
3507 	uint32_t mask_ecx, mask_edx;
3508 	struct cpuid_info *cpi;
3509 	struct cpuid_regs *cp;
3510 	int xcpuid;
3511 #if !defined(__xpv)
3512 	extern int idle_cpu_prefer_mwait;
3513 #endif
3514 
3515 	cpi = cpu->cpu_m.mcpu_cpi;
3516 	ASSERT(cpi != NULL);
3517 
3518 	if (cpi->cpi_maxeax < 1)
3519 		return;
3520 
3521 	/*
3522 	 * This was filled during the identification pass.
3523 	 */
3524 	cp = &cpi->cpi_std[1];
3525 
3526 	/*
3527 	 * *default* assumptions:
3528 	 * - believe %edx feature word
3529 	 * - ignore %ecx feature word
3530 	 * - 32-bit virtual and physical addressing
3531 	 */
3532 	mask_edx = 0xffffffff;
3533 	mask_ecx = 0;
3534 
3535 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3536 
3537 	switch (cpi->cpi_vendor) {
3538 	case X86_VENDOR_Intel:
3539 		if (cpi->cpi_family == 5)
3540 			x86_type = X86_TYPE_P5;
3541 		else if (IS_LEGACY_P6(cpi)) {
3542 			x86_type = X86_TYPE_P6;
3543 			pentiumpro_bug4046376 = 1;
3544 			/*
3545 			 * Clear the SEP bit when it was set erroneously
3546 			 */
3547 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3548 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3549 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3550 			x86_type = X86_TYPE_P4;
3551 			/*
3552 			 * We don't currently depend on any of the %ecx
3553 			 * features until Prescott, so we'll only check
3554 			 * this from P4 onwards.  We might want to revisit
3555 			 * that idea later.
3556 			 */
3557 			mask_ecx = 0xffffffff;
3558 		} else if (cpi->cpi_family > 0xf)
3559 			mask_ecx = 0xffffffff;
3560 		/*
3561 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3562 		 * to obtain the monitor linesize.
3563 		 */
3564 		if (cpi->cpi_maxeax < 5)
3565 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3566 		break;
3567 	case X86_VENDOR_IntelClone:
3568 	default:
3569 		break;
3570 	case X86_VENDOR_AMD:
3571 #if defined(OPTERON_ERRATUM_108)
3572 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3573 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3574 			cpi->cpi_model = 0xc;
3575 		} else
3576 #endif
3577 		if (cpi->cpi_family == 5) {
3578 			/*
3579 			 * AMD K5 and K6
3580 			 *
3581 			 * These CPUs have an incomplete implementation
3582 			 * of MCA/MCE which we mask away.
3583 			 */
3584 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3585 
3586 			/*
3587 			 * Model 0 uses the wrong (APIC) bit
3588 			 * to indicate PGE.  Fix it here.
3589 			 */
3590 			if (cpi->cpi_model == 0) {
3591 				if (cp->cp_edx & 0x200) {
3592 					cp->cp_edx &= ~0x200;
3593 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3594 				}
3595 			}
3596 
3597 			/*
3598 			 * Early models had problems w/ MMX; disable.
3599 			 */
3600 			if (cpi->cpi_model < 6)
3601 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3602 		}
3603 
3604 		/*
3605 		 * For newer families, SSE3 and CX16, at least, are valid;
3606 		 * enable all
3607 		 */
3608 		if (cpi->cpi_family >= 0xf)
3609 			mask_ecx = 0xffffffff;
3610 		/*
3611 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3612 		 * to obtain the monitor linesize.
3613 		 */
3614 		if (cpi->cpi_maxeax < 5)
3615 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3616 
3617 #if !defined(__xpv)
3618 		/*
3619 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3620 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3621 		 * know for certain that in at least family 17h, per AMD, mwait
3622 		 * is preferred. Families in-between are less certain.
3623 		 */
3624 		if (cpi->cpi_family < 0x17) {
3625 			idle_cpu_prefer_mwait = 0;
3626 		}
3627 #endif
3628 
3629 		break;
3630 	case X86_VENDOR_HYGON:
3631 		/* Enable all for Hygon Dhyana CPU */
3632 		mask_ecx = 0xffffffff;
3633 		break;
3634 	case X86_VENDOR_TM:
3635 		/*
3636 		 * workaround the NT workaround in CMS 4.1
3637 		 */
3638 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3639 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3640 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3641 		break;
3642 	case X86_VENDOR_Centaur:
3643 		/*
3644 		 * workaround the NT workarounds again
3645 		 */
3646 		if (cpi->cpi_family == 6)
3647 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3648 		break;
3649 	case X86_VENDOR_Cyrix:
3650 		/*
3651 		 * We rely heavily on the probing in locore
3652 		 * to actually figure out what parts, if any,
3653 		 * of the Cyrix cpuid instruction to believe.
3654 		 */
3655 		switch (x86_type) {
3656 		case X86_TYPE_CYRIX_486:
3657 			mask_edx = 0;
3658 			break;
3659 		case X86_TYPE_CYRIX_6x86:
3660 			mask_edx = 0;
3661 			break;
3662 		case X86_TYPE_CYRIX_6x86L:
3663 			mask_edx =
3664 			    CPUID_INTC_EDX_DE |
3665 			    CPUID_INTC_EDX_CX8;
3666 			break;
3667 		case X86_TYPE_CYRIX_6x86MX:
3668 			mask_edx =
3669 			    CPUID_INTC_EDX_DE |
3670 			    CPUID_INTC_EDX_MSR |
3671 			    CPUID_INTC_EDX_CX8 |
3672 			    CPUID_INTC_EDX_PGE |
3673 			    CPUID_INTC_EDX_CMOV |
3674 			    CPUID_INTC_EDX_MMX;
3675 			break;
3676 		case X86_TYPE_CYRIX_GXm:
3677 			mask_edx =
3678 			    CPUID_INTC_EDX_MSR |
3679 			    CPUID_INTC_EDX_CX8 |
3680 			    CPUID_INTC_EDX_CMOV |
3681 			    CPUID_INTC_EDX_MMX;
3682 			break;
3683 		case X86_TYPE_CYRIX_MediaGX:
3684 			break;
3685 		case X86_TYPE_CYRIX_MII:
3686 		case X86_TYPE_VIA_CYRIX_III:
3687 			mask_edx =
3688 			    CPUID_INTC_EDX_DE |
3689 			    CPUID_INTC_EDX_TSC |
3690 			    CPUID_INTC_EDX_MSR |
3691 			    CPUID_INTC_EDX_CX8 |
3692 			    CPUID_INTC_EDX_PGE |
3693 			    CPUID_INTC_EDX_CMOV |
3694 			    CPUID_INTC_EDX_MMX;
3695 			break;
3696 		default:
3697 			break;
3698 		}
3699 		break;
3700 	}
3701 
3702 #if defined(__xpv)
3703 	/*
3704 	 * Do not support MONITOR/MWAIT under a hypervisor
3705 	 */
3706 	mask_ecx &= ~CPUID_INTC_ECX_MON;
3707 	/*
3708 	 * Do not support XSAVE under a hypervisor for now
3709 	 */
3710 	xsave_force_disable = B_TRUE;
3711 
3712 #endif	/* __xpv */
3713 
3714 	if (xsave_force_disable) {
3715 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3716 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3717 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3718 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3719 	}
3720 
3721 	/*
3722 	 * Now we've figured out the masks that determine
3723 	 * which bits we choose to believe, apply the masks
3724 	 * to the feature words, then map the kernel's view
3725 	 * of these feature words into its feature word.
3726 	 */
3727 	cp->cp_edx &= mask_edx;
3728 	cp->cp_ecx &= mask_ecx;
3729 
3730 	/*
3731 	 * apply any platform restrictions (we don't call this
3732 	 * immediately after __cpuid_insn here, because we need the
3733 	 * workarounds applied above first)
3734 	 */
3735 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3736 
3737 	/*
3738 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3739 	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3740 	 */
3741 	if (cpi->cpi_maxeax >= 7) {
3742 		struct cpuid_regs *ecp;
3743 		ecp = &cpi->cpi_std[7];
3744 		ecp->cp_eax = 7;
3745 		ecp->cp_ecx = 0;
3746 		(void) __cpuid_insn(ecp);
3747 
3748 		/*
3749 		 * If XSAVE has been disabled, just ignore all of the
3750 		 * extended-save-area dependent flags here.
3751 		 */
3752 		if (xsave_force_disable) {
3753 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3754 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3755 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3756 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3757 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3758 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3759 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3760 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3761 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3762 		}
3763 
3764 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3765 			add_x86_feature(featureset, X86FSET_SMEP);
3766 
3767 		/*
3768 		 * We check disable_smap here in addition to in startup_smap()
3769 		 * to ensure CPUs that aren't the boot CPU don't accidentally
3770 		 * include it in the feature set and thus generate a mismatched
3771 		 * x86 feature set across CPUs.
3772 		 */
3773 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3774 		    disable_smap == 0)
3775 			add_x86_feature(featureset, X86FSET_SMAP);
3776 
3777 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3778 			add_x86_feature(featureset, X86FSET_RDSEED);
3779 
3780 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3781 			add_x86_feature(featureset, X86FSET_ADX);
3782 
3783 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3784 			add_x86_feature(featureset, X86FSET_FSGSBASE);
3785 
3786 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3787 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3788 
3789 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3790 			add_x86_feature(featureset, X86FSET_INVPCID);
3791 
3792 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3793 			add_x86_feature(featureset, X86FSET_UMIP);
3794 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3795 			add_x86_feature(featureset, X86FSET_PKU);
3796 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3797 			add_x86_feature(featureset, X86FSET_OSPKE);
3798 
3799 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3800 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3801 				add_x86_feature(featureset, X86FSET_MPX);
3802 
3803 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3804 				add_x86_feature(featureset, X86FSET_CLWB);
3805 		}
3806 	}
3807 
3808 	/*
3809 	 * fold in overrides from the "eeprom" mechanism
3810 	 */
3811 	cp->cp_edx |= cpuid_feature_edx_include;
3812 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3813 
3814 	cp->cp_ecx |= cpuid_feature_ecx_include;
3815 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3816 
3817 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3818 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3819 	}
3820 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3821 		add_x86_feature(featureset, X86FSET_TSC);
3822 	}
3823 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3824 		add_x86_feature(featureset, X86FSET_MSR);
3825 	}
3826 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3827 		add_x86_feature(featureset, X86FSET_MTRR);
3828 	}
3829 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3830 		add_x86_feature(featureset, X86FSET_PGE);
3831 	}
3832 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3833 		add_x86_feature(featureset, X86FSET_CMOV);
3834 	}
3835 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3836 		add_x86_feature(featureset, X86FSET_MMX);
3837 	}
3838 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3839 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3840 		add_x86_feature(featureset, X86FSET_MCA);
3841 	}
3842 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3843 		add_x86_feature(featureset, X86FSET_PAE);
3844 	}
3845 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3846 		add_x86_feature(featureset, X86FSET_CX8);
3847 	}
3848 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3849 		add_x86_feature(featureset, X86FSET_CX16);
3850 	}
3851 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3852 		add_x86_feature(featureset, X86FSET_PAT);
3853 	}
3854 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3855 		add_x86_feature(featureset, X86FSET_SEP);
3856 	}
3857 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3858 		/*
3859 		 * In our implementation, fxsave/fxrstor
3860 		 * are prerequisites before we'll even
3861 		 * try and do SSE things.
3862 		 */
3863 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3864 			add_x86_feature(featureset, X86FSET_SSE);
3865 		}
3866 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3867 			add_x86_feature(featureset, X86FSET_SSE2);
3868 		}
3869 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3870 			add_x86_feature(featureset, X86FSET_SSE3);
3871 		}
3872 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3873 			add_x86_feature(featureset, X86FSET_SSSE3);
3874 		}
3875 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3876 			add_x86_feature(featureset, X86FSET_SSE4_1);
3877 		}
3878 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3879 			add_x86_feature(featureset, X86FSET_SSE4_2);
3880 		}
3881 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3882 			add_x86_feature(featureset, X86FSET_AES);
3883 		}
3884 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3885 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3886 		}
3887 
3888 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3889 			add_x86_feature(featureset, X86FSET_SHA);
3890 
3891 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3892 			add_x86_feature(featureset, X86FSET_XSAVE);
3893 
3894 			/* We only test AVX & AVX512 when there is XSAVE */
3895 
3896 			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3897 				add_x86_feature(featureset,
3898 				    X86FSET_AVX);
3899 
3900 				/*
3901 				 * Intel says we can't check these without also
3902 				 * checking AVX.
3903 				 */
3904 				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3905 					add_x86_feature(featureset,
3906 					    X86FSET_F16C);
3907 
3908 				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3909 					add_x86_feature(featureset,
3910 					    X86FSET_FMA);
3911 
3912 				if (cpi->cpi_std[7].cp_ebx &
3913 				    CPUID_INTC_EBX_7_0_BMI1)
3914 					add_x86_feature(featureset,
3915 					    X86FSET_BMI1);
3916 
3917 				if (cpi->cpi_std[7].cp_ebx &
3918 				    CPUID_INTC_EBX_7_0_BMI2)
3919 					add_x86_feature(featureset,
3920 					    X86FSET_BMI2);
3921 
3922 				if (cpi->cpi_std[7].cp_ebx &
3923 				    CPUID_INTC_EBX_7_0_AVX2)
3924 					add_x86_feature(featureset,
3925 					    X86FSET_AVX2);
3926 
3927 				if (cpi->cpi_std[7].cp_ecx &
3928 				    CPUID_INTC_ECX_7_0_VAES)
3929 					add_x86_feature(featureset,
3930 					    X86FSET_VAES);
3931 
3932 				if (cpi->cpi_std[7].cp_ecx &
3933 				    CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3934 					add_x86_feature(featureset,
3935 					    X86FSET_VPCLMULQDQ);
3936 			}
3937 
3938 			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3939 			    (cpi->cpi_std[7].cp_ebx &
3940 			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3941 				add_x86_feature(featureset, X86FSET_AVX512F);
3942 
3943 				if (cpi->cpi_std[7].cp_ebx &
3944 				    CPUID_INTC_EBX_7_0_AVX512DQ)
3945 					add_x86_feature(featureset,
3946 					    X86FSET_AVX512DQ);
3947 				if (cpi->cpi_std[7].cp_ebx &
3948 				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3949 					add_x86_feature(featureset,
3950 					    X86FSET_AVX512FMA);
3951 				if (cpi->cpi_std[7].cp_ebx &
3952 				    CPUID_INTC_EBX_7_0_AVX512PF)
3953 					add_x86_feature(featureset,
3954 					    X86FSET_AVX512PF);
3955 				if (cpi->cpi_std[7].cp_ebx &
3956 				    CPUID_INTC_EBX_7_0_AVX512ER)
3957 					add_x86_feature(featureset,
3958 					    X86FSET_AVX512ER);
3959 				if (cpi->cpi_std[7].cp_ebx &
3960 				    CPUID_INTC_EBX_7_0_AVX512CD)
3961 					add_x86_feature(featureset,
3962 					    X86FSET_AVX512CD);
3963 				if (cpi->cpi_std[7].cp_ebx &
3964 				    CPUID_INTC_EBX_7_0_AVX512BW)
3965 					add_x86_feature(featureset,
3966 					    X86FSET_AVX512BW);
3967 				if (cpi->cpi_std[7].cp_ebx &
3968 				    CPUID_INTC_EBX_7_0_AVX512VL)
3969 					add_x86_feature(featureset,
3970 					    X86FSET_AVX512VL);
3971 
3972 				if (cpi->cpi_std[7].cp_ecx &
3973 				    CPUID_INTC_ECX_7_0_AVX512VBMI)
3974 					add_x86_feature(featureset,
3975 					    X86FSET_AVX512VBMI);
3976 				if (cpi->cpi_std[7].cp_ecx &
3977 				    CPUID_INTC_ECX_7_0_AVX512VNNI)
3978 					add_x86_feature(featureset,
3979 					    X86FSET_AVX512VNNI);
3980 				if (cpi->cpi_std[7].cp_ecx &
3981 				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3982 					add_x86_feature(featureset,
3983 					    X86FSET_AVX512VPOPCDQ);
3984 
3985 				if (cpi->cpi_std[7].cp_edx &
3986 				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
3987 					add_x86_feature(featureset,
3988 					    X86FSET_AVX512NNIW);
3989 				if (cpi->cpi_std[7].cp_edx &
3990 				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3991 					add_x86_feature(featureset,
3992 					    X86FSET_AVX512FMAPS);
3993 			}
3994 		}
3995 	}
3996 
3997 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3998 		add_x86_feature(featureset, X86FSET_PCID);
3999 	}
4000 
4001 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4002 		add_x86_feature(featureset, X86FSET_X2APIC);
4003 	}
4004 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4005 		add_x86_feature(featureset, X86FSET_DE);
4006 	}
4007 #if !defined(__xpv)
4008 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4009 
4010 		/*
4011 		 * We require the CLFLUSH instruction for erratum workaround
4012 		 * to use MONITOR/MWAIT.
4013 		 */
4014 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4015 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4016 			add_x86_feature(featureset, X86FSET_MWAIT);
4017 		} else {
4018 			extern int idle_cpu_assert_cflush_monitor;
4019 
4020 			/*
4021 			 * All processors we are aware of which have
4022 			 * MONITOR/MWAIT also have CLFLUSH.
4023 			 */
4024 			if (idle_cpu_assert_cflush_monitor) {
4025 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4026 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4027 			}
4028 		}
4029 	}
4030 #endif	/* __xpv */
4031 
4032 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4033 		add_x86_feature(featureset, X86FSET_VMX);
4034 	}
4035 
4036 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4037 		add_x86_feature(featureset, X86FSET_RDRAND);
4038 
4039 	/*
4040 	 * Only need it first time, rest of the cpus would follow suit.
4041 	 * we only capture this for the bootcpu.
4042 	 */
4043 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4044 		add_x86_feature(featureset, X86FSET_CLFSH);
4045 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4046 	}
4047 	if (is_x86_feature(featureset, X86FSET_PAE))
4048 		cpi->cpi_pabits = 36;
4049 
4050 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4051 		struct cpuid_regs r, *ecp;
4052 
4053 		ecp = &r;
4054 		ecp->cp_eax = 0xD;
4055 		ecp->cp_ecx = 1;
4056 		ecp->cp_edx = ecp->cp_ebx = 0;
4057 		(void) __cpuid_insn(ecp);
4058 
4059 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4060 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4061 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4062 			add_x86_feature(featureset, X86FSET_XSAVEC);
4063 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4064 			add_x86_feature(featureset, X86FSET_XSAVES);
4065 	}
4066 
4067 	/*
4068 	 * Work on the "extended" feature information, doing
4069 	 * some basic initialization to be used in the extended pass.
4070 	 */
4071 	xcpuid = 0;
4072 	switch (cpi->cpi_vendor) {
4073 	case X86_VENDOR_Intel:
4074 		/*
4075 		 * On KVM we know we will have proper support for extended
4076 		 * cpuid.
4077 		 */
4078 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4079 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4080 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4081 			xcpuid++;
4082 		break;
4083 	case X86_VENDOR_AMD:
4084 		if (cpi->cpi_family > 5 ||
4085 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4086 			xcpuid++;
4087 		break;
4088 	case X86_VENDOR_Cyrix:
4089 		/*
4090 		 * Only these Cyrix CPUs are -known- to support
4091 		 * extended cpuid operations.
4092 		 */
4093 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4094 		    x86_type == X86_TYPE_CYRIX_GXm)
4095 			xcpuid++;
4096 		break;
4097 	case X86_VENDOR_HYGON:
4098 	case X86_VENDOR_Centaur:
4099 	case X86_VENDOR_TM:
4100 	default:
4101 		xcpuid++;
4102 		break;
4103 	}
4104 
4105 	if (xcpuid) {
4106 		cp = &cpi->cpi_extd[0];
4107 		cp->cp_eax = CPUID_LEAF_EXT_0;
4108 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4109 	}
4110 
4111 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4112 
4113 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4114 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4115 
4116 		switch (cpi->cpi_vendor) {
4117 		case X86_VENDOR_Intel:
4118 		case X86_VENDOR_AMD:
4119 		case X86_VENDOR_HYGON:
4120 			if (cpi->cpi_xmaxeax < 0x80000001)
4121 				break;
4122 			cp = &cpi->cpi_extd[1];
4123 			cp->cp_eax = 0x80000001;
4124 			(void) __cpuid_insn(cp);
4125 
4126 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4127 			    cpi->cpi_family == 5 &&
4128 			    cpi->cpi_model == 6 &&
4129 			    cpi->cpi_step == 6) {
4130 				/*
4131 				 * K6 model 6 uses bit 10 to indicate SYSC
4132 				 * Later models use bit 11. Fix it here.
4133 				 */
4134 				if (cp->cp_edx & 0x400) {
4135 					cp->cp_edx &= ~0x400;
4136 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4137 				}
4138 			}
4139 
4140 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4141 
4142 			/*
4143 			 * Compute the additions to the kernel's feature word.
4144 			 */
4145 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4146 				add_x86_feature(featureset, X86FSET_NX);
4147 			}
4148 
4149 			/*
4150 			 * Regardless whether or not we boot 64-bit,
4151 			 * we should have a way to identify whether
4152 			 * the CPU is capable of running 64-bit.
4153 			 */
4154 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4155 				add_x86_feature(featureset, X86FSET_64);
4156 			}
4157 
4158 			/* 1 GB large page - enable only for 64 bit kernel */
4159 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4160 				add_x86_feature(featureset, X86FSET_1GPG);
4161 			}
4162 
4163 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4164 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4165 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4166 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4167 				add_x86_feature(featureset, X86FSET_SSE4A);
4168 			}
4169 
4170 			/*
4171 			 * It's really tricky to support syscall/sysret in
4172 			 * the i386 kernel; we rely on sysenter/sysexit
4173 			 * instead.  In the amd64 kernel, things are -way-
4174 			 * better.
4175 			 */
4176 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4177 				add_x86_feature(featureset, X86FSET_ASYSC);
4178 			}
4179 
4180 			/*
4181 			 * While we're thinking about system calls, note
4182 			 * that AMD processors don't support sysenter
4183 			 * in long mode at all, so don't try to program them.
4184 			 */
4185 			if (x86_vendor == X86_VENDOR_AMD ||
4186 			    x86_vendor == X86_VENDOR_HYGON) {
4187 				remove_x86_feature(featureset, X86FSET_SEP);
4188 			}
4189 
4190 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4191 				add_x86_feature(featureset, X86FSET_TSCP);
4192 			}
4193 
4194 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4195 				add_x86_feature(featureset, X86FSET_SVM);
4196 			}
4197 
4198 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4199 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4200 			}
4201 
4202 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4203 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4204 			}
4205 
4206 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4207 				add_x86_feature(featureset, X86FSET_XOP);
4208 			}
4209 
4210 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4211 				add_x86_feature(featureset, X86FSET_FMA4);
4212 			}
4213 
4214 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4215 				add_x86_feature(featureset, X86FSET_TBM);
4216 			}
4217 
4218 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4219 				add_x86_feature(featureset, X86FSET_MONITORX);
4220 			}
4221 			break;
4222 		default:
4223 			break;
4224 		}
4225 
4226 		/*
4227 		 * Get CPUID data about processor cores and hyperthreads.
4228 		 */
4229 		switch (cpi->cpi_vendor) {
4230 		case X86_VENDOR_Intel:
4231 			if (cpi->cpi_maxeax >= 4) {
4232 				cp = &cpi->cpi_std[4];
4233 				cp->cp_eax = 4;
4234 				cp->cp_ecx = 0;
4235 				(void) __cpuid_insn(cp);
4236 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4237 			}
4238 			/*FALLTHROUGH*/
4239 		case X86_VENDOR_AMD:
4240 		case X86_VENDOR_HYGON:
4241 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4242 				break;
4243 			cp = &cpi->cpi_extd[8];
4244 			cp->cp_eax = CPUID_LEAF_EXT_8;
4245 			(void) __cpuid_insn(cp);
4246 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4247 			    cp);
4248 
4249 			/*
4250 			 * AMD uses ebx for some extended functions.
4251 			 */
4252 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4253 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4254 				/*
4255 				 * While we're here, check for the AMD "Error
4256 				 * Pointer Zero/Restore" feature. This can be
4257 				 * used to setup the FP save handlers
4258 				 * appropriately.
4259 				 */
4260 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4261 					cpi->cpi_fp_amd_save = 0;
4262 				} else {
4263 					cpi->cpi_fp_amd_save = 1;
4264 				}
4265 
4266 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4267 					add_x86_feature(featureset,
4268 					    X86FSET_CLZERO);
4269 				}
4270 			}
4271 
4272 			/*
4273 			 * Virtual and physical address limits from
4274 			 * cpuid override previously guessed values.
4275 			 */
4276 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4277 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4278 			break;
4279 		default:
4280 			break;
4281 		}
4282 
4283 		/*
4284 		 * Get CPUID data about TSC Invariance in Deep C-State.
4285 		 */
4286 		switch (cpi->cpi_vendor) {
4287 		case X86_VENDOR_Intel:
4288 		case X86_VENDOR_AMD:
4289 		case X86_VENDOR_HYGON:
4290 			if (cpi->cpi_maxeax >= 7) {
4291 				cp = &cpi->cpi_extd[7];
4292 				cp->cp_eax = 0x80000007;
4293 				cp->cp_ecx = 0;
4294 				(void) __cpuid_insn(cp);
4295 			}
4296 			break;
4297 		default:
4298 			break;
4299 		}
4300 	}
4301 
4302 	/*
4303 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4304 	 * run and thus gathered some of its dependent leaves.
4305 	 */
4306 	cpuid_basic_topology(cpu, featureset);
4307 	cpuid_basic_thermal(cpu, featureset);
4308 #if !defined(__xpv)
4309 	cpuid_basic_ppin(cpu, featureset);
4310 #endif
4311 
4312 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4313 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4314 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4315 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4316 			/* Special handling for AMD FP not necessary. */
4317 			cpi->cpi_fp_amd_save = 0;
4318 		} else {
4319 			cpi->cpi_fp_amd_save = 1;
4320 		}
4321 	}
4322 
4323 	/*
4324 	 * Check (and potentially set) if lfence is serializing.
4325 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4326 	 */
4327 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4328 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4329 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4330 		/*
4331 		 * The AMD white paper Software Techniques For Managing
4332 		 * Speculation on AMD Processors details circumstances for when
4333 		 * lfence instructions are serializing.
4334 		 *
4335 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4336 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4337 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4338 		 * committed to supporting that MSR on all later CPUs.
4339 		 */
4340 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4341 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4342 		} else if (cpi->cpi_family >= 0x10) {
4343 #if !defined(__xpv)
4344 			uint64_t val;
4345 
4346 			/*
4347 			 * Be careful when attempting to enable the bit, and
4348 			 * verify that it was actually set in case we are
4349 			 * running in a hypervisor which is less than faithful
4350 			 * about its emulation of this feature.
4351 			 */
4352 			on_trap_data_t otd;
4353 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4354 				val = rdmsr(MSR_AMD_DE_CFG);
4355 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4356 				wrmsr(MSR_AMD_DE_CFG, val);
4357 				val = rdmsr(MSR_AMD_DE_CFG);
4358 			} else {
4359 				val = 0;
4360 			}
4361 			no_trap();
4362 
4363 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4364 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4365 			}
4366 #endif
4367 		}
4368 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4369 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4370 		/*
4371 		 * Documentation and other OSes indicate that lfence is always
4372 		 * serializing on Intel CPUs.
4373 		 */
4374 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4375 	}
4376 
4377 
4378 	/*
4379 	 * Check the processor leaves that are used for security features.
4380 	 */
4381 	cpuid_scan_security(cpu, featureset);
4382 }
4383 
4384 /*
4385  * Make copies of the cpuid table entries we depend on, in
4386  * part for ease of parsing now, in part so that we have only
4387  * one place to correct any of it, in part for ease of
4388  * later export to userland, and in part so we can look at
4389  * this stuff in a crash dump.
4390  */
4391 
4392 static void
4393 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4394 {
4395 	uint_t n, nmax;
4396 	int i;
4397 	struct cpuid_regs *cp;
4398 	uint8_t *dp;
4399 	uint32_t *iptr;
4400 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4401 
4402 	if (cpi->cpi_maxeax < 1)
4403 		return;
4404 
4405 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4406 		nmax = NMAX_CPI_STD;
4407 	/*
4408 	 * (We already handled n == 0 and n == 1 in the basic pass)
4409 	 */
4410 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4411 		/*
4412 		 * leaves 6 and 7 were handled in the basic pass
4413 		 */
4414 		if (n == 6 || n == 7)
4415 			continue;
4416 
4417 		cp->cp_eax = n;
4418 
4419 		/*
4420 		 * CPUID function 4 expects %ecx to be initialized
4421 		 * with an index which indicates which cache to return
4422 		 * information about. The OS is expected to call function 4
4423 		 * with %ecx set to 0, 1, 2, ... until it returns with
4424 		 * EAX[4:0] set to 0, which indicates there are no more
4425 		 * caches.
4426 		 *
4427 		 * Here, populate cpi_std[4] with the information returned by
4428 		 * function 4 when %ecx == 0, and do the rest in a later pass
4429 		 * when dynamic memory allocation becomes available.
4430 		 *
4431 		 * Note: we need to explicitly initialize %ecx here, since
4432 		 * function 4 may have been previously invoked.
4433 		 */
4434 		if (n == 4)
4435 			cp->cp_ecx = 0;
4436 
4437 		(void) __cpuid_insn(cp);
4438 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4439 		switch (n) {
4440 		case 2:
4441 			/*
4442 			 * "the lower 8 bits of the %eax register
4443 			 * contain a value that identifies the number
4444 			 * of times the cpuid [instruction] has to be
4445 			 * executed to obtain a complete image of the
4446 			 * processor's caching systems."
4447 			 *
4448 			 * How *do* they make this stuff up?
4449 			 */
4450 			cpi->cpi_ncache = sizeof (*cp) *
4451 			    BITX(cp->cp_eax, 7, 0);
4452 			if (cpi->cpi_ncache == 0)
4453 				break;
4454 			cpi->cpi_ncache--;	/* skip count byte */
4455 
4456 			/*
4457 			 * Well, for now, rather than attempt to implement
4458 			 * this slightly dubious algorithm, we just look
4459 			 * at the first 15 ..
4460 			 */
4461 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4462 				cpi->cpi_ncache = sizeof (*cp) - 1;
4463 
4464 			dp = cpi->cpi_cacheinfo;
4465 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4466 				uint8_t *p = (void *)&cp->cp_eax;
4467 				for (i = 1; i < 4; i++)
4468 					if (p[i] != 0)
4469 						*dp++ = p[i];
4470 			}
4471 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4472 				uint8_t *p = (void *)&cp->cp_ebx;
4473 				for (i = 0; i < 4; i++)
4474 					if (p[i] != 0)
4475 						*dp++ = p[i];
4476 			}
4477 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4478 				uint8_t *p = (void *)&cp->cp_ecx;
4479 				for (i = 0; i < 4; i++)
4480 					if (p[i] != 0)
4481 						*dp++ = p[i];
4482 			}
4483 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4484 				uint8_t *p = (void *)&cp->cp_edx;
4485 				for (i = 0; i < 4; i++)
4486 					if (p[i] != 0)
4487 						*dp++ = p[i];
4488 			}
4489 			break;
4490 
4491 		case 3:	/* Processor serial number, if PSN supported */
4492 			break;
4493 
4494 		case 4:	/* Deterministic cache parameters */
4495 			break;
4496 
4497 		case 5:	/* Monitor/Mwait parameters */
4498 		{
4499 			size_t mwait_size;
4500 
4501 			/*
4502 			 * check cpi_mwait.support which was set in
4503 			 * cpuid_pass_basic()
4504 			 */
4505 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4506 				break;
4507 
4508 			/*
4509 			 * Protect ourself from insane mwait line size.
4510 			 * Workaround for incomplete hardware emulator(s).
4511 			 */
4512 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4513 			if (mwait_size < sizeof (uint32_t) ||
4514 			    !ISP2(mwait_size)) {
4515 #if DEBUG
4516 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4517 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4518 #endif
4519 				break;
4520 			}
4521 
4522 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4523 			cpi->cpi_mwait.mon_max = mwait_size;
4524 			if (MWAIT_EXTENSION(cpi)) {
4525 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4526 				if (MWAIT_INT_ENABLE(cpi))
4527 					cpi->cpi_mwait.support |=
4528 					    MWAIT_ECX_INT_ENABLE;
4529 			}
4530 			break;
4531 		}
4532 		default:
4533 			break;
4534 		}
4535 	}
4536 
4537 	/*
4538 	 * XSAVE enumeration
4539 	 */
4540 	if (cpi->cpi_maxeax >= 0xD) {
4541 		struct cpuid_regs regs;
4542 		boolean_t cpuid_d_valid = B_TRUE;
4543 
4544 		cp = &regs;
4545 		cp->cp_eax = 0xD;
4546 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4547 
4548 		(void) __cpuid_insn(cp);
4549 
4550 		/*
4551 		 * Sanity checks for debug
4552 		 */
4553 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4554 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4555 			cpuid_d_valid = B_FALSE;
4556 		}
4557 
4558 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4559 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4560 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4561 
4562 		/*
4563 		 * If the hw supports AVX, get the size and offset in the save
4564 		 * area for the ymm state.
4565 		 */
4566 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4567 			cp->cp_eax = 0xD;
4568 			cp->cp_ecx = 2;
4569 			cp->cp_edx = cp->cp_ebx = 0;
4570 
4571 			(void) __cpuid_insn(cp);
4572 
4573 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4574 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4575 				cpuid_d_valid = B_FALSE;
4576 			}
4577 
4578 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4579 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4580 		}
4581 
4582 		/*
4583 		 * If the hw supports MPX, get the size and offset in the
4584 		 * save area for BNDREGS and BNDCSR.
4585 		 */
4586 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4587 			cp->cp_eax = 0xD;
4588 			cp->cp_ecx = 3;
4589 			cp->cp_edx = cp->cp_ebx = 0;
4590 
4591 			(void) __cpuid_insn(cp);
4592 
4593 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4594 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4595 
4596 			cp->cp_eax = 0xD;
4597 			cp->cp_ecx = 4;
4598 			cp->cp_edx = cp->cp_ebx = 0;
4599 
4600 			(void) __cpuid_insn(cp);
4601 
4602 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4603 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4604 		}
4605 
4606 		/*
4607 		 * If the hw supports AVX512, get the size and offset in the
4608 		 * save area for the opmask registers and zmm state.
4609 		 */
4610 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4611 			cp->cp_eax = 0xD;
4612 			cp->cp_ecx = 5;
4613 			cp->cp_edx = cp->cp_ebx = 0;
4614 
4615 			(void) __cpuid_insn(cp);
4616 
4617 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4618 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4619 
4620 			cp->cp_eax = 0xD;
4621 			cp->cp_ecx = 6;
4622 			cp->cp_edx = cp->cp_ebx = 0;
4623 
4624 			(void) __cpuid_insn(cp);
4625 
4626 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4627 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4628 
4629 			cp->cp_eax = 0xD;
4630 			cp->cp_ecx = 7;
4631 			cp->cp_edx = cp->cp_ebx = 0;
4632 
4633 			(void) __cpuid_insn(cp);
4634 
4635 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4636 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4637 		}
4638 
4639 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4640 			xsave_state_size = 0;
4641 		} else if (cpuid_d_valid) {
4642 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4643 		} else {
4644 			/* Broken CPUID 0xD, probably in HVM */
4645 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4646 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4647 			    ", ymm_size = %d, ymm_offset = %d\n",
4648 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4649 			    cpi->cpi_xsave.xsav_hw_features_high,
4650 			    (int)cpi->cpi_xsave.xsav_max_size,
4651 			    (int)cpi->cpi_xsave.ymm_size,
4652 			    (int)cpi->cpi_xsave.ymm_offset);
4653 
4654 			if (xsave_state_size != 0) {
4655 				/*
4656 				 * This must be a non-boot CPU. We cannot
4657 				 * continue, because boot cpu has already
4658 				 * enabled XSAVE.
4659 				 */
4660 				ASSERT(cpu->cpu_id != 0);
4661 				cmn_err(CE_PANIC, "cpu%d: we have already "
4662 				    "enabled XSAVE on boot cpu, cannot "
4663 				    "continue.", cpu->cpu_id);
4664 			} else {
4665 				/*
4666 				 * If we reached here on the boot CPU, it's also
4667 				 * almost certain that we'll reach here on the
4668 				 * non-boot CPUs. When we're here on a boot CPU
4669 				 * we should disable the feature, on a non-boot
4670 				 * CPU we need to confirm that we have.
4671 				 */
4672 				if (cpu->cpu_id == 0) {
4673 					remove_x86_feature(x86_featureset,
4674 					    X86FSET_XSAVE);
4675 					remove_x86_feature(x86_featureset,
4676 					    X86FSET_AVX);
4677 					remove_x86_feature(x86_featureset,
4678 					    X86FSET_F16C);
4679 					remove_x86_feature(x86_featureset,
4680 					    X86FSET_BMI1);
4681 					remove_x86_feature(x86_featureset,
4682 					    X86FSET_BMI2);
4683 					remove_x86_feature(x86_featureset,
4684 					    X86FSET_FMA);
4685 					remove_x86_feature(x86_featureset,
4686 					    X86FSET_AVX2);
4687 					remove_x86_feature(x86_featureset,
4688 					    X86FSET_MPX);
4689 					remove_x86_feature(x86_featureset,
4690 					    X86FSET_AVX512F);
4691 					remove_x86_feature(x86_featureset,
4692 					    X86FSET_AVX512DQ);
4693 					remove_x86_feature(x86_featureset,
4694 					    X86FSET_AVX512PF);
4695 					remove_x86_feature(x86_featureset,
4696 					    X86FSET_AVX512ER);
4697 					remove_x86_feature(x86_featureset,
4698 					    X86FSET_AVX512CD);
4699 					remove_x86_feature(x86_featureset,
4700 					    X86FSET_AVX512BW);
4701 					remove_x86_feature(x86_featureset,
4702 					    X86FSET_AVX512VL);
4703 					remove_x86_feature(x86_featureset,
4704 					    X86FSET_AVX512FMA);
4705 					remove_x86_feature(x86_featureset,
4706 					    X86FSET_AVX512VBMI);
4707 					remove_x86_feature(x86_featureset,
4708 					    X86FSET_AVX512VNNI);
4709 					remove_x86_feature(x86_featureset,
4710 					    X86FSET_AVX512VPOPCDQ);
4711 					remove_x86_feature(x86_featureset,
4712 					    X86FSET_AVX512NNIW);
4713 					remove_x86_feature(x86_featureset,
4714 					    X86FSET_AVX512FMAPS);
4715 					remove_x86_feature(x86_featureset,
4716 					    X86FSET_VAES);
4717 					remove_x86_feature(x86_featureset,
4718 					    X86FSET_VPCLMULQDQ);
4719 
4720 					CPI_FEATURES_ECX(cpi) &=
4721 					    ~CPUID_INTC_ECX_XSAVE;
4722 					CPI_FEATURES_ECX(cpi) &=
4723 					    ~CPUID_INTC_ECX_AVX;
4724 					CPI_FEATURES_ECX(cpi) &=
4725 					    ~CPUID_INTC_ECX_F16C;
4726 					CPI_FEATURES_ECX(cpi) &=
4727 					    ~CPUID_INTC_ECX_FMA;
4728 					CPI_FEATURES_7_0_EBX(cpi) &=
4729 					    ~CPUID_INTC_EBX_7_0_BMI1;
4730 					CPI_FEATURES_7_0_EBX(cpi) &=
4731 					    ~CPUID_INTC_EBX_7_0_BMI2;
4732 					CPI_FEATURES_7_0_EBX(cpi) &=
4733 					    ~CPUID_INTC_EBX_7_0_AVX2;
4734 					CPI_FEATURES_7_0_EBX(cpi) &=
4735 					    ~CPUID_INTC_EBX_7_0_MPX;
4736 					CPI_FEATURES_7_0_EBX(cpi) &=
4737 					    ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4738 
4739 					CPI_FEATURES_7_0_ECX(cpi) &=
4740 					    ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4741 
4742 					CPI_FEATURES_7_0_ECX(cpi) &=
4743 					    ~CPUID_INTC_ECX_7_0_VAES;
4744 					CPI_FEATURES_7_0_ECX(cpi) &=
4745 					    ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4746 
4747 					CPI_FEATURES_7_0_EDX(cpi) &=
4748 					    ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4749 
4750 					xsave_force_disable = B_TRUE;
4751 				} else {
4752 					VERIFY(is_x86_feature(x86_featureset,
4753 					    X86FSET_XSAVE) == B_FALSE);
4754 				}
4755 			}
4756 		}
4757 	}
4758 
4759 
4760 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4761 		return;
4762 
4763 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4764 		nmax = NMAX_CPI_EXTD;
4765 	/*
4766 	 * Copy the extended properties, fixing them as we go.
4767 	 * (We already handled n == 0 and n == 1 in the basic pass)
4768 	 */
4769 	iptr = (void *)cpi->cpi_brandstr;
4770 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4771 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4772 		(void) __cpuid_insn(cp);
4773 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4774 		    cp);
4775 		switch (n) {
4776 		case 2:
4777 		case 3:
4778 		case 4:
4779 			/*
4780 			 * Extract the brand string
4781 			 */
4782 			*iptr++ = cp->cp_eax;
4783 			*iptr++ = cp->cp_ebx;
4784 			*iptr++ = cp->cp_ecx;
4785 			*iptr++ = cp->cp_edx;
4786 			break;
4787 		case 5:
4788 			switch (cpi->cpi_vendor) {
4789 			case X86_VENDOR_AMD:
4790 				/*
4791 				 * The Athlon and Duron were the first
4792 				 * parts to report the sizes of the
4793 				 * TLB for large pages. Before then,
4794 				 * we don't trust the data.
4795 				 */
4796 				if (cpi->cpi_family < 6 ||
4797 				    (cpi->cpi_family == 6 &&
4798 				    cpi->cpi_model < 1))
4799 					cp->cp_eax = 0;
4800 				break;
4801 			default:
4802 				break;
4803 			}
4804 			break;
4805 		case 6:
4806 			switch (cpi->cpi_vendor) {
4807 			case X86_VENDOR_AMD:
4808 				/*
4809 				 * The Athlon and Duron were the first
4810 				 * AMD parts with L2 TLB's.
4811 				 * Before then, don't trust the data.
4812 				 */
4813 				if (cpi->cpi_family < 6 ||
4814 				    (cpi->cpi_family == 6 &&
4815 				    cpi->cpi_model < 1))
4816 					cp->cp_eax = cp->cp_ebx = 0;
4817 				/*
4818 				 * AMD Duron rev A0 reports L2
4819 				 * cache size incorrectly as 1K
4820 				 * when it is really 64K
4821 				 */
4822 				if (cpi->cpi_family == 6 &&
4823 				    cpi->cpi_model == 3 &&
4824 				    cpi->cpi_step == 0) {
4825 					cp->cp_ecx &= 0xffff;
4826 					cp->cp_ecx |= 0x400000;
4827 				}
4828 				break;
4829 			case X86_VENDOR_Cyrix:	/* VIA C3 */
4830 				/*
4831 				 * VIA C3 processors are a bit messed
4832 				 * up w.r.t. encoding cache sizes in %ecx
4833 				 */
4834 				if (cpi->cpi_family != 6)
4835 					break;
4836 				/*
4837 				 * model 7 and 8 were incorrectly encoded
4838 				 *
4839 				 * xxx is model 8 really broken?
4840 				 */
4841 				if (cpi->cpi_model == 7 ||
4842 				    cpi->cpi_model == 8)
4843 					cp->cp_ecx =
4844 					    BITX(cp->cp_ecx, 31, 24) << 16 |
4845 					    BITX(cp->cp_ecx, 23, 16) << 12 |
4846 					    BITX(cp->cp_ecx, 15, 8) << 8 |
4847 					    BITX(cp->cp_ecx, 7, 0);
4848 				/*
4849 				 * model 9 stepping 1 has wrong associativity
4850 				 */
4851 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4852 					cp->cp_ecx |= 8 << 12;
4853 				break;
4854 			case X86_VENDOR_Intel:
4855 				/*
4856 				 * Extended L2 Cache features function.
4857 				 * First appeared on Prescott.
4858 				 */
4859 			default:
4860 				break;
4861 			}
4862 			break;
4863 		default:
4864 			break;
4865 		}
4866 	}
4867 }
4868 
4869 static const char *
4870 intel_cpubrand(const struct cpuid_info *cpi)
4871 {
4872 	int i;
4873 
4874 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
4875 
4876 	switch (cpi->cpi_family) {
4877 	case 5:
4878 		return ("Intel Pentium(r)");
4879 	case 6:
4880 		switch (cpi->cpi_model) {
4881 			uint_t celeron, xeon;
4882 			const struct cpuid_regs *cp;
4883 		case 0:
4884 		case 1:
4885 		case 2:
4886 			return ("Intel Pentium(r) Pro");
4887 		case 3:
4888 		case 4:
4889 			return ("Intel Pentium(r) II");
4890 		case 6:
4891 			return ("Intel Celeron(r)");
4892 		case 5:
4893 		case 7:
4894 			celeron = xeon = 0;
4895 			cp = &cpi->cpi_std[2];	/* cache info */
4896 
4897 			for (i = 1; i < 4; i++) {
4898 				uint_t tmp;
4899 
4900 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4901 				if (tmp == 0x40)
4902 					celeron++;
4903 				if (tmp >= 0x44 && tmp <= 0x45)
4904 					xeon++;
4905 			}
4906 
4907 			for (i = 0; i < 2; i++) {
4908 				uint_t tmp;
4909 
4910 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4911 				if (tmp == 0x40)
4912 					celeron++;
4913 				else if (tmp >= 0x44 && tmp <= 0x45)
4914 					xeon++;
4915 			}
4916 
4917 			for (i = 0; i < 4; i++) {
4918 				uint_t tmp;
4919 
4920 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4921 				if (tmp == 0x40)
4922 					celeron++;
4923 				else if (tmp >= 0x44 && tmp <= 0x45)
4924 					xeon++;
4925 			}
4926 
4927 			for (i = 0; i < 4; i++) {
4928 				uint_t tmp;
4929 
4930 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4931 				if (tmp == 0x40)
4932 					celeron++;
4933 				else if (tmp >= 0x44 && tmp <= 0x45)
4934 					xeon++;
4935 			}
4936 
4937 			if (celeron)
4938 				return ("Intel Celeron(r)");
4939 			if (xeon)
4940 				return (cpi->cpi_model == 5 ?
4941 				    "Intel Pentium(r) II Xeon(tm)" :
4942 				    "Intel Pentium(r) III Xeon(tm)");
4943 			return (cpi->cpi_model == 5 ?
4944 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4945 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4946 		default:
4947 			break;
4948 		}
4949 	default:
4950 		break;
4951 	}
4952 
4953 	/* BrandID is present if the field is nonzero */
4954 	if (cpi->cpi_brandid != 0) {
4955 		static const struct {
4956 			uint_t bt_bid;
4957 			const char *bt_str;
4958 		} brand_tbl[] = {
4959 			{ 0x1,	"Intel(r) Celeron(r)" },
4960 			{ 0x2,	"Intel(r) Pentium(r) III" },
4961 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
4962 			{ 0x4,	"Intel(r) Pentium(r) III" },
4963 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
4964 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
4965 			{ 0x8,	"Intel(r) Pentium(r) 4" },
4966 			{ 0x9,	"Intel(r) Pentium(r) 4" },
4967 			{ 0xa,	"Intel(r) Celeron(r)" },
4968 			{ 0xb,	"Intel(r) Xeon(tm)" },
4969 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
4970 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
4971 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
4972 			{ 0x11, "Mobile Genuine Intel(r)" },
4973 			{ 0x12, "Intel(r) Celeron(r) M" },
4974 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
4975 			{ 0x14, "Intel(r) Celeron(r)" },
4976 			{ 0x15, "Mobile Genuine Intel(r)" },
4977 			{ 0x16,	"Intel(r) Pentium(r) M" },
4978 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
4979 		};
4980 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4981 		uint_t sgn;
4982 
4983 		sgn = (cpi->cpi_family << 8) |
4984 		    (cpi->cpi_model << 4) | cpi->cpi_step;
4985 
4986 		for (i = 0; i < btblmax; i++)
4987 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4988 				break;
4989 		if (i < btblmax) {
4990 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4991 				return ("Intel(r) Celeron(r)");
4992 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4993 				return ("Intel(r) Xeon(tm) MP");
4994 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4995 				return ("Intel(r) Xeon(tm)");
4996 			return (brand_tbl[i].bt_str);
4997 		}
4998 	}
4999 
5000 	return (NULL);
5001 }
5002 
5003 static const char *
5004 amd_cpubrand(const struct cpuid_info *cpi)
5005 {
5006 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5007 
5008 	switch (cpi->cpi_family) {
5009 	case 5:
5010 		switch (cpi->cpi_model) {
5011 		case 0:
5012 		case 1:
5013 		case 2:
5014 		case 3:
5015 		case 4:
5016 		case 5:
5017 			return ("AMD-K5(r)");
5018 		case 6:
5019 		case 7:
5020 			return ("AMD-K6(r)");
5021 		case 8:
5022 			return ("AMD-K6(r)-2");
5023 		case 9:
5024 			return ("AMD-K6(r)-III");
5025 		default:
5026 			return ("AMD (family 5)");
5027 		}
5028 	case 6:
5029 		switch (cpi->cpi_model) {
5030 		case 1:
5031 			return ("AMD-K7(tm)");
5032 		case 0:
5033 		case 2:
5034 		case 4:
5035 			return ("AMD Athlon(tm)");
5036 		case 3:
5037 		case 7:
5038 			return ("AMD Duron(tm)");
5039 		case 6:
5040 		case 8:
5041 		case 10:
5042 			/*
5043 			 * Use the L2 cache size to distinguish
5044 			 */
5045 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5046 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5047 		default:
5048 			return ("AMD (family 6)");
5049 		}
5050 	default:
5051 		break;
5052 	}
5053 
5054 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5055 	    cpi->cpi_brandid != 0) {
5056 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5057 		case 3:
5058 			return ("AMD Opteron(tm) UP 1xx");
5059 		case 4:
5060 			return ("AMD Opteron(tm) DP 2xx");
5061 		case 5:
5062 			return ("AMD Opteron(tm) MP 8xx");
5063 		default:
5064 			return ("AMD Opteron(tm)");
5065 		}
5066 	}
5067 
5068 	return (NULL);
5069 }
5070 
5071 static const char *
5072 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5073 {
5074 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5075 
5076 	switch (type) {
5077 	case X86_TYPE_CYRIX_6x86:
5078 		return ("Cyrix 6x86");
5079 	case X86_TYPE_CYRIX_6x86L:
5080 		return ("Cyrix 6x86L");
5081 	case X86_TYPE_CYRIX_6x86MX:
5082 		return ("Cyrix 6x86MX");
5083 	case X86_TYPE_CYRIX_GXm:
5084 		return ("Cyrix GXm");
5085 	case X86_TYPE_CYRIX_MediaGX:
5086 		return ("Cyrix MediaGX");
5087 	case X86_TYPE_CYRIX_MII:
5088 		return ("Cyrix M2");
5089 	case X86_TYPE_VIA_CYRIX_III:
5090 		return ("VIA Cyrix M3");
5091 	default:
5092 		/*
5093 		 * Have another wild guess ..
5094 		 */
5095 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5096 			return ("Cyrix 5x86");
5097 		else if (cpi->cpi_family == 5) {
5098 			switch (cpi->cpi_model) {
5099 			case 2:
5100 				return ("Cyrix 6x86");	/* Cyrix M1 */
5101 			case 4:
5102 				return ("Cyrix MediaGX");
5103 			default:
5104 				break;
5105 			}
5106 		} else if (cpi->cpi_family == 6) {
5107 			switch (cpi->cpi_model) {
5108 			case 0:
5109 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5110 			case 5:
5111 			case 6:
5112 			case 7:
5113 			case 8:
5114 			case 9:
5115 				return ("VIA C3");
5116 			default:
5117 				break;
5118 			}
5119 		}
5120 		break;
5121 	}
5122 	return (NULL);
5123 }
5124 
5125 /*
5126  * This only gets called in the case that the CPU extended
5127  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5128  * aren't available, or contain null bytes for some reason.
5129  */
5130 static void
5131 fabricate_brandstr(struct cpuid_info *cpi)
5132 {
5133 	const char *brand = NULL;
5134 
5135 	switch (cpi->cpi_vendor) {
5136 	case X86_VENDOR_Intel:
5137 		brand = intel_cpubrand(cpi);
5138 		break;
5139 	case X86_VENDOR_AMD:
5140 		brand = amd_cpubrand(cpi);
5141 		break;
5142 	case X86_VENDOR_Cyrix:
5143 		brand = cyrix_cpubrand(cpi, x86_type);
5144 		break;
5145 	case X86_VENDOR_NexGen:
5146 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5147 			brand = "NexGen Nx586";
5148 		break;
5149 	case X86_VENDOR_Centaur:
5150 		if (cpi->cpi_family == 5)
5151 			switch (cpi->cpi_model) {
5152 			case 4:
5153 				brand = "Centaur C6";
5154 				break;
5155 			case 8:
5156 				brand = "Centaur C2";
5157 				break;
5158 			case 9:
5159 				brand = "Centaur C3";
5160 				break;
5161 			default:
5162 				break;
5163 			}
5164 		break;
5165 	case X86_VENDOR_Rise:
5166 		if (cpi->cpi_family == 5 &&
5167 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5168 			brand = "Rise mP6";
5169 		break;
5170 	case X86_VENDOR_SiS:
5171 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5172 			brand = "SiS 55x";
5173 		break;
5174 	case X86_VENDOR_TM:
5175 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5176 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5177 		break;
5178 	case X86_VENDOR_NSC:
5179 	case X86_VENDOR_UMC:
5180 	default:
5181 		break;
5182 	}
5183 	if (brand) {
5184 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5185 		return;
5186 	}
5187 
5188 	/*
5189 	 * If all else fails ...
5190 	 */
5191 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5192 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5193 	    cpi->cpi_model, cpi->cpi_step);
5194 }
5195 
5196 /*
5197  * This routine is called just after kernel memory allocation
5198  * becomes available on cpu0, and as part of mp_startup() on
5199  * the other cpus.
5200  *
5201  * Fixup the brand string, and collect any information from cpuid
5202  * that requires dynamically allocated storage to represent.
5203  */
5204 
5205 static void
5206 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5207 {
5208 	int	i, max, shft, level, size;
5209 	struct cpuid_regs regs;
5210 	struct cpuid_regs *cp;
5211 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5212 
5213 	/*
5214 	 * Deterministic cache parameters
5215 	 *
5216 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5217 	 * values that are present are currently defined to be the same. This
5218 	 * means we can use the same logic to parse it as long as we use the
5219 	 * appropriate leaf to get the data. If you're updating this, make sure
5220 	 * you're careful about which vendor supports which aspect.
5221 	 *
5222 	 * Take this opportunity to detect the number of threads sharing the
5223 	 * last level cache, and construct a corresponding cache id. The
5224 	 * respective cpuid_info members are initialized to the default case of
5225 	 * "no last level cache sharing".
5226 	 */
5227 	cpi->cpi_ncpu_shr_last_cache = 1;
5228 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5229 
5230 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5231 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5232 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5233 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5234 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5235 		uint32_t leaf;
5236 
5237 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5238 			leaf = 4;
5239 		} else {
5240 			leaf = CPUID_LEAF_EXT_1d;
5241 		}
5242 
5243 		/*
5244 		 * Find the # of elements (size) returned by the leaf and along
5245 		 * the way detect last level cache sharing details.
5246 		 */
5247 		bzero(&regs, sizeof (regs));
5248 		cp = &regs;
5249 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5250 			cp->cp_eax = leaf;
5251 			cp->cp_ecx = i;
5252 
5253 			(void) __cpuid_insn(cp);
5254 
5255 			if (CPI_CACHE_TYPE(cp) == 0)
5256 				break;
5257 			level = CPI_CACHE_LVL(cp);
5258 			if (level > max) {
5259 				max = level;
5260 				cpi->cpi_ncpu_shr_last_cache =
5261 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5262 			}
5263 		}
5264 		cpi->cpi_cache_leaf_size = size = i;
5265 
5266 		/*
5267 		 * Allocate the cpi_cache_leaves array. The first element
5268 		 * references the regs for the corresponding leaf with %ecx set
5269 		 * to 0. This was gathered in cpuid_pass_extended().
5270 		 */
5271 		if (size > 0) {
5272 			cpi->cpi_cache_leaves =
5273 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5274 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5275 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5276 			} else {
5277 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5278 			}
5279 
5280 			/*
5281 			 * Allocate storage to hold the additional regs
5282 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5283 			 *
5284 			 * The regs for the leaf, %ecx == 0 has already
5285 			 * been allocated as indicated above.
5286 			 */
5287 			for (i = 1; i < size; i++) {
5288 				cp = cpi->cpi_cache_leaves[i] =
5289 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5290 				cp->cp_eax = leaf;
5291 				cp->cp_ecx = i;
5292 
5293 				(void) __cpuid_insn(cp);
5294 			}
5295 		}
5296 		/*
5297 		 * Determine the number of bits needed to represent
5298 		 * the number of CPUs sharing the last level cache.
5299 		 *
5300 		 * Shift off that number of bits from the APIC id to
5301 		 * derive the cache id.
5302 		 */
5303 		shft = 0;
5304 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5305 			shft++;
5306 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5307 	}
5308 
5309 	/*
5310 	 * Now fixup the brand string
5311 	 */
5312 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5313 		fabricate_brandstr(cpi);
5314 	} else {
5315 
5316 		/*
5317 		 * If we successfully extracted a brand string from the cpuid
5318 		 * instruction, clean it up by removing leading spaces and
5319 		 * similar junk.
5320 		 */
5321 		if (cpi->cpi_brandstr[0]) {
5322 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5323 			char *src, *dst;
5324 
5325 			dst = src = (char *)cpi->cpi_brandstr;
5326 			src[maxlen - 1] = '\0';
5327 			/*
5328 			 * strip leading spaces
5329 			 */
5330 			while (*src == ' ')
5331 				src++;
5332 			/*
5333 			 * Remove any 'Genuine' or "Authentic" prefixes
5334 			 */
5335 			if (strncmp(src, "Genuine ", 8) == 0)
5336 				src += 8;
5337 			if (strncmp(src, "Authentic ", 10) == 0)
5338 				src += 10;
5339 
5340 			/*
5341 			 * Now do an in-place copy.
5342 			 * Map (R) to (r) and (TM) to (tm).
5343 			 * The era of teletypes is long gone, and there's
5344 			 * -really- no need to shout.
5345 			 */
5346 			while (*src != '\0') {
5347 				if (src[0] == '(') {
5348 					if (strncmp(src + 1, "R)", 2) == 0) {
5349 						(void) strncpy(dst, "(r)", 3);
5350 						src += 3;
5351 						dst += 3;
5352 						continue;
5353 					}
5354 					if (strncmp(src + 1, "TM)", 3) == 0) {
5355 						(void) strncpy(dst, "(tm)", 4);
5356 						src += 4;
5357 						dst += 4;
5358 						continue;
5359 					}
5360 				}
5361 				*dst++ = *src++;
5362 			}
5363 			*dst = '\0';
5364 
5365 			/*
5366 			 * Finally, remove any trailing spaces
5367 			 */
5368 			while (--dst > cpi->cpi_brandstr)
5369 				if (*dst == ' ')
5370 					*dst = '\0';
5371 				else
5372 					break;
5373 		} else
5374 			fabricate_brandstr(cpi);
5375 	}
5376 }
5377 
5378 /*
5379  * This routine is called out of bind_hwcap() much later in the life
5380  * of the kernel (post_startup()).  The job of this routine is to resolve
5381  * the hardware feature support and kernel support for those features into
5382  * what we're actually going to tell applications via the aux vector.
5383  */
5384 
5385 static void
5386 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5387 {
5388 	uint_t *hwcap_out = (uint_t *)arg;
5389 	struct cpuid_info *cpi;
5390 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
5391 
5392 	cpi = cpu->cpu_m.mcpu_cpi;
5393 
5394 	if (cpi->cpi_maxeax >= 1) {
5395 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5396 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5397 		uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
5398 
5399 		*edx = CPI_FEATURES_EDX(cpi);
5400 		*ecx = CPI_FEATURES_ECX(cpi);
5401 		*ebx = CPI_FEATURES_7_0_EBX(cpi);
5402 
5403 		/*
5404 		 * [these require explicit kernel support]
5405 		 */
5406 		if (!is_x86_feature(x86_featureset, X86FSET_SEP))
5407 			*edx &= ~CPUID_INTC_EDX_SEP;
5408 
5409 		if (!is_x86_feature(x86_featureset, X86FSET_SSE))
5410 			*edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
5411 		if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
5412 			*edx &= ~CPUID_INTC_EDX_SSE2;
5413 
5414 		if (!is_x86_feature(x86_featureset, X86FSET_HTT))
5415 			*edx &= ~CPUID_INTC_EDX_HTT;
5416 
5417 		if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
5418 			*ecx &= ~CPUID_INTC_ECX_SSE3;
5419 
5420 		if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
5421 			*ecx &= ~CPUID_INTC_ECX_SSSE3;
5422 		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
5423 			*ecx &= ~CPUID_INTC_ECX_SSE4_1;
5424 		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
5425 			*ecx &= ~CPUID_INTC_ECX_SSE4_2;
5426 		if (!is_x86_feature(x86_featureset, X86FSET_AES))
5427 			*ecx &= ~CPUID_INTC_ECX_AES;
5428 		if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
5429 			*ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
5430 		if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
5431 			*ecx &= ~(CPUID_INTC_ECX_XSAVE |
5432 			    CPUID_INTC_ECX_OSXSAVE);
5433 		if (!is_x86_feature(x86_featureset, X86FSET_AVX))
5434 			*ecx &= ~CPUID_INTC_ECX_AVX;
5435 		if (!is_x86_feature(x86_featureset, X86FSET_F16C))
5436 			*ecx &= ~CPUID_INTC_ECX_F16C;
5437 		if (!is_x86_feature(x86_featureset, X86FSET_FMA))
5438 			*ecx &= ~CPUID_INTC_ECX_FMA;
5439 		if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
5440 			*ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
5441 		if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
5442 			*ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
5443 		if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
5444 			*ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
5445 		if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
5446 			*ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
5447 		if (!is_x86_feature(x86_featureset, X86FSET_ADX))
5448 			*ebx &= ~CPUID_INTC_EBX_7_0_ADX;
5449 
5450 		/*
5451 		 * [no explicit support required beyond x87 fp context]
5452 		 */
5453 		if (!fpu_exists)
5454 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5455 
5456 		/*
5457 		 * Now map the supported feature vector to things that we
5458 		 * think userland will care about.
5459 		 */
5460 		if (*edx & CPUID_INTC_EDX_SEP)
5461 			hwcap_flags |= AV_386_SEP;
5462 		if (*edx & CPUID_INTC_EDX_SSE)
5463 			hwcap_flags |= AV_386_FXSR | AV_386_SSE;
5464 		if (*edx & CPUID_INTC_EDX_SSE2)
5465 			hwcap_flags |= AV_386_SSE2;
5466 		if (*ecx & CPUID_INTC_ECX_SSE3)
5467 			hwcap_flags |= AV_386_SSE3;
5468 		if (*ecx & CPUID_INTC_ECX_SSSE3)
5469 			hwcap_flags |= AV_386_SSSE3;
5470 		if (*ecx & CPUID_INTC_ECX_SSE4_1)
5471 			hwcap_flags |= AV_386_SSE4_1;
5472 		if (*ecx & CPUID_INTC_ECX_SSE4_2)
5473 			hwcap_flags |= AV_386_SSE4_2;
5474 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5475 			hwcap_flags |= AV_386_MOVBE;
5476 		if (*ecx & CPUID_INTC_ECX_AES)
5477 			hwcap_flags |= AV_386_AES;
5478 		if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5479 			hwcap_flags |= AV_386_PCLMULQDQ;
5480 		if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5481 		    (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5482 			hwcap_flags |= AV_386_XSAVE;
5483 
5484 			if (*ecx & CPUID_INTC_ECX_AVX) {
5485 				uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5486 				uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5487 
5488 				hwcap_flags |= AV_386_AVX;
5489 				if (*ecx & CPUID_INTC_ECX_F16C)
5490 					hwcap_flags_2 |= AV_386_2_F16C;
5491 				if (*ecx & CPUID_INTC_ECX_FMA)
5492 					hwcap_flags_2 |= AV_386_2_FMA;
5493 
5494 				if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5495 					hwcap_flags_2 |= AV_386_2_BMI1;
5496 				if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5497 					hwcap_flags_2 |= AV_386_2_BMI2;
5498 				if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5499 					hwcap_flags_2 |= AV_386_2_AVX2;
5500 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5501 					hwcap_flags_2 |= AV_386_2_AVX512F;
5502 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5503 					hwcap_flags_2 |= AV_386_2_AVX512DQ;
5504 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5505 					hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5506 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5507 					hwcap_flags_2 |= AV_386_2_AVX512PF;
5508 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5509 					hwcap_flags_2 |= AV_386_2_AVX512ER;
5510 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5511 					hwcap_flags_2 |= AV_386_2_AVX512CD;
5512 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5513 					hwcap_flags_2 |= AV_386_2_AVX512BW;
5514 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5515 					hwcap_flags_2 |= AV_386_2_AVX512VL;
5516 
5517 				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5518 					hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5519 				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5520 					hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5521 				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5522 					hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5523 				if (*ecx_7 & CPUID_INTC_ECX_7_0_VAES)
5524 					hwcap_flags_2 |= AV_386_2_VAES;
5525 				if (*ecx_7 & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
5526 					hwcap_flags_2 |= AV_386_2_VPCLMULQDQ;
5527 
5528 				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5529 					hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5530 				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5531 					hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5532 			}
5533 		}
5534 		if (*ecx & CPUID_INTC_ECX_VMX)
5535 			hwcap_flags |= AV_386_VMX;
5536 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5537 			hwcap_flags |= AV_386_POPCNT;
5538 		if (*edx & CPUID_INTC_EDX_FPU)
5539 			hwcap_flags |= AV_386_FPU;
5540 		if (*edx & CPUID_INTC_EDX_MMX)
5541 			hwcap_flags |= AV_386_MMX;
5542 
5543 		if (*edx & CPUID_INTC_EDX_TSC)
5544 			hwcap_flags |= AV_386_TSC;
5545 		if (*edx & CPUID_INTC_EDX_CX8)
5546 			hwcap_flags |= AV_386_CX8;
5547 		if (*edx & CPUID_INTC_EDX_CMOV)
5548 			hwcap_flags |= AV_386_CMOV;
5549 		if (*ecx & CPUID_INTC_ECX_CX16)
5550 			hwcap_flags |= AV_386_CX16;
5551 
5552 		if (*ecx & CPUID_INTC_ECX_RDRAND)
5553 			hwcap_flags_2 |= AV_386_2_RDRAND;
5554 		if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5555 			hwcap_flags_2 |= AV_386_2_ADX;
5556 		if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5557 			hwcap_flags_2 |= AV_386_2_RDSEED;
5558 		if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5559 			hwcap_flags_2 |= AV_386_2_SHA;
5560 		if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5561 			hwcap_flags_2 |= AV_386_2_FSGSBASE;
5562 		if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5563 			hwcap_flags_2 |= AV_386_2_CLWB;
5564 		if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5565 			hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5566 
5567 	}
5568 	/*
5569 	 * Check a few miscilaneous features.
5570 	 */
5571 	if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5572 		hwcap_flags_2 |= AV_386_2_CLZERO;
5573 
5574 	if (cpi->cpi_xmaxeax < 0x80000001)
5575 		goto resolve_done;
5576 
5577 	switch (cpi->cpi_vendor) {
5578 		struct cpuid_regs cp;
5579 		uint32_t *edx, *ecx;
5580 
5581 	case X86_VENDOR_Intel:
5582 		/*
5583 		 * Seems like Intel duplicated what we necessary
5584 		 * here to make the initial crop of 64-bit OS's work.
5585 		 * Hopefully, those are the only "extended" bits
5586 		 * they'll add.
5587 		 */
5588 		/*FALLTHROUGH*/
5589 
5590 	case X86_VENDOR_AMD:
5591 	case X86_VENDOR_HYGON:
5592 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5593 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5594 
5595 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5596 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5597 
5598 		/*
5599 		 * [these features require explicit kernel support]
5600 		 */
5601 		switch (cpi->cpi_vendor) {
5602 		case X86_VENDOR_Intel:
5603 			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5604 				*edx &= ~CPUID_AMD_EDX_TSCP;
5605 			break;
5606 
5607 		case X86_VENDOR_AMD:
5608 		case X86_VENDOR_HYGON:
5609 			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5610 				*edx &= ~CPUID_AMD_EDX_TSCP;
5611 			if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5612 				*ecx &= ~CPUID_AMD_ECX_SSE4A;
5613 			break;
5614 
5615 		default:
5616 			break;
5617 		}
5618 
5619 		/*
5620 		 * [no explicit support required beyond
5621 		 * x87 fp context and exception handlers]
5622 		 */
5623 		if (!fpu_exists)
5624 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5625 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5626 
5627 		if (!is_x86_feature(x86_featureset, X86FSET_NX))
5628 			*edx &= ~CPUID_AMD_EDX_NX;
5629 		/*
5630 		 * Now map the supported feature vector to
5631 		 * things that we think userland will care about.
5632 		 */
5633 		if (*edx & CPUID_AMD_EDX_SYSC)
5634 			hwcap_flags |= AV_386_AMD_SYSC;
5635 		if (*edx & CPUID_AMD_EDX_MMXamd)
5636 			hwcap_flags |= AV_386_AMD_MMX;
5637 		if (*edx & CPUID_AMD_EDX_3DNow)
5638 			hwcap_flags |= AV_386_AMD_3DNow;
5639 		if (*edx & CPUID_AMD_EDX_3DNowx)
5640 			hwcap_flags |= AV_386_AMD_3DNowx;
5641 		if (*ecx & CPUID_AMD_ECX_SVM)
5642 			hwcap_flags |= AV_386_AMD_SVM;
5643 
5644 		switch (cpi->cpi_vendor) {
5645 		case X86_VENDOR_AMD:
5646 		case X86_VENDOR_HYGON:
5647 			if (*edx & CPUID_AMD_EDX_TSCP)
5648 				hwcap_flags |= AV_386_TSCP;
5649 			if (*ecx & CPUID_AMD_ECX_AHF64)
5650 				hwcap_flags |= AV_386_AHF;
5651 			if (*ecx & CPUID_AMD_ECX_SSE4A)
5652 				hwcap_flags |= AV_386_AMD_SSE4A;
5653 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5654 				hwcap_flags |= AV_386_AMD_LZCNT;
5655 			if (*ecx & CPUID_AMD_ECX_MONITORX)
5656 				hwcap_flags_2 |= AV_386_2_MONITORX;
5657 			break;
5658 
5659 		case X86_VENDOR_Intel:
5660 			if (*edx & CPUID_AMD_EDX_TSCP)
5661 				hwcap_flags |= AV_386_TSCP;
5662 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5663 				hwcap_flags |= AV_386_AMD_LZCNT;
5664 			/*
5665 			 * Aarrgh.
5666 			 * Intel uses a different bit in the same word.
5667 			 */
5668 			if (*ecx & CPUID_INTC_ECX_AHF64)
5669 				hwcap_flags |= AV_386_AHF;
5670 			break;
5671 
5672 		default:
5673 			break;
5674 		}
5675 		break;
5676 
5677 	case X86_VENDOR_TM:
5678 		cp.cp_eax = 0x80860001;
5679 		(void) __cpuid_insn(&cp);
5680 		cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5681 		break;
5682 
5683 	default:
5684 		break;
5685 	}
5686 
5687 resolve_done:
5688 	if (hwcap_out != NULL) {
5689 		hwcap_out[0] = hwcap_flags;
5690 		hwcap_out[1] = hwcap_flags_2;
5691 	}
5692 }
5693 
5694 
5695 /*
5696  * Simulate the cpuid instruction using the data we previously
5697  * captured about this CPU.  We try our best to return the truth
5698  * about the hardware, independently of kernel support.
5699  */
5700 uint32_t
5701 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5702 {
5703 	struct cpuid_info *cpi;
5704 	struct cpuid_regs *xcp;
5705 
5706 	if (cpu == NULL)
5707 		cpu = CPU;
5708 	cpi = cpu->cpu_m.mcpu_cpi;
5709 
5710 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5711 
5712 	/*
5713 	 * CPUID data is cached in two separate places: cpi_std for standard
5714 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5715 	 */
5716 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5717 		xcp = &cpi->cpi_std[cp->cp_eax];
5718 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5719 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5720 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5721 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5722 	} else {
5723 		/*
5724 		 * The caller is asking for data from an input parameter which
5725 		 * the kernel has not cached.  In this case we go fetch from
5726 		 * the hardware and return the data directly to the user.
5727 		 */
5728 		return (__cpuid_insn(cp));
5729 	}
5730 
5731 	cp->cp_eax = xcp->cp_eax;
5732 	cp->cp_ebx = xcp->cp_ebx;
5733 	cp->cp_ecx = xcp->cp_ecx;
5734 	cp->cp_edx = xcp->cp_edx;
5735 	return (cp->cp_eax);
5736 }
5737 
5738 boolean_t
5739 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5740 {
5741 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5742 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5743 }
5744 
5745 int
5746 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5747 {
5748 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5749 
5750 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5751 }
5752 
5753 int
5754 cpuid_is_cmt(cpu_t *cpu)
5755 {
5756 	if (cpu == NULL)
5757 		cpu = CPU;
5758 
5759 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5760 
5761 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5762 }
5763 
5764 /*
5765  * AMD and Intel both implement the 64-bit variant of the syscall
5766  * instruction (syscallq), so if there's -any- support for syscall,
5767  * cpuid currently says "yes, we support this".
5768  *
5769  * However, Intel decided to -not- implement the 32-bit variant of the
5770  * syscall instruction, so we provide a predicate to allow our caller
5771  * to test that subtlety here.
5772  *
5773  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5774  *	even in the case where the hardware would in fact support it.
5775  */
5776 /*ARGSUSED*/
5777 int
5778 cpuid_syscall32_insn(cpu_t *cpu)
5779 {
5780 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5781 
5782 #if !defined(__xpv)
5783 	if (cpu == NULL)
5784 		cpu = CPU;
5785 
5786 	/*CSTYLED*/
5787 	{
5788 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5789 
5790 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5791 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5792 		    cpi->cpi_xmaxeax >= 0x80000001 &&
5793 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5794 			return (1);
5795 	}
5796 #endif
5797 	return (0);
5798 }
5799 
5800 int
5801 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5802 {
5803 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5804 
5805 	static const char fmt[] =
5806 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5807 	static const char fmt_ht[] =
5808 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5809 
5810 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5811 
5812 	if (cpuid_is_cmt(cpu))
5813 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5814 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5815 		    cpi->cpi_family, cpi->cpi_model,
5816 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5817 	return (snprintf(s, n, fmt,
5818 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5819 	    cpi->cpi_family, cpi->cpi_model,
5820 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5821 }
5822 
5823 const char *
5824 cpuid_getvendorstr(cpu_t *cpu)
5825 {
5826 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5827 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5828 }
5829 
5830 uint_t
5831 cpuid_getvendor(cpu_t *cpu)
5832 {
5833 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5834 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5835 }
5836 
5837 uint_t
5838 cpuid_getfamily(cpu_t *cpu)
5839 {
5840 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5841 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5842 }
5843 
5844 uint_t
5845 cpuid_getmodel(cpu_t *cpu)
5846 {
5847 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5848 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5849 }
5850 
5851 uint_t
5852 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5853 {
5854 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5855 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5856 }
5857 
5858 uint_t
5859 cpuid_get_ncore_per_chip(cpu_t *cpu)
5860 {
5861 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5862 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5863 }
5864 
5865 uint_t
5866 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5867 {
5868 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5869 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5870 }
5871 
5872 id_t
5873 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5874 {
5875 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5876 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5877 }
5878 
5879 uint_t
5880 cpuid_getstep(cpu_t *cpu)
5881 {
5882 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5883 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5884 }
5885 
5886 uint_t
5887 cpuid_getsig(struct cpu *cpu)
5888 {
5889 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5890 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5891 }
5892 
5893 uint32_t
5894 cpuid_getchiprev(struct cpu *cpu)
5895 {
5896 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5897 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5898 }
5899 
5900 const char *
5901 cpuid_getchiprevstr(struct cpu *cpu)
5902 {
5903 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5904 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5905 }
5906 
5907 uint32_t
5908 cpuid_getsockettype(struct cpu *cpu)
5909 {
5910 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5911 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5912 }
5913 
5914 const char *
5915 cpuid_getsocketstr(cpu_t *cpu)
5916 {
5917 	static const char *socketstr = NULL;
5918 	struct cpuid_info *cpi;
5919 
5920 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5921 	cpi = cpu->cpu_m.mcpu_cpi;
5922 
5923 	/* Assume that socket types are the same across the system */
5924 	if (socketstr == NULL)
5925 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5926 		    cpi->cpi_model, cpi->cpi_step);
5927 
5928 
5929 	return (socketstr);
5930 }
5931 
5932 int
5933 cpuid_get_chipid(cpu_t *cpu)
5934 {
5935 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5936 
5937 	if (cpuid_is_cmt(cpu))
5938 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5939 	return (cpu->cpu_id);
5940 }
5941 
5942 id_t
5943 cpuid_get_coreid(cpu_t *cpu)
5944 {
5945 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5946 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5947 }
5948 
5949 int
5950 cpuid_get_pkgcoreid(cpu_t *cpu)
5951 {
5952 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5953 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5954 }
5955 
5956 int
5957 cpuid_get_clogid(cpu_t *cpu)
5958 {
5959 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5960 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5961 }
5962 
5963 int
5964 cpuid_get_cacheid(cpu_t *cpu)
5965 {
5966 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5967 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5968 }
5969 
5970 uint_t
5971 cpuid_get_procnodeid(cpu_t *cpu)
5972 {
5973 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5974 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5975 }
5976 
5977 uint_t
5978 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5979 {
5980 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5981 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5982 }
5983 
5984 uint_t
5985 cpuid_get_compunitid(cpu_t *cpu)
5986 {
5987 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5988 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5989 }
5990 
5991 uint_t
5992 cpuid_get_cores_per_compunit(cpu_t *cpu)
5993 {
5994 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5995 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5996 }
5997 
5998 uint32_t
5999 cpuid_get_apicid(cpu_t *cpu)
6000 {
6001 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6002 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6003 		return (UINT32_MAX);
6004 	} else {
6005 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6006 	}
6007 }
6008 
6009 void
6010 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6011 {
6012 	struct cpuid_info *cpi;
6013 
6014 	if (cpu == NULL)
6015 		cpu = CPU;
6016 	cpi = cpu->cpu_m.mcpu_cpi;
6017 
6018 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6019 
6020 	if (pabits)
6021 		*pabits = cpi->cpi_pabits;
6022 	if (vabits)
6023 		*vabits = cpi->cpi_vabits;
6024 }
6025 
6026 size_t
6027 cpuid_get_xsave_size()
6028 {
6029 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6030 	    sizeof (struct xsave_state)));
6031 }
6032 
6033 /*
6034  * Return true if the CPUs on this system require 'pointer clearing' for the
6035  * floating point error pointer exception handling. In the past, this has been
6036  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6037  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6038  * feature bit and is reflected in the cpi_fp_amd_save member.
6039  */
6040 boolean_t
6041 cpuid_need_fp_excp_handling()
6042 {
6043 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6044 	    cpuid_info0.cpi_fp_amd_save != 0);
6045 }
6046 
6047 /*
6048  * Returns the number of data TLB entries for a corresponding
6049  * pagesize.  If it can't be computed, or isn't known, the
6050  * routine returns zero.  If you ask about an architecturally
6051  * impossible pagesize, the routine will panic (so that the
6052  * hat implementor knows that things are inconsistent.)
6053  */
6054 uint_t
6055 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6056 {
6057 	struct cpuid_info *cpi;
6058 	uint_t dtlb_nent = 0;
6059 
6060 	if (cpu == NULL)
6061 		cpu = CPU;
6062 	cpi = cpu->cpu_m.mcpu_cpi;
6063 
6064 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6065 
6066 	/*
6067 	 * Check the L2 TLB info
6068 	 */
6069 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6070 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6071 
6072 		switch (pagesize) {
6073 
6074 		case 4 * 1024:
6075 			/*
6076 			 * All zero in the top 16 bits of the register
6077 			 * indicates a unified TLB. Size is in low 16 bits.
6078 			 */
6079 			if ((cp->cp_ebx & 0xffff0000) == 0)
6080 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6081 			else
6082 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6083 			break;
6084 
6085 		case 2 * 1024 * 1024:
6086 			if ((cp->cp_eax & 0xffff0000) == 0)
6087 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6088 			else
6089 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6090 			break;
6091 
6092 		default:
6093 			panic("unknown L2 pagesize");
6094 			/*NOTREACHED*/
6095 		}
6096 	}
6097 
6098 	if (dtlb_nent != 0)
6099 		return (dtlb_nent);
6100 
6101 	/*
6102 	 * No L2 TLB support for this size, try L1.
6103 	 */
6104 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6105 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6106 
6107 		switch (pagesize) {
6108 		case 4 * 1024:
6109 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6110 			break;
6111 		case 2 * 1024 * 1024:
6112 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6113 			break;
6114 		default:
6115 			panic("unknown L1 d-TLB pagesize");
6116 			/*NOTREACHED*/
6117 		}
6118 	}
6119 
6120 	return (dtlb_nent);
6121 }
6122 
6123 /*
6124  * Return 0 if the erratum is not present or not applicable, positive
6125  * if it is, and negative if the status of the erratum is unknown.
6126  *
6127  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6128  * Processors" #25759, Rev 3.57, August 2005
6129  */
6130 int
6131 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6132 {
6133 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6134 	uint_t eax;
6135 
6136 	/*
6137 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6138 	 * a legacy (32-bit) AMD CPU.
6139 	 */
6140 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6141 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6142 	    cpi->cpi_family == 6) {
6143 		return (0);
6144 	}
6145 
6146 	eax = cpi->cpi_std[1].cp_eax;
6147 
6148 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6149 #define	SH_B3(eax)	(eax == 0xf51)
6150 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6151 
6152 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6153 
6154 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6155 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6156 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6157 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6158 
6159 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6160 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6161 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6162 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6163 
6164 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6165 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6166 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6167 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6168 #define	BH_E4(eax)	(eax == 0x20fb1)
6169 #define	SH_E5(eax)	(eax == 0x20f42)
6170 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6171 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6172 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6173 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6174 			    DH_E6(eax) || JH_E6(eax))
6175 
6176 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6177 #define	DR_B0(eax)	(eax == 0x100f20)
6178 #define	DR_B1(eax)	(eax == 0x100f21)
6179 #define	DR_BA(eax)	(eax == 0x100f2a)
6180 #define	DR_B2(eax)	(eax == 0x100f22)
6181 #define	DR_B3(eax)	(eax == 0x100f23)
6182 #define	RB_C0(eax)	(eax == 0x100f40)
6183 
6184 	switch (erratum) {
6185 	case 1:
6186 		return (cpi->cpi_family < 0x10);
6187 	case 51:	/* what does the asterisk mean? */
6188 		return (B(eax) || SH_C0(eax) || CG(eax));
6189 	case 52:
6190 		return (B(eax));
6191 	case 57:
6192 		return (cpi->cpi_family <= 0x11);
6193 	case 58:
6194 		return (B(eax));
6195 	case 60:
6196 		return (cpi->cpi_family <= 0x11);
6197 	case 61:
6198 	case 62:
6199 	case 63:
6200 	case 64:
6201 	case 65:
6202 	case 66:
6203 	case 68:
6204 	case 69:
6205 	case 70:
6206 	case 71:
6207 		return (B(eax));
6208 	case 72:
6209 		return (SH_B0(eax));
6210 	case 74:
6211 		return (B(eax));
6212 	case 75:
6213 		return (cpi->cpi_family < 0x10);
6214 	case 76:
6215 		return (B(eax));
6216 	case 77:
6217 		return (cpi->cpi_family <= 0x11);
6218 	case 78:
6219 		return (B(eax) || SH_C0(eax));
6220 	case 79:
6221 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6222 	case 80:
6223 	case 81:
6224 	case 82:
6225 		return (B(eax));
6226 	case 83:
6227 		return (B(eax) || SH_C0(eax) || CG(eax));
6228 	case 85:
6229 		return (cpi->cpi_family < 0x10);
6230 	case 86:
6231 		return (SH_C0(eax) || CG(eax));
6232 	case 88:
6233 		return (B(eax) || SH_C0(eax));
6234 	case 89:
6235 		return (cpi->cpi_family < 0x10);
6236 	case 90:
6237 		return (B(eax) || SH_C0(eax) || CG(eax));
6238 	case 91:
6239 	case 92:
6240 		return (B(eax) || SH_C0(eax));
6241 	case 93:
6242 		return (SH_C0(eax));
6243 	case 94:
6244 		return (B(eax) || SH_C0(eax) || CG(eax));
6245 	case 95:
6246 		return (B(eax) || SH_C0(eax));
6247 	case 96:
6248 		return (B(eax) || SH_C0(eax) || CG(eax));
6249 	case 97:
6250 	case 98:
6251 		return (SH_C0(eax) || CG(eax));
6252 	case 99:
6253 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6254 	case 100:
6255 		return (B(eax) || SH_C0(eax));
6256 	case 101:
6257 	case 103:
6258 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6259 	case 104:
6260 		return (SH_C0(eax) || CG(eax) || D0(eax));
6261 	case 105:
6262 	case 106:
6263 	case 107:
6264 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6265 	case 108:
6266 		return (DH_CG(eax));
6267 	case 109:
6268 		return (SH_C0(eax) || CG(eax) || D0(eax));
6269 	case 110:
6270 		return (D0(eax) || EX(eax));
6271 	case 111:
6272 		return (CG(eax));
6273 	case 112:
6274 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6275 	case 113:
6276 		return (eax == 0x20fc0);
6277 	case 114:
6278 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6279 	case 115:
6280 		return (SH_E0(eax) || JH_E1(eax));
6281 	case 116:
6282 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6283 	case 117:
6284 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6285 	case 118:
6286 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6287 		    JH_E6(eax));
6288 	case 121:
6289 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6290 	case 122:
6291 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6292 	case 123:
6293 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6294 	case 131:
6295 		return (cpi->cpi_family < 0x10);
6296 	case 6336786:
6297 
6298 		/*
6299 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6300 		 * if this is a K8 family or newer processor. We're testing for
6301 		 * this 'erratum' to determine whether or not we have a constant
6302 		 * TSC.
6303 		 *
6304 		 * Our current fix for this is to disable the C1-Clock ramping.
6305 		 * However, this doesn't work on newer processor families nor
6306 		 * does it work when virtualized as those devices don't exist.
6307 		 */
6308 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6309 			return (0);
6310 		}
6311 
6312 		if (CPI_FAMILY(cpi) == 0xf) {
6313 			struct cpuid_regs regs;
6314 			regs.cp_eax = 0x80000007;
6315 			(void) __cpuid_insn(&regs);
6316 			return (!(regs.cp_edx & 0x100));
6317 		}
6318 		return (0);
6319 	case 147:
6320 		/*
6321 		 * This erratum (K8 #147) is not present on family 10 and newer.
6322 		 */
6323 		if (cpi->cpi_family >= 0x10) {
6324 			return (0);
6325 		}
6326 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6327 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6328 
6329 	case 6671130:
6330 		/*
6331 		 * check for processors (pre-Shanghai) that do not provide
6332 		 * optimal management of 1gb ptes in its tlb.
6333 		 */
6334 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6335 
6336 	case 298:
6337 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6338 		    DR_B2(eax) || RB_C0(eax));
6339 
6340 	case 721:
6341 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6342 
6343 	default:
6344 		return (-1);
6345 
6346 	}
6347 }
6348 
6349 /*
6350  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6351  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6352  */
6353 int
6354 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6355 {
6356 	struct cpuid_info	*cpi;
6357 	uint_t			osvwid;
6358 	static int		osvwfeature = -1;
6359 	uint64_t		osvwlength;
6360 
6361 
6362 	cpi = cpu->cpu_m.mcpu_cpi;
6363 
6364 	/* confirm OSVW supported */
6365 	if (osvwfeature == -1) {
6366 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6367 	} else {
6368 		/* assert that osvw feature setting is consistent on all cpus */
6369 		ASSERT(osvwfeature ==
6370 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6371 	}
6372 	if (!osvwfeature)
6373 		return (-1);
6374 
6375 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6376 
6377 	switch (erratum) {
6378 	case 298:	/* osvwid is 0 */
6379 		osvwid = 0;
6380 		if (osvwlength <= (uint64_t)osvwid) {
6381 			/* osvwid 0 is unknown */
6382 			return (-1);
6383 		}
6384 
6385 		/*
6386 		 * Check the OSVW STATUS MSR to determine the state
6387 		 * of the erratum where:
6388 		 *   0 - fixed by HW
6389 		 *   1 - BIOS has applied the workaround when BIOS
6390 		 *   workaround is available. (Or for other errata,
6391 		 *   OS workaround is required.)
6392 		 * For a value of 1, caller will confirm that the
6393 		 * erratum 298 workaround has indeed been applied by BIOS.
6394 		 *
6395 		 * A 1 may be set in cpus that have a HW fix
6396 		 * in a mixed cpu system. Regarding erratum 298:
6397 		 *   In a multiprocessor platform, the workaround above
6398 		 *   should be applied to all processors regardless of
6399 		 *   silicon revision when an affected processor is
6400 		 *   present.
6401 		 */
6402 
6403 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6404 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6405 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6406 
6407 	default:
6408 		return (-1);
6409 	}
6410 }
6411 
6412 static const char assoc_str[] = "associativity";
6413 static const char line_str[] = "line-size";
6414 static const char size_str[] = "size";
6415 
6416 static void
6417 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6418     uint32_t val)
6419 {
6420 	char buf[128];
6421 
6422 	/*
6423 	 * ndi_prop_update_int() is used because it is desirable for
6424 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6425 	 */
6426 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6427 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6428 }
6429 
6430 /*
6431  * Intel-style cache/tlb description
6432  *
6433  * Standard cpuid level 2 gives a randomly ordered
6434  * selection of tags that index into a table that describes
6435  * cache and tlb properties.
6436  */
6437 
6438 static const char l1_icache_str[] = "l1-icache";
6439 static const char l1_dcache_str[] = "l1-dcache";
6440 static const char l2_cache_str[] = "l2-cache";
6441 static const char l3_cache_str[] = "l3-cache";
6442 static const char itlb4k_str[] = "itlb-4K";
6443 static const char dtlb4k_str[] = "dtlb-4K";
6444 static const char itlb2M_str[] = "itlb-2M";
6445 static const char itlb4M_str[] = "itlb-4M";
6446 static const char dtlb4M_str[] = "dtlb-4M";
6447 static const char dtlb24_str[] = "dtlb0-2M-4M";
6448 static const char itlb424_str[] = "itlb-4K-2M-4M";
6449 static const char itlb24_str[] = "itlb-2M-4M";
6450 static const char dtlb44_str[] = "dtlb-4K-4M";
6451 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6452 static const char sl2_cache_str[] = "sectored-l2-cache";
6453 static const char itrace_str[] = "itrace-cache";
6454 static const char sl3_cache_str[] = "sectored-l3-cache";
6455 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6456 
6457 static const struct cachetab {
6458 	uint8_t		ct_code;
6459 	uint8_t		ct_assoc;
6460 	uint16_t	ct_line_size;
6461 	size_t		ct_size;
6462 	const char	*ct_label;
6463 } intel_ctab[] = {
6464 	/*
6465 	 * maintain descending order!
6466 	 *
6467 	 * Codes ignored - Reason
6468 	 * ----------------------
6469 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6470 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6471 	 */
6472 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6473 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6474 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6475 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6476 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6477 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6478 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6479 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6480 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6481 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6482 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6483 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6484 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6485 	{ 0xc0, 4, 0, 8, dtlb44_str },
6486 	{ 0xba, 4, 0, 64, dtlb4k_str },
6487 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6488 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6489 	{ 0xb2, 4, 0, 64, itlb4k_str },
6490 	{ 0xb0, 4, 0, 128, itlb4k_str },
6491 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6492 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6493 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6494 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6495 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6496 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6497 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6498 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6499 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6500 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6501 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6502 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6503 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6504 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6505 	{ 0x73, 8, 0, 64*1024, itrace_str},
6506 	{ 0x72, 8, 0, 32*1024, itrace_str},
6507 	{ 0x71, 8, 0, 16*1024, itrace_str},
6508 	{ 0x70, 8, 0, 12*1024, itrace_str},
6509 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6510 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6511 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6512 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6513 	{ 0x5d, 0, 0, 256, dtlb44_str},
6514 	{ 0x5c, 0, 0, 128, dtlb44_str},
6515 	{ 0x5b, 0, 0, 64, dtlb44_str},
6516 	{ 0x5a, 4, 0, 32, dtlb24_str},
6517 	{ 0x59, 0, 0, 16, dtlb4k_str},
6518 	{ 0x57, 4, 0, 16, dtlb4k_str},
6519 	{ 0x56, 4, 0, 16, dtlb4M_str},
6520 	{ 0x55, 0, 0, 7, itlb24_str},
6521 	{ 0x52, 0, 0, 256, itlb424_str},
6522 	{ 0x51, 0, 0, 128, itlb424_str},
6523 	{ 0x50, 0, 0, 64, itlb424_str},
6524 	{ 0x4f, 0, 0, 32, itlb4k_str},
6525 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6526 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6527 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6528 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6529 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6530 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6531 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6532 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6533 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6534 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6535 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6536 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6537 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6538 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6539 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6540 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6541 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6542 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6543 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6544 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6545 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6546 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6547 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6548 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6549 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6550 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6551 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6552 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6553 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6554 	{ 0x0b, 4, 0, 4, itlb4M_str},
6555 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6556 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6557 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6558 	{ 0x05, 4, 0, 32, dtlb4M_str},
6559 	{ 0x04, 4, 0, 8, dtlb4M_str},
6560 	{ 0x03, 4, 0, 64, dtlb4k_str},
6561 	{ 0x02, 4, 0, 2, itlb4M_str},
6562 	{ 0x01, 4, 0, 32, itlb4k_str},
6563 	{ 0 }
6564 };
6565 
6566 static const struct cachetab cyrix_ctab[] = {
6567 	{ 0x70, 4, 0, 32, "tlb-4K" },
6568 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6569 	{ 0 }
6570 };
6571 
6572 /*
6573  * Search a cache table for a matching entry
6574  */
6575 static const struct cachetab *
6576 find_cacheent(const struct cachetab *ct, uint_t code)
6577 {
6578 	if (code != 0) {
6579 		for (; ct->ct_code != 0; ct++)
6580 			if (ct->ct_code <= code)
6581 				break;
6582 		if (ct->ct_code == code)
6583 			return (ct);
6584 	}
6585 	return (NULL);
6586 }
6587 
6588 /*
6589  * Populate cachetab entry with L2 or L3 cache-information using
6590  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6591  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6592  * information is found.
6593  */
6594 static int
6595 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6596 {
6597 	uint32_t level, i;
6598 	int ret = 0;
6599 
6600 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6601 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6602 
6603 		if (level == 2 || level == 3) {
6604 			ct->ct_assoc =
6605 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6606 			ct->ct_line_size =
6607 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6608 			ct->ct_size = ct->ct_assoc *
6609 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6610 			    ct->ct_line_size *
6611 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6612 
6613 			if (level == 2) {
6614 				ct->ct_label = l2_cache_str;
6615 			} else if (level == 3) {
6616 				ct->ct_label = l3_cache_str;
6617 			}
6618 			ret = 1;
6619 		}
6620 	}
6621 
6622 	return (ret);
6623 }
6624 
6625 /*
6626  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6627  * The walk is terminated if the walker returns non-zero.
6628  */
6629 static void
6630 intel_walk_cacheinfo(struct cpuid_info *cpi,
6631     void *arg, int (*func)(void *, const struct cachetab *))
6632 {
6633 	const struct cachetab *ct;
6634 	struct cachetab des_49_ct, des_b1_ct;
6635 	uint8_t *dp;
6636 	int i;
6637 
6638 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6639 		return;
6640 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6641 		/*
6642 		 * For overloaded descriptor 0x49 we use cpuid function 4
6643 		 * if supported by the current processor, to create
6644 		 * cache information.
6645 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6646 		 * to disambiguate the cache information.
6647 		 */
6648 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6649 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6650 				ct = &des_49_ct;
6651 		} else if (*dp == 0xb1) {
6652 			des_b1_ct.ct_code = 0xb1;
6653 			des_b1_ct.ct_assoc = 4;
6654 			des_b1_ct.ct_line_size = 0;
6655 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6656 				des_b1_ct.ct_size = 8;
6657 				des_b1_ct.ct_label = itlb2M_str;
6658 			} else {
6659 				des_b1_ct.ct_size = 4;
6660 				des_b1_ct.ct_label = itlb4M_str;
6661 			}
6662 			ct = &des_b1_ct;
6663 		} else {
6664 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6665 				continue;
6666 			}
6667 		}
6668 
6669 		if (func(arg, ct) != 0) {
6670 			break;
6671 		}
6672 	}
6673 }
6674 
6675 /*
6676  * (Like the Intel one, except for Cyrix CPUs)
6677  */
6678 static void
6679 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6680     void *arg, int (*func)(void *, const struct cachetab *))
6681 {
6682 	const struct cachetab *ct;
6683 	uint8_t *dp;
6684 	int i;
6685 
6686 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6687 		return;
6688 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6689 		/*
6690 		 * Search Cyrix-specific descriptor table first ..
6691 		 */
6692 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6693 			if (func(arg, ct) != 0)
6694 				break;
6695 			continue;
6696 		}
6697 		/*
6698 		 * .. else fall back to the Intel one
6699 		 */
6700 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6701 			if (func(arg, ct) != 0)
6702 				break;
6703 			continue;
6704 		}
6705 	}
6706 }
6707 
6708 /*
6709  * A cacheinfo walker that adds associativity, line-size, and size properties
6710  * to the devinfo node it is passed as an argument.
6711  */
6712 static int
6713 add_cacheent_props(void *arg, const struct cachetab *ct)
6714 {
6715 	dev_info_t *devi = arg;
6716 
6717 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6718 	if (ct->ct_line_size != 0)
6719 		add_cache_prop(devi, ct->ct_label, line_str,
6720 		    ct->ct_line_size);
6721 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6722 	return (0);
6723 }
6724 
6725 
6726 static const char fully_assoc[] = "fully-associative?";
6727 
6728 /*
6729  * AMD style cache/tlb description
6730  *
6731  * Extended functions 5 and 6 directly describe properties of
6732  * tlbs and various cache levels.
6733  */
6734 static void
6735 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6736 {
6737 	switch (assoc) {
6738 	case 0:	/* reserved; ignore */
6739 		break;
6740 	default:
6741 		add_cache_prop(devi, label, assoc_str, assoc);
6742 		break;
6743 	case 0xff:
6744 		add_cache_prop(devi, label, fully_assoc, 1);
6745 		break;
6746 	}
6747 }
6748 
6749 static void
6750 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6751 {
6752 	if (size == 0)
6753 		return;
6754 	add_cache_prop(devi, label, size_str, size);
6755 	add_amd_assoc(devi, label, assoc);
6756 }
6757 
6758 static void
6759 add_amd_cache(dev_info_t *devi, const char *label,
6760     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6761 {
6762 	if (size == 0 || line_size == 0)
6763 		return;
6764 	add_amd_assoc(devi, label, assoc);
6765 	/*
6766 	 * Most AMD parts have a sectored cache. Multiple cache lines are
6767 	 * associated with each tag. A sector consists of all cache lines
6768 	 * associated with a tag. For example, the AMD K6-III has a sector
6769 	 * size of 2 cache lines per tag.
6770 	 */
6771 	if (lines_per_tag != 0)
6772 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6773 	add_cache_prop(devi, label, line_str, line_size);
6774 	add_cache_prop(devi, label, size_str, size * 1024);
6775 }
6776 
6777 static void
6778 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6779 {
6780 	switch (assoc) {
6781 	case 0:	/* off */
6782 		break;
6783 	case 1:
6784 	case 2:
6785 	case 4:
6786 		add_cache_prop(devi, label, assoc_str, assoc);
6787 		break;
6788 	case 6:
6789 		add_cache_prop(devi, label, assoc_str, 8);
6790 		break;
6791 	case 8:
6792 		add_cache_prop(devi, label, assoc_str, 16);
6793 		break;
6794 	case 0xf:
6795 		add_cache_prop(devi, label, fully_assoc, 1);
6796 		break;
6797 	default: /* reserved; ignore */
6798 		break;
6799 	}
6800 }
6801 
6802 static void
6803 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6804 {
6805 	if (size == 0 || assoc == 0)
6806 		return;
6807 	add_amd_l2_assoc(devi, label, assoc);
6808 	add_cache_prop(devi, label, size_str, size);
6809 }
6810 
6811 static void
6812 add_amd_l2_cache(dev_info_t *devi, const char *label,
6813     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6814 {
6815 	if (size == 0 || assoc == 0 || line_size == 0)
6816 		return;
6817 	add_amd_l2_assoc(devi, label, assoc);
6818 	if (lines_per_tag != 0)
6819 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6820 	add_cache_prop(devi, label, line_str, line_size);
6821 	add_cache_prop(devi, label, size_str, size * 1024);
6822 }
6823 
6824 static void
6825 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6826 {
6827 	struct cpuid_regs *cp;
6828 
6829 	if (cpi->cpi_xmaxeax < 0x80000005)
6830 		return;
6831 	cp = &cpi->cpi_extd[5];
6832 
6833 	/*
6834 	 * 4M/2M L1 TLB configuration
6835 	 *
6836 	 * We report the size for 2M pages because AMD uses two
6837 	 * TLB entries for one 4M page.
6838 	 */
6839 	add_amd_tlb(devi, "dtlb-2M",
6840 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6841 	add_amd_tlb(devi, "itlb-2M",
6842 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6843 
6844 	/*
6845 	 * 4K L1 TLB configuration
6846 	 */
6847 
6848 	switch (cpi->cpi_vendor) {
6849 		uint_t nentries;
6850 	case X86_VENDOR_TM:
6851 		if (cpi->cpi_family >= 5) {
6852 			/*
6853 			 * Crusoe processors have 256 TLB entries, but
6854 			 * cpuid data format constrains them to only
6855 			 * reporting 255 of them.
6856 			 */
6857 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6858 				nentries = 256;
6859 			/*
6860 			 * Crusoe processors also have a unified TLB
6861 			 */
6862 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6863 			    nentries);
6864 			break;
6865 		}
6866 		/*FALLTHROUGH*/
6867 	default:
6868 		add_amd_tlb(devi, itlb4k_str,
6869 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6870 		add_amd_tlb(devi, dtlb4k_str,
6871 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6872 		break;
6873 	}
6874 
6875 	/*
6876 	 * data L1 cache configuration
6877 	 */
6878 
6879 	add_amd_cache(devi, l1_dcache_str,
6880 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6881 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6882 
6883 	/*
6884 	 * code L1 cache configuration
6885 	 */
6886 
6887 	add_amd_cache(devi, l1_icache_str,
6888 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6889 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6890 
6891 	if (cpi->cpi_xmaxeax < 0x80000006)
6892 		return;
6893 	cp = &cpi->cpi_extd[6];
6894 
6895 	/* Check for a unified L2 TLB for large pages */
6896 
6897 	if (BITX(cp->cp_eax, 31, 16) == 0)
6898 		add_amd_l2_tlb(devi, "l2-tlb-2M",
6899 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6900 	else {
6901 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
6902 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6903 		add_amd_l2_tlb(devi, "l2-itlb-2M",
6904 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6905 	}
6906 
6907 	/* Check for a unified L2 TLB for 4K pages */
6908 
6909 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
6910 		add_amd_l2_tlb(devi, "l2-tlb-4K",
6911 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6912 	} else {
6913 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
6914 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6915 		add_amd_l2_tlb(devi, "l2-itlb-4K",
6916 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6917 	}
6918 
6919 	add_amd_l2_cache(devi, l2_cache_str,
6920 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6921 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6922 }
6923 
6924 /*
6925  * There are two basic ways that the x86 world describes it cache
6926  * and tlb architecture - Intel's way and AMD's way.
6927  *
6928  * Return which flavor of cache architecture we should use
6929  */
6930 static int
6931 x86_which_cacheinfo(struct cpuid_info *cpi)
6932 {
6933 	switch (cpi->cpi_vendor) {
6934 	case X86_VENDOR_Intel:
6935 		if (cpi->cpi_maxeax >= 2)
6936 			return (X86_VENDOR_Intel);
6937 		break;
6938 	case X86_VENDOR_AMD:
6939 		/*
6940 		 * The K5 model 1 was the first part from AMD that reported
6941 		 * cache sizes via extended cpuid functions.
6942 		 */
6943 		if (cpi->cpi_family > 5 ||
6944 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6945 			return (X86_VENDOR_AMD);
6946 		break;
6947 	case X86_VENDOR_HYGON:
6948 		return (X86_VENDOR_AMD);
6949 	case X86_VENDOR_TM:
6950 		if (cpi->cpi_family >= 5)
6951 			return (X86_VENDOR_AMD);
6952 		/*FALLTHROUGH*/
6953 	default:
6954 		/*
6955 		 * If they have extended CPU data for 0x80000005
6956 		 * then we assume they have AMD-format cache
6957 		 * information.
6958 		 *
6959 		 * If not, and the vendor happens to be Cyrix,
6960 		 * then try our-Cyrix specific handler.
6961 		 *
6962 		 * If we're not Cyrix, then assume we're using Intel's
6963 		 * table-driven format instead.
6964 		 */
6965 		if (cpi->cpi_xmaxeax >= 0x80000005)
6966 			return (X86_VENDOR_AMD);
6967 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6968 			return (X86_VENDOR_Cyrix);
6969 		else if (cpi->cpi_maxeax >= 2)
6970 			return (X86_VENDOR_Intel);
6971 		break;
6972 	}
6973 	return (-1);
6974 }
6975 
6976 void
6977 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6978     struct cpuid_info *cpi)
6979 {
6980 	dev_info_t *cpu_devi;
6981 	int create;
6982 
6983 	cpu_devi = (dev_info_t *)dip;
6984 
6985 	/* device_type */
6986 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6987 	    "device_type", "cpu");
6988 
6989 	/* reg */
6990 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6991 	    "reg", cpu_id);
6992 
6993 	/* cpu-mhz, and clock-frequency */
6994 	if (cpu_freq > 0) {
6995 		long long mul;
6996 
6997 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6998 		    "cpu-mhz", cpu_freq);
6999 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7000 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7001 			    "clock-frequency", (int)mul);
7002 	}
7003 
7004 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7005 
7006 	/* vendor-id */
7007 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7008 	    "vendor-id", cpi->cpi_vendorstr);
7009 
7010 	if (cpi->cpi_maxeax == 0) {
7011 		return;
7012 	}
7013 
7014 	/*
7015 	 * family, model, and step
7016 	 */
7017 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7018 	    "family", CPI_FAMILY(cpi));
7019 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7020 	    "cpu-model", CPI_MODEL(cpi));
7021 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7022 	    "stepping-id", CPI_STEP(cpi));
7023 
7024 	/* type */
7025 	switch (cpi->cpi_vendor) {
7026 	case X86_VENDOR_Intel:
7027 		create = 1;
7028 		break;
7029 	default:
7030 		create = 0;
7031 		break;
7032 	}
7033 	if (create)
7034 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7035 		    "type", CPI_TYPE(cpi));
7036 
7037 	/* ext-family */
7038 	switch (cpi->cpi_vendor) {
7039 	case X86_VENDOR_Intel:
7040 	case X86_VENDOR_AMD:
7041 		create = cpi->cpi_family >= 0xf;
7042 		break;
7043 	case X86_VENDOR_HYGON:
7044 		create = 1;
7045 		break;
7046 	default:
7047 		create = 0;
7048 		break;
7049 	}
7050 	if (create)
7051 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7052 		    "ext-family", CPI_FAMILY_XTD(cpi));
7053 
7054 	/* ext-model */
7055 	switch (cpi->cpi_vendor) {
7056 	case X86_VENDOR_Intel:
7057 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7058 		break;
7059 	case X86_VENDOR_AMD:
7060 		create = CPI_FAMILY(cpi) == 0xf;
7061 		break;
7062 	case X86_VENDOR_HYGON:
7063 		create = 1;
7064 		break;
7065 	default:
7066 		create = 0;
7067 		break;
7068 	}
7069 	if (create)
7070 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7071 		    "ext-model", CPI_MODEL_XTD(cpi));
7072 
7073 	/* generation */
7074 	switch (cpi->cpi_vendor) {
7075 	case X86_VENDOR_AMD:
7076 	case X86_VENDOR_HYGON:
7077 		/*
7078 		 * AMD K5 model 1 was the first part to support this
7079 		 */
7080 		create = cpi->cpi_xmaxeax >= 0x80000001;
7081 		break;
7082 	default:
7083 		create = 0;
7084 		break;
7085 	}
7086 	if (create)
7087 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7088 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7089 
7090 	/* brand-id */
7091 	switch (cpi->cpi_vendor) {
7092 	case X86_VENDOR_Intel:
7093 		/*
7094 		 * brand id first appeared on Pentium III Xeon model 8,
7095 		 * and Celeron model 8 processors and Opteron
7096 		 */
7097 		create = cpi->cpi_family > 6 ||
7098 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7099 		break;
7100 	case X86_VENDOR_AMD:
7101 		create = cpi->cpi_family >= 0xf;
7102 		break;
7103 	case X86_VENDOR_HYGON:
7104 		create = 1;
7105 		break;
7106 	default:
7107 		create = 0;
7108 		break;
7109 	}
7110 	if (create && cpi->cpi_brandid != 0) {
7111 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7112 		    "brand-id", cpi->cpi_brandid);
7113 	}
7114 
7115 	/* chunks, and apic-id */
7116 	switch (cpi->cpi_vendor) {
7117 		/*
7118 		 * first available on Pentium IV and Opteron (K8)
7119 		 */
7120 	case X86_VENDOR_Intel:
7121 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7122 		break;
7123 	case X86_VENDOR_AMD:
7124 		create = cpi->cpi_family >= 0xf;
7125 		break;
7126 	case X86_VENDOR_HYGON:
7127 		create = 1;
7128 		break;
7129 	default:
7130 		create = 0;
7131 		break;
7132 	}
7133 	if (create) {
7134 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7135 		    "chunks", CPI_CHUNKS(cpi));
7136 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7137 		    "apic-id", cpi->cpi_apicid);
7138 		if (cpi->cpi_chipid >= 0) {
7139 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7140 			    "chip#", cpi->cpi_chipid);
7141 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7142 			    "clog#", cpi->cpi_clogid);
7143 		}
7144 	}
7145 
7146 	/* cpuid-features */
7147 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7148 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7149 
7150 
7151 	/* cpuid-features-ecx */
7152 	switch (cpi->cpi_vendor) {
7153 	case X86_VENDOR_Intel:
7154 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7155 		break;
7156 	case X86_VENDOR_AMD:
7157 		create = cpi->cpi_family >= 0xf;
7158 		break;
7159 	case X86_VENDOR_HYGON:
7160 		create = 1;
7161 		break;
7162 	default:
7163 		create = 0;
7164 		break;
7165 	}
7166 	if (create)
7167 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7168 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7169 
7170 	/* ext-cpuid-features */
7171 	switch (cpi->cpi_vendor) {
7172 	case X86_VENDOR_Intel:
7173 	case X86_VENDOR_AMD:
7174 	case X86_VENDOR_HYGON:
7175 	case X86_VENDOR_Cyrix:
7176 	case X86_VENDOR_TM:
7177 	case X86_VENDOR_Centaur:
7178 		create = cpi->cpi_xmaxeax >= 0x80000001;
7179 		break;
7180 	default:
7181 		create = 0;
7182 		break;
7183 	}
7184 	if (create) {
7185 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7186 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7187 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7188 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7189 	}
7190 
7191 	/*
7192 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7193 	 * model 1, and Cyrix GXm.  On earlier models we try and
7194 	 * simulate something similar .. so this string should always
7195 	 * same -something- about the processor, however lame.
7196 	 */
7197 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7198 	    "brand-string", cpi->cpi_brandstr);
7199 
7200 	/*
7201 	 * Finally, cache and tlb information
7202 	 */
7203 	switch (x86_which_cacheinfo(cpi)) {
7204 	case X86_VENDOR_Intel:
7205 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7206 		break;
7207 	case X86_VENDOR_Cyrix:
7208 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7209 		break;
7210 	case X86_VENDOR_AMD:
7211 		amd_cache_info(cpi, cpu_devi);
7212 		break;
7213 	default:
7214 		break;
7215 	}
7216 }
7217 
7218 struct l2info {
7219 	int *l2i_csz;
7220 	int *l2i_lsz;
7221 	int *l2i_assoc;
7222 	int l2i_ret;
7223 };
7224 
7225 /*
7226  * A cacheinfo walker that fetches the size, line-size and associativity
7227  * of the L2 cache
7228  */
7229 static int
7230 intel_l2cinfo(void *arg, const struct cachetab *ct)
7231 {
7232 	struct l2info *l2i = arg;
7233 	int *ip;
7234 
7235 	if (ct->ct_label != l2_cache_str &&
7236 	    ct->ct_label != sl2_cache_str)
7237 		return (0);	/* not an L2 -- keep walking */
7238 
7239 	if ((ip = l2i->l2i_csz) != NULL)
7240 		*ip = ct->ct_size;
7241 	if ((ip = l2i->l2i_lsz) != NULL)
7242 		*ip = ct->ct_line_size;
7243 	if ((ip = l2i->l2i_assoc) != NULL)
7244 		*ip = ct->ct_assoc;
7245 	l2i->l2i_ret = ct->ct_size;
7246 	return (1);		/* was an L2 -- terminate walk */
7247 }
7248 
7249 /*
7250  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7251  *
7252  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7253  *	value is the associativity, the associativity for the L2 cache and
7254  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7255  *	an index into the amd_afd[] array to determine the associativity.
7256  *	-1 is undefined. 0 is fully associative.
7257  */
7258 
7259 static int amd_afd[] =
7260 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7261 
7262 static void
7263 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7264 {
7265 	struct cpuid_regs *cp;
7266 	uint_t size, assoc;
7267 	int i;
7268 	int *ip;
7269 
7270 	if (cpi->cpi_xmaxeax < 0x80000006)
7271 		return;
7272 	cp = &cpi->cpi_extd[6];
7273 
7274 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7275 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7276 		uint_t cachesz = size * 1024;
7277 		assoc = amd_afd[i];
7278 
7279 		ASSERT(assoc != -1);
7280 
7281 		if ((ip = l2i->l2i_csz) != NULL)
7282 			*ip = cachesz;
7283 		if ((ip = l2i->l2i_lsz) != NULL)
7284 			*ip = BITX(cp->cp_ecx, 7, 0);
7285 		if ((ip = l2i->l2i_assoc) != NULL)
7286 			*ip = assoc;
7287 		l2i->l2i_ret = cachesz;
7288 	}
7289 }
7290 
7291 int
7292 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7293 {
7294 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7295 	struct l2info __l2info, *l2i = &__l2info;
7296 
7297 	l2i->l2i_csz = csz;
7298 	l2i->l2i_lsz = lsz;
7299 	l2i->l2i_assoc = assoc;
7300 	l2i->l2i_ret = -1;
7301 
7302 	switch (x86_which_cacheinfo(cpi)) {
7303 	case X86_VENDOR_Intel:
7304 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7305 		break;
7306 	case X86_VENDOR_Cyrix:
7307 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7308 		break;
7309 	case X86_VENDOR_AMD:
7310 		amd_l2cacheinfo(cpi, l2i);
7311 		break;
7312 	default:
7313 		break;
7314 	}
7315 	return (l2i->l2i_ret);
7316 }
7317 
7318 #if !defined(__xpv)
7319 
7320 uint32_t *
7321 cpuid_mwait_alloc(cpu_t *cpu)
7322 {
7323 	uint32_t	*ret;
7324 	size_t		mwait_size;
7325 
7326 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7327 
7328 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7329 	if (mwait_size == 0)
7330 		return (NULL);
7331 
7332 	/*
7333 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7334 	 * allocations.  mwait_size is currently cache line sized.  Neither
7335 	 * of these implementation details are guarantied to be true in the
7336 	 * future.
7337 	 *
7338 	 * First try allocating mwait_size as kmem_alloc() currently returns
7339 	 * correctly aligned memory.  If kmem_alloc() does not return
7340 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7341 	 *
7342 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7343 	 * decide to free this memory.
7344 	 */
7345 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7346 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7347 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7348 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7349 		*ret = MWAIT_RUNNING;
7350 		return (ret);
7351 	} else {
7352 		kmem_free(ret, mwait_size);
7353 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7354 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7355 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7356 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7357 		*ret = MWAIT_RUNNING;
7358 		return (ret);
7359 	}
7360 }
7361 
7362 void
7363 cpuid_mwait_free(cpu_t *cpu)
7364 {
7365 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7366 		return;
7367 	}
7368 
7369 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7370 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7371 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7372 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7373 	}
7374 
7375 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7376 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7377 }
7378 
7379 void
7380 patch_tsc_read(int flag)
7381 {
7382 	size_t cnt;
7383 
7384 	switch (flag) {
7385 	case TSC_NONE:
7386 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7387 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7388 		break;
7389 	case TSC_RDTSC_LFENCE:
7390 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7391 		(void) memcpy((void *)tsc_read,
7392 		    (void *)&_tsc_lfence_start, cnt);
7393 		break;
7394 	case TSC_TSCP:
7395 		cnt = &_tscp_end - &_tscp_start;
7396 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7397 		break;
7398 	default:
7399 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7400 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7401 		break;
7402 	}
7403 	tsc_type = flag;
7404 }
7405 
7406 int
7407 cpuid_deep_cstates_supported(void)
7408 {
7409 	struct cpuid_info *cpi;
7410 	struct cpuid_regs regs;
7411 
7412 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7413 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7414 
7415 	cpi = CPU->cpu_m.mcpu_cpi;
7416 
7417 	switch (cpi->cpi_vendor) {
7418 	case X86_VENDOR_Intel:
7419 		if (cpi->cpi_xmaxeax < 0x80000007)
7420 			return (0);
7421 
7422 		/*
7423 		 * Does TSC run at a constant rate in all C-states?
7424 		 */
7425 		regs.cp_eax = 0x80000007;
7426 		(void) __cpuid_insn(&regs);
7427 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7428 
7429 	default:
7430 		return (0);
7431 	}
7432 }
7433 
7434 #endif	/* !__xpv */
7435 
7436 void
7437 post_startup_cpu_fixups(void)
7438 {
7439 #ifndef __xpv
7440 	/*
7441 	 * Some AMD processors support C1E state. Entering this state will
7442 	 * cause the local APIC timer to stop, which we can't deal with at
7443 	 * this time.
7444 	 */
7445 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7446 		on_trap_data_t otd;
7447 		uint64_t reg;
7448 
7449 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7450 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7451 			/* Disable C1E state if it is enabled by BIOS */
7452 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7453 			    AMD_ACTONCMPHALT_MASK) {
7454 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7455 				    AMD_ACTONCMPHALT_SHIFT);
7456 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7457 			}
7458 		}
7459 		no_trap();
7460 	}
7461 #endif	/* !__xpv */
7462 }
7463 
7464 void
7465 enable_pcid(void)
7466 {
7467 	if (x86_use_pcid == -1)
7468 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7469 
7470 	if (x86_use_invpcid == -1) {
7471 		x86_use_invpcid = is_x86_feature(x86_featureset,
7472 		    X86FSET_INVPCID);
7473 	}
7474 
7475 	if (!x86_use_pcid)
7476 		return;
7477 
7478 	/*
7479 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7480 	 * bits; better make sure there's nothing there.
7481 	 */
7482 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7483 
7484 	setcr4(getcr4() | CR4_PCIDE);
7485 }
7486 
7487 /*
7488  * Setup necessary registers to enable XSAVE feature on this processor.
7489  * This function needs to be called early enough, so that no xsave/xrstor
7490  * ops will execute on the processor before the MSRs are properly set up.
7491  *
7492  * Current implementation has the following assumption:
7493  * - cpuid_pass_basic() is done, so that X86 features are known.
7494  * - fpu_probe() is done, so that fp_save_mech is chosen.
7495  */
7496 void
7497 xsave_setup_msr(cpu_t *cpu)
7498 {
7499 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7500 	ASSERT(fp_save_mech == FP_XSAVE);
7501 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7502 
7503 	/* Enable OSXSAVE in CR4. */
7504 	setcr4(getcr4() | CR4_OSXSAVE);
7505 	/*
7506 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7507 	 * correct value.
7508 	 */
7509 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7510 	setup_xfem();
7511 }
7512 
7513 /*
7514  * Starting with the Westmere processor the local
7515  * APIC timer will continue running in all C-states,
7516  * including the deepest C-states.
7517  */
7518 int
7519 cpuid_arat_supported(void)
7520 {
7521 	struct cpuid_info *cpi;
7522 	struct cpuid_regs regs;
7523 
7524 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7525 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7526 
7527 	cpi = CPU->cpu_m.mcpu_cpi;
7528 
7529 	switch (cpi->cpi_vendor) {
7530 	case X86_VENDOR_Intel:
7531 		/*
7532 		 * Always-running Local APIC Timer is
7533 		 * indicated by CPUID.6.EAX[2].
7534 		 */
7535 		if (cpi->cpi_maxeax >= 6) {
7536 			regs.cp_eax = 6;
7537 			(void) cpuid_insn(NULL, &regs);
7538 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7539 		} else {
7540 			return (0);
7541 		}
7542 	default:
7543 		return (0);
7544 	}
7545 }
7546 
7547 /*
7548  * Check support for Intel ENERGY_PERF_BIAS feature
7549  */
7550 int
7551 cpuid_iepb_supported(struct cpu *cp)
7552 {
7553 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7554 	struct cpuid_regs regs;
7555 
7556 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7557 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7558 
7559 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7560 		return (0);
7561 	}
7562 
7563 	/*
7564 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7565 	 * capability bit CPUID.6.ECX.3
7566 	 */
7567 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7568 		return (0);
7569 
7570 	regs.cp_eax = 0x6;
7571 	(void) cpuid_insn(NULL, &regs);
7572 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7573 }
7574 
7575 /*
7576  * Check support for TSC deadline timer
7577  *
7578  * TSC deadline timer provides a superior software programming
7579  * model over local APIC timer that eliminates "time drifts".
7580  * Instead of specifying a relative time, software specifies an
7581  * absolute time as the target at which the processor should
7582  * generate a timer event.
7583  */
7584 int
7585 cpuid_deadline_tsc_supported(void)
7586 {
7587 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7588 	struct cpuid_regs regs;
7589 
7590 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7591 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7592 
7593 	switch (cpi->cpi_vendor) {
7594 	case X86_VENDOR_Intel:
7595 		if (cpi->cpi_maxeax >= 1) {
7596 			regs.cp_eax = 1;
7597 			(void) cpuid_insn(NULL, &regs);
7598 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7599 		} else {
7600 			return (0);
7601 		}
7602 	default:
7603 		return (0);
7604 	}
7605 }
7606 
7607 #if !defined(__xpv)
7608 /*
7609  * Patch in versions of bcopy for high performance Intel Nhm processors
7610  * and later...
7611  */
7612 void
7613 patch_memops(uint_t vendor)
7614 {
7615 	size_t cnt, i;
7616 	caddr_t to, from;
7617 
7618 	if ((vendor == X86_VENDOR_Intel) &&
7619 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7620 		cnt = &bcopy_patch_end - &bcopy_patch_start;
7621 		to = &bcopy_ck_size;
7622 		from = &bcopy_patch_start;
7623 		for (i = 0; i < cnt; i++) {
7624 			*to++ = *from++;
7625 		}
7626 	}
7627 }
7628 #endif  /*  !__xpv */
7629 
7630 /*
7631  * We're being asked to tell the system how many bits are required to represent
7632  * the various thread and strand IDs. While it's tempting to derive this based
7633  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7634  * correct. Instead, this needs to be based on the number of bits that the APIC
7635  * allows for these different configurations. We only update these to a larger
7636  * value if we find one.
7637  */
7638 void
7639 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7640 {
7641 	struct cpuid_info *cpi;
7642 
7643 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7644 	cpi = cpu->cpu_m.mcpu_cpi;
7645 
7646 	if (cpi->cpi_ncore_bits > *core_nbits) {
7647 		*core_nbits = cpi->cpi_ncore_bits;
7648 	}
7649 
7650 	if (cpi->cpi_nthread_bits > *strand_nbits) {
7651 		*strand_nbits = cpi->cpi_nthread_bits;
7652 	}
7653 }
7654 
7655 void
7656 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7657 {
7658 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7659 	struct cpuid_regs cp;
7660 
7661 	/*
7662 	 * Reread the CPUID portions that we need for various security
7663 	 * information.
7664 	 */
7665 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7666 		/*
7667 		 * Check if we now have leaf 7 available to us.
7668 		 */
7669 		if (cpi->cpi_maxeax < 7) {
7670 			bzero(&cp, sizeof (cp));
7671 			cp.cp_eax = 0;
7672 			cpi->cpi_maxeax = __cpuid_insn(&cp);
7673 			if (cpi->cpi_maxeax < 7)
7674 				return;
7675 		}
7676 
7677 		bzero(&cp, sizeof (cp));
7678 		cp.cp_eax = 7;
7679 		cp.cp_ecx = 0;
7680 		(void) __cpuid_insn(&cp);
7681 		cpi->cpi_std[7] = cp;
7682 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7683 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
7684 		/* No xcpuid support */
7685 		if (cpi->cpi_family < 5 ||
7686 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7687 			return;
7688 
7689 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7690 			bzero(&cp, sizeof (cp));
7691 			cp.cp_eax = CPUID_LEAF_EXT_0;
7692 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7693 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7694 				return;
7695 			}
7696 		}
7697 
7698 		bzero(&cp, sizeof (cp));
7699 		cp.cp_eax = CPUID_LEAF_EXT_8;
7700 		(void) __cpuid_insn(&cp);
7701 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7702 		cpi->cpi_extd[8] = cp;
7703 	} else {
7704 		/*
7705 		 * Nothing to do here. Return an empty set which has already
7706 		 * been zeroed for us.
7707 		 */
7708 		return;
7709 	}
7710 	cpuid_scan_security(cpu, fset);
7711 }
7712 
7713 /* ARGSUSED */
7714 static int
7715 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7716 {
7717 	uchar_t *fset;
7718 	boolean_t first_pass = (boolean_t)arg1;
7719 
7720 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7721 	if (first_pass && CPU->cpu_id != 0)
7722 		return (0);
7723 	if (!first_pass && CPU->cpu_id == 0)
7724 		return (0);
7725 	cpuid_pass_ucode(CPU, fset);
7726 
7727 	return (0);
7728 }
7729 
7730 /*
7731  * After a microcode update where the version has changed, then we need to
7732  * rescan CPUID. To do this we check every CPU to make sure that they have the
7733  * same microcode. Then we perform a cross call to all such CPUs. It's the
7734  * caller's job to make sure that no one else can end up doing an update while
7735  * this is going on.
7736  *
7737  * We assume that the system is microcode capable if we're called.
7738  */
7739 void
7740 cpuid_post_ucodeadm(void)
7741 {
7742 	uint32_t rev;
7743 	int i;
7744 	struct cpu *cpu;
7745 	cpuset_t cpuset;
7746 	void *argdata;
7747 	uchar_t *f0;
7748 
7749 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7750 
7751 	mutex_enter(&cpu_lock);
7752 	cpu = cpu_get(0);
7753 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7754 	CPUSET_ONLY(cpuset, 0);
7755 	for (i = 1; i < max_ncpus; i++) {
7756 		if ((cpu = cpu_get(i)) == NULL)
7757 			continue;
7758 
7759 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7760 			panic("post microcode update CPU %d has differing "
7761 			    "microcode revision (%u) from CPU 0 (%u)",
7762 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7763 		}
7764 		CPUSET_ADD(cpuset, i);
7765 	}
7766 
7767 	/*
7768 	 * We do the cross calls in two passes. The first pass is only for the
7769 	 * boot CPU. The second pass is for all of the other CPUs. This allows
7770 	 * the boot CPU to go through and change behavior related to patching or
7771 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
7772 	 * other CPUs to follow suit.
7773 	 */
7774 	kpreempt_disable();
7775 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7776 	    cpuid_post_ucodeadm_xc);
7777 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7778 	    cpuid_post_ucodeadm_xc);
7779 	kpreempt_enable();
7780 
7781 	/*
7782 	 * OK, now look at each CPU and see if their feature sets are equal.
7783 	 */
7784 	f0 = argdata;
7785 	for (i = 1; i < max_ncpus; i++) {
7786 		uchar_t *fset;
7787 		if (!CPU_IN_SET(cpuset, i))
7788 			continue;
7789 
7790 		fset = (uchar_t *)((uintptr_t)argdata +
7791 		    sizeof (x86_featureset) * i);
7792 
7793 		if (!compare_x86_featureset(f0, fset)) {
7794 			panic("Post microcode update CPU %d has "
7795 			    "differing security feature (%p) set from CPU 0 "
7796 			    "(%p), not appending to feature set", i,
7797 			    (void *)fset, (void *)f0);
7798 		}
7799 	}
7800 
7801 	mutex_exit(&cpu_lock);
7802 
7803 	for (i = 0; i < NUM_X86_FEATURES; i++) {
7804 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7805 		    x86_feature_names[i]);
7806 		if (is_x86_feature(f0, i)) {
7807 			add_x86_feature(x86_featureset, i);
7808 		}
7809 	}
7810 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7811 }
7812 
7813 typedef void (*cpuid_pass_f)(cpu_t *, void *);
7814 
7815 typedef struct cpuid_pass_def {
7816 	cpuid_pass_t cpd_pass;
7817 	cpuid_pass_f cpd_func;
7818 } cpuid_pass_def_t;
7819 
7820 /*
7821  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
7822  * normal sense and should not appear here.
7823  */
7824 static const cpuid_pass_def_t cpuid_pass_defs[] = {
7825 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
7826 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
7827 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
7828 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
7829 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
7830 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
7831 };
7832 
7833 void
7834 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
7835 {
7836 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
7837 
7838 	if (cp == NULL)
7839 		cp = CPU;
7840 
7841 	/*
7842 	 * Space statically allocated for BSP, ensure pointer is set
7843 	 */
7844 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
7845 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
7846 
7847 	ASSERT(cpuid_checkpass(cp, pass - 1));
7848 
7849 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
7850 		if (cpuid_pass_defs[i].cpd_pass == pass) {
7851 			cpuid_pass_defs[i].cpd_func(cp, arg);
7852 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
7853 			return;
7854 		}
7855 	}
7856 
7857 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
7858 	    pass, cp->cpu_id);
7859 }
7860