xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision 627ade2ad3d27999387b31183c830a2d42c63ef5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2023 Oxide Computer Company
28  * Copyright 2024 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * The Microcode Pass
328  *
329  * After a microcode update, we do a selective rescan of the cpuid leaves to
330  * determine what features have changed. Microcode updates can provide more
331  * details about security related features to deal with issues like Spectre and
332  * L1TF. On occasion, vendors have violated their contract and removed bits.
333  * However, we don't try to detect that because that puts us in a situation that
334  * we really can't deal with. As such, the only thing we rescan are security
335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
336  * different sequence on APs and therefore is not part of the sequential order;
337  * It is invoked directly instead of by cpuid_execpass() and its completion
338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
339  * a more complex dependency mechanism if warranted by future developments.
340  *
341  * All of the passes are run on all CPUs. However, for the most part we only
342  * care about what the boot CPU says about this information and use the other
343  * CPUs as a rough guide to sanity check that we have the same feature set.
344  *
345  * We do not support running multiple logical CPUs with disjoint, let alone
346  * different, feature sets.
347  *
348  * ------------------
349  * Processor Topology
350  * ------------------
351  *
352  * One of the important things that we need to do is to understand the topology
353  * of the underlying processor. When we say topology in this case, we're trying
354  * to understand the relationship between the logical CPUs that the operating
355  * system sees and the underlying physical layout. Different logical CPUs may
356  * share different resources which can have important consequences for the
357  * performance of the system. For example, they may share caches, execution
358  * units, and more.
359  *
360  * The topology of the processor changes from generation to generation and
361  * vendor to vendor.  Along with that, different vendors use different
362  * terminology, and the operating system itself uses occasionally overlapping
363  * terminology. It's important to understand what this topology looks like so
364  * one can understand the different things that we try to calculate and
365  * determine.
366  *
367  * To get started, let's talk about a little bit of terminology that we've used
368  * so far, is used throughout this file, and is fairly generic across multiple
369  * vendors:
370  *
371  * CPU
372  *	A central processing unit (CPU) refers to a logical and/or virtual
373  *	entity that the operating system can execute instructions on. The
374  *	underlying resources for this CPU may be shared between multiple
375  *	entities; however, to the operating system it is a discrete unit.
376  *
377  * PROCESSOR and PACKAGE
378  *
379  *	Generally, when we use the term 'processor' on its own, we are referring
380  *	to the physical entity that one buys and plugs into a board. However,
381  *	because processor has been overloaded and one might see it used to mean
382  *	multiple different levels, we will instead use the term 'package' for
383  *	the rest of this file. The term package comes from the electrical
384  *	engineering side and refers to the physical entity that encloses the
385  *	electronics inside. Strictly speaking the package can contain more than
386  *	just the CPU, for example, on many processors it may also have what's
387  *	called an 'integrated graphical processing unit (GPU)'. Because the
388  *	package can encapsulate multiple units, it is the largest physical unit
389  *	that we refer to.
390  *
391  * SOCKET
392  *
393  *	A socket refers to unit on a system board (generally the motherboard)
394  *	that can receive a package. A single package, or processor, is plugged
395  *	into a single socket. A system may have multiple sockets. Often times,
396  *	the term socket is used interchangeably with package and refers to the
397  *	electrical component that has plugged in, and not the receptacle itself.
398  *
399  * CORE
400  *
401  *	A core refers to the physical instantiation of a CPU, generally, with a
402  *	full set of hardware resources available to it. A package may contain
403  *	multiple cores inside of it or it may just have a single one. A
404  *	processor with more than one core is often referred to as 'multi-core'.
405  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
406  *	that has 'multi-core' processors.
407  *
408  *	A core may expose a single logical CPU to the operating system, or it
409  *	may expose multiple CPUs, which we call threads, defined below.
410  *
411  *	Some resources may still be shared by cores in the same package. For
412  *	example, many processors will share the level 3 cache between cores.
413  *	Some AMD generations share hardware resources between cores. For more
414  *	information on that see the section 'AMD Topology'.
415  *
416  * THREAD and STRAND
417  *
418  *	In this file, generally a thread refers to a hardware resources and not
419  *	the operating system's logical abstraction. A thread is always exposed
420  *	as an independent logical CPU to the operating system. A thread belongs
421  *	to a specific core. A core may have more than one thread. When that is
422  *	the case, the threads that are part of the same core are often referred
423  *	to as 'siblings'.
424  *
425  *	When multiple threads exist, this is generally referred to as
426  *	simultaneous multi-threading (SMT). When Intel introduced this in their
427  *	processors they called it hyper-threading (HT). When multiple threads
428  *	are active in a core, they split the resources of the core. For example,
429  *	two threads may share the same set of hardware execution units.
430  *
431  *	The operating system often uses the term 'strand' to refer to a thread.
432  *	This helps disambiguate it from the software concept.
433  *
434  * CHIP
435  *
436  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
437  *	base meaning, it is used to refer to a single integrated circuit, which
438  *	may or may not be the only thing in the package. In illumos, when you
439  *	see the term 'chip' it is almost always referring to the same thing as
440  *	the 'package'. However, many vendors may use chip to refer to one of
441  *	many integrated circuits that have been placed in the package. As an
442  *	example, see the subsequent definition.
443  *
444  *	To try and keep things consistent, we will only use chip when referring
445  *	to the entire integrated circuit package, with the exception of the
446  *	definition of multi-chip module (because it is in the name) and use the
447  *	term 'die' when we want the more general, potential sub-component
448  *	definition.
449  *
450  * DIE
451  *
452  *	A die refers to an integrated circuit. Inside of the package there may
453  *	be a single die or multiple dies. This is sometimes called a 'chip' in
454  *	vendor's parlance, but in this file, we use the term die to refer to a
455  *	subcomponent.
456  *
457  * MULTI-CHIP MODULE
458  *
459  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
460  *	are connected together in the same package. When a multi-chip design is
461  *	used, generally each chip is manufactured independently and then joined
462  *	together in the package. For example, on AMD's Zen microarchitecture
463  *	(family 0x17), the package contains several dies (the second meaning of
464  *	chip from above) that are connected together.
465  *
466  * CACHE
467  *
468  *	A cache is a part of the processor that maintains copies of recently
469  *	accessed memory. Caches are split into levels and then into types.
470  *	Commonly there are one to three levels, called level one, two, and
471  *	three. The lower the level, the smaller it is, the closer it is to the
472  *	execution units of the CPU, and the faster it is to access. The layout
473  *	and design of the cache come in many different flavors, consult other
474  *	resources for a discussion of those.
475  *
476  *	Caches are generally split into two types, the instruction and data
477  *	cache. The caches contain what their names suggest, the instruction
478  *	cache has executable program text, while the data cache has all other
479  *	memory that the processor accesses. As of this writing, data is kept
480  *	coherent between all of the caches on x86, so if one modifies program
481  *	text before it is executed, that will be in the data cache, and the
482  *	instruction cache will be synchronized with that change when the
483  *	processor actually executes those instructions. This coherency also
484  *	covers the fact that data could show up in multiple caches.
485  *
486  *	Generally, the lowest level caches are specific to a core. However, the
487  *	last layer cache is shared between some number of cores. The number of
488  *	CPUs sharing this last level cache is important. This has implications
489  *	for the choices that the scheduler makes, as accessing memory that might
490  *	be in a remote cache after thread migration can be quite expensive.
491  *
492  *	Sometimes, the word cache is abbreviated with a '$', because in US
493  *	English the word cache is pronounced the same as cash. So L1D$ refers to
494  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
495  *	in the rest of this theory statement for clarity.
496  *
497  * MEMORY CONTROLLER
498  *
499  *	The memory controller is a component that provides access to DRAM. Each
500  *	memory controller can access a set number of DRAM channels. Each channel
501  *	can have a number of DIMMs (sticks of memory) associated with it. A
502  *	given package may have more than one memory controller. The association
503  *	of the memory controller to a group of cores is important as it is
504  *	cheaper to access memory on the controller that you are associated with.
505  *
506  * NUMA
507  *
508  *	NUMA or non-uniform memory access, describes a way that systems are
509  *	built. On x86, any processor core can address all of the memory in the
510  *	system. However, When using multiple sockets or possibly within a
511  *	multi-chip module, some of that memory is physically closer and some of
512  *	it is further. Memory that is further away is more expensive to access.
513  *	Consider the following image of multiple sockets with memory:
514  *
515  *	+--------+                                                +--------+
516  *	| DIMM A |         +----------+      +----------+         | DIMM D |
517  *	+--------+-+       |          |      |          |       +-+------+-+
518  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
519  *	  +--------+-+     |          |      |          |     +-+------+-+
520  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
521  *	    +--------+                                        +--------+
522  *
523  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
524  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
525  *	access DIMMs A-C and more expensive to access D-F as it has to go
526  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
527  *	D-F are cheaper than A-C. While the socket form is the most common, when
528  *	using multi-chip modules, this can also sometimes occur. For another
529  *	example of this that's more involved, see the AMD topology section.
530  *
531  *
532  * Intel Topology
533  * --------------
534  *
535  * Most Intel processors since Nehalem, (as of this writing the current gen
536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
537  * the package is a single monolithic die. MCMs currently aren't used. Most
538  * parts have three levels of caches, with the L3 cache being shared between
539  * all of the cores on the package. The L1/L2 cache is generally specific to
540  * an individual core. The following image shows at a simplified level what
541  * this looks like. The memory controller is commonly part of something called
542  * the 'Uncore', that used to be separate physical chips that were not a part of
543  * the package, but are now part of the same chip.
544  *
545  *  +-----------------------------------------------------------------------+
546  *  | Package                                                               |
547  *  |  +-------------------+  +-------------------+  +-------------------+  |
548  *  |  | Core              |  | Core              |  | Core              |  |
549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
558  *  |  +-------------------+  +-------------------+  +-------------------+  |
559  *  | +-------------------------------------------------------------------+ |
560  *  | |                         Shared L3 Cache                           | |
561  *  | +-------------------------------------------------------------------+ |
562  *  | +-------------------------------------------------------------------+ |
563  *  | |                        Memory Controller                          | |
564  *  | +-------------------------------------------------------------------+ |
565  *  +-----------------------------------------------------------------------+
566  *
567  * A side effect of this current architecture is that what we care about from a
568  * scheduling and topology perspective, is simplified. In general we care about
569  * understanding which logical CPUs are part of the same core and socket.
570  *
571  * To determine the relationship between threads and cores, Intel initially used
572  * the identifier in the advanced programmable interrupt controller (APIC). They
573  * also added cpuid leaf 4 to give additional information about the number of
574  * threads and CPUs in the processor. With the addition of x2apic (which
575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
576  * additional cpuid topology leaf 0xB was added.
577  *
578  * AMD Topology
579  * ------------
580  *
581  * When discussing AMD topology, we want to break this into three distinct
582  * generations of topology. There's the basic topology that has been used in
583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
587  * additional terminology that's worth talking about.
588  *
589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
590  * that they considered SMT. Whether or not the AMD processors have SMT
591  * influences many things including scheduling and reliability, availability,
592  * and serviceability (RAS) features.
593  *
594  * NODE
595  *
596  *	AMD uses the term node to refer to a die that contains a number of cores
597  *	and I/O resources. Depending on the processor family and model, more
598  *	than one node can be present in the package. When there is more than one
599  *	node this indicates a multi-chip module. Usually each node has its own
600  *	access to memory and I/O devices. This is important and generally
601  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
602  *	result, we track this relationship in the operating system.
603  *
604  *	In processors with an L3 cache, the L3 cache is generally shared across
605  *	the entire node, though the way this is carved up varies from generation
606  *	to generation.
607  *
608  * BULLDOZER
609  *
610  *	Starting with the Bulldozer family (0x15) and continuing until the
611  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
612  *	compute unit. In a compute unit, two traditional cores share a number of
613  *	hardware resources. Critically, they share the FPU, L1 instruction
614  *	cache, and the L2 cache. Several compute units were then combined inside
615  *	of a single node.  Because the integer execution units, L1 data cache,
616  *	and some other resources were not shared between the cores, AMD never
617  *	considered this to be SMT.
618  *
619  * ZEN
620  *
621  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
622  *	is called Zeppelin. These modules are similar to the idea of nodes used
623  *	previously. Each of these nodes has two DRAM channels which all of the
624  *	cores in the node can access uniformly. These nodes are linked together
625  *	in the package, creating a NUMA environment.
626  *
627  *	The Zeppelin die itself contains two different 'core complexes'. Each
628  *	core complex consists of four cores which each have two threads, for a
629  *	total of 8 logical CPUs per complex. Unlike other generations,
630  *	where all the logical CPUs in a given node share the L3 cache, here each
631  *	core complex has its own shared L3 cache.
632  *
633  *	A further thing that we need to consider is that in some configurations,
634  *	particularly with the Threadripper line of processors, not every die
635  *	actually has its memory controllers wired up to actual memory channels.
636  *	This means that some cores have memory attached to them and others
637  *	don't.
638  *
639  *	To put Zen in perspective, consider the following images:
640  *
641  *      +--------------------------------------------------------+
642  *      | Core Complex                                           |
643  *      | +-------------------+    +-------------------+  +---+  |
644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
650  *      | +-------------------+    +-------------------+  | C |  |
651  *      | +-------------------+    +-------------------+  | a |  |
652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
658  *      | +-------------------+    +-------------------+  +---+  |
659  *      |                                                        |
660  *	+--------------------------------------------------------+
661  *
662  *  This first image represents a single Zen core complex that consists of four
663  *  cores.
664  *
665  *
666  *	+--------------------------------------------------------+
667  *	| Zeppelin Die                                           |
668  *	|  +--------------------------------------------------+  |
669  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
670  *	|  +--------------------------------------------------+  |
671  *      |                           HH                           |
672  *	|          +-----------+    HH    +-----------+          |
673  *	|          |           |    HH    |           |          |
674  *	|          |    Core   |==========|    Core   |          |
675  *	|          |  Complex  |==========|  Complex  |          |
676  *	|          |           |    HH    |           |          |
677  *	|          +-----------+    HH    +-----------+          |
678  *      |                           HH                           |
679  *	|  +--------------------------------------------------+  |
680  *	|  |                Memory Controller                 |  |
681  *	|  +--------------------------------------------------+  |
682  *      |                                                        |
683  *	+--------------------------------------------------------+
684  *
685  *  This image represents a single Zeppelin Die. Note how both cores are
686  *  connected to the same memory controller and I/O units. While each core
687  *  complex has its own L3 cache as seen in the first image, they both have
688  *  uniform access to memory.
689  *
690  *
691  *                      PP                     PP
692  *                      PP                     PP
693  *           +----------PP---------------------PP---------+
694  *           |          PP                     PP         |
695  *           |    +-----------+          +-----------+    |
696  *           |    |           |          |           |    |
697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
699  *           |    |           |          |           |    |
700  *           |    +-----------+ooo    ...+-----------+    |
701  *           |          HH      ooo  ...       HH         |
702  *           |          HH        oo..         HH         |
703  *           |          HH        ..oo         HH         |
704  *           |          HH      ...  ooo       HH         |
705  *           |    +-----------+...    ooo+-----------+    |
706  *           |    |           |          |           |    |
707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
709  *           |    |           |          |           |    |
710  *           |    +-----------+          +-----------+    |
711  *           |          PP                     PP         |
712  *           +----------PP---------------------PP---------+
713  *                      PP                     PP
714  *                      PP                     PP
715  *
716  *  This image represents a single Zen package. In this example, it has four
717  *  Zeppelin dies, though some configurations only have a single one. In this
718  *  example, each die is directly connected to the next. Also, each die is
719  *  represented as being connected to memory by the 'M' character and connected
720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
721  *  die is made up of two core complexes, we have multiple different NUMA
722  *  domains that we care about for these systems.
723  *
724  * ZEN 2
725  *
726  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
727  *	each Zeppelin Die had its own I/O die, that has been moved out of the
728  *	core complex in Zen 2. The actual core complex looks pretty similar, but
729  *	now the die actually looks much simpler:
730  *
731  *      +--------------------------------------------------------+
732  *      | Zen 2 Core Complex Die    HH                           |
733  *      |                           HH                           |
734  *      |          +-----------+    HH    +-----------+          |
735  *      |          |           |    HH    |           |          |
736  *      |          |    Core   |==========|    Core   |          |
737  *      |          |  Complex  |==========|  Complex  |          |
738  *      |          |           |    HH    |           |          |
739  *      |          +-----------+    HH    +-----------+          |
740  *      |                           HH                           |
741  *      |                           HH                           |
742  *      +--------------------------------------------------------+
743  *
744  *	From here, when we add the central I/O die, this changes things a bit.
745  *	Each die is connected to the I/O die, rather than trying to interconnect
746  *	them directly. The following image takes the same Zen 1 image that we
747  *	had earlier and shows what it looks like with the I/O die instead:
748  *
749  *                                 PP    PP
750  *                                 PP    PP
751  *           +---------------------PP----PP---------------------+
752  *           |                     PP    PP                     |
753  *           |  +-----------+      PP    PP      +-----------+  |
754  *           |  |           |      PP    PP      |           |  |
755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
757  *           |  |         |o|oooo|          |oooo|o|         |  |
758  *           |  +-----------+    |          |    +-----------+  |
759  *           |                   |   I/O    |                   |
760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
762  *           |                   |          |                   |
763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
765  *           |                   |          |                   |
766  *           |  +-----------+    |          |    +-----------+  |
767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
769  *           |  |    Die    |      PP    PP      |    Die    |  |
770  *           |  |           |      PP    PP      |           |  |
771  *           |  +-----------+      PP    PP      +-----------+  |
772  *           |                     PP    PP                     |
773  *           +---------------------PP----PP---------------------+
774  *                                 PP    PP
775  *                                 PP    PP
776  *
777  *	The above has four core complex dies installed, though the Zen 2 EPYC
778  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
779  *	generally only have one to two. The more notable difference here is how
780  *	everything communicates. Note that memory and PCIe come out of the
781  *	central die. This changes the way that one die accesses a resource. It
782  *	basically always has to go to the I/O die, where as in Zen 1 it may have
783  *	satisfied it locally. In general, this ends up being a better strategy
784  *	for most things, though it is possible to still treat everything in four
785  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
786  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
787  *	now there is only one 'node' present.
788  *
789  * ZEN 3
790  *
791  *	From an architectural perspective, Zen 3 is a much smaller change from
792  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
793  *	its microarchitectural changes. The biggest thing for us is how the die
794  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
795  *	cache. However, in Zen 3, the L3 is now shared between the entire core
796  *	complex die and is no longer partitioned between each core complex. This
797  *	means that all cores on the die can share the same L3 cache. Otherwise,
798  *	the general layout of the overall package with various core complexes
799  *	and an I/O die stays the same. Here's what the Core Complex Die looks
800  *	like in a bit more detail:
801  *
802  *               +-------------------------------------------------+
803  *               | Zen 3 Core Complex Die                          |
804  *               | +-------------------+    +-------------------+  |
805  *               | | Core       +----+ |    | Core       +----+ |  |
806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
811  *               | +-------------------+    +-------------------+  |
812  *               | +-------------------+    +-------------------+  |
813  *               | | Core       +----+ |    | Core       +----+ |  |
814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
819  *               | +-------------------+    +-------------------+  |
820  *               |                                                 |
821  *               | +--------------------------------------------+  |
822  *               | |                 L3 Cache                   |  |
823  *               | +--------------------------------------------+  |
824  *               |                                                 |
825  *               | +-------------------+    +-------------------+  |
826  *               | | Core       +----+ |    | Core       +----+ |  |
827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
832  *               | +-------------------+    +-------------------+  |
833  *               | +-------------------+    +-------------------+  |
834  *               | | Core       +----+ |    | Core       +----+ |  |
835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
840  *               | +-------------------+    +-------------------+  |
841  *               +-------------------------------------------------+
842  *
843  *	While it is not pictured, there are connections from the die to the
844  *	broader data fabric and additional functional blocks to support that
845  *	communication and coherency.
846  *
847  * CPUID LEAVES
848  *
849  * There are a few different CPUID leaves that we can use to try and understand
850  * the actual state of the world. As part of the introduction of family 0xf, AMD
851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
852  * processors that are in the system. Because families before Zen didn't have
853  * SMT, this was always the number of cores that were in the system. However, it
854  * should always be thought of as the number of logical threads to be consistent
855  * between generations. In addition we also get the size of the APIC ID that is
856  * used to represent the number of logical processors. This is important for
857  * deriving topology information.
858  *
859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
860  * bit between Bulldozer and later families, but it is quite useful in
861  * determining the topology information. Because this information has changed
862  * across family generations, it's worth calling out what these mean
863  * explicitly. The registers have the following meanings:
864  *
865  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
866  *		APIC ID, even though on systems without x2apic support, it will
867  *		be limited to 8 bits.
868  *
869  *	%ebx	On Bulldozer-era systems this contains information about the
870  *		number of cores that are in a compute unit (cores that share
871  *		resources). It also contains a per-package compute unit ID that
872  *		identifies which compute unit the logical CPU is a part of.
873  *
874  *		On Zen-era systems this instead contains the number of threads
875  *		per core and the ID of the core that the logical CPU is a part
876  *		of. Note, this ID is unique only to the package, it is not
877  *		globally unique across the entire system.
878  *
879  *	%ecx	This contains the number of nodes that exist in the package. It
880  *		also contains an ID that identifies which node the logical CPU
881  *		is a part of.
882  *
883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
884  * cache layout to determine which logical CPUs are sharing which caches.
885  *
886  * illumos Topology
887  * ----------------
888  *
889  * Based on the above we synthesize the information into several different
890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
891  * of what each member is supposed to represent and their uniqueness. In
892  * general, there are two levels of uniqueness that we care about. We care about
893  * an ID that is globally unique. That means that it will be unique across all
894  * entities in the system. For example, the default logical CPU ID is globally
895  * unique. On the other hand, there is some information that we only care about
896  * being unique within the context of a single package / socket. Here are the
897  * variables that we keep track of and their meaning.
898  *
899  * Several of the values that are asking for an identifier, with the exception
900  * of cpi_apicid, are allowed to be synthetic.
901  *
902  *
903  * cpi_apicid
904  *
905  *	This is the value of the CPU's APIC id. This should be the full 32-bit
906  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
907  *	APIC ID. This value is globally unique between all logical CPUs across
908  *	all packages. This is usually required by the APIC.
909  *
910  * cpi_chipid
911  *
912  *	This value indicates the ID of the package that the logical CPU is a
913  *	part of. This value is allowed to be synthetic. It is usually derived by
914  *	taking the CPU's APIC ID and determining how many bits are used to
915  *	represent CPU cores in the package. All logical CPUs that are part of
916  *	the same package must have the same value.
917  *
918  * cpi_coreid
919  *
920  *	This represents the ID of a CPU core. Two logical CPUs should only have
921  *	the same cpi_coreid value if they are part of the same core. These
922  *	values may be synthetic. On systems that support SMT, this value is
923  *	usually derived from the APIC ID, otherwise it is often synthetic and
924  *	just set to the value of the cpu_id in the cpu_t.
925  *
926  * cpi_pkgcoreid
927  *
928  *	This is similar to the cpi_coreid in that logical CPUs that are part of
929  *	the same core should have the same ID. The main difference is that these
930  *	values are only required to be unique to a given socket.
931  *
932  * cpi_clogid
933  *
934  *	This represents the logical ID of a logical CPU. This value should be
935  *	unique within a given socket for each logical CPU. This is allowed to be
936  *	synthetic, though it is usually based off of the CPU's apic ID. The
937  *	broader system expects that logical CPUs that have are part of the same
938  *	core have contiguous numbers. For example, if there were two threads per
939  *	core, then the core IDs divided by two should be the same and the first
940  *	modulus two should be zero and the second one. For example, IDs 4 and 5
941  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
942  *	6 represent two logical CPUs that are part of different cores.
943  *
944  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
945  *	from the same source, strictly speaking, they don't have to be and the
946  *	two values should be considered logically independent. One should not
947  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
948  *	some kind of relationship. While this is tempting, we've seen cases on
949  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
950  *
951  * cpi_ncpu_per_chip
952  *
953  *	This value indicates the total number of logical CPUs that exist in the
954  *	physical package. Critically, this is not the number of logical CPUs
955  *	that exist for just the single core.
956  *
957  *	This value should be the same for all logical CPUs in the same package.
958  *
959  * cpi_ncore_per_chip
960  *
961  *	This value indicates the total number of physical CPU cores that exist
962  *	in the package. The system compares this value with cpi_ncpu_per_chip to
963  *	determine if simultaneous multi-threading (SMT) is enabled. When
964  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
965  *	the X86FSET_HTT feature is not set. If this value is greater than one,
966  *	than we consider the processor to have the feature X86FSET_CMP, to
967  *	indicate that there is support for more than one core.
968  *
969  *	This value should be the same for all logical CPUs in the same package.
970  *
971  * cpi_procnodes_per_pkg
972  *
973  *	This value indicates the number of 'nodes' that exist in the package.
974  *	When processors are actually a multi-chip module, this represents the
975  *	number of such modules that exist in the package. Currently, on Intel
976  *	based systems this member is always set to 1.
977  *
978  *	This value should be the same for all logical CPUs in the same package.
979  *
980  * cpi_procnodeid
981  *
982  *	This value indicates the ID of the node that the logical CPU is a part
983  *	of. All logical CPUs that are in the same node must have the same value
984  *	here. This value must be unique across all of the packages in the
985  *	system.  On Intel based systems, this is currently set to the value in
986  *	cpi_chipid because there is only one node.
987  *
988  * cpi_cores_per_compunit
989  *
990  *	This value indicates the number of cores that are part of a compute
991  *	unit. See the AMD topology section for this. This member only has real
992  *	meaning currently for AMD Bulldozer family processors. For all other
993  *	processors, this should currently be set to 1.
994  *
995  * cpi_compunitid
996  *
997  *	This indicates the compute unit that the logical CPU belongs to. For
998  *	processors without AMD Bulldozer-style compute units this should be set
999  *	to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *	This indicates the number of logical CPUs that are sharing the same last
1004  *	level cache. This value should be the same for all CPUs that are sharing
1005  *	that cache. The last cache refers to the cache that is closest to memory
1006  *	and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *	This indicates the ID of the last cache that the logical CPU uses. This
1011  *	cache is often shared between multiple logical CPUs and is the cache
1012  *	that is closest to memory and furthest away from the CPU. This value
1013  *	should be the same for a group of logical CPUs only if they actually
1014  *	share the same last level cache. IDs should not overlap between
1015  *	packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *	This indicates the number of bits that are required to represent all of
1020  *	the cores in the system. As cores are derived based on their APIC IDs,
1021  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *	this value to be larger than the actual number of IDs that are present
1023  *	in the system. This is used to size tables by the CMI framework. It is
1024  *	only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *	This indicates the number of bits required to represent all of the IDs
1029  *	that cover the logical CPUs that exist on a given core. It's OK for this
1030  *	value to be larger than the actual number of IDs that are present in the
1031  *	system.  This is used to size tables by the CMI framework. It is
1032  *	only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *   - Register File Data Sampling (RFDS)
1117  *
1118  * Each of these requires different sets of mitigations and has different attack
1119  * surfaces. For the most part, this discussion is about protecting the kernel
1120  * from non-kernel executing environments such as user processes and hardware
1121  * virtual machines. Unfortunately, there are a number of user vs. user
1122  * scenarios that exist with these. The rest of this section will describe the
1123  * overall approach that the system has taken to address these as well as their
1124  * shortcomings. Unfortunately, not all of the above have been handled today.
1125  *
1126  * SPECTRE v2, ret2spec, SpectreRSB
1127  *
1128  * The second variant of the spectre attack focuses on performing branch target
1129  * injection. This generally impacts indirect call instructions in the system.
1130  * There are four different ways to mitigate this issue that are commonly
1131  * described today:
1132  *
1133  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1134  *  2. Using Retpolines and RSB Stuffing
1135  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1136  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1137  *
1138  * IBRS uses a feature added to microcode to restrict speculation, among other
1139  * things. This form of mitigation has not been used as it has been generally
1140  * seen as too expensive and requires reactivation upon various transitions in
1141  * the system.
1142  *
1143  * As a less impactful alternative to IBRS, retpolines were developed by
1144  * Google. These basically require one to replace indirect calls with a specific
1145  * trampoline that will cause speculation to fail and break the attack.
1146  * Retpolines require compiler support. We always build with retpolines in the
1147  * external thunk mode. This means that a traditional indirect call is replaced
1148  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1149  * of this is that all indirect function calls are performed through a register.
1150  *
1151  * We have to use a common external location of the thunk and not inline it into
1152  * the callsite so that way we can have a single place to patch these functions.
1153  * As it turns out, we currently have two different forms of retpolines that
1154  * exist in the system:
1155  *
1156  *  1. A full retpoline
1157  *  2. A no-op version
1158  *
1159  * The first one is used in the general case. Historically, there was an
1160  * AMD-specific optimized retopoline variant that was based around using a
1161  * serializing lfence instruction; however, in March 2022 it was announced that
1162  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1163  * use it and it is no longer available in the system.
1164  *
1165  * The third form described above is the most curious. It turns out that the way
1166  * that retpolines are implemented is that they rely on how speculation is
1167  * performed on a 'ret' instruction. Intel has continued to optimize this
1168  * process (which is partly why we need to have return stack buffer stuffing,
1169  * but more on that in a bit) and in processors starting with Cascade Lake
1170  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1171  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1172  *
1173  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1174  * physical core. However, if this is the case, we don't want to use retpolines
1175  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1176  * function (called a thunk) into a jmp instruction. This means that we're still
1177  * paying the cost of an extra jump to the external thunk, but it gives us
1178  * flexibility and the ability to have a single kernel image that works across a
1179  * wide variety of systems and hardware features.
1180  *
1181  * Unfortunately, this alone is insufficient. First, Skylake systems have
1182  * additional speculation for the Return Stack Buffer (RSB) which is used to
1183  * return from call instructions which retpolines take advantage of. However,
1184  * this problem is not just limited to Skylake and is actually more pernicious.
1185  * The SpectreRSB paper introduces several more problems that can arise with
1186  * dealing with this. The RSB can be poisoned just like the indirect branch
1187  * predictor. This means that one needs to clear the RSB when transitioning
1188  * between two different privilege domains. Some examples include:
1189  *
1190  *  - Switching between two different user processes
1191  *  - Going between user land and the kernel
1192  *  - Returning to the kernel from a hardware virtual machine
1193  *
1194  * Mitigating this involves combining a couple of different things. The first is
1195  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1196  * Bridge. When an RSB entry refers to a user address and we're executing in the
1197  * kernel, speculation through it will be stopped when SMEP is enabled. This
1198  * protects against a number of the different cases that we would normally be
1199  * worried about such as when we enter the kernel from user land.
1200  *
1201  * To prevent against additional manipulation of the RSB from other contexts
1202  * such as a non-root VMX context attacking the kernel we first look to
1203  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1204  * nothing else that we need to do to protect the kernel at this time.
1205  *
1206  * Unfortunately, not all eIBRS implementations are sufficient to guard
1207  * against RSB manipulations, so we still need to manually overwrite the
1208  * contents of the return stack buffer unless the hardware specifies we are
1209  * covered. We do this through the x86_rsb_stuff() function.  Currently this
1210  * is employed on context switch and vmx_exit. The x86_rsb_stuff() function is
1211  * disabled only when mitigations in general are, or if we have hardware
1212  * indicating no need for post-barrier RSB protections, either in one place
1213  * (old hardware), or on both (newer hardware).
1214  *
1215  * If SMEP is not present, then we would have to stuff the RSB every time we
1216  * transitioned from user mode to the kernel, which isn't very practical right
1217  * now.
1218  *
1219  * To fully protect user to user and vmx to vmx attacks from these classes of
1220  * issues, we would also need to allow them to opt into performing an Indirect
1221  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1222  *
1223  * The fourth form of mitigation here is specific to AMD and is called Automated
1224  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1225  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1226  * (extended feature enable register) MSR. This bit basically says that IBRS
1227  * acts as though it is always active when executing at CPL0 and when executing
1228  * in the 'host' context when SEV-SNP is enabled.
1229  *
1230  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1231  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1232  * to the kernel, we must still consider the remaining cases that exist, just
1233  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1234  * traditional technique to work, this is not true on all CPUs. While a write to
1235  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1236  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1237  * guard page is present between user and kernel address spaces and SMEP is
1238  * enabled, then there is no need to clear the RSB at all.
1239  *
1240  * By default, the system will enable RSB stuffing and the required variant of
1241  * retpolines and store that information in the x86_spectrev2_mitigation value.
1242  * This will be evaluated after a microcode update as well, though it is
1243  * expected that microcode updates will not take away features. This may mean
1244  * that a late loaded microcode may not end up in the optimal configuration
1245  * (though this should be rare).
1246  *
1247  * Currently we do not build kmdb with retpolines or perform any additional side
1248  * channel security mitigations for it. One complication with kmdb is that it
1249  * requires its own retpoline thunks and it would need to adjust itself based on
1250  * what the kernel does. The threat model of kmdb is more limited and therefore
1251  * it may make more sense to investigate using prediction barriers as the whole
1252  * system is only executing a single instruction at a time while in kmdb.
1253  *
1254  * SPECTRE v1, v4
1255  *
1256  * The v1 and v4 variants of spectre are not currently mitigated in the
1257  * system and require other classes of changes to occur in the code.
1258  *
1259  * SPECTRE v1 (SWAPGS VARIANT)
1260  *
1261  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1262  * can generally affect any branch-dependent code. The swapgs issue is one
1263  * variant of this. If we are coming in from userspace, we can have code like
1264  * this:
1265  *
1266  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1267  *	je	1f
1268  *	movq	$0, REGOFF_SAVFP(%rsp)
1269  *	swapgs
1270  *	1:
1271  *	movq	%gs:CPU_THREAD, %rax
1272  *
1273  * If an attacker can cause a mis-speculation of the branch here, we could skip
1274  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1275  * load. If subsequent code can act as the usual Spectre cache gadget, this
1276  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1277  * any use of the %gs override.
1278  *
1279  * The other case is also an issue: if we're coming into a trap from kernel
1280  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1281  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1282  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1283  * case, and the fix is the same in both cases (an lfence at the branch target
1284  * 1: in this example), we'll just do it unconditionally.
1285  *
1286  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1287  * harder for user-space to actually set a useful %gsbase value: although it's
1288  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1289  * mitigate anyway.
1290  *
1291  * MELTDOWN
1292  *
1293  * Meltdown, or spectre v3, allowed a user process to read any data in their
1294  * address space regardless of whether or not the page tables in question
1295  * allowed the user to have the ability to read them. The solution to meltdown
1296  * is kernel page table isolation. In this world, there are two page tables that
1297  * are used for a process, one in user land and one in the kernel. To implement
1298  * this we use per-CPU page tables and switch between the user and kernel
1299  * variants when entering and exiting the kernel.  For more information about
1300  * this process and how the trampolines work, please see the big theory
1301  * statements and additional comments in:
1302  *
1303  *  - uts/i86pc/ml/kpti_trampolines.s
1304  *  - uts/i86pc/vm/hat_i86.c
1305  *
1306  * While Meltdown only impacted Intel systems and there are also Intel systems
1307  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1308  * kernel page table isolation enabled. While this may at first seem weird, an
1309  * important thing to remember is that you can't speculatively read an address
1310  * if it's never in your page table at all. Having user processes without kernel
1311  * pages present provides us with an important layer of defense in the kernel
1312  * against any other side channel attacks that exist and have yet to be
1313  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1314  * default, no matter the x86 system.
1315  *
1316  * L1 TERMINAL FAULT
1317  *
1318  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1319  * execution uses page table entries. Effectively, it is two different problems.
1320  * The first is that it ignores the not present bit in the page table entries
1321  * when performing speculative execution. This means that something can
1322  * speculatively read the listed physical address if it's present in the L1
1323  * cache under certain conditions (see Intel's documentation for the full set of
1324  * conditions). Secondly, this can be used to bypass hardware virtualization
1325  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1326  * instructions.
1327  *
1328  * For the non-hardware virtualized case, this is relatively easy to deal with.
1329  * We must make sure that all unmapped pages have an address of zero. This means
1330  * that they could read the first 4k of physical memory; however, we never use
1331  * that first page in the operating system and always skip putting it in our
1332  * memory map, even if firmware tells us we can use it in our memory map. While
1333  * other systems try to put extra metadata in the address and reserved bits,
1334  * which led to this being problematic in those cases, we do not.
1335  *
1336  * For hardware virtual machines things are more complicated. Because they can
1337  * construct their own page tables, it isn't hard for them to perform this
1338  * attack against any physical address. The one wrinkle is that this physical
1339  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1340  * to flush the L1 data cache. We wrap this up in the function
1341  * spec_uarch_flush(). This function is also used in the mitigation of
1342  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1343  * hypervisors such as KVM or bhyve are responsible for performing this before
1344  * entering the guest.
1345  *
1346  * Because this attack takes place in the L1 cache, there's another wrinkle
1347  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1348  * designs. This means that when a thread enters a hardware virtualized context
1349  * and flushes the L1 data cache, the other thread on the processor may then go
1350  * ahead and put new data in it that can be potentially attacked. While one
1351  * solution is to disable SMT on the system, another option that is available is
1352  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1353  * goes through and makes sure that if a HVM is being scheduled on one thread,
1354  * then the thing on the other thread is from the same hardware virtual machine.
1355  * If an interrupt comes in or the guest exits to the broader system, then the
1356  * other SMT thread will be kicked out.
1357  *
1358  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1359  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1360  * perform L1TF related mitigations.
1361  *
1362  * MICROARCHITECTURAL DATA SAMPLING
1363  *
1364  * Microarchitectural data sampling (MDS) is a combination of four discrete
1365  * vulnerabilities that are similar issues affecting various parts of the CPU's
1366  * microarchitectural implementation around load, store, and fill buffers.
1367  * Specifically it is made up of the following subcomponents:
1368  *
1369  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1370  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1371  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1372  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1373  *
1374  * To begin addressing these, Intel has introduced another feature in microcode
1375  * called MD_CLEAR. This changes the verw instruction to operate in a different
1376  * way. This allows us to execute the verw instruction in a particular way to
1377  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1378  * updated when this microcode is present to flush this state.
1379  *
1380  * Primarily we need to flush this state whenever we transition from the kernel
1381  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1382  * little bit different. Here the structures are statically sized when a logical
1383  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1384  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1385  * mwait, or another ACPI method. To perform these flushes, we call
1386  * x86_md_clear() at all of these transition points.
1387  *
1388  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1389  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1390  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1391  * a no-op.
1392  *
1393  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1394  * particular, everything we've discussed above is only valid for a single
1395  * thread executing on a core. In the case where you have hyper-threading
1396  * present, this attack can be performed between threads. The theoretical fix
1397  * for this is to ensure that both threads are always in the same security
1398  * domain. This means that they are executing in the same ring and mutually
1399  * trust each other. Practically speaking, this would mean that a system call
1400  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1401  * Rather than implement this, we recommend that one disables hyper-threading
1402  * through the use of psradm -aS.
1403  *
1404  * TSX ASYNCHRONOUS ABORT
1405  *
1406  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1407  * behaves like MDS, but leverages Intel's transactional instructions as another
1408  * vector. Effectively, when a transaction hits one of these cases (unmapped
1409  * page, various cache snoop activity, etc.) then the same data can be exposed
1410  * as in the case of MDS. This means that you can attack your twin.
1411  *
1412  * Intel has described that there are two different ways that we can mitigate
1413  * this problem on affected processors:
1414  *
1415  *   1) We can use the same techniques used to deal with MDS. Flushing the
1416  *      microarchitectural buffers and disabling hyperthreading will mitigate
1417  *      this in the same way.
1418  *
1419  *   2) Using microcode to disable TSX.
1420  *
1421  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1422  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1423  * That's OK as we're already doing all such mitigations. On the other hand,
1424  * processors with MDS_NO are all supposed to receive microcode updates that
1425  * enumerate support for disabling TSX. In general, we'd rather use this method
1426  * when available as it doesn't require disabling hyperthreading to be
1427  * effective. Currently we basically are relying on microcode for processors
1428  * that enumerate MDS_NO.
1429  *
1430  * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1431  * Sampling: RFDS. This allows an attacker to sample values that were in any
1432  * of integer, floating point, or vector registers. This was discovered by
1433  * Intel during internal validation work.  The existence of the RFDS_NO
1434  * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1435  * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1436  * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1437  * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1438  * MSR that L1D uses.
1439  *
1440  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1441  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1442  * different powers. The first allows us to cause all transactions to
1443  * immediately abort. The second gives us a means of disabling TSX completely,
1444  * which includes removing it from cpuid. If we have support for this in
1445  * microcode during the first cpuid pass, then we'll disable TSX completely such
1446  * that user land never has a chance to observe the bit. However, if we are late
1447  * loading the microcode, then we must use the functionality to cause
1448  * transactions to automatically abort. This is necessary for user land's sake.
1449  * Once a program sees a cpuid bit, it must not be taken away.
1450  *
1451  * We track whether or not we should do this based on what cpuid pass we're in.
1452  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1453  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1454  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1455  * second time after we do the initial microcode update.  As a result we need to
1456  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1457  * suitable microcode on the current CPU (which happens prior to
1458  * cpuid_pass_ucode()).
1459  *
1460  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1461  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1462  * unfortunate feature in a number of ways, and taking the opportunity to
1463  * finally be able to turn it off is likely to be of benefit in the future.
1464  *
1465  * SUMMARY
1466  *
1467  * The following table attempts to summarize the mitigations for various issues
1468  * and what's done in various places:
1469  *
1470  *  - Spectre v1: Not currently mitigated
1471  *  - swapgs: lfences after swapgs paths
1472  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1473  *  - Meltdown: Kernel Page Table Isolation
1474  *  - Spectre v3a: Updated CPU microcode
1475  *  - Spectre v4: Not currently mitigated
1476  *  - SpectreRSB: SMEP and RSB Stuffing
1477  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1478  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1479  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1480  *  - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1481  *
1482  * The following table indicates the x86 feature set bits that indicate that a
1483  * given problem has been solved or a notable feature is present:
1484  *
1485  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1486  *  - MDS_NO: All forms of MDS
1487  *  - TAA_NO: TAA
1488  *  - RFDS_NO: RFDS
1489  */
1490 
1491 #include <sys/types.h>
1492 #include <sys/archsystm.h>
1493 #include <sys/x86_archext.h>
1494 #include <sys/kmem.h>
1495 #include <sys/systm.h>
1496 #include <sys/cmn_err.h>
1497 #include <sys/sunddi.h>
1498 #include <sys/sunndi.h>
1499 #include <sys/cpuvar.h>
1500 #include <sys/processor.h>
1501 #include <sys/sysmacros.h>
1502 #include <sys/pg.h>
1503 #include <sys/fp.h>
1504 #include <sys/controlregs.h>
1505 #include <sys/bitmap.h>
1506 #include <sys/auxv_386.h>
1507 #include <sys/memnode.h>
1508 #include <sys/pci_cfgspace.h>
1509 #include <sys/comm_page.h>
1510 #include <sys/mach_mmu.h>
1511 #include <sys/ucode.h>
1512 #include <sys/tsc.h>
1513 #include <sys/kobj.h>
1514 #include <sys/asm_misc.h>
1515 #include <sys/bitmap.h>
1516 
1517 #ifdef __xpv
1518 #include <sys/hypervisor.h>
1519 #else
1520 #include <sys/ontrap.h>
1521 #endif
1522 
1523 uint_t x86_vendor = X86_VENDOR_IntelClone;
1524 uint_t x86_type = X86_TYPE_OTHER;
1525 uint_t x86_clflush_size = 0;
1526 
1527 #if defined(__xpv)
1528 int x86_use_pcid = 0;
1529 int x86_use_invpcid = 0;
1530 #else
1531 int x86_use_pcid = -1;
1532 int x86_use_invpcid = -1;
1533 #endif
1534 
1535 typedef enum {
1536 	X86_SPECTREV2_RETPOLINE,
1537 	X86_SPECTREV2_ENHANCED_IBRS,
1538 	X86_SPECTREV2_AUTO_IBRS,
1539 	X86_SPECTREV2_DISABLED
1540 } x86_spectrev2_mitigation_t;
1541 
1542 uint_t x86_disable_spectrev2 = 0;
1543 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1544     X86_SPECTREV2_RETPOLINE;
1545 
1546 /*
1547  * The mitigation status for TAA:
1548  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1549  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1550  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1551  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1552  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1553  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1554  */
1555 typedef enum {
1556 	X86_TAA_NOTHING,
1557 	X86_TAA_DISABLED,
1558 	X86_TAA_MD_CLEAR,
1559 	X86_TAA_TSX_FORCE_ABORT,
1560 	X86_TAA_TSX_DISABLE,
1561 	X86_TAA_HW_MITIGATED
1562 } x86_taa_mitigation_t;
1563 
1564 uint_t x86_disable_taa = 0;
1565 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1566 
1567 uint_t pentiumpro_bug4046376;
1568 
1569 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1570 
1571 static char *x86_feature_names[NUM_X86_FEATURES] = {
1572 	"lgpg",
1573 	"tsc",
1574 	"msr",
1575 	"mtrr",
1576 	"pge",
1577 	"de",
1578 	"cmov",
1579 	"mmx",
1580 	"mca",
1581 	"pae",
1582 	"cv8",
1583 	"pat",
1584 	"sep",
1585 	"sse",
1586 	"sse2",
1587 	"htt",
1588 	"asysc",
1589 	"nx",
1590 	"sse3",
1591 	"cx16",
1592 	"cmp",
1593 	"tscp",
1594 	"mwait",
1595 	"sse4a",
1596 	"cpuid",
1597 	"ssse3",
1598 	"sse4_1",
1599 	"sse4_2",
1600 	"1gpg",
1601 	"clfsh",
1602 	"64",
1603 	"aes",
1604 	"pclmulqdq",
1605 	"xsave",
1606 	"avx",
1607 	"vmx",
1608 	"svm",
1609 	"topoext",
1610 	"f16c",
1611 	"rdrand",
1612 	"x2apic",
1613 	"avx2",
1614 	"bmi1",
1615 	"bmi2",
1616 	"fma",
1617 	"smep",
1618 	"smap",
1619 	"adx",
1620 	"rdseed",
1621 	"mpx",
1622 	"avx512f",
1623 	"avx512dq",
1624 	"avx512pf",
1625 	"avx512er",
1626 	"avx512cd",
1627 	"avx512bw",
1628 	"avx512vl",
1629 	"avx512fma",
1630 	"avx512vbmi",
1631 	"avx512_vpopcntdq",
1632 	"avx512_4vnniw",
1633 	"avx512_4fmaps",
1634 	"xsaveopt",
1635 	"xsavec",
1636 	"xsaves",
1637 	"sha",
1638 	"umip",
1639 	"pku",
1640 	"ospke",
1641 	"pcid",
1642 	"invpcid",
1643 	"ibrs",
1644 	"ibpb",
1645 	"stibp",
1646 	"ssbd",
1647 	"ssbd_virt",
1648 	"rdcl_no",
1649 	"ibrs_all",
1650 	"rsba",
1651 	"ssb_no",
1652 	"stibp_all",
1653 	"flush_cmd",
1654 	"l1d_vmentry_no",
1655 	"fsgsbase",
1656 	"clflushopt",
1657 	"clwb",
1658 	"monitorx",
1659 	"clzero",
1660 	"xop",
1661 	"fma4",
1662 	"tbm",
1663 	"avx512_vnni",
1664 	"amd_pcec",
1665 	"md_clear",
1666 	"mds_no",
1667 	"core_thermal",
1668 	"pkg_thermal",
1669 	"tsx_ctrl",
1670 	"taa_no",
1671 	"ppin",
1672 	"vaes",
1673 	"vpclmulqdq",
1674 	"lfence_serializing",
1675 	"gfni",
1676 	"avx512_vp2intersect",
1677 	"avx512_bitalg",
1678 	"avx512_vbmi2",
1679 	"avx512_bf16",
1680 	"auto_ibrs",
1681 	"rfds_no",
1682 	"rfds_clear",
1683 	"pbrsb_no"
1684 };
1685 
1686 boolean_t
1687 is_x86_feature(void *featureset, uint_t feature)
1688 {
1689 	ASSERT(feature < NUM_X86_FEATURES);
1690 	return (BT_TEST((ulong_t *)featureset, feature));
1691 }
1692 
1693 void
1694 add_x86_feature(void *featureset, uint_t feature)
1695 {
1696 	ASSERT(feature < NUM_X86_FEATURES);
1697 	BT_SET((ulong_t *)featureset, feature);
1698 }
1699 
1700 void
1701 remove_x86_feature(void *featureset, uint_t feature)
1702 {
1703 	ASSERT(feature < NUM_X86_FEATURES);
1704 	BT_CLEAR((ulong_t *)featureset, feature);
1705 }
1706 
1707 boolean_t
1708 compare_x86_featureset(void *setA, void *setB)
1709 {
1710 	/*
1711 	 * We assume that the unused bits of the bitmap are always zero.
1712 	 */
1713 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1714 		return (B_TRUE);
1715 	} else {
1716 		return (B_FALSE);
1717 	}
1718 }
1719 
1720 void
1721 print_x86_featureset(void *featureset)
1722 {
1723 	uint_t i;
1724 
1725 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1726 		if (is_x86_feature(featureset, i)) {
1727 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1728 			    x86_feature_names[i]);
1729 		}
1730 	}
1731 }
1732 
1733 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1734 static size_t xsave_state_size = 0;
1735 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1736 boolean_t xsave_force_disable = B_FALSE;
1737 extern int disable_smap;
1738 
1739 /*
1740  * This is set to platform type we are running on.
1741  */
1742 static int platform_type = -1;
1743 
1744 #if !defined(__xpv)
1745 /*
1746  * Variable to patch if hypervisor platform detection needs to be
1747  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1748  */
1749 int enable_platform_detection = 1;
1750 #endif
1751 
1752 /*
1753  * monitor/mwait info.
1754  *
1755  * size_actual and buf_actual are the real address and size allocated to get
1756  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1757  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1758  * processor cache-line alignment, but this is not guarantied in the furture.
1759  */
1760 struct mwait_info {
1761 	size_t		mon_min;	/* min size to avoid missed wakeups */
1762 	size_t		mon_max;	/* size to avoid false wakeups */
1763 	size_t		size_actual;	/* size actually allocated */
1764 	void		*buf_actual;	/* memory actually allocated */
1765 	uint32_t	support;	/* processor support of monitor/mwait */
1766 };
1767 
1768 /*
1769  * xsave/xrestor info.
1770  *
1771  * This structure contains HW feature bits and the size of the xsave save area.
1772  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1773  * (xsave_state) to describe the xsave layout. However, at runtime the
1774  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1775  * xsave_state structure simply represents the legacy layout of the beginning
1776  * of the xsave area.
1777  */
1778 struct xsave_info {
1779 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1780 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1781 	size_t		xsav_max_size;  /* max size save area for HW features */
1782 	size_t		ymm_size;	/* AVX: size of ymm save area */
1783 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1784 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1785 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1786 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1787 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1788 	size_t		opmask_size;	/* AVX512: size of opmask save */
1789 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1790 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1791 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1792 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1793 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1794 };
1795 
1796 
1797 /*
1798  * These constants determine how many of the elements of the
1799  * cpuid we cache in the cpuid_info data structure; the
1800  * remaining elements are accessible via the cpuid instruction.
1801  */
1802 
1803 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1804 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1805 #define	NMAX_CPI_TOPO	0x10		/* Sanity check on leaf 8X26, 1F */
1806 
1807 /*
1808  * See the big theory statement for a more detailed explanation of what some of
1809  * these members mean.
1810  */
1811 struct cpuid_info {
1812 	uint_t cpi_pass;		/* last pass completed */
1813 	/*
1814 	 * standard function information
1815 	 */
1816 	uint_t cpi_maxeax;		/* fn 0: %eax */
1817 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1818 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1819 
1820 	uint_t cpi_family;		/* fn 1: extended family */
1821 	uint_t cpi_model;		/* fn 1: extended model */
1822 	uint_t cpi_step;		/* fn 1: stepping */
1823 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1824 					/*		AMD: package/socket # */
1825 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1826 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1827 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1828 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1829 	uint_t cpi_ncache;		/* fn 2: number of elements */
1830 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1831 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1832 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1833 					/* Intel fn: 4, AMD fn: 8000001d */
1834 	struct cpuid_regs **cpi_cache_leaves;	/* Actual leaves from above */
1835 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1836 	struct cpuid_regs cpi_sub7[1];	/* Leaf 7, sub-leaf 1 */
1837 	/*
1838 	 * extended function information
1839 	 */
1840 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1841 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1842 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1843 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1844 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1845 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1846 
1847 	id_t cpi_coreid;		/* same coreid => strands share core */
1848 	int cpi_pkgcoreid;		/* core number within single package */
1849 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1850 					/* Intel: fn 4: %eax[31-26] */
1851 
1852 	/*
1853 	 * These values represent the number of bits that are required to store
1854 	 * information about the number of cores and threads.
1855 	 */
1856 	uint_t cpi_ncore_bits;
1857 	uint_t cpi_nthread_bits;
1858 	/*
1859 	 * supported feature information
1860 	 */
1861 	uint32_t cpi_support[6];
1862 #define	STD_EDX_FEATURES	0
1863 #define	AMD_EDX_FEATURES	1
1864 #define	TM_EDX_FEATURES		2
1865 #define	STD_ECX_FEATURES	3
1866 #define	AMD_ECX_FEATURES	4
1867 #define	STD_EBX_FEATURES	5
1868 	/*
1869 	 * Synthesized information, where known.
1870 	 */
1871 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1872 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1873 	uint32_t cpi_socket;		/* Chip package/socket type */
1874 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1875 
1876 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1877 	uint32_t cpi_apicid;
1878 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1879 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1880 					/* Intel: 1 */
1881 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1882 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1883 
1884 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1885 
1886 	/*
1887 	 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1888 	 * eventually leaf 0x1F (Intel).
1889 	 */
1890 	uint_t cpi_topo_nleaves;
1891 	struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1892 };
1893 
1894 
1895 static struct cpuid_info cpuid_info0;
1896 
1897 /*
1898  * These bit fields are defined by the Intel Application Note AP-485
1899  * "Intel Processor Identification and the CPUID Instruction"
1900  */
1901 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1902 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1903 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1904 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1905 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1906 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1907 
1908 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1909 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1910 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1911 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1912 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1913 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1914 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1915 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1916 
1917 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1918 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1919 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1920 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1921 
1922 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1923 #define	CPI_XMAXEAX_MAX		0x80000100
1924 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1925 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1926 
1927 /*
1928  * Function 4 (Deterministic Cache Parameters) macros
1929  * Defined by Intel Application Note AP-485
1930  */
1931 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1932 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1933 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1934 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1935 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1936 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1937 #define	CPI_CACHE_TYPE_DONE	0
1938 #define	CPI_CACHE_TYPE_DATA	1
1939 #define	CPI_CACHE_TYPE_INSTR	2
1940 #define	CPI_CACHE_TYPE_UNIFIED	3
1941 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1942 
1943 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1944 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1945 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1946 
1947 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1948 
1949 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1950 
1951 
1952 /*
1953  * A couple of shorthand macros to identify "later" P6-family chips
1954  * like the Pentium M and Core.  First, the "older" P6-based stuff
1955  * (loosely defined as "pre-Pentium-4"):
1956  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1957  */
1958 #define	IS_LEGACY_P6(cpi) (			\
1959 	cpi->cpi_family == 6 &&			\
1960 		(cpi->cpi_model == 1 ||		\
1961 		cpi->cpi_model == 3 ||		\
1962 		cpi->cpi_model == 5 ||		\
1963 		cpi->cpi_model == 6 ||		\
1964 		cpi->cpi_model == 7 ||		\
1965 		cpi->cpi_model == 8 ||		\
1966 		cpi->cpi_model == 0xA ||	\
1967 		cpi->cpi_model == 0xB)		\
1968 )
1969 
1970 /* A "new F6" is everything with family 6 that's not the above */
1971 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1972 
1973 /* Extended family/model support */
1974 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1975 	cpi->cpi_family >= 0xf)
1976 
1977 /*
1978  * Info for monitor/mwait idle loop.
1979  *
1980  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1981  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1982  * 2006.
1983  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1984  * Documentation Updates" #33633, Rev 2.05, December 2006.
1985  */
1986 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1987 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1988 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1989 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1990 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1991 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1992 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1993 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1994 /*
1995  * Number of sub-cstates for a given c-state.
1996  */
1997 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1998 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1999 
2000 /*
2001  * XSAVE leaf 0xD enumeration
2002  */
2003 #define	CPUID_LEAFD_2_YMM_OFFSET	576
2004 #define	CPUID_LEAFD_2_YMM_SIZE		256
2005 
2006 /*
2007  * Common extended leaf names to cut down on typos.
2008  */
2009 #define	CPUID_LEAF_EXT_0		0x80000000
2010 #define	CPUID_LEAF_EXT_8		0x80000008
2011 #define	CPUID_LEAF_EXT_1d		0x8000001d
2012 #define	CPUID_LEAF_EXT_1e		0x8000001e
2013 #define	CPUID_LEAF_EXT_21		0x80000021
2014 #define	CPUID_LEAF_EXT_26		0x80000026
2015 
2016 /*
2017  * Functions we consume from cpuid_subr.c;  don't publish these in a header
2018  * file to try and keep people using the expected cpuid_* interfaces.
2019  */
2020 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2021 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2022 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2023 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2024 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2025 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2026 
2027 /*
2028  * Apply up various platform-dependent restrictions where the
2029  * underlying platform restrictions mean the CPU can be marked
2030  * as less capable than its cpuid instruction would imply.
2031  */
2032 #if defined(__xpv)
2033 static void
2034 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2035 {
2036 	switch (eax) {
2037 	case 1: {
2038 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2039 		    0 : CPUID_INTC_EDX_MCA;
2040 		cp->cp_edx &=
2041 		    ~(mcamask |
2042 		    CPUID_INTC_EDX_PSE |
2043 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2044 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2045 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2046 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2047 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2048 		break;
2049 	}
2050 
2051 	case 0x80000001:
2052 		cp->cp_edx &=
2053 		    ~(CPUID_AMD_EDX_PSE |
2054 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2055 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2056 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2057 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2058 		    CPUID_AMD_EDX_TSCP);
2059 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2060 		break;
2061 	default:
2062 		break;
2063 	}
2064 
2065 	switch (vendor) {
2066 	case X86_VENDOR_Intel:
2067 		switch (eax) {
2068 		case 4:
2069 			/*
2070 			 * Zero out the (ncores-per-chip - 1) field
2071 			 */
2072 			cp->cp_eax &= 0x03fffffff;
2073 			break;
2074 		default:
2075 			break;
2076 		}
2077 		break;
2078 	case X86_VENDOR_AMD:
2079 	case X86_VENDOR_HYGON:
2080 		switch (eax) {
2081 
2082 		case 0x80000001:
2083 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2084 			break;
2085 
2086 		case CPUID_LEAF_EXT_8:
2087 			/*
2088 			 * Zero out the (ncores-per-chip - 1) field
2089 			 */
2090 			cp->cp_ecx &= 0xffffff00;
2091 			break;
2092 		default:
2093 			break;
2094 		}
2095 		break;
2096 	default:
2097 		break;
2098 	}
2099 }
2100 #else
2101 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2102 #endif
2103 
2104 /*
2105  *  Some undocumented ways of patching the results of the cpuid
2106  *  instruction to permit running Solaris 10 on future cpus that
2107  *  we don't currently support.  Could be set to non-zero values
2108  *  via settings in eeprom.
2109  */
2110 
2111 uint32_t cpuid_feature_ecx_include;
2112 uint32_t cpuid_feature_ecx_exclude;
2113 uint32_t cpuid_feature_edx_include;
2114 uint32_t cpuid_feature_edx_exclude;
2115 
2116 /*
2117  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2118  */
2119 void
2120 cpuid_alloc_space(cpu_t *cpu)
2121 {
2122 	/*
2123 	 * By convention, cpu0 is the boot cpu, which is set up
2124 	 * before memory allocation is available.  All other cpus get
2125 	 * their cpuid_info struct allocated here.
2126 	 */
2127 	ASSERT(cpu->cpu_id != 0);
2128 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2129 	cpu->cpu_m.mcpu_cpi =
2130 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2131 }
2132 
2133 void
2134 cpuid_free_space(cpu_t *cpu)
2135 {
2136 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2137 	int i;
2138 
2139 	ASSERT(cpi != NULL);
2140 	ASSERT(cpi != &cpuid_info0);
2141 
2142 	/*
2143 	 * Free up any cache leaf related dynamic storage. The first entry was
2144 	 * cached from the standard cpuid storage, so we should not free it.
2145 	 */
2146 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2147 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2148 	if (cpi->cpi_cache_leaf_size > 0)
2149 		kmem_free(cpi->cpi_cache_leaves,
2150 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2151 
2152 	kmem_free(cpi, sizeof (*cpi));
2153 	cpu->cpu_m.mcpu_cpi = NULL;
2154 }
2155 
2156 #if !defined(__xpv)
2157 /*
2158  * Determine the type of the underlying platform. This is used to customize
2159  * initialization of various subsystems (e.g. TSC). determine_platform() must
2160  * only ever be called once to prevent two processors from seeing different
2161  * values of platform_type. Must be called before cpuid_pass_ident(), the
2162  * earliest consumer to execute; the identification pass will call
2163  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2164  */
2165 void
2166 determine_platform(void)
2167 {
2168 	struct cpuid_regs cp;
2169 	uint32_t base;
2170 	uint32_t regs[4];
2171 	char *hvstr = (char *)regs;
2172 
2173 	ASSERT(platform_type == -1);
2174 
2175 	platform_type = HW_NATIVE;
2176 
2177 	if (!enable_platform_detection)
2178 		return;
2179 
2180 	/*
2181 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2182 	 * vendor signature, and set platform type accordingly.
2183 	 *
2184 	 * References:
2185 	 * http://lkml.org/lkml/2008/10/1/246
2186 	 * http://kb.vmware.com/kb/1009458
2187 	 */
2188 	cp.cp_eax = 0x1;
2189 	(void) __cpuid_insn(&cp);
2190 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2191 		cp.cp_eax = 0x40000000;
2192 		(void) __cpuid_insn(&cp);
2193 		regs[0] = cp.cp_ebx;
2194 		regs[1] = cp.cp_ecx;
2195 		regs[2] = cp.cp_edx;
2196 		regs[3] = 0;
2197 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2198 			platform_type = HW_XEN_HVM;
2199 			return;
2200 		}
2201 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2202 			platform_type = HW_VMWARE;
2203 			return;
2204 		}
2205 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2206 			platform_type = HW_KVM;
2207 			return;
2208 		}
2209 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2210 			platform_type = HW_BHYVE;
2211 			return;
2212 		}
2213 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2214 			platform_type = HW_MICROSOFT;
2215 			return;
2216 		}
2217 		if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2218 			platform_type = HW_QEMU_TCG;
2219 			return;
2220 		}
2221 	} else {
2222 		/*
2223 		 * Check older VMware hardware versions. VMware hypervisor is
2224 		 * detected by performing an IN operation to VMware hypervisor
2225 		 * port and checking that value returned in %ebx is VMware
2226 		 * hypervisor magic value.
2227 		 *
2228 		 * References: http://kb.vmware.com/kb/1009458
2229 		 */
2230 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2231 		if (regs[1] == VMWARE_HVMAGIC) {
2232 			platform_type = HW_VMWARE;
2233 			return;
2234 		}
2235 	}
2236 
2237 	/*
2238 	 * Check Xen hypervisor. In a fully virtualized domain,
2239 	 * Xen's pseudo-cpuid function returns a string representing the
2240 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2241 	 * supported cpuid function. We need at least a (base + 2) leaf value
2242 	 * to do what we want to do. Try different base values, since the
2243 	 * hypervisor might use a different one depending on whether Hyper-V
2244 	 * emulation is switched on by default or not.
2245 	 */
2246 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2247 		cp.cp_eax = base;
2248 		(void) __cpuid_insn(&cp);
2249 		regs[0] = cp.cp_ebx;
2250 		regs[1] = cp.cp_ecx;
2251 		regs[2] = cp.cp_edx;
2252 		regs[3] = 0;
2253 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2254 		    cp.cp_eax >= (base + 2)) {
2255 			platform_type &= ~HW_NATIVE;
2256 			platform_type |= HW_XEN_HVM;
2257 			return;
2258 		}
2259 	}
2260 }
2261 
2262 int
2263 get_hwenv(void)
2264 {
2265 	ASSERT(platform_type != -1);
2266 	return (platform_type);
2267 }
2268 
2269 int
2270 is_controldom(void)
2271 {
2272 	return (0);
2273 }
2274 
2275 #else
2276 
2277 int
2278 get_hwenv(void)
2279 {
2280 	return (HW_XEN_PV);
2281 }
2282 
2283 int
2284 is_controldom(void)
2285 {
2286 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2287 }
2288 
2289 #endif	/* __xpv */
2290 
2291 /*
2292  * Gather the extended topology information. This should be the same for both
2293  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2294  */
2295 static void
2296 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2297 {
2298 	uint_t i;
2299 
2300 	for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2301 		struct cpuid_regs *regs = &cpi->cpi_topo[i];
2302 
2303 		bzero(regs, sizeof (struct cpuid_regs));
2304 		regs->cp_eax = leaf;
2305 		regs->cp_ecx = i;
2306 
2307 		(void) __cpuid_insn(regs);
2308 		if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2309 		    CPUID_AMD_8X26_TYPE_DONE) {
2310 			break;
2311 		}
2312 	}
2313 
2314 	cpi->cpi_topo_nleaves = i;
2315 }
2316 
2317 /*
2318  * Make sure that we have gathered all of the CPUID leaves that we might need to
2319  * determine topology. We assume that the standard leaf 1 has already been done
2320  * and that xmaxeax has already been calculated.
2321  */
2322 static void
2323 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2324 {
2325 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2326 
2327 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2328 		struct cpuid_regs *cp;
2329 
2330 		cp = &cpi->cpi_extd[8];
2331 		cp->cp_eax = CPUID_LEAF_EXT_8;
2332 		(void) __cpuid_insn(cp);
2333 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2334 	}
2335 
2336 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2337 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2338 		struct cpuid_regs *cp;
2339 
2340 		cp = &cpi->cpi_extd[0x1e];
2341 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2342 		(void) __cpuid_insn(cp);
2343 	}
2344 
2345 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2346 		cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2347 	}
2348 }
2349 
2350 /*
2351  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2352  * it to everything else. If not, and we're on an AMD system where 8000001e is
2353  * valid, then we use that. Othewrise, we fall back to the default value for the
2354  * APIC ID in leaf 1.
2355  */
2356 static uint32_t
2357 cpuid_gather_apicid(struct cpuid_info *cpi)
2358 {
2359 	/*
2360 	 * Leaf B changes based on the arguments to it. Because we don't cache
2361 	 * it, we need to gather it again.
2362 	 */
2363 	if (cpi->cpi_maxeax >= 0xB) {
2364 		struct cpuid_regs regs;
2365 		struct cpuid_regs *cp;
2366 
2367 		cp = &regs;
2368 		cp->cp_eax = 0xB;
2369 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2370 		(void) __cpuid_insn(cp);
2371 
2372 		if (cp->cp_ebx != 0) {
2373 			return (cp->cp_edx);
2374 		}
2375 	}
2376 
2377 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2378 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2379 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2380 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2381 		return (cpi->cpi_extd[0x1e].cp_eax);
2382 	}
2383 
2384 	return (CPI_APIC_ID(cpi));
2385 }
2386 
2387 /*
2388  * For AMD processors, attempt to calculate the number of chips and cores that
2389  * exist. The way that we do this varies based on the generation, because the
2390  * generations themselves have changed dramatically.
2391  *
2392  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2393  * However, with the advent of family 17h (Zen) it actually tells us the number
2394  * of threads, so we need to look at leaf 0x8000001e if available to determine
2395  * its value. Otherwise, for all prior families, the number of enabled cores is
2396  * the same as threads.
2397  *
2398  * If we do not have leaf 0x80000008, then we assume that this processor does
2399  * not have anything. AMD's older CPUID specification says there's no reason to
2400  * fall back to leaf 1.
2401  *
2402  * In some virtualization cases we will not have leaf 8000001e or it will be
2403  * zero. When that happens we assume the number of threads is one.
2404  */
2405 static void
2406 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2407 {
2408 	uint_t nthreads, nthread_per_core;
2409 
2410 	nthreads = nthread_per_core = 1;
2411 
2412 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2413 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2414 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2415 		nthreads = CPI_CPU_COUNT(cpi);
2416 	}
2417 
2418 	/*
2419 	 * For us to have threads, and know about it, we have to be at least at
2420 	 * family 17h and have the cpuid bit that says we have extended
2421 	 * topology.
2422 	 */
2423 	if (cpi->cpi_family >= 0x17 &&
2424 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2425 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2426 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2427 	}
2428 
2429 	*ncpus = nthreads;
2430 	*ncores = nthreads / nthread_per_core;
2431 }
2432 
2433 /*
2434  * Seed the initial values for the cores and threads for an Intel based
2435  * processor. These values will be overwritten if we detect that the processor
2436  * supports CPUID leaf 0xb.
2437  */
2438 static void
2439 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2440 {
2441 	/*
2442 	 * Only seed the number of physical cores from the first level leaf 4
2443 	 * information. The number of threads there indicate how many share the
2444 	 * L1 cache, which may or may not have anything to do with the number of
2445 	 * logical CPUs per core.
2446 	 */
2447 	if (cpi->cpi_maxeax >= 4) {
2448 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2449 	} else {
2450 		*ncores = 1;
2451 	}
2452 
2453 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2454 		*ncpus = CPI_CPU_COUNT(cpi);
2455 	} else {
2456 		*ncpus = *ncores;
2457 	}
2458 }
2459 
2460 static boolean_t
2461 cpuid_leafB_getids(cpu_t *cpu)
2462 {
2463 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2464 	struct cpuid_regs regs;
2465 	struct cpuid_regs *cp;
2466 
2467 	if (cpi->cpi_maxeax < 0xB)
2468 		return (B_FALSE);
2469 
2470 	cp = &regs;
2471 	cp->cp_eax = 0xB;
2472 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2473 
2474 	(void) __cpuid_insn(cp);
2475 
2476 	/*
2477 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2478 	 * indicates that the extended topology enumeration leaf is
2479 	 * available.
2480 	 */
2481 	if (cp->cp_ebx != 0) {
2482 		uint32_t x2apic_id = 0;
2483 		uint_t coreid_shift = 0;
2484 		uint_t ncpu_per_core = 1;
2485 		uint_t chipid_shift = 0;
2486 		uint_t ncpu_per_chip = 1;
2487 		uint_t i;
2488 		uint_t level;
2489 
2490 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2491 			cp->cp_eax = 0xB;
2492 			cp->cp_ecx = i;
2493 
2494 			(void) __cpuid_insn(cp);
2495 			level = CPI_CPU_LEVEL_TYPE(cp);
2496 
2497 			if (level == 1) {
2498 				x2apic_id = cp->cp_edx;
2499 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2500 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2501 			} else if (level == 2) {
2502 				x2apic_id = cp->cp_edx;
2503 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2504 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2505 			}
2506 		}
2507 
2508 		/*
2509 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2510 		 */
2511 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2512 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2513 		    ncpu_per_core;
2514 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2515 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2516 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2517 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2518 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2519 		cpi->cpi_compunitid = cpi->cpi_coreid;
2520 
2521 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2522 			cpi->cpi_nthread_bits = coreid_shift;
2523 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2524 		}
2525 
2526 		return (B_TRUE);
2527 	} else {
2528 		return (B_FALSE);
2529 	}
2530 }
2531 
2532 static void
2533 cpuid_intel_getids(cpu_t *cpu, void *feature)
2534 {
2535 	uint_t i;
2536 	uint_t chipid_shift = 0;
2537 	uint_t coreid_shift = 0;
2538 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2539 
2540 	/*
2541 	 * There are no compute units or processor nodes currently on Intel.
2542 	 * Always set these to one.
2543 	 */
2544 	cpi->cpi_procnodes_per_pkg = 1;
2545 	cpi->cpi_cores_per_compunit = 1;
2546 
2547 	/*
2548 	 * If cpuid Leaf B is present, use that to try and get this information.
2549 	 * It will be the most accurate for Intel CPUs.
2550 	 */
2551 	if (cpuid_leafB_getids(cpu))
2552 		return;
2553 
2554 	/*
2555 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2556 	 * and ncore_per_chip. These represent the largest power of two values
2557 	 * that we need to cover all of the IDs in the system. Therefore, we use
2558 	 * those values to seed the number of bits needed to cover information
2559 	 * in the case when leaf B is not available. These values will probably
2560 	 * be larger than required, but that's OK.
2561 	 */
2562 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2563 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2564 
2565 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2566 		chipid_shift++;
2567 
2568 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2569 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2570 
2571 	if (is_x86_feature(feature, X86FSET_CMP)) {
2572 		/*
2573 		 * Multi-core (and possibly multi-threaded)
2574 		 * processors.
2575 		 */
2576 		uint_t ncpu_per_core = 0;
2577 
2578 		if (cpi->cpi_ncore_per_chip == 1)
2579 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2580 		else if (cpi->cpi_ncore_per_chip > 1)
2581 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2582 			    cpi->cpi_ncore_per_chip;
2583 		/*
2584 		 * 8bit APIC IDs on dual core Pentiums
2585 		 * look like this:
2586 		 *
2587 		 * +-----------------------+------+------+
2588 		 * | Physical Package ID   |  MC  |  HT  |
2589 		 * +-----------------------+------+------+
2590 		 * <------- chipid -------->
2591 		 * <------- coreid --------------->
2592 		 *			   <--- clogid -->
2593 		 *			   <------>
2594 		 *			   pkgcoreid
2595 		 *
2596 		 * Where the number of bits necessary to
2597 		 * represent MC and HT fields together equals
2598 		 * to the minimum number of bits necessary to
2599 		 * store the value of cpi->cpi_ncpu_per_chip.
2600 		 * Of those bits, the MC part uses the number
2601 		 * of bits necessary to store the value of
2602 		 * cpi->cpi_ncore_per_chip.
2603 		 */
2604 		for (i = 1; i < ncpu_per_core; i <<= 1)
2605 			coreid_shift++;
2606 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2607 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2608 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2609 		/*
2610 		 * Single-core multi-threaded processors.
2611 		 */
2612 		cpi->cpi_coreid = cpi->cpi_chipid;
2613 		cpi->cpi_pkgcoreid = 0;
2614 	} else {
2615 		/*
2616 		 * Single-core single-thread processors.
2617 		 */
2618 		cpi->cpi_coreid = cpu->cpu_id;
2619 		cpi->cpi_pkgcoreid = 0;
2620 	}
2621 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2622 	cpi->cpi_compunitid = cpi->cpi_coreid;
2623 }
2624 
2625 /*
2626  * Historically, AMD has had CMP chips with only a single thread per core.
2627  * However, starting in family 17h (Zen), this has changed and they now have
2628  * multiple threads. Our internal core id needs to be a unique value.
2629  *
2630  * To determine the core id of an AMD system, if we're from a family before 17h,
2631  * then we just use the cpu id, as that gives us a good value that will be
2632  * unique for each core. If instead, we're on family 17h or later, then we need
2633  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2634  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2635  * We can't use the normal core id in that leaf as it's only unique within the
2636  * socket, which is perfect for cpi_pkgcoreid, but not us.
2637  */
2638 static id_t
2639 cpuid_amd_get_coreid(cpu_t *cpu)
2640 {
2641 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2642 
2643 	if (cpi->cpi_family >= 0x17 &&
2644 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2645 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2646 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2647 		if (nthreads > 1) {
2648 			VERIFY3U(nthreads, ==, 2);
2649 			return (cpi->cpi_apicid >> 1);
2650 		}
2651 	}
2652 
2653 	return (cpu->cpu_id);
2654 }
2655 
2656 /*
2657  * IDs on AMD is a more challenging task. This is notable because of the
2658  * following two facts:
2659  *
2660  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2661  *     also no way to get an actual unique core id from the system. As such, we
2662  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2663  *     however, guarantee that sibling cores of a chip will have sequential
2664  *     coreids starting at a multiple of the number of cores per chip - that is
2665  *     usually the case, but if the APIC IDs have been set up in a different
2666  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2667  *
2668  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2669  *     called compute units. These compute units share the L1I cache, L2 cache,
2670  *     and the FPU. To deal with this, a new topology leaf was added in
2671  *     0x8000001e. However, parts of this leaf have different meanings
2672  *     once we get to family 0x17.
2673  */
2674 
2675 static void
2676 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2677 {
2678 	int i, first_half, coreidsz;
2679 	uint32_t nb_caps_reg;
2680 	uint_t node2_1;
2681 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2682 	struct cpuid_regs *cp;
2683 
2684 	/*
2685 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2686 	 * hasn't been stripped by virtualization). We always set the compute
2687 	 * unit id to the same value. Also, initialize the default number of
2688 	 * cores per compute unit and nodes per package. This will be
2689 	 * overwritten when we know information about a particular family.
2690 	 */
2691 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2692 	cpi->cpi_compunitid = cpi->cpi_coreid;
2693 	cpi->cpi_cores_per_compunit = 1;
2694 	cpi->cpi_procnodes_per_pkg = 1;
2695 
2696 	/*
2697 	 * To construct the logical ID, we need to determine how many APIC IDs
2698 	 * are dedicated to the cores and threads. This is provided for us in
2699 	 * 0x80000008. However, if it's not present (say due to virtualization),
2700 	 * then we assume it's one. This should be present on all 64-bit AMD
2701 	 * processors.  It was added in family 0xf (Hammer).
2702 	 */
2703 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2704 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2705 
2706 		/*
2707 		 * In AMD parlance chip is really a node while illumos
2708 		 * uses chip as equivalent to socket/package.
2709 		 */
2710 		if (coreidsz == 0) {
2711 			/* Use legacy method */
2712 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2713 				coreidsz++;
2714 			if (coreidsz == 0)
2715 				coreidsz = 1;
2716 		}
2717 	} else {
2718 		/* Assume single-core part */
2719 		coreidsz = 1;
2720 	}
2721 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2722 
2723 	/*
2724 	 * The package core ID varies depending on the family. While it may be
2725 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2726 	 * this value is the core id in the given node. For non-virtualized
2727 	 * family 17h, we need to take the logical core id and shift off the
2728 	 * threads like we do when getting the core id.  Otherwise, we can use
2729 	 * the clogid as is. When family 17h is virtualized, the clogid should
2730 	 * be sufficient as if we don't have valid data in the leaf, then we
2731 	 * won't think we have SMT, in which case the cpi_clogid should be
2732 	 * sufficient.
2733 	 */
2734 	if (cpi->cpi_family >= 0x17 &&
2735 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2736 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2737 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2738 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2739 		if (nthreads > 1) {
2740 			VERIFY3U(nthreads, ==, 2);
2741 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2742 		} else {
2743 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2744 		}
2745 	} else {
2746 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2747 	}
2748 
2749 	/*
2750 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2751 	 * (bulldozer) or newer, then we can derive all of this from leaf
2752 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2753 	 */
2754 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2755 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2756 		cp = &cpi->cpi_extd[0x1e];
2757 
2758 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2759 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2760 
2761 		/*
2762 		 * For Bulldozer-era CPUs, recalculate the compute unit
2763 		 * information.
2764 		 */
2765 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2766 			cpi->cpi_cores_per_compunit =
2767 			    BITX(cp->cp_ebx, 15, 8) + 1;
2768 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2769 			    (cpi->cpi_ncore_per_chip /
2770 			    cpi->cpi_cores_per_compunit) *
2771 			    (cpi->cpi_procnodeid /
2772 			    cpi->cpi_procnodes_per_pkg);
2773 		}
2774 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2775 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2776 	} else if (cpi->cpi_family == 0x10) {
2777 		/*
2778 		 * See if we are a multi-node processor.
2779 		 * All processors in the system have the same number of nodes
2780 		 */
2781 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2782 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2783 			/* Single-node */
2784 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2785 			    coreidsz);
2786 		} else {
2787 
2788 			/*
2789 			 * Multi-node revision D (2 nodes per package
2790 			 * are supported)
2791 			 */
2792 			cpi->cpi_procnodes_per_pkg = 2;
2793 
2794 			first_half = (cpi->cpi_pkgcoreid <=
2795 			    (cpi->cpi_ncore_per_chip/2 - 1));
2796 
2797 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2798 				/* We are BSP */
2799 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2800 			} else {
2801 
2802 				/* We are AP */
2803 				/* NodeId[2:1] bits to use for reading F3xe8 */
2804 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2805 
2806 				nb_caps_reg =
2807 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2808 
2809 				/*
2810 				 * Check IntNodeNum bit (31:30, but bit 31 is
2811 				 * always 0 on dual-node processors)
2812 				 */
2813 				if (BITX(nb_caps_reg, 30, 30) == 0)
2814 					cpi->cpi_procnodeid = node2_1 +
2815 					    !first_half;
2816 				else
2817 					cpi->cpi_procnodeid = node2_1 +
2818 					    first_half;
2819 			}
2820 		}
2821 	} else {
2822 		cpi->cpi_procnodeid = 0;
2823 	}
2824 
2825 	cpi->cpi_chipid =
2826 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2827 
2828 	cpi->cpi_ncore_bits = coreidsz;
2829 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2830 	    cpi->cpi_ncore_per_chip);
2831 }
2832 
2833 static void
2834 spec_uarch_flush_noop(void)
2835 {
2836 }
2837 
2838 /*
2839  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2840  * MDS-related micro-architectural state that would normally happen by calling
2841  * x86_md_clear().
2842  */
2843 static void
2844 spec_uarch_flush_msr(void)
2845 {
2846 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2847 }
2848 
2849 /*
2850  * This function points to a function that will flush certain
2851  * micro-architectural state on the processor. This flush is used to mitigate
2852  * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2853  * This function can point to one of three functions:
2854  *
2855  * - A noop which is done because we either are vulnerable, but do not have
2856  *   microcode available to help deal with a fix, or because we aren't
2857  *   vulnerable.
2858  *
2859  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2860  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2861  *   however, it only flushes the MDS related micro-architectural state on the
2862  *   current hyperthread, it does not do anything for the twin.
2863  *
2864  * - x86_md_clear which will flush the MDS related state. This is done when we
2865  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2866  *   (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2867  *   can clear it (RFDS_CLEAR is set).
2868  */
2869 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2870 
2871 static void
2872 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2873 {
2874 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2875 
2876 	/* Non-Intel doesn't concern us here. */
2877 	if (cpi->cpi_vendor != X86_VENDOR_Intel)
2878 		return;
2879 
2880 	/*
2881 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2882 	 * has been fixed in hardware, it doesn't cover everything related to
2883 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2884 	 * need to mitigate this.
2885 	 *
2886 	 * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2887 	 * because of the small cases of RFDS.
2888 	 */
2889 
2890 	if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2891 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2892 	    (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2893 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2894 		const uint8_t nop = NOP_INSTR;
2895 		uint8_t *md = (uint8_t *)x86_md_clear;
2896 
2897 		*md = nop;
2898 	}
2899 
2900 	membar_producer();
2901 }
2902 
2903 static void
2904 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2905 {
2906 	boolean_t need_l1d, need_mds, need_rfds;
2907 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2908 
2909 	/*
2910 	 * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2911 	 * in hardware, then there's nothing left for us to do for enabling
2912 	 * the flush. We can also go ahead and say that SMT exclusion is
2913 	 * unnecessary.
2914 	 */
2915 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2916 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2917 	    is_x86_feature(featureset, X86FSET_MDS_NO) &&
2918 	    is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2919 		extern int smt_exclusion;
2920 		smt_exclusion = 0;
2921 		spec_uarch_flush = spec_uarch_flush_noop;
2922 		membar_producer();
2923 		return;
2924 	}
2925 
2926 	/*
2927 	 * The locations where we need to perform an L1D flush are required both
2928 	 * for mitigating L1TF and MDS. When verw support is present in
2929 	 * microcode, then the L1D flush will take care of doing that as well.
2930 	 * However, if we have a system where RDCL_NO is present, but we don't
2931 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2932 	 * L1D flush.
2933 	 */
2934 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2935 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2936 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2937 		need_l1d = B_TRUE;
2938 	} else {
2939 		need_l1d = B_FALSE;
2940 	}
2941 
2942 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2943 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2944 		need_mds = B_TRUE;
2945 	} else {
2946 		need_mds = B_FALSE;
2947 	}
2948 
2949 	if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2950 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
2951 		need_rfds = B_TRUE;
2952 	} else {
2953 		need_rfds = B_FALSE;
2954 	}
2955 
2956 	if (need_l1d) {
2957 		/*
2958 		 * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
2959 		 * together. If the following VERIFY trips, we need to add
2960 		 * further fixes here.
2961 		 */
2962 		VERIFY(!need_rfds);
2963 		spec_uarch_flush = spec_uarch_flush_msr;
2964 	} else if (need_mds || need_rfds) {
2965 		spec_uarch_flush = x86_md_clear;
2966 	} else {
2967 		/*
2968 		 * We have no hardware mitigations available to us.
2969 		 */
2970 		spec_uarch_flush = spec_uarch_flush_noop;
2971 	}
2972 	membar_producer();
2973 }
2974 
2975 /*
2976  * We default to enabling Return Stack Buffer (RSB) mitigations.
2977  *
2978  * We used to skip RSB mitigations with Intel eIBRS, but developments around
2979  * post-barrier RSB (PBRSB) guessing suggests we should enable Intel RSB
2980  * mitigations always unless explicitly bypassed, or unless hardware indicates
2981  * the bug has been fixed.
2982  *
2983  * The current decisions for using, or ignoring, a RSB software stuffing
2984  * sequence are expressed by the following table:
2985  *
2986  * +-------+------------+-----------------+--------+
2987  * | eIBRS |  PBRSB_NO  |  context switch | vmexit |
2988  * +-------+------------+-----------------+--------+
2989  * |   Yes |     No     |  stuff          | stuff  |
2990  * |   Yes |     Yes    |  ignore         | ignore |
2991  * |   No  |     No     |  stuff          | ignore |
2992  * +-------+------------+-----------------+--------+
2993  *
2994  * Note that if an Intel CPU has no eIBRS, it will never enumerate PBRSB_NO,
2995  * because machines with no eIBRS do not have a problem with PBRSB overflow.
2996  * See the Intel document cited below for details.
2997  *
2998  * Also note that AMD AUTO_IBRS has no PBRSB problem, so it is not included in
2999  * the table above, and that there is no situation where vmexit stuffing is
3000  * needed, but context-switch stuffing isn't.
3001  */
3002 
3003 /* BEGIN CSTYLED */
3004 /*
3005  * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/post-barrier-return-stack-buffer-predictions.html
3006  */
3007 /* END CSTYLED */
3008 
3009 /*
3010  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
3011  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
3012  * also states that as long as SMEP and we maintain at least one page between
3013  * the kernel and user space (we have much more of a red zone), then we do not
3014  * need to clear the RSB. We constrain this to only when Automatic IRBS is
3015  * present.
3016  */
3017 static void
3018 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit, bool intel_pbrsb_no)
3019 {
3020 	const uint8_t ret = RET_INSTR;
3021 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
3022 	uint8_t *vmx_stuff = (uint8_t *)x86_rsb_stuff_vmexit;
3023 
3024 	switch (mit) {
3025 	case X86_SPECTREV2_AUTO_IBRS:
3026 	case X86_SPECTREV2_DISABLED:
3027 		/* Don't bother with any RSB stuffing! */
3028 		*stuff = ret;
3029 		*vmx_stuff = ret;
3030 		break;
3031 	case X86_SPECTREV2_RETPOLINE:
3032 		/*
3033 		 * The Intel document on Post-Barrier RSB says that processors
3034 		 * without eIBRS do not have PBRSB problems upon VMEXIT.
3035 		 */
3036 		VERIFY(!intel_pbrsb_no);
3037 		VERIFY3U(*stuff, !=, ret);
3038 		*vmx_stuff = ret;
3039 		break;
3040 	default:
3041 		/*
3042 		 * eIBRS is all that's left.  If CPU claims PBRSB is fixed,
3043 		 * don't use the RSB mitigation in either case.  Otherwise
3044 		 * both vmexit and context-switching require the software
3045 		 * mitigation.
3046 		 */
3047 		if (intel_pbrsb_no) {
3048 			/* CPU claims PBRSB problems are fixed. */
3049 			*stuff = ret;
3050 			*vmx_stuff = ret;
3051 		}
3052 		VERIFY3U(*stuff, ==, *vmx_stuff);
3053 		break;
3054 	}
3055 }
3056 
3057 static void
3058 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3059 {
3060 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3061 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3062 	    "_r14", "_r15" };
3063 	const uint_t nthunks = ARRAY_SIZE(thunks);
3064 	const char *type;
3065 	uint_t i;
3066 
3067 	if (mit == x86_spectrev2_mitigation)
3068 		return;
3069 
3070 	switch (mit) {
3071 	case X86_SPECTREV2_RETPOLINE:
3072 		type = "gen";
3073 		break;
3074 	case X86_SPECTREV2_AUTO_IBRS:
3075 	case X86_SPECTREV2_ENHANCED_IBRS:
3076 	case X86_SPECTREV2_DISABLED:
3077 		type = "jmp";
3078 		break;
3079 	default:
3080 		panic("asked to update retpoline state with unknown state!");
3081 	}
3082 
3083 	for (i = 0; i < nthunks; i++) {
3084 		uintptr_t source, dest;
3085 		int ssize, dsize;
3086 		char sourcebuf[64], destbuf[64];
3087 
3088 		(void) snprintf(destbuf, sizeof (destbuf),
3089 		    "__x86_indirect_thunk%s", thunks[i]);
3090 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
3091 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
3092 
3093 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3094 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
3095 		VERIFY3U(source, !=, 0);
3096 		VERIFY3U(dest, !=, 0);
3097 		VERIFY3S(dsize, >=, ssize);
3098 		bcopy((void *)source, (void *)dest, ssize);
3099 	}
3100 }
3101 
3102 static void
3103 cpuid_enable_enhanced_ibrs(void)
3104 {
3105 	uint64_t val;
3106 
3107 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3108 	val |= IA32_SPEC_CTRL_IBRS;
3109 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3110 }
3111 
3112 static void
3113 cpuid_enable_auto_ibrs(void)
3114 {
3115 	uint64_t val;
3116 
3117 	val = rdmsr(MSR_AMD_EFER);
3118 	val |= AMD_EFER_AIBRSE;
3119 	wrmsr(MSR_AMD_EFER, val);
3120 }
3121 
3122 /*
3123  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3124  * we can disable TSX, we do so.
3125  *
3126  * This determination is done only on the boot CPU, potentially after loading
3127  * updated microcode.
3128  */
3129 static void
3130 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3131 {
3132 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3133 
3134 	VERIFY(cpu->cpu_id == 0);
3135 
3136 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3137 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3138 		return;
3139 	}
3140 
3141 	if (x86_disable_taa) {
3142 		x86_taa_mitigation = X86_TAA_DISABLED;
3143 		return;
3144 	}
3145 
3146 	/*
3147 	 * If we do not have the ability to disable TSX, then our only
3148 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3149 	 * MDS mitigation as described above.  The latter relies upon us having
3150 	 * configured MDS mitigations correctly! This includes disabling SMT if
3151 	 * we want to cross-CPU-thread protection.
3152 	 */
3153 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3154 		/*
3155 		 * It's not clear whether any parts will enumerate TAA_NO
3156 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3157 		 */
3158 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3159 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3160 			return;
3161 		}
3162 
3163 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3164 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3165 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3166 		} else {
3167 			x86_taa_mitigation = X86_TAA_NOTHING;
3168 		}
3169 		return;
3170 	}
3171 
3172 	/*
3173 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3174 	 * enough in boot.
3175 	 *
3176 	 * Otherwise, we'll fall back to causing transactions to abort as our
3177 	 * mitigation. TSX-using code will always take the fallback path.
3178 	 */
3179 	if (cpi->cpi_pass < 4) {
3180 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3181 	} else {
3182 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3183 	}
3184 }
3185 
3186 /*
3187  * As mentioned, we should only touch the MSR when we've got a suitable
3188  * microcode loaded on this CPU.
3189  */
3190 static void
3191 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3192 {
3193 	uint64_t val;
3194 
3195 	switch (taa) {
3196 	case X86_TAA_TSX_DISABLE:
3197 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3198 			return;
3199 		val = rdmsr(MSR_IA32_TSX_CTRL);
3200 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3201 		wrmsr(MSR_IA32_TSX_CTRL, val);
3202 		break;
3203 	case X86_TAA_TSX_FORCE_ABORT:
3204 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3205 			return;
3206 		val = rdmsr(MSR_IA32_TSX_CTRL);
3207 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3208 		wrmsr(MSR_IA32_TSX_CTRL, val);
3209 		break;
3210 	case X86_TAA_HW_MITIGATED:
3211 	case X86_TAA_MD_CLEAR:
3212 	case X86_TAA_DISABLED:
3213 	case X86_TAA_NOTHING:
3214 		break;
3215 	}
3216 }
3217 
3218 static void
3219 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3220 {
3221 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3222 	x86_spectrev2_mitigation_t v2mit;
3223 
3224 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3225 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3226 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3227 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3228 			add_x86_feature(featureset, X86FSET_IBPB);
3229 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3230 			add_x86_feature(featureset, X86FSET_IBRS);
3231 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3232 			add_x86_feature(featureset, X86FSET_STIBP);
3233 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3234 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3235 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3236 			add_x86_feature(featureset, X86FSET_SSBD);
3237 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3238 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3239 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3240 			add_x86_feature(featureset, X86FSET_SSB_NO);
3241 
3242 		/*
3243 		 * Rather than Enhanced IBRS, AMD has a different feature that
3244 		 * is a bit in EFER that can be enabled and will basically do
3245 		 * the right thing while executing in the kernel.
3246 		 */
3247 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3248 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3249 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3250 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3251 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3252 		}
3253 
3254 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3255 	    cpi->cpi_maxeax >= 7) {
3256 		struct cpuid_regs *ecp;
3257 		ecp = &cpi->cpi_std[7];
3258 
3259 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3260 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3261 		}
3262 
3263 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3264 			add_x86_feature(featureset, X86FSET_IBRS);
3265 			add_x86_feature(featureset, X86FSET_IBPB);
3266 		}
3267 
3268 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3269 			add_x86_feature(featureset, X86FSET_STIBP);
3270 		}
3271 
3272 		/*
3273 		 * Don't read the arch caps MSR on xpv where we lack the
3274 		 * on_trap().
3275 		 */
3276 #ifndef __xpv
3277 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3278 			on_trap_data_t otd;
3279 
3280 			/*
3281 			 * Be paranoid and assume we'll get a #GP.
3282 			 */
3283 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3284 				uint64_t reg;
3285 
3286 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3287 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3288 					add_x86_feature(featureset,
3289 					    X86FSET_RDCL_NO);
3290 				}
3291 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3292 					add_x86_feature(featureset,
3293 					    X86FSET_IBRS_ALL);
3294 				}
3295 				if (reg & IA32_ARCH_CAP_RSBA) {
3296 					add_x86_feature(featureset,
3297 					    X86FSET_RSBA);
3298 				}
3299 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3300 					add_x86_feature(featureset,
3301 					    X86FSET_L1D_VM_NO);
3302 				}
3303 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3304 					add_x86_feature(featureset,
3305 					    X86FSET_SSB_NO);
3306 				}
3307 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3308 					add_x86_feature(featureset,
3309 					    X86FSET_MDS_NO);
3310 				}
3311 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3312 					add_x86_feature(featureset,
3313 					    X86FSET_TSX_CTRL);
3314 				}
3315 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3316 					add_x86_feature(featureset,
3317 					    X86FSET_TAA_NO);
3318 				}
3319 				if (reg & IA32_ARCH_CAP_RFDS_NO) {
3320 					add_x86_feature(featureset,
3321 					    X86FSET_RFDS_NO);
3322 				}
3323 				if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3324 					add_x86_feature(featureset,
3325 					    X86FSET_RFDS_CLEAR);
3326 				}
3327 				if (reg & IA32_ARCH_CAP_PBRSB_NO) {
3328 					add_x86_feature(featureset,
3329 					    X86FSET_PBRSB_NO);
3330 				}
3331 			}
3332 			no_trap();
3333 		}
3334 #endif	/* !__xpv */
3335 
3336 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3337 			add_x86_feature(featureset, X86FSET_SSBD);
3338 
3339 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3340 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3341 	}
3342 
3343 	/*
3344 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3345 	 * will have already run this function and determined what we need to
3346 	 * do. This gives us a hook for per-HW thread mitigations such as
3347 	 * enhanced IBRS, or disabling TSX.
3348 	 */
3349 	if (cpu->cpu_id != 0) {
3350 		switch (x86_spectrev2_mitigation) {
3351 		case X86_SPECTREV2_ENHANCED_IBRS:
3352 			cpuid_enable_enhanced_ibrs();
3353 			break;
3354 		case X86_SPECTREV2_AUTO_IBRS:
3355 			cpuid_enable_auto_ibrs();
3356 			break;
3357 		default:
3358 			break;
3359 		}
3360 
3361 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3362 		return;
3363 	}
3364 
3365 	/*
3366 	 * Go through and initialize various security mechanisms that we should
3367 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3368 	 * TAA.
3369 	 */
3370 
3371 	/*
3372 	 * By default we've come in with retpolines enabled. Check whether we
3373 	 * should disable them or enable enhanced or automatic IBRS. RSB
3374 	 * stuffing is enabled by default. Note, we do not allow the use of AMD
3375 	 * optimized retpolines as it was disclosed by AMD in March 2022 that
3376 	 * they were still vulnerable. Prior to that point, we used them.
3377 	 */
3378 	if (x86_disable_spectrev2 != 0) {
3379 		v2mit = X86_SPECTREV2_DISABLED;
3380 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3381 		cpuid_enable_auto_ibrs();
3382 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3383 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3384 		cpuid_enable_enhanced_ibrs();
3385 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3386 	} else {
3387 		v2mit = X86_SPECTREV2_RETPOLINE;
3388 	}
3389 
3390 	cpuid_patch_retpolines(v2mit);
3391 	cpuid_patch_rsb(v2mit, is_x86_feature(featureset, X86FSET_PBRSB_NO));
3392 	x86_spectrev2_mitigation = v2mit;
3393 	membar_producer();
3394 
3395 	/*
3396 	 * We need to determine what changes are required for mitigating L1TF
3397 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3398 	 * is required.
3399 	 *
3400 	 * If any of these are present, then we need to flush u-arch state at
3401 	 * various points. For MDS, we need to do so whenever we change to a
3402 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3403 	 * flush the L1D cache at VM entry. When we have microcode that handles
3404 	 * MDS, the L1D flush also clears the other u-arch state that the
3405 	 * md_clear does.
3406 	 */
3407 
3408 	/*
3409 	 * Update whether or not we need to be taking explicit action against
3410 	 * MDS or RFDS.
3411 	 */
3412 	cpuid_update_md_clear(cpu, featureset);
3413 
3414 	/*
3415 	 * Determine whether SMT exclusion is required and whether or not we
3416 	 * need to perform an l1d flush.
3417 	 */
3418 	cpuid_update_l1d_flush(cpu, featureset);
3419 
3420 	/*
3421 	 * Determine what our mitigation strategy should be for TAA and then
3422 	 * also apply TAA mitigations.
3423 	 */
3424 	cpuid_update_tsx(cpu, featureset);
3425 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3426 }
3427 
3428 /*
3429  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3430  */
3431 void
3432 setup_xfem(void)
3433 {
3434 	uint64_t flags = XFEATURE_LEGACY_FP;
3435 
3436 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3437 
3438 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3439 		flags |= XFEATURE_SSE;
3440 
3441 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3442 		flags |= XFEATURE_AVX;
3443 
3444 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3445 		flags |= XFEATURE_AVX512;
3446 
3447 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3448 
3449 	xsave_bv_all = flags;
3450 }
3451 
3452 static void
3453 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3454 {
3455 	struct cpuid_info *cpi;
3456 
3457 	cpi = cpu->cpu_m.mcpu_cpi;
3458 
3459 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3460 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3461 		cpuid_gather_amd_topology_leaves(cpu);
3462 	}
3463 
3464 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3465 
3466 	/*
3467 	 * Before we can calculate the IDs that we should assign to this
3468 	 * processor, we need to understand how many cores and threads it has.
3469 	 */
3470 	switch (cpi->cpi_vendor) {
3471 	case X86_VENDOR_Intel:
3472 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3473 		    &cpi->cpi_ncore_per_chip);
3474 		break;
3475 	case X86_VENDOR_AMD:
3476 	case X86_VENDOR_HYGON:
3477 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3478 		    &cpi->cpi_ncore_per_chip);
3479 		break;
3480 	default:
3481 		/*
3482 		 * If we have some other x86 compatible chip, it's not clear how
3483 		 * they would behave. The most common case is virtualization
3484 		 * today, though there are also 64-bit VIA chips. Assume that
3485 		 * all we can get is the basic Leaf 1 HTT information.
3486 		 */
3487 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3488 			cpi->cpi_ncore_per_chip = 1;
3489 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3490 		}
3491 		break;
3492 	}
3493 
3494 	/*
3495 	 * Based on the calculated number of threads and cores, potentially
3496 	 * assign the HTT and CMT features.
3497 	 */
3498 	if (cpi->cpi_ncore_per_chip > 1) {
3499 		add_x86_feature(featureset, X86FSET_CMP);
3500 	}
3501 
3502 	if (cpi->cpi_ncpu_per_chip > 1 &&
3503 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3504 		add_x86_feature(featureset, X86FSET_HTT);
3505 	}
3506 
3507 	/*
3508 	 * Now that has been set up, we need to go through and calculate all of
3509 	 * the rest of the parameters that exist. If we think the CPU doesn't
3510 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3511 	 * up information in some way. The most likely case for this is
3512 	 * virtualization where we have a lot of partial topology information.
3513 	 */
3514 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3515 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3516 		/*
3517 		 * This is a single core, single-threaded processor.
3518 		 */
3519 		cpi->cpi_procnodes_per_pkg = 1;
3520 		cpi->cpi_cores_per_compunit = 1;
3521 		cpi->cpi_compunitid = 0;
3522 		cpi->cpi_chipid = -1;
3523 		cpi->cpi_clogid = 0;
3524 		cpi->cpi_coreid = cpu->cpu_id;
3525 		cpi->cpi_pkgcoreid = 0;
3526 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3527 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3528 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3529 		} else {
3530 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3531 		}
3532 	} else {
3533 		switch (cpi->cpi_vendor) {
3534 		case X86_VENDOR_Intel:
3535 			cpuid_intel_getids(cpu, featureset);
3536 			break;
3537 		case X86_VENDOR_AMD:
3538 		case X86_VENDOR_HYGON:
3539 			cpuid_amd_getids(cpu, featureset);
3540 			break;
3541 		default:
3542 			/*
3543 			 * In this case, it's hard to say what we should do.
3544 			 * We're going to model them to the OS as single core
3545 			 * threads. We don't have a good identifier for them, so
3546 			 * we're just going to use the cpu id all on a single
3547 			 * chip.
3548 			 *
3549 			 * This case has historically been different from the
3550 			 * case above where we don't have HTT or CMP. While they
3551 			 * could be combined, we've opted to keep it separate to
3552 			 * minimize the risk of topology changes in weird cases.
3553 			 */
3554 			cpi->cpi_procnodes_per_pkg = 1;
3555 			cpi->cpi_cores_per_compunit = 1;
3556 			cpi->cpi_chipid = 0;
3557 			cpi->cpi_coreid = cpu->cpu_id;
3558 			cpi->cpi_clogid = cpu->cpu_id;
3559 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3560 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3561 			cpi->cpi_compunitid = cpi->cpi_coreid;
3562 			break;
3563 		}
3564 	}
3565 }
3566 
3567 /*
3568  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3569  * always gather leaf 6 if it's supported; however, we only look for features on
3570  * Intel systems as AMD does not currently define any of the features we look
3571  * for below.
3572  */
3573 static void
3574 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3575 {
3576 	struct cpuid_regs *cp;
3577 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3578 
3579 	if (cpi->cpi_maxeax < 6) {
3580 		return;
3581 	}
3582 
3583 	cp = &cpi->cpi_std[6];
3584 	cp->cp_eax = 6;
3585 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3586 	(void) __cpuid_insn(cp);
3587 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3588 
3589 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3590 		return;
3591 	}
3592 
3593 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3594 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3595 	}
3596 
3597 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3598 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3599 	}
3600 }
3601 
3602 /*
3603  * This is used when we discover that we have AVX support in cpuid. This
3604  * proceeds to scan for the rest of the AVX derived features.
3605  */
3606 static void
3607 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3608 {
3609 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3610 
3611 	/*
3612 	 * If we don't have AVX, don't bother with most of this.
3613 	 */
3614 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3615 		return;
3616 
3617 	add_x86_feature(featureset, X86FSET_AVX);
3618 
3619 	/*
3620 	 * Intel says we can't check these without also
3621 	 * checking AVX.
3622 	 */
3623 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3624 		add_x86_feature(featureset, X86FSET_F16C);
3625 
3626 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3627 		add_x86_feature(featureset, X86FSET_FMA);
3628 
3629 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3630 		add_x86_feature(featureset, X86FSET_BMI1);
3631 
3632 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3633 		add_x86_feature(featureset, X86FSET_BMI2);
3634 
3635 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3636 		add_x86_feature(featureset, X86FSET_AVX2);
3637 
3638 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3639 		add_x86_feature(featureset, X86FSET_VAES);
3640 
3641 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3642 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3643 
3644 	/*
3645 	 * The rest of the AVX features require AVX512. Do not check them unless
3646 	 * it is present.
3647 	 */
3648 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3649 		return;
3650 	add_x86_feature(featureset, X86FSET_AVX512F);
3651 
3652 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3653 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3654 
3655 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3656 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3657 
3658 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3659 		add_x86_feature(featureset, X86FSET_AVX512PF);
3660 
3661 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3662 		add_x86_feature(featureset, X86FSET_AVX512ER);
3663 
3664 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3665 		add_x86_feature(featureset, X86FSET_AVX512CD);
3666 
3667 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3668 		add_x86_feature(featureset, X86FSET_AVX512BW);
3669 
3670 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3671 		add_x86_feature(featureset, X86FSET_AVX512VL);
3672 
3673 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3674 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3675 
3676 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3677 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3678 
3679 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3680 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3681 
3682 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3683 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3684 
3685 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3686 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3687 
3688 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3689 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3690 
3691 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3692 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3693 
3694 	/*
3695 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3696 	 * we don't need to.
3697 	 */
3698 	if (cpi->cpi_std[7].cp_eax < 1)
3699 		return;
3700 
3701 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3702 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3703 }
3704 
3705 /*
3706  * PPIN is the protected processor inventory number. On AMD this is an actual
3707  * feature bit. However, on Intel systems we need to read the platform
3708  * information MSR if we're on a specific model.
3709  */
3710 #if !defined(__xpv)
3711 static void
3712 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3713 {
3714 	on_trap_data_t otd;
3715 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3716 
3717 	switch (cpi->cpi_vendor) {
3718 	case X86_VENDOR_AMD:
3719 		/*
3720 		 * This leaf will have already been gathered in the topology
3721 		 * functions.
3722 		 */
3723 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3724 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3725 				add_x86_feature(featureset, X86FSET_PPIN);
3726 			}
3727 		}
3728 		break;
3729 	case X86_VENDOR_Intel:
3730 		if (cpi->cpi_family != 6)
3731 			break;
3732 		switch (cpi->cpi_model) {
3733 		case INTC_MODEL_IVYBRIDGE_XEON:
3734 		case INTC_MODEL_HASWELL_XEON:
3735 		case INTC_MODEL_BROADWELL_XEON:
3736 		case INTC_MODEL_BROADWELL_XEON_D:
3737 		case INTC_MODEL_SKYLAKE_XEON:
3738 		case INTC_MODEL_ICELAKE_XEON:
3739 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3740 				uint64_t value;
3741 
3742 				value = rdmsr(MSR_PLATFORM_INFO);
3743 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3744 					add_x86_feature(featureset,
3745 					    X86FSET_PPIN);
3746 				}
3747 			}
3748 			no_trap();
3749 			break;
3750 		default:
3751 			break;
3752 		}
3753 		break;
3754 	default:
3755 		break;
3756 	}
3757 }
3758 #endif	/* ! __xpv */
3759 
3760 static void
3761 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3762 {
3763 	uchar_t *featureset = (uchar_t *)arg;
3764 
3765 	/*
3766 	 * We don't run on any processor that doesn't have cpuid, and could not
3767 	 * possibly have arrived here.
3768 	 */
3769 	add_x86_feature(featureset, X86FSET_CPUID);
3770 }
3771 
3772 static void
3773 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3774 {
3775 	struct cpuid_info *cpi;
3776 	struct cpuid_regs *cp;
3777 
3778 	/*
3779 	 * We require that virtual/native detection be complete and that PCI
3780 	 * config space access has been set up; at present there is no reliable
3781 	 * way to determine the latter.
3782 	 */
3783 #if !defined(__xpv)
3784 	ASSERT3S(platform_type, !=, -1);
3785 #endif	/* !__xpv */
3786 
3787 	cpi = cpu->cpu_m.mcpu_cpi;
3788 	ASSERT(cpi != NULL);
3789 
3790 	cp = &cpi->cpi_std[0];
3791 	cp->cp_eax = 0;
3792 	cpi->cpi_maxeax = __cpuid_insn(cp);
3793 	{
3794 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3795 		*iptr++ = cp->cp_ebx;
3796 		*iptr++ = cp->cp_edx;
3797 		*iptr++ = cp->cp_ecx;
3798 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3799 	}
3800 
3801 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3802 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3803 
3804 	/*
3805 	 * Limit the range in case of weird hardware
3806 	 */
3807 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3808 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3809 	if (cpi->cpi_maxeax < 1)
3810 		return;
3811 
3812 	cp = &cpi->cpi_std[1];
3813 	cp->cp_eax = 1;
3814 	(void) __cpuid_insn(cp);
3815 
3816 	/*
3817 	 * Extract identifying constants for easy access.
3818 	 */
3819 	cpi->cpi_model = CPI_MODEL(cpi);
3820 	cpi->cpi_family = CPI_FAMILY(cpi);
3821 
3822 	if (cpi->cpi_family == 0xf)
3823 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3824 
3825 	/*
3826 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3827 	 * Intel, and presumably everyone else, uses model == 0xf, as
3828 	 * one would expect (max value means possible overflow).  Sigh.
3829 	 */
3830 
3831 	switch (cpi->cpi_vendor) {
3832 	case X86_VENDOR_Intel:
3833 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3834 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3835 		break;
3836 	case X86_VENDOR_AMD:
3837 		if (CPI_FAMILY(cpi) == 0xf)
3838 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3839 		break;
3840 	case X86_VENDOR_HYGON:
3841 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3842 		break;
3843 	default:
3844 		if (cpi->cpi_model == 0xf)
3845 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3846 		break;
3847 	}
3848 
3849 	cpi->cpi_step = CPI_STEP(cpi);
3850 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3851 
3852 	/*
3853 	 * Synthesize chip "revision" and socket type
3854 	 */
3855 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3856 	    cpi->cpi_model, cpi->cpi_step);
3857 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3858 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3859 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3860 	    cpi->cpi_model, cpi->cpi_step);
3861 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3862 	    cpi->cpi_model, cpi->cpi_step);
3863 }
3864 
3865 static void
3866 cpuid_pass_basic(cpu_t *cpu, void *arg)
3867 {
3868 	uchar_t *featureset = (uchar_t *)arg;
3869 	uint32_t mask_ecx, mask_edx;
3870 	struct cpuid_info *cpi;
3871 	struct cpuid_regs *cp;
3872 	int xcpuid;
3873 #if !defined(__xpv)
3874 	extern int idle_cpu_prefer_mwait;
3875 #endif
3876 
3877 	cpi = cpu->cpu_m.mcpu_cpi;
3878 	ASSERT(cpi != NULL);
3879 
3880 	if (cpi->cpi_maxeax < 1)
3881 		return;
3882 
3883 	/*
3884 	 * This was filled during the identification pass.
3885 	 */
3886 	cp = &cpi->cpi_std[1];
3887 
3888 	/*
3889 	 * *default* assumptions:
3890 	 * - believe %edx feature word
3891 	 * - ignore %ecx feature word
3892 	 * - 32-bit virtual and physical addressing
3893 	 */
3894 	mask_edx = 0xffffffff;
3895 	mask_ecx = 0;
3896 
3897 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3898 
3899 	switch (cpi->cpi_vendor) {
3900 	case X86_VENDOR_Intel:
3901 		if (cpi->cpi_family == 5)
3902 			x86_type = X86_TYPE_P5;
3903 		else if (IS_LEGACY_P6(cpi)) {
3904 			x86_type = X86_TYPE_P6;
3905 			pentiumpro_bug4046376 = 1;
3906 			/*
3907 			 * Clear the SEP bit when it was set erroneously
3908 			 */
3909 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3910 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3911 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3912 			x86_type = X86_TYPE_P4;
3913 			/*
3914 			 * We don't currently depend on any of the %ecx
3915 			 * features until Prescott, so we'll only check
3916 			 * this from P4 onwards.  We might want to revisit
3917 			 * that idea later.
3918 			 */
3919 			mask_ecx = 0xffffffff;
3920 		} else if (cpi->cpi_family > 0xf)
3921 			mask_ecx = 0xffffffff;
3922 		/*
3923 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3924 		 * to obtain the monitor linesize.
3925 		 */
3926 		if (cpi->cpi_maxeax < 5)
3927 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3928 		break;
3929 	case X86_VENDOR_IntelClone:
3930 	default:
3931 		break;
3932 	case X86_VENDOR_AMD:
3933 #if defined(OPTERON_ERRATUM_108)
3934 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3935 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3936 			cpi->cpi_model = 0xc;
3937 		} else
3938 #endif
3939 		if (cpi->cpi_family == 5) {
3940 			/*
3941 			 * AMD K5 and K6
3942 			 *
3943 			 * These CPUs have an incomplete implementation
3944 			 * of MCA/MCE which we mask away.
3945 			 */
3946 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3947 
3948 			/*
3949 			 * Model 0 uses the wrong (APIC) bit
3950 			 * to indicate PGE.  Fix it here.
3951 			 */
3952 			if (cpi->cpi_model == 0) {
3953 				if (cp->cp_edx & 0x200) {
3954 					cp->cp_edx &= ~0x200;
3955 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3956 				}
3957 			}
3958 
3959 			/*
3960 			 * Early models had problems w/ MMX; disable.
3961 			 */
3962 			if (cpi->cpi_model < 6)
3963 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3964 		}
3965 
3966 		/*
3967 		 * For newer families, SSE3 and CX16, at least, are valid;
3968 		 * enable all
3969 		 */
3970 		if (cpi->cpi_family >= 0xf)
3971 			mask_ecx = 0xffffffff;
3972 		/*
3973 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3974 		 * to obtain the monitor linesize.
3975 		 */
3976 		if (cpi->cpi_maxeax < 5)
3977 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3978 
3979 #if !defined(__xpv)
3980 		/*
3981 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3982 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3983 		 * know for certain that in at least family 17h, per AMD, mwait
3984 		 * is preferred. Families in-between are less certain.
3985 		 */
3986 		if (cpi->cpi_family < 0x17) {
3987 			idle_cpu_prefer_mwait = 0;
3988 		}
3989 #endif
3990 
3991 		break;
3992 	case X86_VENDOR_HYGON:
3993 		/* Enable all for Hygon Dhyana CPU */
3994 		mask_ecx = 0xffffffff;
3995 		break;
3996 	case X86_VENDOR_TM:
3997 		/*
3998 		 * workaround the NT workaround in CMS 4.1
3999 		 */
4000 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
4001 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
4002 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4003 		break;
4004 	case X86_VENDOR_Centaur:
4005 		/*
4006 		 * workaround the NT workarounds again
4007 		 */
4008 		if (cpi->cpi_family == 6)
4009 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4010 		break;
4011 	case X86_VENDOR_Cyrix:
4012 		/*
4013 		 * We rely heavily on the probing in locore
4014 		 * to actually figure out what parts, if any,
4015 		 * of the Cyrix cpuid instruction to believe.
4016 		 */
4017 		switch (x86_type) {
4018 		case X86_TYPE_CYRIX_486:
4019 			mask_edx = 0;
4020 			break;
4021 		case X86_TYPE_CYRIX_6x86:
4022 			mask_edx = 0;
4023 			break;
4024 		case X86_TYPE_CYRIX_6x86L:
4025 			mask_edx =
4026 			    CPUID_INTC_EDX_DE |
4027 			    CPUID_INTC_EDX_CX8;
4028 			break;
4029 		case X86_TYPE_CYRIX_6x86MX:
4030 			mask_edx =
4031 			    CPUID_INTC_EDX_DE |
4032 			    CPUID_INTC_EDX_MSR |
4033 			    CPUID_INTC_EDX_CX8 |
4034 			    CPUID_INTC_EDX_PGE |
4035 			    CPUID_INTC_EDX_CMOV |
4036 			    CPUID_INTC_EDX_MMX;
4037 			break;
4038 		case X86_TYPE_CYRIX_GXm:
4039 			mask_edx =
4040 			    CPUID_INTC_EDX_MSR |
4041 			    CPUID_INTC_EDX_CX8 |
4042 			    CPUID_INTC_EDX_CMOV |
4043 			    CPUID_INTC_EDX_MMX;
4044 			break;
4045 		case X86_TYPE_CYRIX_MediaGX:
4046 			break;
4047 		case X86_TYPE_CYRIX_MII:
4048 		case X86_TYPE_VIA_CYRIX_III:
4049 			mask_edx =
4050 			    CPUID_INTC_EDX_DE |
4051 			    CPUID_INTC_EDX_TSC |
4052 			    CPUID_INTC_EDX_MSR |
4053 			    CPUID_INTC_EDX_CX8 |
4054 			    CPUID_INTC_EDX_PGE |
4055 			    CPUID_INTC_EDX_CMOV |
4056 			    CPUID_INTC_EDX_MMX;
4057 			break;
4058 		default:
4059 			break;
4060 		}
4061 		break;
4062 	}
4063 
4064 #if defined(__xpv)
4065 	/*
4066 	 * Do not support MONITOR/MWAIT under a hypervisor
4067 	 */
4068 	mask_ecx &= ~CPUID_INTC_ECX_MON;
4069 	/*
4070 	 * Do not support XSAVE under a hypervisor for now
4071 	 */
4072 	xsave_force_disable = B_TRUE;
4073 
4074 #endif	/* __xpv */
4075 
4076 	if (xsave_force_disable) {
4077 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4078 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
4079 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
4080 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
4081 	}
4082 
4083 	/*
4084 	 * Now we've figured out the masks that determine
4085 	 * which bits we choose to believe, apply the masks
4086 	 * to the feature words, then map the kernel's view
4087 	 * of these feature words into its feature word.
4088 	 */
4089 	cp->cp_edx &= mask_edx;
4090 	cp->cp_ecx &= mask_ecx;
4091 
4092 	/*
4093 	 * apply any platform restrictions (we don't call this
4094 	 * immediately after __cpuid_insn here, because we need the
4095 	 * workarounds applied above first)
4096 	 */
4097 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4098 
4099 	/*
4100 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
4101 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4102 	 * 7 has sub-leaves determined by ecx.
4103 	 */
4104 	if (cpi->cpi_maxeax >= 7) {
4105 		struct cpuid_regs *ecp;
4106 		ecp = &cpi->cpi_std[7];
4107 		ecp->cp_eax = 7;
4108 		ecp->cp_ecx = 0;
4109 		(void) __cpuid_insn(ecp);
4110 
4111 		/*
4112 		 * If XSAVE has been disabled, just ignore all of the
4113 		 * extended-save-area dependent flags here. By removing most of
4114 		 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
4115 		 * end up looking at additional xsave dependent leaves right
4116 		 * now.
4117 		 */
4118 		if (xsave_force_disable) {
4119 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4120 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4121 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4122 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4123 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4124 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4125 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4126 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4127 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4128 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4129 		}
4130 
4131 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4132 			add_x86_feature(featureset, X86FSET_SMEP);
4133 
4134 		/*
4135 		 * We check disable_smap here in addition to in startup_smap()
4136 		 * to ensure CPUs that aren't the boot CPU don't accidentally
4137 		 * include it in the feature set and thus generate a mismatched
4138 		 * x86 feature set across CPUs.
4139 		 */
4140 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4141 		    disable_smap == 0)
4142 			add_x86_feature(featureset, X86FSET_SMAP);
4143 
4144 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
4145 			add_x86_feature(featureset, X86FSET_RDSEED);
4146 
4147 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4148 			add_x86_feature(featureset, X86FSET_ADX);
4149 
4150 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4151 			add_x86_feature(featureset, X86FSET_FSGSBASE);
4152 
4153 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4154 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4155 
4156 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4157 			add_x86_feature(featureset, X86FSET_INVPCID);
4158 
4159 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4160 			add_x86_feature(featureset, X86FSET_UMIP);
4161 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4162 			add_x86_feature(featureset, X86FSET_PKU);
4163 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4164 			add_x86_feature(featureset, X86FSET_OSPKE);
4165 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4166 			add_x86_feature(featureset, X86FSET_GFNI);
4167 
4168 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4169 			add_x86_feature(featureset, X86FSET_CLWB);
4170 
4171 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4172 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4173 				add_x86_feature(featureset, X86FSET_MPX);
4174 		}
4175 
4176 		/*
4177 		 * If we have subleaf 1 available, grab and store that. This is
4178 		 * used for more AVX and related features.
4179 		 */
4180 		if (ecp->cp_eax >= 1) {
4181 			struct cpuid_regs *c71;
4182 			c71 = &cpi->cpi_sub7[0];
4183 			c71->cp_eax = 7;
4184 			c71->cp_ecx = 1;
4185 			(void) __cpuid_insn(c71);
4186 		}
4187 	}
4188 
4189 	/*
4190 	 * fold in overrides from the "eeprom" mechanism
4191 	 */
4192 	cp->cp_edx |= cpuid_feature_edx_include;
4193 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4194 
4195 	cp->cp_ecx |= cpuid_feature_ecx_include;
4196 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4197 
4198 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4199 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4200 	}
4201 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4202 		add_x86_feature(featureset, X86FSET_TSC);
4203 	}
4204 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4205 		add_x86_feature(featureset, X86FSET_MSR);
4206 	}
4207 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4208 		add_x86_feature(featureset, X86FSET_MTRR);
4209 	}
4210 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4211 		add_x86_feature(featureset, X86FSET_PGE);
4212 	}
4213 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4214 		add_x86_feature(featureset, X86FSET_CMOV);
4215 	}
4216 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4217 		add_x86_feature(featureset, X86FSET_MMX);
4218 	}
4219 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4220 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4221 		add_x86_feature(featureset, X86FSET_MCA);
4222 	}
4223 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4224 		add_x86_feature(featureset, X86FSET_PAE);
4225 	}
4226 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4227 		add_x86_feature(featureset, X86FSET_CX8);
4228 	}
4229 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4230 		add_x86_feature(featureset, X86FSET_CX16);
4231 	}
4232 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4233 		add_x86_feature(featureset, X86FSET_PAT);
4234 	}
4235 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4236 		add_x86_feature(featureset, X86FSET_SEP);
4237 	}
4238 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4239 		/*
4240 		 * In our implementation, fxsave/fxrstor
4241 		 * are prerequisites before we'll even
4242 		 * try and do SSE things.
4243 		 */
4244 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4245 			add_x86_feature(featureset, X86FSET_SSE);
4246 		}
4247 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4248 			add_x86_feature(featureset, X86FSET_SSE2);
4249 		}
4250 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4251 			add_x86_feature(featureset, X86FSET_SSE3);
4252 		}
4253 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4254 			add_x86_feature(featureset, X86FSET_SSSE3);
4255 		}
4256 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4257 			add_x86_feature(featureset, X86FSET_SSE4_1);
4258 		}
4259 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4260 			add_x86_feature(featureset, X86FSET_SSE4_2);
4261 		}
4262 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4263 			add_x86_feature(featureset, X86FSET_AES);
4264 		}
4265 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4266 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4267 		}
4268 
4269 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4270 			add_x86_feature(featureset, X86FSET_SHA);
4271 
4272 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4273 			add_x86_feature(featureset, X86FSET_XSAVE);
4274 
4275 			/* We only test AVX & AVX512 when there is XSAVE */
4276 			cpuid_basic_avx(cpu, featureset);
4277 		}
4278 	}
4279 
4280 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4281 		add_x86_feature(featureset, X86FSET_PCID);
4282 	}
4283 
4284 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4285 		add_x86_feature(featureset, X86FSET_X2APIC);
4286 	}
4287 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4288 		add_x86_feature(featureset, X86FSET_DE);
4289 	}
4290 #if !defined(__xpv)
4291 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4292 
4293 		/*
4294 		 * We require the CLFLUSH instruction for erratum workaround
4295 		 * to use MONITOR/MWAIT.
4296 		 */
4297 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4298 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4299 			add_x86_feature(featureset, X86FSET_MWAIT);
4300 		} else {
4301 			extern int idle_cpu_assert_cflush_monitor;
4302 
4303 			/*
4304 			 * All processors we are aware of which have
4305 			 * MONITOR/MWAIT also have CLFLUSH.
4306 			 */
4307 			if (idle_cpu_assert_cflush_monitor) {
4308 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4309 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4310 			}
4311 		}
4312 	}
4313 #endif	/* __xpv */
4314 
4315 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4316 		add_x86_feature(featureset, X86FSET_VMX);
4317 	}
4318 
4319 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4320 		add_x86_feature(featureset, X86FSET_RDRAND);
4321 
4322 	/*
4323 	 * Only need it first time, rest of the cpus would follow suit.
4324 	 * we only capture this for the bootcpu.
4325 	 */
4326 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4327 		add_x86_feature(featureset, X86FSET_CLFSH);
4328 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4329 	}
4330 	if (is_x86_feature(featureset, X86FSET_PAE))
4331 		cpi->cpi_pabits = 36;
4332 
4333 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4334 		struct cpuid_regs r, *ecp;
4335 
4336 		ecp = &r;
4337 		ecp->cp_eax = 0xD;
4338 		ecp->cp_ecx = 1;
4339 		ecp->cp_edx = ecp->cp_ebx = 0;
4340 		(void) __cpuid_insn(ecp);
4341 
4342 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4343 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4344 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4345 			add_x86_feature(featureset, X86FSET_XSAVEC);
4346 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4347 			add_x86_feature(featureset, X86FSET_XSAVES);
4348 
4349 		/*
4350 		 * Zen 2 family processors suffer from erratum 1386 that causes
4351 		 * xsaves to not function correctly in some circumstances. There
4352 		 * are no supervisor states in Zen 2 and earlier. Practically
4353 		 * speaking this has no impact for us as we currently do not
4354 		 * leverage compressed xsave formats. To safeguard against
4355 		 * issues in the future where we may opt to using it, we remove
4356 		 * it from the feature set now. While Matisse has a microcode
4357 		 * update available with a fix, not all Zen 2 CPUs do so it's
4358 		 * simpler for the moment to unconditionally remove it.
4359 		 */
4360 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4361 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4362 			remove_x86_feature(featureset, X86FSET_XSAVES);
4363 		}
4364 	}
4365 
4366 	/*
4367 	 * Work on the "extended" feature information, doing
4368 	 * some basic initialization to be used in the extended pass.
4369 	 */
4370 	xcpuid = 0;
4371 	switch (cpi->cpi_vendor) {
4372 	case X86_VENDOR_Intel:
4373 		/*
4374 		 * On KVM we know we will have proper support for extended
4375 		 * cpuid.
4376 		 */
4377 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4378 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4379 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4380 			xcpuid++;
4381 		break;
4382 	case X86_VENDOR_AMD:
4383 		if (cpi->cpi_family > 5 ||
4384 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4385 			xcpuid++;
4386 		break;
4387 	case X86_VENDOR_Cyrix:
4388 		/*
4389 		 * Only these Cyrix CPUs are -known- to support
4390 		 * extended cpuid operations.
4391 		 */
4392 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4393 		    x86_type == X86_TYPE_CYRIX_GXm)
4394 			xcpuid++;
4395 		break;
4396 	case X86_VENDOR_HYGON:
4397 	case X86_VENDOR_Centaur:
4398 	case X86_VENDOR_TM:
4399 	default:
4400 		xcpuid++;
4401 		break;
4402 	}
4403 
4404 	if (xcpuid) {
4405 		cp = &cpi->cpi_extd[0];
4406 		cp->cp_eax = CPUID_LEAF_EXT_0;
4407 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4408 	}
4409 
4410 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4411 
4412 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4413 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4414 
4415 		switch (cpi->cpi_vendor) {
4416 		case X86_VENDOR_Intel:
4417 		case X86_VENDOR_AMD:
4418 		case X86_VENDOR_HYGON:
4419 			if (cpi->cpi_xmaxeax < 0x80000001)
4420 				break;
4421 			cp = &cpi->cpi_extd[1];
4422 			cp->cp_eax = 0x80000001;
4423 			(void) __cpuid_insn(cp);
4424 
4425 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4426 			    cpi->cpi_family == 5 &&
4427 			    cpi->cpi_model == 6 &&
4428 			    cpi->cpi_step == 6) {
4429 				/*
4430 				 * K6 model 6 uses bit 10 to indicate SYSC
4431 				 * Later models use bit 11. Fix it here.
4432 				 */
4433 				if (cp->cp_edx & 0x400) {
4434 					cp->cp_edx &= ~0x400;
4435 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4436 				}
4437 			}
4438 
4439 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4440 
4441 			/*
4442 			 * Compute the additions to the kernel's feature word.
4443 			 */
4444 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4445 				add_x86_feature(featureset, X86FSET_NX);
4446 			}
4447 
4448 			/*
4449 			 * Regardless whether or not we boot 64-bit,
4450 			 * we should have a way to identify whether
4451 			 * the CPU is capable of running 64-bit.
4452 			 */
4453 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4454 				add_x86_feature(featureset, X86FSET_64);
4455 			}
4456 
4457 			/* 1 GB large page - enable only for 64 bit kernel */
4458 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4459 				add_x86_feature(featureset, X86FSET_1GPG);
4460 			}
4461 
4462 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4463 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4464 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4465 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4466 				add_x86_feature(featureset, X86FSET_SSE4A);
4467 			}
4468 
4469 			/*
4470 			 * It's really tricky to support syscall/sysret in
4471 			 * the i386 kernel; we rely on sysenter/sysexit
4472 			 * instead.  In the amd64 kernel, things are -way-
4473 			 * better.
4474 			 */
4475 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4476 				add_x86_feature(featureset, X86FSET_ASYSC);
4477 			}
4478 
4479 			/*
4480 			 * While we're thinking about system calls, note
4481 			 * that AMD processors don't support sysenter
4482 			 * in long mode at all, so don't try to program them.
4483 			 */
4484 			if (x86_vendor == X86_VENDOR_AMD ||
4485 			    x86_vendor == X86_VENDOR_HYGON) {
4486 				remove_x86_feature(featureset, X86FSET_SEP);
4487 			}
4488 
4489 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4490 				add_x86_feature(featureset, X86FSET_TSCP);
4491 			}
4492 
4493 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4494 				add_x86_feature(featureset, X86FSET_SVM);
4495 			}
4496 
4497 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4498 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4499 			}
4500 
4501 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4502 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4503 			}
4504 
4505 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4506 				add_x86_feature(featureset, X86FSET_XOP);
4507 			}
4508 
4509 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4510 				add_x86_feature(featureset, X86FSET_FMA4);
4511 			}
4512 
4513 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4514 				add_x86_feature(featureset, X86FSET_TBM);
4515 			}
4516 
4517 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4518 				add_x86_feature(featureset, X86FSET_MONITORX);
4519 			}
4520 			break;
4521 		default:
4522 			break;
4523 		}
4524 
4525 		/*
4526 		 * Get CPUID data about processor cores and hyperthreads.
4527 		 */
4528 		switch (cpi->cpi_vendor) {
4529 		case X86_VENDOR_Intel:
4530 			if (cpi->cpi_maxeax >= 4) {
4531 				cp = &cpi->cpi_std[4];
4532 				cp->cp_eax = 4;
4533 				cp->cp_ecx = 0;
4534 				(void) __cpuid_insn(cp);
4535 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4536 			}
4537 			/*FALLTHROUGH*/
4538 		case X86_VENDOR_AMD:
4539 		case X86_VENDOR_HYGON:
4540 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4541 				break;
4542 			cp = &cpi->cpi_extd[8];
4543 			cp->cp_eax = CPUID_LEAF_EXT_8;
4544 			(void) __cpuid_insn(cp);
4545 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4546 			    cp);
4547 
4548 			/*
4549 			 * AMD uses ebx for some extended functions.
4550 			 */
4551 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4552 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4553 				/*
4554 				 * While we're here, check for the AMD "Error
4555 				 * Pointer Zero/Restore" feature. This can be
4556 				 * used to setup the FP save handlers
4557 				 * appropriately.
4558 				 */
4559 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4560 					cpi->cpi_fp_amd_save = 0;
4561 				} else {
4562 					cpi->cpi_fp_amd_save = 1;
4563 				}
4564 
4565 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4566 					add_x86_feature(featureset,
4567 					    X86FSET_CLZERO);
4568 				}
4569 			}
4570 
4571 			/*
4572 			 * Virtual and physical address limits from
4573 			 * cpuid override previously guessed values.
4574 			 */
4575 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4576 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4577 			break;
4578 		default:
4579 			break;
4580 		}
4581 
4582 		/*
4583 		 * Get CPUID data about TSC Invariance in Deep C-State.
4584 		 */
4585 		switch (cpi->cpi_vendor) {
4586 		case X86_VENDOR_Intel:
4587 		case X86_VENDOR_AMD:
4588 		case X86_VENDOR_HYGON:
4589 			if (cpi->cpi_maxeax >= 7) {
4590 				cp = &cpi->cpi_extd[7];
4591 				cp->cp_eax = 0x80000007;
4592 				cp->cp_ecx = 0;
4593 				(void) __cpuid_insn(cp);
4594 			}
4595 			break;
4596 		default:
4597 			break;
4598 		}
4599 	}
4600 
4601 	/*
4602 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4603 	 * run and thus gathered some of its dependent leaves.
4604 	 */
4605 	cpuid_basic_topology(cpu, featureset);
4606 	cpuid_basic_thermal(cpu, featureset);
4607 #if !defined(__xpv)
4608 	cpuid_basic_ppin(cpu, featureset);
4609 #endif
4610 
4611 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4612 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4613 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4614 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4615 			/* Special handling for AMD FP not necessary. */
4616 			cpi->cpi_fp_amd_save = 0;
4617 		} else {
4618 			cpi->cpi_fp_amd_save = 1;
4619 		}
4620 	}
4621 
4622 	/*
4623 	 * Check (and potentially set) if lfence is serializing.
4624 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4625 	 */
4626 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4627 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4628 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4629 		/*
4630 		 * The AMD white paper Software Techniques For Managing
4631 		 * Speculation on AMD Processors details circumstances for when
4632 		 * lfence instructions are serializing.
4633 		 *
4634 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4635 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4636 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4637 		 * committed to supporting that MSR on all later CPUs.
4638 		 */
4639 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4640 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4641 		} else if (cpi->cpi_family >= 0x10) {
4642 #if !defined(__xpv)
4643 			uint64_t val;
4644 
4645 			/*
4646 			 * Be careful when attempting to enable the bit, and
4647 			 * verify that it was actually set in case we are
4648 			 * running in a hypervisor which is less than faithful
4649 			 * about its emulation of this feature.
4650 			 */
4651 			on_trap_data_t otd;
4652 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4653 				val = rdmsr(MSR_AMD_DE_CFG);
4654 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4655 				wrmsr(MSR_AMD_DE_CFG, val);
4656 				val = rdmsr(MSR_AMD_DE_CFG);
4657 			} else {
4658 				val = 0;
4659 			}
4660 			no_trap();
4661 
4662 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4663 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4664 			}
4665 #endif
4666 		}
4667 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4668 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4669 		/*
4670 		 * Documentation and other OSes indicate that lfence is always
4671 		 * serializing on Intel CPUs.
4672 		 */
4673 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4674 	}
4675 
4676 
4677 	/*
4678 	 * Check the processor leaves that are used for security features. Grab
4679 	 * any additional processor-specific leaves that we may not have yet.
4680 	 */
4681 	switch (cpi->cpi_vendor) {
4682 	case X86_VENDOR_AMD:
4683 	case X86_VENDOR_HYGON:
4684 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4685 			cp = &cpi->cpi_extd[7];
4686 			cp->cp_eax = CPUID_LEAF_EXT_21;
4687 			cp->cp_ecx = 0;
4688 			(void) __cpuid_insn(cp);
4689 		}
4690 		break;
4691 	default:
4692 		break;
4693 	}
4694 
4695 	cpuid_scan_security(cpu, featureset);
4696 }
4697 
4698 /*
4699  * Make copies of the cpuid table entries we depend on, in
4700  * part for ease of parsing now, in part so that we have only
4701  * one place to correct any of it, in part for ease of
4702  * later export to userland, and in part so we can look at
4703  * this stuff in a crash dump.
4704  */
4705 
4706 static void
4707 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4708 {
4709 	uint_t n, nmax;
4710 	int i;
4711 	struct cpuid_regs *cp;
4712 	uint8_t *dp;
4713 	uint32_t *iptr;
4714 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4715 
4716 	if (cpi->cpi_maxeax < 1)
4717 		return;
4718 
4719 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4720 		nmax = NMAX_CPI_STD;
4721 	/*
4722 	 * (We already handled n == 0 and n == 1 in the basic pass)
4723 	 */
4724 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4725 		/*
4726 		 * leaves 6 and 7 were handled in the basic pass
4727 		 */
4728 		if (n == 6 || n == 7)
4729 			continue;
4730 
4731 		cp->cp_eax = n;
4732 
4733 		/*
4734 		 * CPUID function 4 expects %ecx to be initialized
4735 		 * with an index which indicates which cache to return
4736 		 * information about. The OS is expected to call function 4
4737 		 * with %ecx set to 0, 1, 2, ... until it returns with
4738 		 * EAX[4:0] set to 0, which indicates there are no more
4739 		 * caches.
4740 		 *
4741 		 * Here, populate cpi_std[4] with the information returned by
4742 		 * function 4 when %ecx == 0, and do the rest in a later pass
4743 		 * when dynamic memory allocation becomes available.
4744 		 *
4745 		 * Note: we need to explicitly initialize %ecx here, since
4746 		 * function 4 may have been previously invoked.
4747 		 */
4748 		if (n == 4)
4749 			cp->cp_ecx = 0;
4750 
4751 		(void) __cpuid_insn(cp);
4752 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4753 		switch (n) {
4754 		case 2:
4755 			/*
4756 			 * "the lower 8 bits of the %eax register
4757 			 * contain a value that identifies the number
4758 			 * of times the cpuid [instruction] has to be
4759 			 * executed to obtain a complete image of the
4760 			 * processor's caching systems."
4761 			 *
4762 			 * How *do* they make this stuff up?
4763 			 */
4764 			cpi->cpi_ncache = sizeof (*cp) *
4765 			    BITX(cp->cp_eax, 7, 0);
4766 			if (cpi->cpi_ncache == 0)
4767 				break;
4768 			cpi->cpi_ncache--;	/* skip count byte */
4769 
4770 			/*
4771 			 * Well, for now, rather than attempt to implement
4772 			 * this slightly dubious algorithm, we just look
4773 			 * at the first 15 ..
4774 			 */
4775 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4776 				cpi->cpi_ncache = sizeof (*cp) - 1;
4777 
4778 			dp = cpi->cpi_cacheinfo;
4779 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4780 				uint8_t *p = (void *)&cp->cp_eax;
4781 				for (i = 1; i < 4; i++)
4782 					if (p[i] != 0)
4783 						*dp++ = p[i];
4784 			}
4785 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4786 				uint8_t *p = (void *)&cp->cp_ebx;
4787 				for (i = 0; i < 4; i++)
4788 					if (p[i] != 0)
4789 						*dp++ = p[i];
4790 			}
4791 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4792 				uint8_t *p = (void *)&cp->cp_ecx;
4793 				for (i = 0; i < 4; i++)
4794 					if (p[i] != 0)
4795 						*dp++ = p[i];
4796 			}
4797 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4798 				uint8_t *p = (void *)&cp->cp_edx;
4799 				for (i = 0; i < 4; i++)
4800 					if (p[i] != 0)
4801 						*dp++ = p[i];
4802 			}
4803 			break;
4804 
4805 		case 3:	/* Processor serial number, if PSN supported */
4806 			break;
4807 
4808 		case 4:	/* Deterministic cache parameters */
4809 			break;
4810 
4811 		case 5:	/* Monitor/Mwait parameters */
4812 		{
4813 			size_t mwait_size;
4814 
4815 			/*
4816 			 * check cpi_mwait.support which was set in
4817 			 * cpuid_pass_basic()
4818 			 */
4819 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4820 				break;
4821 
4822 			/*
4823 			 * Protect ourself from insane mwait line size.
4824 			 * Workaround for incomplete hardware emulator(s).
4825 			 */
4826 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4827 			if (mwait_size < sizeof (uint32_t) ||
4828 			    !ISP2(mwait_size)) {
4829 #if DEBUG
4830 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4831 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4832 #endif
4833 				break;
4834 			}
4835 
4836 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4837 			cpi->cpi_mwait.mon_max = mwait_size;
4838 			if (MWAIT_EXTENSION(cpi)) {
4839 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4840 				if (MWAIT_INT_ENABLE(cpi))
4841 					cpi->cpi_mwait.support |=
4842 					    MWAIT_ECX_INT_ENABLE;
4843 			}
4844 			break;
4845 		}
4846 		default:
4847 			break;
4848 		}
4849 	}
4850 
4851 	/*
4852 	 * XSAVE enumeration
4853 	 */
4854 	if (cpi->cpi_maxeax >= 0xD) {
4855 		struct cpuid_regs regs;
4856 		boolean_t cpuid_d_valid = B_TRUE;
4857 
4858 		cp = &regs;
4859 		cp->cp_eax = 0xD;
4860 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4861 
4862 		(void) __cpuid_insn(cp);
4863 
4864 		/*
4865 		 * Sanity checks for debug
4866 		 */
4867 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4868 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4869 			cpuid_d_valid = B_FALSE;
4870 		}
4871 
4872 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4873 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4874 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4875 
4876 		/*
4877 		 * If the hw supports AVX, get the size and offset in the save
4878 		 * area for the ymm state.
4879 		 */
4880 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4881 			cp->cp_eax = 0xD;
4882 			cp->cp_ecx = 2;
4883 			cp->cp_edx = cp->cp_ebx = 0;
4884 
4885 			(void) __cpuid_insn(cp);
4886 
4887 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4888 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4889 				cpuid_d_valid = B_FALSE;
4890 			}
4891 
4892 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4893 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4894 		}
4895 
4896 		/*
4897 		 * If the hw supports MPX, get the size and offset in the
4898 		 * save area for BNDREGS and BNDCSR.
4899 		 */
4900 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4901 			cp->cp_eax = 0xD;
4902 			cp->cp_ecx = 3;
4903 			cp->cp_edx = cp->cp_ebx = 0;
4904 
4905 			(void) __cpuid_insn(cp);
4906 
4907 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4908 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4909 
4910 			cp->cp_eax = 0xD;
4911 			cp->cp_ecx = 4;
4912 			cp->cp_edx = cp->cp_ebx = 0;
4913 
4914 			(void) __cpuid_insn(cp);
4915 
4916 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4917 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4918 		}
4919 
4920 		/*
4921 		 * If the hw supports AVX512, get the size and offset in the
4922 		 * save area for the opmask registers and zmm state.
4923 		 */
4924 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4925 			cp->cp_eax = 0xD;
4926 			cp->cp_ecx = 5;
4927 			cp->cp_edx = cp->cp_ebx = 0;
4928 
4929 			(void) __cpuid_insn(cp);
4930 
4931 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4932 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4933 
4934 			cp->cp_eax = 0xD;
4935 			cp->cp_ecx = 6;
4936 			cp->cp_edx = cp->cp_ebx = 0;
4937 
4938 			(void) __cpuid_insn(cp);
4939 
4940 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4941 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4942 
4943 			cp->cp_eax = 0xD;
4944 			cp->cp_ecx = 7;
4945 			cp->cp_edx = cp->cp_ebx = 0;
4946 
4947 			(void) __cpuid_insn(cp);
4948 
4949 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4950 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4951 		}
4952 
4953 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4954 			xsave_state_size = 0;
4955 		} else if (cpuid_d_valid) {
4956 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4957 		} else {
4958 			/* Broken CPUID 0xD, probably in HVM */
4959 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4960 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4961 			    ", ymm_size = %d, ymm_offset = %d\n",
4962 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4963 			    cpi->cpi_xsave.xsav_hw_features_high,
4964 			    (int)cpi->cpi_xsave.xsav_max_size,
4965 			    (int)cpi->cpi_xsave.ymm_size,
4966 			    (int)cpi->cpi_xsave.ymm_offset);
4967 
4968 			if (xsave_state_size != 0) {
4969 				/*
4970 				 * This must be a non-boot CPU. We cannot
4971 				 * continue, because boot cpu has already
4972 				 * enabled XSAVE.
4973 				 */
4974 				ASSERT(cpu->cpu_id != 0);
4975 				cmn_err(CE_PANIC, "cpu%d: we have already "
4976 				    "enabled XSAVE on boot cpu, cannot "
4977 				    "continue.", cpu->cpu_id);
4978 			} else {
4979 				/*
4980 				 * If we reached here on the boot CPU, it's also
4981 				 * almost certain that we'll reach here on the
4982 				 * non-boot CPUs. When we're here on a boot CPU
4983 				 * we should disable the feature, on a non-boot
4984 				 * CPU we need to confirm that we have.
4985 				 */
4986 				if (cpu->cpu_id == 0) {
4987 					remove_x86_feature(x86_featureset,
4988 					    X86FSET_XSAVE);
4989 					remove_x86_feature(x86_featureset,
4990 					    X86FSET_AVX);
4991 					remove_x86_feature(x86_featureset,
4992 					    X86FSET_F16C);
4993 					remove_x86_feature(x86_featureset,
4994 					    X86FSET_BMI1);
4995 					remove_x86_feature(x86_featureset,
4996 					    X86FSET_BMI2);
4997 					remove_x86_feature(x86_featureset,
4998 					    X86FSET_FMA);
4999 					remove_x86_feature(x86_featureset,
5000 					    X86FSET_AVX2);
5001 					remove_x86_feature(x86_featureset,
5002 					    X86FSET_MPX);
5003 					remove_x86_feature(x86_featureset,
5004 					    X86FSET_AVX512F);
5005 					remove_x86_feature(x86_featureset,
5006 					    X86FSET_AVX512DQ);
5007 					remove_x86_feature(x86_featureset,
5008 					    X86FSET_AVX512PF);
5009 					remove_x86_feature(x86_featureset,
5010 					    X86FSET_AVX512ER);
5011 					remove_x86_feature(x86_featureset,
5012 					    X86FSET_AVX512CD);
5013 					remove_x86_feature(x86_featureset,
5014 					    X86FSET_AVX512BW);
5015 					remove_x86_feature(x86_featureset,
5016 					    X86FSET_AVX512VL);
5017 					remove_x86_feature(x86_featureset,
5018 					    X86FSET_AVX512FMA);
5019 					remove_x86_feature(x86_featureset,
5020 					    X86FSET_AVX512VBMI);
5021 					remove_x86_feature(x86_featureset,
5022 					    X86FSET_AVX512VNNI);
5023 					remove_x86_feature(x86_featureset,
5024 					    X86FSET_AVX512VPOPCDQ);
5025 					remove_x86_feature(x86_featureset,
5026 					    X86FSET_AVX512NNIW);
5027 					remove_x86_feature(x86_featureset,
5028 					    X86FSET_AVX512FMAPS);
5029 					remove_x86_feature(x86_featureset,
5030 					    X86FSET_VAES);
5031 					remove_x86_feature(x86_featureset,
5032 					    X86FSET_VPCLMULQDQ);
5033 					remove_x86_feature(x86_featureset,
5034 					    X86FSET_GFNI);
5035 					remove_x86_feature(x86_featureset,
5036 					    X86FSET_AVX512_VP2INT);
5037 					remove_x86_feature(x86_featureset,
5038 					    X86FSET_AVX512_BITALG);
5039 					remove_x86_feature(x86_featureset,
5040 					    X86FSET_AVX512_VBMI2);
5041 					remove_x86_feature(x86_featureset,
5042 					    X86FSET_AVX512_BF16);
5043 
5044 					xsave_force_disable = B_TRUE;
5045 				} else {
5046 					VERIFY(is_x86_feature(x86_featureset,
5047 					    X86FSET_XSAVE) == B_FALSE);
5048 				}
5049 			}
5050 		}
5051 	}
5052 
5053 
5054 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
5055 		return;
5056 
5057 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
5058 		nmax = NMAX_CPI_EXTD;
5059 	/*
5060 	 * Copy the extended properties, fixing them as we go. While we start at
5061 	 * 2 because we've already handled a few cases in the basic pass, the
5062 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5063 	 */
5064 	iptr = (void *)cpi->cpi_brandstr;
5065 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5066 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5067 		(void) __cpuid_insn(cp);
5068 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5069 		    cp);
5070 		switch (n) {
5071 		case 2:
5072 		case 3:
5073 		case 4:
5074 			/*
5075 			 * Extract the brand string
5076 			 */
5077 			*iptr++ = cp->cp_eax;
5078 			*iptr++ = cp->cp_ebx;
5079 			*iptr++ = cp->cp_ecx;
5080 			*iptr++ = cp->cp_edx;
5081 			break;
5082 		case 5:
5083 			switch (cpi->cpi_vendor) {
5084 			case X86_VENDOR_AMD:
5085 				/*
5086 				 * The Athlon and Duron were the first
5087 				 * parts to report the sizes of the
5088 				 * TLB for large pages. Before then,
5089 				 * we don't trust the data.
5090 				 */
5091 				if (cpi->cpi_family < 6 ||
5092 				    (cpi->cpi_family == 6 &&
5093 				    cpi->cpi_model < 1))
5094 					cp->cp_eax = 0;
5095 				break;
5096 			default:
5097 				break;
5098 			}
5099 			break;
5100 		case 6:
5101 			switch (cpi->cpi_vendor) {
5102 			case X86_VENDOR_AMD:
5103 				/*
5104 				 * The Athlon and Duron were the first
5105 				 * AMD parts with L2 TLB's.
5106 				 * Before then, don't trust the data.
5107 				 */
5108 				if (cpi->cpi_family < 6 ||
5109 				    (cpi->cpi_family == 6 &&
5110 				    cpi->cpi_model < 1))
5111 					cp->cp_eax = cp->cp_ebx = 0;
5112 				/*
5113 				 * AMD Duron rev A0 reports L2
5114 				 * cache size incorrectly as 1K
5115 				 * when it is really 64K
5116 				 */
5117 				if (cpi->cpi_family == 6 &&
5118 				    cpi->cpi_model == 3 &&
5119 				    cpi->cpi_step == 0) {
5120 					cp->cp_ecx &= 0xffff;
5121 					cp->cp_ecx |= 0x400000;
5122 				}
5123 				break;
5124 			case X86_VENDOR_Cyrix:	/* VIA C3 */
5125 				/*
5126 				 * VIA C3 processors are a bit messed
5127 				 * up w.r.t. encoding cache sizes in %ecx
5128 				 */
5129 				if (cpi->cpi_family != 6)
5130 					break;
5131 				/*
5132 				 * model 7 and 8 were incorrectly encoded
5133 				 *
5134 				 * xxx is model 8 really broken?
5135 				 */
5136 				if (cpi->cpi_model == 7 ||
5137 				    cpi->cpi_model == 8)
5138 					cp->cp_ecx =
5139 					    BITX(cp->cp_ecx, 31, 24) << 16 |
5140 					    BITX(cp->cp_ecx, 23, 16) << 12 |
5141 					    BITX(cp->cp_ecx, 15, 8) << 8 |
5142 					    BITX(cp->cp_ecx, 7, 0);
5143 				/*
5144 				 * model 9 stepping 1 has wrong associativity
5145 				 */
5146 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5147 					cp->cp_ecx |= 8 << 12;
5148 				break;
5149 			case X86_VENDOR_Intel:
5150 				/*
5151 				 * Extended L2 Cache features function.
5152 				 * First appeared on Prescott.
5153 				 */
5154 			default:
5155 				break;
5156 			}
5157 			break;
5158 		default:
5159 			break;
5160 		}
5161 	}
5162 }
5163 
5164 static const char *
5165 intel_cpubrand(const struct cpuid_info *cpi)
5166 {
5167 	int i;
5168 
5169 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5170 
5171 	switch (cpi->cpi_family) {
5172 	case 5:
5173 		return ("Intel Pentium(r)");
5174 	case 6:
5175 		switch (cpi->cpi_model) {
5176 			uint_t celeron, xeon;
5177 			const struct cpuid_regs *cp;
5178 		case 0:
5179 		case 1:
5180 		case 2:
5181 			return ("Intel Pentium(r) Pro");
5182 		case 3:
5183 		case 4:
5184 			return ("Intel Pentium(r) II");
5185 		case 6:
5186 			return ("Intel Celeron(r)");
5187 		case 5:
5188 		case 7:
5189 			celeron = xeon = 0;
5190 			cp = &cpi->cpi_std[2];	/* cache info */
5191 
5192 			for (i = 1; i < 4; i++) {
5193 				uint_t tmp;
5194 
5195 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5196 				if (tmp == 0x40)
5197 					celeron++;
5198 				if (tmp >= 0x44 && tmp <= 0x45)
5199 					xeon++;
5200 			}
5201 
5202 			for (i = 0; i < 2; i++) {
5203 				uint_t tmp;
5204 
5205 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5206 				if (tmp == 0x40)
5207 					celeron++;
5208 				else if (tmp >= 0x44 && tmp <= 0x45)
5209 					xeon++;
5210 			}
5211 
5212 			for (i = 0; i < 4; i++) {
5213 				uint_t tmp;
5214 
5215 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5216 				if (tmp == 0x40)
5217 					celeron++;
5218 				else if (tmp >= 0x44 && tmp <= 0x45)
5219 					xeon++;
5220 			}
5221 
5222 			for (i = 0; i < 4; i++) {
5223 				uint_t tmp;
5224 
5225 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5226 				if (tmp == 0x40)
5227 					celeron++;
5228 				else if (tmp >= 0x44 && tmp <= 0x45)
5229 					xeon++;
5230 			}
5231 
5232 			if (celeron)
5233 				return ("Intel Celeron(r)");
5234 			if (xeon)
5235 				return (cpi->cpi_model == 5 ?
5236 				    "Intel Pentium(r) II Xeon(tm)" :
5237 				    "Intel Pentium(r) III Xeon(tm)");
5238 			return (cpi->cpi_model == 5 ?
5239 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5240 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5241 		default:
5242 			break;
5243 		}
5244 	default:
5245 		break;
5246 	}
5247 
5248 	/* BrandID is present if the field is nonzero */
5249 	if (cpi->cpi_brandid != 0) {
5250 		static const struct {
5251 			uint_t bt_bid;
5252 			const char *bt_str;
5253 		} brand_tbl[] = {
5254 			{ 0x1,	"Intel(r) Celeron(r)" },
5255 			{ 0x2,	"Intel(r) Pentium(r) III" },
5256 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5257 			{ 0x4,	"Intel(r) Pentium(r) III" },
5258 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5259 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5260 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5261 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5262 			{ 0xa,	"Intel(r) Celeron(r)" },
5263 			{ 0xb,	"Intel(r) Xeon(tm)" },
5264 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5265 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5266 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5267 			{ 0x11, "Mobile Genuine Intel(r)" },
5268 			{ 0x12, "Intel(r) Celeron(r) M" },
5269 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5270 			{ 0x14, "Intel(r) Celeron(r)" },
5271 			{ 0x15, "Mobile Genuine Intel(r)" },
5272 			{ 0x16,	"Intel(r) Pentium(r) M" },
5273 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5274 		};
5275 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5276 		uint_t sgn;
5277 
5278 		sgn = (cpi->cpi_family << 8) |
5279 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5280 
5281 		for (i = 0; i < btblmax; i++)
5282 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5283 				break;
5284 		if (i < btblmax) {
5285 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5286 				return ("Intel(r) Celeron(r)");
5287 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5288 				return ("Intel(r) Xeon(tm) MP");
5289 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5290 				return ("Intel(r) Xeon(tm)");
5291 			return (brand_tbl[i].bt_str);
5292 		}
5293 	}
5294 
5295 	return (NULL);
5296 }
5297 
5298 static const char *
5299 amd_cpubrand(const struct cpuid_info *cpi)
5300 {
5301 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5302 
5303 	switch (cpi->cpi_family) {
5304 	case 5:
5305 		switch (cpi->cpi_model) {
5306 		case 0:
5307 		case 1:
5308 		case 2:
5309 		case 3:
5310 		case 4:
5311 		case 5:
5312 			return ("AMD-K5(r)");
5313 		case 6:
5314 		case 7:
5315 			return ("AMD-K6(r)");
5316 		case 8:
5317 			return ("AMD-K6(r)-2");
5318 		case 9:
5319 			return ("AMD-K6(r)-III");
5320 		default:
5321 			return ("AMD (family 5)");
5322 		}
5323 	case 6:
5324 		switch (cpi->cpi_model) {
5325 		case 1:
5326 			return ("AMD-K7(tm)");
5327 		case 0:
5328 		case 2:
5329 		case 4:
5330 			return ("AMD Athlon(tm)");
5331 		case 3:
5332 		case 7:
5333 			return ("AMD Duron(tm)");
5334 		case 6:
5335 		case 8:
5336 		case 10:
5337 			/*
5338 			 * Use the L2 cache size to distinguish
5339 			 */
5340 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5341 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5342 		default:
5343 			return ("AMD (family 6)");
5344 		}
5345 	default:
5346 		break;
5347 	}
5348 
5349 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5350 	    cpi->cpi_brandid != 0) {
5351 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5352 		case 3:
5353 			return ("AMD Opteron(tm) UP 1xx");
5354 		case 4:
5355 			return ("AMD Opteron(tm) DP 2xx");
5356 		case 5:
5357 			return ("AMD Opteron(tm) MP 8xx");
5358 		default:
5359 			return ("AMD Opteron(tm)");
5360 		}
5361 	}
5362 
5363 	return (NULL);
5364 }
5365 
5366 static const char *
5367 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5368 {
5369 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5370 
5371 	switch (type) {
5372 	case X86_TYPE_CYRIX_6x86:
5373 		return ("Cyrix 6x86");
5374 	case X86_TYPE_CYRIX_6x86L:
5375 		return ("Cyrix 6x86L");
5376 	case X86_TYPE_CYRIX_6x86MX:
5377 		return ("Cyrix 6x86MX");
5378 	case X86_TYPE_CYRIX_GXm:
5379 		return ("Cyrix GXm");
5380 	case X86_TYPE_CYRIX_MediaGX:
5381 		return ("Cyrix MediaGX");
5382 	case X86_TYPE_CYRIX_MII:
5383 		return ("Cyrix M2");
5384 	case X86_TYPE_VIA_CYRIX_III:
5385 		return ("VIA Cyrix M3");
5386 	default:
5387 		/*
5388 		 * Have another wild guess ..
5389 		 */
5390 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5391 			return ("Cyrix 5x86");
5392 		else if (cpi->cpi_family == 5) {
5393 			switch (cpi->cpi_model) {
5394 			case 2:
5395 				return ("Cyrix 6x86");	/* Cyrix M1 */
5396 			case 4:
5397 				return ("Cyrix MediaGX");
5398 			default:
5399 				break;
5400 			}
5401 		} else if (cpi->cpi_family == 6) {
5402 			switch (cpi->cpi_model) {
5403 			case 0:
5404 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5405 			case 5:
5406 			case 6:
5407 			case 7:
5408 			case 8:
5409 			case 9:
5410 				return ("VIA C3");
5411 			default:
5412 				break;
5413 			}
5414 		}
5415 		break;
5416 	}
5417 	return (NULL);
5418 }
5419 
5420 /*
5421  * This only gets called in the case that the CPU extended
5422  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5423  * aren't available, or contain null bytes for some reason.
5424  */
5425 static void
5426 fabricate_brandstr(struct cpuid_info *cpi)
5427 {
5428 	const char *brand = NULL;
5429 
5430 	switch (cpi->cpi_vendor) {
5431 	case X86_VENDOR_Intel:
5432 		brand = intel_cpubrand(cpi);
5433 		break;
5434 	case X86_VENDOR_AMD:
5435 		brand = amd_cpubrand(cpi);
5436 		break;
5437 	case X86_VENDOR_Cyrix:
5438 		brand = cyrix_cpubrand(cpi, x86_type);
5439 		break;
5440 	case X86_VENDOR_NexGen:
5441 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5442 			brand = "NexGen Nx586";
5443 		break;
5444 	case X86_VENDOR_Centaur:
5445 		if (cpi->cpi_family == 5)
5446 			switch (cpi->cpi_model) {
5447 			case 4:
5448 				brand = "Centaur C6";
5449 				break;
5450 			case 8:
5451 				brand = "Centaur C2";
5452 				break;
5453 			case 9:
5454 				brand = "Centaur C3";
5455 				break;
5456 			default:
5457 				break;
5458 			}
5459 		break;
5460 	case X86_VENDOR_Rise:
5461 		if (cpi->cpi_family == 5 &&
5462 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5463 			brand = "Rise mP6";
5464 		break;
5465 	case X86_VENDOR_SiS:
5466 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5467 			brand = "SiS 55x";
5468 		break;
5469 	case X86_VENDOR_TM:
5470 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5471 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5472 		break;
5473 	case X86_VENDOR_NSC:
5474 	case X86_VENDOR_UMC:
5475 	default:
5476 		break;
5477 	}
5478 	if (brand) {
5479 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5480 		return;
5481 	}
5482 
5483 	/*
5484 	 * If all else fails ...
5485 	 */
5486 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5487 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5488 	    cpi->cpi_model, cpi->cpi_step);
5489 }
5490 
5491 /*
5492  * This routine is called just after kernel memory allocation
5493  * becomes available on cpu0, and as part of mp_startup() on
5494  * the other cpus.
5495  *
5496  * Fixup the brand string, and collect any information from cpuid
5497  * that requires dynamically allocated storage to represent.
5498  */
5499 
5500 static void
5501 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5502 {
5503 	int	i, max, shft, level, size;
5504 	struct cpuid_regs regs;
5505 	struct cpuid_regs *cp;
5506 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5507 
5508 	/*
5509 	 * Deterministic cache parameters
5510 	 *
5511 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5512 	 * values that are present are currently defined to be the same. This
5513 	 * means we can use the same logic to parse it as long as we use the
5514 	 * appropriate leaf to get the data. If you're updating this, make sure
5515 	 * you're careful about which vendor supports which aspect.
5516 	 *
5517 	 * Take this opportunity to detect the number of threads sharing the
5518 	 * last level cache, and construct a corresponding cache id. The
5519 	 * respective cpuid_info members are initialized to the default case of
5520 	 * "no last level cache sharing".
5521 	 */
5522 	cpi->cpi_ncpu_shr_last_cache = 1;
5523 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5524 
5525 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5526 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5527 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5528 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5529 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5530 		uint32_t leaf;
5531 
5532 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5533 			leaf = 4;
5534 		} else {
5535 			leaf = CPUID_LEAF_EXT_1d;
5536 		}
5537 
5538 		/*
5539 		 * Find the # of elements (size) returned by the leaf and along
5540 		 * the way detect last level cache sharing details.
5541 		 */
5542 		bzero(&regs, sizeof (regs));
5543 		cp = &regs;
5544 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5545 			cp->cp_eax = leaf;
5546 			cp->cp_ecx = i;
5547 
5548 			(void) __cpuid_insn(cp);
5549 
5550 			if (CPI_CACHE_TYPE(cp) == 0)
5551 				break;
5552 			level = CPI_CACHE_LVL(cp);
5553 			if (level > max) {
5554 				max = level;
5555 				cpi->cpi_ncpu_shr_last_cache =
5556 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5557 			}
5558 		}
5559 		cpi->cpi_cache_leaf_size = size = i;
5560 
5561 		/*
5562 		 * Allocate the cpi_cache_leaves array. The first element
5563 		 * references the regs for the corresponding leaf with %ecx set
5564 		 * to 0. This was gathered in cpuid_pass_extended().
5565 		 */
5566 		if (size > 0) {
5567 			cpi->cpi_cache_leaves =
5568 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5569 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5570 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5571 			} else {
5572 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5573 			}
5574 
5575 			/*
5576 			 * Allocate storage to hold the additional regs
5577 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5578 			 *
5579 			 * The regs for the leaf, %ecx == 0 has already
5580 			 * been allocated as indicated above.
5581 			 */
5582 			for (i = 1; i < size; i++) {
5583 				cp = cpi->cpi_cache_leaves[i] =
5584 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5585 				cp->cp_eax = leaf;
5586 				cp->cp_ecx = i;
5587 
5588 				(void) __cpuid_insn(cp);
5589 			}
5590 		}
5591 		/*
5592 		 * Determine the number of bits needed to represent
5593 		 * the number of CPUs sharing the last level cache.
5594 		 *
5595 		 * Shift off that number of bits from the APIC id to
5596 		 * derive the cache id.
5597 		 */
5598 		shft = 0;
5599 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5600 			shft++;
5601 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5602 	}
5603 
5604 	/*
5605 	 * Now fixup the brand string
5606 	 */
5607 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5608 		fabricate_brandstr(cpi);
5609 	} else {
5610 
5611 		/*
5612 		 * If we successfully extracted a brand string from the cpuid
5613 		 * instruction, clean it up by removing leading spaces and
5614 		 * similar junk.
5615 		 */
5616 		if (cpi->cpi_brandstr[0]) {
5617 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5618 			char *src, *dst;
5619 
5620 			dst = src = (char *)cpi->cpi_brandstr;
5621 			src[maxlen - 1] = '\0';
5622 			/*
5623 			 * strip leading spaces
5624 			 */
5625 			while (*src == ' ')
5626 				src++;
5627 			/*
5628 			 * Remove any 'Genuine' or "Authentic" prefixes
5629 			 */
5630 			if (strncmp(src, "Genuine ", 8) == 0)
5631 				src += 8;
5632 			if (strncmp(src, "Authentic ", 10) == 0)
5633 				src += 10;
5634 
5635 			/*
5636 			 * Now do an in-place copy.
5637 			 * Map (R) to (r) and (TM) to (tm).
5638 			 * The era of teletypes is long gone, and there's
5639 			 * -really- no need to shout.
5640 			 */
5641 			while (*src != '\0') {
5642 				if (src[0] == '(') {
5643 					if (strncmp(src + 1, "R)", 2) == 0) {
5644 						(void) strncpy(dst, "(r)", 3);
5645 						src += 3;
5646 						dst += 3;
5647 						continue;
5648 					}
5649 					if (strncmp(src + 1, "TM)", 3) == 0) {
5650 						(void) strncpy(dst, "(tm)", 4);
5651 						src += 4;
5652 						dst += 4;
5653 						continue;
5654 					}
5655 				}
5656 				*dst++ = *src++;
5657 			}
5658 			*dst = '\0';
5659 
5660 			/*
5661 			 * Finally, remove any trailing spaces
5662 			 */
5663 			while (--dst > cpi->cpi_brandstr)
5664 				if (*dst == ' ')
5665 					*dst = '\0';
5666 				else
5667 					break;
5668 		} else
5669 			fabricate_brandstr(cpi);
5670 	}
5671 }
5672 
5673 typedef struct {
5674 	uint32_t avm_av;
5675 	uint32_t avm_feat;
5676 } av_feat_map_t;
5677 
5678 /*
5679  * These arrays are used to map features that we should add based on x86
5680  * features that are present. As a large number depend on kernel features,
5681  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5682  * There is an array of these for each hwcap word. Some features aren't tracked
5683  * in the kernel x86 featureset and that's ok. They will not show up in here.
5684  */
5685 static const av_feat_map_t x86fset_to_av1[] = {
5686 	{ AV_386_CX8, X86FSET_CX8 },
5687 	{ AV_386_SEP, X86FSET_SEP },
5688 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5689 	{ AV_386_CMOV, X86FSET_CMOV },
5690 	{ AV_386_FXSR, X86FSET_SSE },
5691 	{ AV_386_SSE, X86FSET_SSE },
5692 	{ AV_386_SSE2, X86FSET_SSE2 },
5693 	{ AV_386_SSE3, X86FSET_SSE3 },
5694 	{ AV_386_CX16, X86FSET_CX16 },
5695 	{ AV_386_TSCP, X86FSET_TSCP },
5696 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5697 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5698 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5699 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5700 	{ AV_386_AES, X86FSET_AES },
5701 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5702 	{ AV_386_XSAVE, X86FSET_XSAVE },
5703 	{ AV_386_AVX, X86FSET_AVX },
5704 	{ AV_386_VMX, X86FSET_VMX },
5705 	{ AV_386_AMD_SVM, X86FSET_SVM }
5706 };
5707 
5708 static const av_feat_map_t x86fset_to_av2[] = {
5709 	{ AV_386_2_F16C, X86FSET_F16C },
5710 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5711 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5712 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5713 	{ AV_386_2_FMA, X86FSET_FMA },
5714 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5715 	{ AV_386_2_ADX, X86FSET_ADX },
5716 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5717 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5718 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5719 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5720 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5721 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5722 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5723 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5724 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5725 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5726 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5727 	{ AV_386_2_SHA, X86FSET_SHA },
5728 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5729 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5730 	{ AV_386_2_CLWB, X86FSET_CLWB },
5731 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5732 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5733 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5734 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5735 	{ AV_386_2_VAES, X86FSET_VAES },
5736 	{ AV_386_2_GFNI, X86FSET_GFNI },
5737 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5738 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5739 };
5740 
5741 static const av_feat_map_t x86fset_to_av3[] = {
5742 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5743 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5744 };
5745 
5746 /*
5747  * This routine is called out of bind_hwcap() much later in the life
5748  * of the kernel (post_startup()).  The job of this routine is to resolve
5749  * the hardware feature support and kernel support for those features into
5750  * what we're actually going to tell applications via the aux vector.
5751  *
5752  * Most of the aux vector is derived from the x86_featureset array vector where
5753  * a given feature indicates that an aux vector should be plumbed through. This
5754  * allows the kernel to use one tracking mechanism for these based on whether or
5755  * not it has the required hardware support (most often xsave). Most newer
5756  * features are added there in case we need them in the kernel. Otherwise,
5757  * features are evaluated based on looking at the cpuid features that remain. If
5758  * you find yourself wanting to clear out cpuid features for some reason, they
5759  * should instead be driven by the feature set so we have a consistent view.
5760  */
5761 
5762 static void
5763 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5764 {
5765 	uint_t *hwcap_out = (uint_t *)arg;
5766 	struct cpuid_info *cpi;
5767 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5768 
5769 	cpi = cpu->cpu_m.mcpu_cpi;
5770 
5771 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5772 		if (is_x86_feature(x86_featureset,
5773 		    x86fset_to_av1[i].avm_feat)) {
5774 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5775 		}
5776 	}
5777 
5778 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5779 		if (is_x86_feature(x86_featureset,
5780 		    x86fset_to_av2[i].avm_feat)) {
5781 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5782 		}
5783 	}
5784 
5785 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5786 		if (is_x86_feature(x86_featureset,
5787 		    x86fset_to_av3[i].avm_feat)) {
5788 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5789 		}
5790 	}
5791 
5792 	/*
5793 	 * From here on out we're working through features that don't have
5794 	 * corresponding kernel feature flags for various reasons that are
5795 	 * mostly just due to the historical implementation.
5796 	 */
5797 	if (cpi->cpi_maxeax >= 1) {
5798 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5799 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5800 
5801 		*edx = CPI_FEATURES_EDX(cpi);
5802 		*ecx = CPI_FEATURES_ECX(cpi);
5803 
5804 		/*
5805 		 * [no explicit support required beyond x87 fp context]
5806 		 */
5807 		if (!fpu_exists)
5808 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5809 
5810 		/*
5811 		 * Now map the supported feature vector to things that we
5812 		 * think userland will care about.
5813 		 */
5814 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5815 			hwcap_flags |= AV_386_MOVBE;
5816 
5817 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5818 			hwcap_flags |= AV_386_POPCNT;
5819 		if (*edx & CPUID_INTC_EDX_FPU)
5820 			hwcap_flags |= AV_386_FPU;
5821 		if (*edx & CPUID_INTC_EDX_MMX)
5822 			hwcap_flags |= AV_386_MMX;
5823 		if (*edx & CPUID_INTC_EDX_TSC)
5824 			hwcap_flags |= AV_386_TSC;
5825 	}
5826 
5827 	/*
5828 	 * Check a few miscellaneous features.
5829 	 */
5830 	if (cpi->cpi_xmaxeax < 0x80000001)
5831 		goto resolve_done;
5832 
5833 	switch (cpi->cpi_vendor) {
5834 		uint32_t *edx, *ecx;
5835 
5836 	case X86_VENDOR_Intel:
5837 		/*
5838 		 * Seems like Intel duplicated what we necessary
5839 		 * here to make the initial crop of 64-bit OS's work.
5840 		 * Hopefully, those are the only "extended" bits
5841 		 * they'll add.
5842 		 */
5843 		/*FALLTHROUGH*/
5844 
5845 	case X86_VENDOR_AMD:
5846 	case X86_VENDOR_HYGON:
5847 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5848 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5849 
5850 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5851 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5852 
5853 		/*
5854 		 * [no explicit support required beyond
5855 		 * x87 fp context and exception handlers]
5856 		 */
5857 		if (!fpu_exists)
5858 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5859 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5860 
5861 		/*
5862 		 * Now map the supported feature vector to
5863 		 * things that we think userland will care about.
5864 		 */
5865 		if (*edx & CPUID_AMD_EDX_MMXamd)
5866 			hwcap_flags |= AV_386_AMD_MMX;
5867 		if (*edx & CPUID_AMD_EDX_3DNow)
5868 			hwcap_flags |= AV_386_AMD_3DNow;
5869 		if (*edx & CPUID_AMD_EDX_3DNowx)
5870 			hwcap_flags |= AV_386_AMD_3DNowx;
5871 
5872 		switch (cpi->cpi_vendor) {
5873 		case X86_VENDOR_AMD:
5874 		case X86_VENDOR_HYGON:
5875 			if (*ecx & CPUID_AMD_ECX_AHF64)
5876 				hwcap_flags |= AV_386_AHF;
5877 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5878 				hwcap_flags |= AV_386_AMD_LZCNT;
5879 			break;
5880 
5881 		case X86_VENDOR_Intel:
5882 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5883 				hwcap_flags |= AV_386_AMD_LZCNT;
5884 			/*
5885 			 * Aarrgh.
5886 			 * Intel uses a different bit in the same word.
5887 			 */
5888 			if (*ecx & CPUID_INTC_ECX_AHF64)
5889 				hwcap_flags |= AV_386_AHF;
5890 			break;
5891 		default:
5892 			break;
5893 		}
5894 		break;
5895 
5896 	default:
5897 		break;
5898 	}
5899 
5900 resolve_done:
5901 	if (hwcap_out != NULL) {
5902 		hwcap_out[0] = hwcap_flags;
5903 		hwcap_out[1] = hwcap_flags_2;
5904 		hwcap_out[2] = hwcap_flags_3;
5905 	}
5906 }
5907 
5908 
5909 /*
5910  * Simulate the cpuid instruction using the data we previously
5911  * captured about this CPU.  We try our best to return the truth
5912  * about the hardware, independently of kernel support.
5913  */
5914 uint32_t
5915 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5916 {
5917 	struct cpuid_info *cpi;
5918 	struct cpuid_regs *xcp;
5919 
5920 	if (cpu == NULL)
5921 		cpu = CPU;
5922 	cpi = cpu->cpu_m.mcpu_cpi;
5923 
5924 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5925 
5926 	/*
5927 	 * CPUID data is cached in two separate places: cpi_std for standard
5928 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5929 	 */
5930 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5931 		xcp = &cpi->cpi_std[cp->cp_eax];
5932 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5933 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5934 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5935 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5936 	} else {
5937 		/*
5938 		 * The caller is asking for data from an input parameter which
5939 		 * the kernel has not cached.  In this case we go fetch from
5940 		 * the hardware and return the data directly to the user.
5941 		 */
5942 		return (__cpuid_insn(cp));
5943 	}
5944 
5945 	cp->cp_eax = xcp->cp_eax;
5946 	cp->cp_ebx = xcp->cp_ebx;
5947 	cp->cp_ecx = xcp->cp_ecx;
5948 	cp->cp_edx = xcp->cp_edx;
5949 	return (cp->cp_eax);
5950 }
5951 
5952 boolean_t
5953 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5954 {
5955 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5956 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5957 }
5958 
5959 int
5960 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5961 {
5962 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5963 
5964 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5965 }
5966 
5967 int
5968 cpuid_is_cmt(cpu_t *cpu)
5969 {
5970 	if (cpu == NULL)
5971 		cpu = CPU;
5972 
5973 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5974 
5975 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5976 }
5977 
5978 /*
5979  * AMD and Intel both implement the 64-bit variant of the syscall
5980  * instruction (syscallq), so if there's -any- support for syscall,
5981  * cpuid currently says "yes, we support this".
5982  *
5983  * However, Intel decided to -not- implement the 32-bit variant of the
5984  * syscall instruction, so we provide a predicate to allow our caller
5985  * to test that subtlety here.
5986  *
5987  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5988  *	even in the case where the hardware would in fact support it.
5989  */
5990 /*ARGSUSED*/
5991 int
5992 cpuid_syscall32_insn(cpu_t *cpu)
5993 {
5994 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5995 
5996 #if !defined(__xpv)
5997 	if (cpu == NULL)
5998 		cpu = CPU;
5999 
6000 	/*CSTYLED*/
6001 	{
6002 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6003 
6004 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
6005 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
6006 		    cpi->cpi_xmaxeax >= 0x80000001 &&
6007 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
6008 			return (1);
6009 	}
6010 #endif
6011 	return (0);
6012 }
6013 
6014 int
6015 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
6016 {
6017 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6018 
6019 	static const char fmt[] =
6020 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
6021 	static const char fmt_ht[] =
6022 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
6023 
6024 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6025 
6026 	if (cpuid_is_cmt(cpu))
6027 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
6028 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6029 		    cpi->cpi_family, cpi->cpi_model,
6030 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6031 	return (snprintf(s, n, fmt,
6032 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6033 	    cpi->cpi_family, cpi->cpi_model,
6034 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6035 }
6036 
6037 const char *
6038 cpuid_getvendorstr(cpu_t *cpu)
6039 {
6040 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6041 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
6042 }
6043 
6044 uint_t
6045 cpuid_getvendor(cpu_t *cpu)
6046 {
6047 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6048 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
6049 }
6050 
6051 uint_t
6052 cpuid_getfamily(cpu_t *cpu)
6053 {
6054 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6055 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
6056 }
6057 
6058 uint_t
6059 cpuid_getmodel(cpu_t *cpu)
6060 {
6061 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6062 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
6063 }
6064 
6065 uint_t
6066 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6067 {
6068 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6069 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6070 }
6071 
6072 uint_t
6073 cpuid_get_ncore_per_chip(cpu_t *cpu)
6074 {
6075 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6076 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6077 }
6078 
6079 uint_t
6080 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6081 {
6082 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6083 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6084 }
6085 
6086 id_t
6087 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6088 {
6089 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6090 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6091 }
6092 
6093 uint_t
6094 cpuid_getstep(cpu_t *cpu)
6095 {
6096 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6097 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
6098 }
6099 
6100 uint_t
6101 cpuid_getsig(struct cpu *cpu)
6102 {
6103 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6104 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6105 }
6106 
6107 uint32_t
6108 cpuid_getchiprev(struct cpu *cpu)
6109 {
6110 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6111 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6112 }
6113 
6114 const char *
6115 cpuid_getchiprevstr(struct cpu *cpu)
6116 {
6117 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6118 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6119 }
6120 
6121 uint32_t
6122 cpuid_getsockettype(struct cpu *cpu)
6123 {
6124 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6125 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6126 }
6127 
6128 const char *
6129 cpuid_getsocketstr(cpu_t *cpu)
6130 {
6131 	static const char *socketstr = NULL;
6132 	struct cpuid_info *cpi;
6133 
6134 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6135 	cpi = cpu->cpu_m.mcpu_cpi;
6136 
6137 	/* Assume that socket types are the same across the system */
6138 	if (socketstr == NULL)
6139 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6140 		    cpi->cpi_model, cpi->cpi_step);
6141 
6142 
6143 	return (socketstr);
6144 }
6145 
6146 x86_uarchrev_t
6147 cpuid_getuarchrev(cpu_t *cpu)
6148 {
6149 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6150 }
6151 
6152 int
6153 cpuid_get_chipid(cpu_t *cpu)
6154 {
6155 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6156 
6157 	if (cpuid_is_cmt(cpu))
6158 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6159 	return (cpu->cpu_id);
6160 }
6161 
6162 id_t
6163 cpuid_get_coreid(cpu_t *cpu)
6164 {
6165 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6166 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6167 }
6168 
6169 int
6170 cpuid_get_pkgcoreid(cpu_t *cpu)
6171 {
6172 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6173 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6174 }
6175 
6176 int
6177 cpuid_get_clogid(cpu_t *cpu)
6178 {
6179 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6180 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6181 }
6182 
6183 int
6184 cpuid_get_cacheid(cpu_t *cpu)
6185 {
6186 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6187 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6188 }
6189 
6190 uint_t
6191 cpuid_get_procnodeid(cpu_t *cpu)
6192 {
6193 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6194 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6195 }
6196 
6197 uint_t
6198 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6199 {
6200 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6201 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6202 }
6203 
6204 uint_t
6205 cpuid_get_compunitid(cpu_t *cpu)
6206 {
6207 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6208 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6209 }
6210 
6211 uint_t
6212 cpuid_get_cores_per_compunit(cpu_t *cpu)
6213 {
6214 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6215 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6216 }
6217 
6218 uint32_t
6219 cpuid_get_apicid(cpu_t *cpu)
6220 {
6221 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6222 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6223 		return (UINT32_MAX);
6224 	} else {
6225 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6226 	}
6227 }
6228 
6229 void
6230 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6231 {
6232 	struct cpuid_info *cpi;
6233 
6234 	if (cpu == NULL)
6235 		cpu = CPU;
6236 	cpi = cpu->cpu_m.mcpu_cpi;
6237 
6238 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6239 
6240 	if (pabits)
6241 		*pabits = cpi->cpi_pabits;
6242 	if (vabits)
6243 		*vabits = cpi->cpi_vabits;
6244 }
6245 
6246 size_t
6247 cpuid_get_xsave_size(void)
6248 {
6249 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6250 	    sizeof (struct xsave_state)));
6251 }
6252 
6253 /*
6254  * Export information about known offsets to the kernel. We only care about
6255  * things we have actually enabled support for in %xcr0.
6256  */
6257 void
6258 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6259 {
6260 	size_t size, off;
6261 
6262 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6263 
6264 	if (sizep == NULL)
6265 		sizep = &size;
6266 	if (offp == NULL)
6267 		offp = &off;
6268 
6269 	switch (bit) {
6270 	case XFEATURE_LEGACY_FP:
6271 	case XFEATURE_SSE:
6272 		*sizep = sizeof (struct fxsave_state);
6273 		*offp = 0;
6274 		break;
6275 	case XFEATURE_AVX:
6276 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6277 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6278 		break;
6279 	case XFEATURE_AVX512_OPMASK:
6280 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6281 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6282 		break;
6283 	case XFEATURE_AVX512_ZMM:
6284 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6285 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6286 		break;
6287 	case XFEATURE_AVX512_HI_ZMM:
6288 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6289 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6290 		break;
6291 	default:
6292 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6293 	}
6294 }
6295 
6296 /*
6297  * Return true if the CPUs on this system require 'pointer clearing' for the
6298  * floating point error pointer exception handling. In the past, this has been
6299  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6300  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6301  * feature bit and is reflected in the cpi_fp_amd_save member.
6302  */
6303 boolean_t
6304 cpuid_need_fp_excp_handling(void)
6305 {
6306 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6307 	    cpuid_info0.cpi_fp_amd_save != 0);
6308 }
6309 
6310 /*
6311  * Returns the number of data TLB entries for a corresponding
6312  * pagesize.  If it can't be computed, or isn't known, the
6313  * routine returns zero.  If you ask about an architecturally
6314  * impossible pagesize, the routine will panic (so that the
6315  * hat implementor knows that things are inconsistent.)
6316  */
6317 uint_t
6318 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6319 {
6320 	struct cpuid_info *cpi;
6321 	uint_t dtlb_nent = 0;
6322 
6323 	if (cpu == NULL)
6324 		cpu = CPU;
6325 	cpi = cpu->cpu_m.mcpu_cpi;
6326 
6327 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6328 
6329 	/*
6330 	 * Check the L2 TLB info
6331 	 */
6332 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6333 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6334 
6335 		switch (pagesize) {
6336 
6337 		case 4 * 1024:
6338 			/*
6339 			 * All zero in the top 16 bits of the register
6340 			 * indicates a unified TLB. Size is in low 16 bits.
6341 			 */
6342 			if ((cp->cp_ebx & 0xffff0000) == 0)
6343 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6344 			else
6345 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6346 			break;
6347 
6348 		case 2 * 1024 * 1024:
6349 			if ((cp->cp_eax & 0xffff0000) == 0)
6350 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6351 			else
6352 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6353 			break;
6354 
6355 		default:
6356 			panic("unknown L2 pagesize");
6357 			/*NOTREACHED*/
6358 		}
6359 	}
6360 
6361 	if (dtlb_nent != 0)
6362 		return (dtlb_nent);
6363 
6364 	/*
6365 	 * No L2 TLB support for this size, try L1.
6366 	 */
6367 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6368 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6369 
6370 		switch (pagesize) {
6371 		case 4 * 1024:
6372 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6373 			break;
6374 		case 2 * 1024 * 1024:
6375 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6376 			break;
6377 		default:
6378 			panic("unknown L1 d-TLB pagesize");
6379 			/*NOTREACHED*/
6380 		}
6381 	}
6382 
6383 	return (dtlb_nent);
6384 }
6385 
6386 /*
6387  * Return 0 if the erratum is not present or not applicable, positive
6388  * if it is, and negative if the status of the erratum is unknown.
6389  *
6390  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6391  * Processors" #25759, Rev 3.57, August 2005
6392  */
6393 int
6394 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6395 {
6396 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6397 	uint_t eax;
6398 
6399 	/*
6400 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6401 	 * a legacy (32-bit) AMD CPU.
6402 	 */
6403 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6404 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6405 	    cpi->cpi_family == 6) {
6406 		return (0);
6407 	}
6408 
6409 	eax = cpi->cpi_std[1].cp_eax;
6410 
6411 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6412 #define	SH_B3(eax)	(eax == 0xf51)
6413 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6414 
6415 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6416 
6417 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6418 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6419 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6420 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6421 
6422 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6423 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6424 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6425 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6426 
6427 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6428 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6429 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6430 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6431 #define	BH_E4(eax)	(eax == 0x20fb1)
6432 #define	SH_E5(eax)	(eax == 0x20f42)
6433 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6434 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6435 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6436 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6437 			    DH_E6(eax) || JH_E6(eax))
6438 
6439 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6440 #define	DR_B0(eax)	(eax == 0x100f20)
6441 #define	DR_B1(eax)	(eax == 0x100f21)
6442 #define	DR_BA(eax)	(eax == 0x100f2a)
6443 #define	DR_B2(eax)	(eax == 0x100f22)
6444 #define	DR_B3(eax)	(eax == 0x100f23)
6445 #define	RB_C0(eax)	(eax == 0x100f40)
6446 
6447 	switch (erratum) {
6448 	case 1:
6449 		return (cpi->cpi_family < 0x10);
6450 	case 51:	/* what does the asterisk mean? */
6451 		return (B(eax) || SH_C0(eax) || CG(eax));
6452 	case 52:
6453 		return (B(eax));
6454 	case 57:
6455 		return (cpi->cpi_family <= 0x11);
6456 	case 58:
6457 		return (B(eax));
6458 	case 60:
6459 		return (cpi->cpi_family <= 0x11);
6460 	case 61:
6461 	case 62:
6462 	case 63:
6463 	case 64:
6464 	case 65:
6465 	case 66:
6466 	case 68:
6467 	case 69:
6468 	case 70:
6469 	case 71:
6470 		return (B(eax));
6471 	case 72:
6472 		return (SH_B0(eax));
6473 	case 74:
6474 		return (B(eax));
6475 	case 75:
6476 		return (cpi->cpi_family < 0x10);
6477 	case 76:
6478 		return (B(eax));
6479 	case 77:
6480 		return (cpi->cpi_family <= 0x11);
6481 	case 78:
6482 		return (B(eax) || SH_C0(eax));
6483 	case 79:
6484 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6485 	case 80:
6486 	case 81:
6487 	case 82:
6488 		return (B(eax));
6489 	case 83:
6490 		return (B(eax) || SH_C0(eax) || CG(eax));
6491 	case 85:
6492 		return (cpi->cpi_family < 0x10);
6493 	case 86:
6494 		return (SH_C0(eax) || CG(eax));
6495 	case 88:
6496 		return (B(eax) || SH_C0(eax));
6497 	case 89:
6498 		return (cpi->cpi_family < 0x10);
6499 	case 90:
6500 		return (B(eax) || SH_C0(eax) || CG(eax));
6501 	case 91:
6502 	case 92:
6503 		return (B(eax) || SH_C0(eax));
6504 	case 93:
6505 		return (SH_C0(eax));
6506 	case 94:
6507 		return (B(eax) || SH_C0(eax) || CG(eax));
6508 	case 95:
6509 		return (B(eax) || SH_C0(eax));
6510 	case 96:
6511 		return (B(eax) || SH_C0(eax) || CG(eax));
6512 	case 97:
6513 	case 98:
6514 		return (SH_C0(eax) || CG(eax));
6515 	case 99:
6516 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6517 	case 100:
6518 		return (B(eax) || SH_C0(eax));
6519 	case 101:
6520 	case 103:
6521 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6522 	case 104:
6523 		return (SH_C0(eax) || CG(eax) || D0(eax));
6524 	case 105:
6525 	case 106:
6526 	case 107:
6527 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6528 	case 108:
6529 		return (DH_CG(eax));
6530 	case 109:
6531 		return (SH_C0(eax) || CG(eax) || D0(eax));
6532 	case 110:
6533 		return (D0(eax) || EX(eax));
6534 	case 111:
6535 		return (CG(eax));
6536 	case 112:
6537 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6538 	case 113:
6539 		return (eax == 0x20fc0);
6540 	case 114:
6541 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6542 	case 115:
6543 		return (SH_E0(eax) || JH_E1(eax));
6544 	case 116:
6545 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6546 	case 117:
6547 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6548 	case 118:
6549 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6550 		    JH_E6(eax));
6551 	case 121:
6552 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6553 	case 122:
6554 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6555 	case 123:
6556 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6557 	case 131:
6558 		return (cpi->cpi_family < 0x10);
6559 	case 6336786:
6560 
6561 		/*
6562 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6563 		 * if this is a K8 family or newer processor. We're testing for
6564 		 * this 'erratum' to determine whether or not we have a constant
6565 		 * TSC.
6566 		 *
6567 		 * Our current fix for this is to disable the C1-Clock ramping.
6568 		 * However, this doesn't work on newer processor families nor
6569 		 * does it work when virtualized as those devices don't exist.
6570 		 */
6571 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6572 			return (0);
6573 		}
6574 
6575 		if (CPI_FAMILY(cpi) == 0xf) {
6576 			struct cpuid_regs regs;
6577 			regs.cp_eax = 0x80000007;
6578 			(void) __cpuid_insn(&regs);
6579 			return (!(regs.cp_edx & 0x100));
6580 		}
6581 		return (0);
6582 	case 147:
6583 		/*
6584 		 * This erratum (K8 #147) is not present on family 10 and newer.
6585 		 */
6586 		if (cpi->cpi_family >= 0x10) {
6587 			return (0);
6588 		}
6589 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6590 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6591 
6592 	case 6671130:
6593 		/*
6594 		 * check for processors (pre-Shanghai) that do not provide
6595 		 * optimal management of 1gb ptes in its tlb.
6596 		 */
6597 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6598 
6599 	case 298:
6600 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6601 		    DR_B2(eax) || RB_C0(eax));
6602 
6603 	case 721:
6604 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6605 
6606 	default:
6607 		return (-1);
6608 
6609 	}
6610 }
6611 
6612 /*
6613  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6614  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6615  */
6616 int
6617 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6618 {
6619 	struct cpuid_info	*cpi;
6620 	uint_t			osvwid;
6621 	static int		osvwfeature = -1;
6622 	uint64_t		osvwlength;
6623 
6624 
6625 	cpi = cpu->cpu_m.mcpu_cpi;
6626 
6627 	/* confirm OSVW supported */
6628 	if (osvwfeature == -1) {
6629 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6630 	} else {
6631 		/* assert that osvw feature setting is consistent on all cpus */
6632 		ASSERT(osvwfeature ==
6633 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6634 	}
6635 	if (!osvwfeature)
6636 		return (-1);
6637 
6638 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6639 
6640 	switch (erratum) {
6641 	case 298:	/* osvwid is 0 */
6642 		osvwid = 0;
6643 		if (osvwlength <= (uint64_t)osvwid) {
6644 			/* osvwid 0 is unknown */
6645 			return (-1);
6646 		}
6647 
6648 		/*
6649 		 * Check the OSVW STATUS MSR to determine the state
6650 		 * of the erratum where:
6651 		 *   0 - fixed by HW
6652 		 *   1 - BIOS has applied the workaround when BIOS
6653 		 *   workaround is available. (Or for other errata,
6654 		 *   OS workaround is required.)
6655 		 * For a value of 1, caller will confirm that the
6656 		 * erratum 298 workaround has indeed been applied by BIOS.
6657 		 *
6658 		 * A 1 may be set in cpus that have a HW fix
6659 		 * in a mixed cpu system. Regarding erratum 298:
6660 		 *   In a multiprocessor platform, the workaround above
6661 		 *   should be applied to all processors regardless of
6662 		 *   silicon revision when an affected processor is
6663 		 *   present.
6664 		 */
6665 
6666 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6667 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6668 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6669 
6670 	default:
6671 		return (-1);
6672 	}
6673 }
6674 
6675 static const char assoc_str[] = "associativity";
6676 static const char line_str[] = "line-size";
6677 static const char size_str[] = "size";
6678 
6679 static void
6680 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6681     uint32_t val)
6682 {
6683 	char buf[128];
6684 
6685 	/*
6686 	 * ndi_prop_update_int() is used because it is desirable for
6687 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6688 	 */
6689 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6690 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6691 }
6692 
6693 /*
6694  * Intel-style cache/tlb description
6695  *
6696  * Standard cpuid level 2 gives a randomly ordered
6697  * selection of tags that index into a table that describes
6698  * cache and tlb properties.
6699  */
6700 
6701 static const char l1_icache_str[] = "l1-icache";
6702 static const char l1_dcache_str[] = "l1-dcache";
6703 static const char l2_cache_str[] = "l2-cache";
6704 static const char l3_cache_str[] = "l3-cache";
6705 static const char itlb4k_str[] = "itlb-4K";
6706 static const char dtlb4k_str[] = "dtlb-4K";
6707 static const char itlb2M_str[] = "itlb-2M";
6708 static const char itlb4M_str[] = "itlb-4M";
6709 static const char dtlb4M_str[] = "dtlb-4M";
6710 static const char dtlb24_str[] = "dtlb0-2M-4M";
6711 static const char itlb424_str[] = "itlb-4K-2M-4M";
6712 static const char itlb24_str[] = "itlb-2M-4M";
6713 static const char dtlb44_str[] = "dtlb-4K-4M";
6714 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6715 static const char sl2_cache_str[] = "sectored-l2-cache";
6716 static const char itrace_str[] = "itrace-cache";
6717 static const char sl3_cache_str[] = "sectored-l3-cache";
6718 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6719 
6720 static const struct cachetab {
6721 	uint8_t		ct_code;
6722 	uint8_t		ct_assoc;
6723 	uint16_t	ct_line_size;
6724 	size_t		ct_size;
6725 	const char	*ct_label;
6726 } intel_ctab[] = {
6727 	/*
6728 	 * maintain descending order!
6729 	 *
6730 	 * Codes ignored - Reason
6731 	 * ----------------------
6732 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6733 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6734 	 */
6735 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6736 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6737 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6738 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6739 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6740 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6741 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6742 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6743 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6744 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6745 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6746 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6747 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6748 	{ 0xc0, 4, 0, 8, dtlb44_str },
6749 	{ 0xba, 4, 0, 64, dtlb4k_str },
6750 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6751 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6752 	{ 0xb2, 4, 0, 64, itlb4k_str },
6753 	{ 0xb0, 4, 0, 128, itlb4k_str },
6754 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6755 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6756 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6757 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6758 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6759 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6760 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6761 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6762 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6763 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6764 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6765 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6766 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6767 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6768 	{ 0x73, 8, 0, 64*1024, itrace_str},
6769 	{ 0x72, 8, 0, 32*1024, itrace_str},
6770 	{ 0x71, 8, 0, 16*1024, itrace_str},
6771 	{ 0x70, 8, 0, 12*1024, itrace_str},
6772 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6773 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6774 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6775 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6776 	{ 0x5d, 0, 0, 256, dtlb44_str},
6777 	{ 0x5c, 0, 0, 128, dtlb44_str},
6778 	{ 0x5b, 0, 0, 64, dtlb44_str},
6779 	{ 0x5a, 4, 0, 32, dtlb24_str},
6780 	{ 0x59, 0, 0, 16, dtlb4k_str},
6781 	{ 0x57, 4, 0, 16, dtlb4k_str},
6782 	{ 0x56, 4, 0, 16, dtlb4M_str},
6783 	{ 0x55, 0, 0, 7, itlb24_str},
6784 	{ 0x52, 0, 0, 256, itlb424_str},
6785 	{ 0x51, 0, 0, 128, itlb424_str},
6786 	{ 0x50, 0, 0, 64, itlb424_str},
6787 	{ 0x4f, 0, 0, 32, itlb4k_str},
6788 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6789 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6790 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6791 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6792 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6793 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6794 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6795 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6796 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6797 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6798 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6799 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6800 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6801 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6802 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6803 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6804 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6805 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6806 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6807 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6808 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6809 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6810 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6811 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6812 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6813 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6814 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6815 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6816 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6817 	{ 0x0b, 4, 0, 4, itlb4M_str},
6818 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6819 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6820 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6821 	{ 0x05, 4, 0, 32, dtlb4M_str},
6822 	{ 0x04, 4, 0, 8, dtlb4M_str},
6823 	{ 0x03, 4, 0, 64, dtlb4k_str},
6824 	{ 0x02, 4, 0, 2, itlb4M_str},
6825 	{ 0x01, 4, 0, 32, itlb4k_str},
6826 	{ 0 }
6827 };
6828 
6829 static const struct cachetab cyrix_ctab[] = {
6830 	{ 0x70, 4, 0, 32, "tlb-4K" },
6831 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6832 	{ 0 }
6833 };
6834 
6835 /*
6836  * Search a cache table for a matching entry
6837  */
6838 static const struct cachetab *
6839 find_cacheent(const struct cachetab *ct, uint_t code)
6840 {
6841 	if (code != 0) {
6842 		for (; ct->ct_code != 0; ct++)
6843 			if (ct->ct_code <= code)
6844 				break;
6845 		if (ct->ct_code == code)
6846 			return (ct);
6847 	}
6848 	return (NULL);
6849 }
6850 
6851 /*
6852  * Populate cachetab entry with L2 or L3 cache-information using
6853  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6854  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6855  * information is found.
6856  */
6857 static int
6858 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6859 {
6860 	uint32_t level, i;
6861 	int ret = 0;
6862 
6863 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6864 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6865 
6866 		if (level == 2 || level == 3) {
6867 			ct->ct_assoc =
6868 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6869 			ct->ct_line_size =
6870 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6871 			ct->ct_size = ct->ct_assoc *
6872 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6873 			    ct->ct_line_size *
6874 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6875 
6876 			if (level == 2) {
6877 				ct->ct_label = l2_cache_str;
6878 			} else if (level == 3) {
6879 				ct->ct_label = l3_cache_str;
6880 			}
6881 			ret = 1;
6882 		}
6883 	}
6884 
6885 	return (ret);
6886 }
6887 
6888 /*
6889  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6890  * The walk is terminated if the walker returns non-zero.
6891  */
6892 static void
6893 intel_walk_cacheinfo(struct cpuid_info *cpi,
6894     void *arg, int (*func)(void *, const struct cachetab *))
6895 {
6896 	const struct cachetab *ct;
6897 	struct cachetab des_49_ct, des_b1_ct;
6898 	uint8_t *dp;
6899 	int i;
6900 
6901 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6902 		return;
6903 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6904 		/*
6905 		 * For overloaded descriptor 0x49 we use cpuid function 4
6906 		 * if supported by the current processor, to create
6907 		 * cache information.
6908 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6909 		 * to disambiguate the cache information.
6910 		 */
6911 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6912 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6913 				ct = &des_49_ct;
6914 		} else if (*dp == 0xb1) {
6915 			des_b1_ct.ct_code = 0xb1;
6916 			des_b1_ct.ct_assoc = 4;
6917 			des_b1_ct.ct_line_size = 0;
6918 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6919 				des_b1_ct.ct_size = 8;
6920 				des_b1_ct.ct_label = itlb2M_str;
6921 			} else {
6922 				des_b1_ct.ct_size = 4;
6923 				des_b1_ct.ct_label = itlb4M_str;
6924 			}
6925 			ct = &des_b1_ct;
6926 		} else {
6927 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6928 				continue;
6929 			}
6930 		}
6931 
6932 		if (func(arg, ct) != 0) {
6933 			break;
6934 		}
6935 	}
6936 }
6937 
6938 /*
6939  * (Like the Intel one, except for Cyrix CPUs)
6940  */
6941 static void
6942 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6943     void *arg, int (*func)(void *, const struct cachetab *))
6944 {
6945 	const struct cachetab *ct;
6946 	uint8_t *dp;
6947 	int i;
6948 
6949 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6950 		return;
6951 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6952 		/*
6953 		 * Search Cyrix-specific descriptor table first ..
6954 		 */
6955 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6956 			if (func(arg, ct) != 0)
6957 				break;
6958 			continue;
6959 		}
6960 		/*
6961 		 * .. else fall back to the Intel one
6962 		 */
6963 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6964 			if (func(arg, ct) != 0)
6965 				break;
6966 			continue;
6967 		}
6968 	}
6969 }
6970 
6971 /*
6972  * A cacheinfo walker that adds associativity, line-size, and size properties
6973  * to the devinfo node it is passed as an argument.
6974  */
6975 static int
6976 add_cacheent_props(void *arg, const struct cachetab *ct)
6977 {
6978 	dev_info_t *devi = arg;
6979 
6980 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6981 	if (ct->ct_line_size != 0)
6982 		add_cache_prop(devi, ct->ct_label, line_str,
6983 		    ct->ct_line_size);
6984 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6985 	return (0);
6986 }
6987 
6988 
6989 static const char fully_assoc[] = "fully-associative?";
6990 
6991 /*
6992  * AMD style cache/tlb description
6993  *
6994  * Extended functions 5 and 6 directly describe properties of
6995  * tlbs and various cache levels.
6996  */
6997 static void
6998 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6999 {
7000 	switch (assoc) {
7001 	case 0:	/* reserved; ignore */
7002 		break;
7003 	default:
7004 		add_cache_prop(devi, label, assoc_str, assoc);
7005 		break;
7006 	case 0xff:
7007 		add_cache_prop(devi, label, fully_assoc, 1);
7008 		break;
7009 	}
7010 }
7011 
7012 static void
7013 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7014 {
7015 	if (size == 0)
7016 		return;
7017 	add_cache_prop(devi, label, size_str, size);
7018 	add_amd_assoc(devi, label, assoc);
7019 }
7020 
7021 static void
7022 add_amd_cache(dev_info_t *devi, const char *label,
7023     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7024 {
7025 	if (size == 0 || line_size == 0)
7026 		return;
7027 	add_amd_assoc(devi, label, assoc);
7028 	/*
7029 	 * Most AMD parts have a sectored cache. Multiple cache lines are
7030 	 * associated with each tag. A sector consists of all cache lines
7031 	 * associated with a tag. For example, the AMD K6-III has a sector
7032 	 * size of 2 cache lines per tag.
7033 	 */
7034 	if (lines_per_tag != 0)
7035 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7036 	add_cache_prop(devi, label, line_str, line_size);
7037 	add_cache_prop(devi, label, size_str, size * 1024);
7038 }
7039 
7040 static void
7041 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7042 {
7043 	switch (assoc) {
7044 	case 0:	/* off */
7045 		break;
7046 	case 1:
7047 	case 2:
7048 	case 4:
7049 		add_cache_prop(devi, label, assoc_str, assoc);
7050 		break;
7051 	case 6:
7052 		add_cache_prop(devi, label, assoc_str, 8);
7053 		break;
7054 	case 8:
7055 		add_cache_prop(devi, label, assoc_str, 16);
7056 		break;
7057 	case 0xf:
7058 		add_cache_prop(devi, label, fully_assoc, 1);
7059 		break;
7060 	default: /* reserved; ignore */
7061 		break;
7062 	}
7063 }
7064 
7065 static void
7066 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7067 {
7068 	if (size == 0 || assoc == 0)
7069 		return;
7070 	add_amd_l2_assoc(devi, label, assoc);
7071 	add_cache_prop(devi, label, size_str, size);
7072 }
7073 
7074 static void
7075 add_amd_l2_cache(dev_info_t *devi, const char *label,
7076     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7077 {
7078 	if (size == 0 || assoc == 0 || line_size == 0)
7079 		return;
7080 	add_amd_l2_assoc(devi, label, assoc);
7081 	if (lines_per_tag != 0)
7082 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7083 	add_cache_prop(devi, label, line_str, line_size);
7084 	add_cache_prop(devi, label, size_str, size * 1024);
7085 }
7086 
7087 static void
7088 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7089 {
7090 	struct cpuid_regs *cp;
7091 
7092 	if (cpi->cpi_xmaxeax < 0x80000005)
7093 		return;
7094 	cp = &cpi->cpi_extd[5];
7095 
7096 	/*
7097 	 * 4M/2M L1 TLB configuration
7098 	 *
7099 	 * We report the size for 2M pages because AMD uses two
7100 	 * TLB entries for one 4M page.
7101 	 */
7102 	add_amd_tlb(devi, "dtlb-2M",
7103 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7104 	add_amd_tlb(devi, "itlb-2M",
7105 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7106 
7107 	/*
7108 	 * 4K L1 TLB configuration
7109 	 */
7110 
7111 	switch (cpi->cpi_vendor) {
7112 		uint_t nentries;
7113 	case X86_VENDOR_TM:
7114 		if (cpi->cpi_family >= 5) {
7115 			/*
7116 			 * Crusoe processors have 256 TLB entries, but
7117 			 * cpuid data format constrains them to only
7118 			 * reporting 255 of them.
7119 			 */
7120 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7121 				nentries = 256;
7122 			/*
7123 			 * Crusoe processors also have a unified TLB
7124 			 */
7125 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7126 			    nentries);
7127 			break;
7128 		}
7129 		/*FALLTHROUGH*/
7130 	default:
7131 		add_amd_tlb(devi, itlb4k_str,
7132 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7133 		add_amd_tlb(devi, dtlb4k_str,
7134 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7135 		break;
7136 	}
7137 
7138 	/*
7139 	 * data L1 cache configuration
7140 	 */
7141 
7142 	add_amd_cache(devi, l1_dcache_str,
7143 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7144 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7145 
7146 	/*
7147 	 * code L1 cache configuration
7148 	 */
7149 
7150 	add_amd_cache(devi, l1_icache_str,
7151 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7152 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7153 
7154 	if (cpi->cpi_xmaxeax < 0x80000006)
7155 		return;
7156 	cp = &cpi->cpi_extd[6];
7157 
7158 	/* Check for a unified L2 TLB for large pages */
7159 
7160 	if (BITX(cp->cp_eax, 31, 16) == 0)
7161 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7162 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7163 	else {
7164 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7165 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7166 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7167 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7168 	}
7169 
7170 	/* Check for a unified L2 TLB for 4K pages */
7171 
7172 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7173 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7174 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7175 	} else {
7176 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7177 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7178 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7179 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7180 	}
7181 
7182 	add_amd_l2_cache(devi, l2_cache_str,
7183 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7184 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7185 }
7186 
7187 /*
7188  * There are two basic ways that the x86 world describes it cache
7189  * and tlb architecture - Intel's way and AMD's way.
7190  *
7191  * Return which flavor of cache architecture we should use
7192  */
7193 static int
7194 x86_which_cacheinfo(struct cpuid_info *cpi)
7195 {
7196 	switch (cpi->cpi_vendor) {
7197 	case X86_VENDOR_Intel:
7198 		if (cpi->cpi_maxeax >= 2)
7199 			return (X86_VENDOR_Intel);
7200 		break;
7201 	case X86_VENDOR_AMD:
7202 		/*
7203 		 * The K5 model 1 was the first part from AMD that reported
7204 		 * cache sizes via extended cpuid functions.
7205 		 */
7206 		if (cpi->cpi_family > 5 ||
7207 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7208 			return (X86_VENDOR_AMD);
7209 		break;
7210 	case X86_VENDOR_HYGON:
7211 		return (X86_VENDOR_AMD);
7212 	case X86_VENDOR_TM:
7213 		if (cpi->cpi_family >= 5)
7214 			return (X86_VENDOR_AMD);
7215 		/*FALLTHROUGH*/
7216 	default:
7217 		/*
7218 		 * If they have extended CPU data for 0x80000005
7219 		 * then we assume they have AMD-format cache
7220 		 * information.
7221 		 *
7222 		 * If not, and the vendor happens to be Cyrix,
7223 		 * then try our-Cyrix specific handler.
7224 		 *
7225 		 * If we're not Cyrix, then assume we're using Intel's
7226 		 * table-driven format instead.
7227 		 */
7228 		if (cpi->cpi_xmaxeax >= 0x80000005)
7229 			return (X86_VENDOR_AMD);
7230 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7231 			return (X86_VENDOR_Cyrix);
7232 		else if (cpi->cpi_maxeax >= 2)
7233 			return (X86_VENDOR_Intel);
7234 		break;
7235 	}
7236 	return (-1);
7237 }
7238 
7239 void
7240 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7241     struct cpuid_info *cpi)
7242 {
7243 	dev_info_t *cpu_devi;
7244 	int create;
7245 
7246 	cpu_devi = (dev_info_t *)dip;
7247 
7248 	/* device_type */
7249 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7250 	    "device_type", "cpu");
7251 
7252 	/* reg */
7253 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7254 	    "reg", cpu_id);
7255 
7256 	/* cpu-mhz, and clock-frequency */
7257 	if (cpu_freq > 0) {
7258 		long long mul;
7259 
7260 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7261 		    "cpu-mhz", cpu_freq);
7262 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7263 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7264 			    "clock-frequency", (int)mul);
7265 	}
7266 
7267 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7268 
7269 	/* vendor-id */
7270 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7271 	    "vendor-id", cpi->cpi_vendorstr);
7272 
7273 	if (cpi->cpi_maxeax == 0) {
7274 		return;
7275 	}
7276 
7277 	/*
7278 	 * family, model, and step
7279 	 */
7280 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7281 	    "family", CPI_FAMILY(cpi));
7282 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7283 	    "cpu-model", CPI_MODEL(cpi));
7284 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7285 	    "stepping-id", CPI_STEP(cpi));
7286 
7287 	/* type */
7288 	switch (cpi->cpi_vendor) {
7289 	case X86_VENDOR_Intel:
7290 		create = 1;
7291 		break;
7292 	default:
7293 		create = 0;
7294 		break;
7295 	}
7296 	if (create)
7297 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7298 		    "type", CPI_TYPE(cpi));
7299 
7300 	/* ext-family */
7301 	switch (cpi->cpi_vendor) {
7302 	case X86_VENDOR_Intel:
7303 	case X86_VENDOR_AMD:
7304 		create = cpi->cpi_family >= 0xf;
7305 		break;
7306 	case X86_VENDOR_HYGON:
7307 		create = 1;
7308 		break;
7309 	default:
7310 		create = 0;
7311 		break;
7312 	}
7313 	if (create)
7314 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7315 		    "ext-family", CPI_FAMILY_XTD(cpi));
7316 
7317 	/* ext-model */
7318 	switch (cpi->cpi_vendor) {
7319 	case X86_VENDOR_Intel:
7320 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7321 		break;
7322 	case X86_VENDOR_AMD:
7323 		create = CPI_FAMILY(cpi) == 0xf;
7324 		break;
7325 	case X86_VENDOR_HYGON:
7326 		create = 1;
7327 		break;
7328 	default:
7329 		create = 0;
7330 		break;
7331 	}
7332 	if (create)
7333 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7334 		    "ext-model", CPI_MODEL_XTD(cpi));
7335 
7336 	/* generation */
7337 	switch (cpi->cpi_vendor) {
7338 	case X86_VENDOR_AMD:
7339 	case X86_VENDOR_HYGON:
7340 		/*
7341 		 * AMD K5 model 1 was the first part to support this
7342 		 */
7343 		create = cpi->cpi_xmaxeax >= 0x80000001;
7344 		break;
7345 	default:
7346 		create = 0;
7347 		break;
7348 	}
7349 	if (create)
7350 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7351 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7352 
7353 	/* brand-id */
7354 	switch (cpi->cpi_vendor) {
7355 	case X86_VENDOR_Intel:
7356 		/*
7357 		 * brand id first appeared on Pentium III Xeon model 8,
7358 		 * and Celeron model 8 processors and Opteron
7359 		 */
7360 		create = cpi->cpi_family > 6 ||
7361 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7362 		break;
7363 	case X86_VENDOR_AMD:
7364 		create = cpi->cpi_family >= 0xf;
7365 		break;
7366 	case X86_VENDOR_HYGON:
7367 		create = 1;
7368 		break;
7369 	default:
7370 		create = 0;
7371 		break;
7372 	}
7373 	if (create && cpi->cpi_brandid != 0) {
7374 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7375 		    "brand-id", cpi->cpi_brandid);
7376 	}
7377 
7378 	/* chunks, and apic-id */
7379 	switch (cpi->cpi_vendor) {
7380 		/*
7381 		 * first available on Pentium IV and Opteron (K8)
7382 		 */
7383 	case X86_VENDOR_Intel:
7384 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7385 		break;
7386 	case X86_VENDOR_AMD:
7387 		create = cpi->cpi_family >= 0xf;
7388 		break;
7389 	case X86_VENDOR_HYGON:
7390 		create = 1;
7391 		break;
7392 	default:
7393 		create = 0;
7394 		break;
7395 	}
7396 	if (create) {
7397 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7398 		    "chunks", CPI_CHUNKS(cpi));
7399 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7400 		    "apic-id", cpi->cpi_apicid);
7401 		if (cpi->cpi_chipid >= 0) {
7402 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7403 			    "chip#", cpi->cpi_chipid);
7404 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7405 			    "clog#", cpi->cpi_clogid);
7406 		}
7407 	}
7408 
7409 	/* cpuid-features */
7410 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7411 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7412 
7413 
7414 	/* cpuid-features-ecx */
7415 	switch (cpi->cpi_vendor) {
7416 	case X86_VENDOR_Intel:
7417 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7418 		break;
7419 	case X86_VENDOR_AMD:
7420 		create = cpi->cpi_family >= 0xf;
7421 		break;
7422 	case X86_VENDOR_HYGON:
7423 		create = 1;
7424 		break;
7425 	default:
7426 		create = 0;
7427 		break;
7428 	}
7429 	if (create)
7430 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7431 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7432 
7433 	/* ext-cpuid-features */
7434 	switch (cpi->cpi_vendor) {
7435 	case X86_VENDOR_Intel:
7436 	case X86_VENDOR_AMD:
7437 	case X86_VENDOR_HYGON:
7438 	case X86_VENDOR_Cyrix:
7439 	case X86_VENDOR_TM:
7440 	case X86_VENDOR_Centaur:
7441 		create = cpi->cpi_xmaxeax >= 0x80000001;
7442 		break;
7443 	default:
7444 		create = 0;
7445 		break;
7446 	}
7447 	if (create) {
7448 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7449 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7450 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7451 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7452 	}
7453 
7454 	/*
7455 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7456 	 * model 1, and Cyrix GXm.  On earlier models we try and
7457 	 * simulate something similar .. so this string should always
7458 	 * same -something- about the processor, however lame.
7459 	 */
7460 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7461 	    "brand-string", cpi->cpi_brandstr);
7462 
7463 	/*
7464 	 * Finally, cache and tlb information
7465 	 */
7466 	switch (x86_which_cacheinfo(cpi)) {
7467 	case X86_VENDOR_Intel:
7468 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7469 		break;
7470 	case X86_VENDOR_Cyrix:
7471 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7472 		break;
7473 	case X86_VENDOR_AMD:
7474 		amd_cache_info(cpi, cpu_devi);
7475 		break;
7476 	default:
7477 		break;
7478 	}
7479 }
7480 
7481 struct l2info {
7482 	int *l2i_csz;
7483 	int *l2i_lsz;
7484 	int *l2i_assoc;
7485 	int l2i_ret;
7486 };
7487 
7488 /*
7489  * A cacheinfo walker that fetches the size, line-size and associativity
7490  * of the L2 cache
7491  */
7492 static int
7493 intel_l2cinfo(void *arg, const struct cachetab *ct)
7494 {
7495 	struct l2info *l2i = arg;
7496 	int *ip;
7497 
7498 	if (ct->ct_label != l2_cache_str &&
7499 	    ct->ct_label != sl2_cache_str)
7500 		return (0);	/* not an L2 -- keep walking */
7501 
7502 	if ((ip = l2i->l2i_csz) != NULL)
7503 		*ip = ct->ct_size;
7504 	if ((ip = l2i->l2i_lsz) != NULL)
7505 		*ip = ct->ct_line_size;
7506 	if ((ip = l2i->l2i_assoc) != NULL)
7507 		*ip = ct->ct_assoc;
7508 	l2i->l2i_ret = ct->ct_size;
7509 	return (1);		/* was an L2 -- terminate walk */
7510 }
7511 
7512 /*
7513  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7514  *
7515  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7516  *	value is the associativity, the associativity for the L2 cache and
7517  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7518  *	an index into the amd_afd[] array to determine the associativity.
7519  *	-1 is undefined. 0 is fully associative.
7520  */
7521 
7522 static int amd_afd[] =
7523 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7524 
7525 static void
7526 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7527 {
7528 	struct cpuid_regs *cp;
7529 	uint_t size, assoc;
7530 	int i;
7531 	int *ip;
7532 
7533 	if (cpi->cpi_xmaxeax < 0x80000006)
7534 		return;
7535 	cp = &cpi->cpi_extd[6];
7536 
7537 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7538 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7539 		uint_t cachesz = size * 1024;
7540 		assoc = amd_afd[i];
7541 
7542 		ASSERT(assoc != -1);
7543 
7544 		if ((ip = l2i->l2i_csz) != NULL)
7545 			*ip = cachesz;
7546 		if ((ip = l2i->l2i_lsz) != NULL)
7547 			*ip = BITX(cp->cp_ecx, 7, 0);
7548 		if ((ip = l2i->l2i_assoc) != NULL)
7549 			*ip = assoc;
7550 		l2i->l2i_ret = cachesz;
7551 	}
7552 }
7553 
7554 int
7555 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7556 {
7557 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7558 	struct l2info __l2info, *l2i = &__l2info;
7559 
7560 	l2i->l2i_csz = csz;
7561 	l2i->l2i_lsz = lsz;
7562 	l2i->l2i_assoc = assoc;
7563 	l2i->l2i_ret = -1;
7564 
7565 	switch (x86_which_cacheinfo(cpi)) {
7566 	case X86_VENDOR_Intel:
7567 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7568 		break;
7569 	case X86_VENDOR_Cyrix:
7570 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7571 		break;
7572 	case X86_VENDOR_AMD:
7573 		amd_l2cacheinfo(cpi, l2i);
7574 		break;
7575 	default:
7576 		break;
7577 	}
7578 	return (l2i->l2i_ret);
7579 }
7580 
7581 #if !defined(__xpv)
7582 
7583 uint32_t *
7584 cpuid_mwait_alloc(cpu_t *cpu)
7585 {
7586 	uint32_t	*ret;
7587 	size_t		mwait_size;
7588 
7589 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7590 
7591 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7592 	if (mwait_size == 0)
7593 		return (NULL);
7594 
7595 	/*
7596 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7597 	 * allocations.  mwait_size is currently cache line sized.  Neither
7598 	 * of these implementation details are guarantied to be true in the
7599 	 * future.
7600 	 *
7601 	 * First try allocating mwait_size as kmem_alloc() currently returns
7602 	 * correctly aligned memory.  If kmem_alloc() does not return
7603 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7604 	 *
7605 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7606 	 * decide to free this memory.
7607 	 */
7608 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7609 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7610 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7611 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7612 		*ret = MWAIT_RUNNING;
7613 		return (ret);
7614 	} else {
7615 		kmem_free(ret, mwait_size);
7616 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7617 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7618 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7619 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7620 		*ret = MWAIT_RUNNING;
7621 		return (ret);
7622 	}
7623 }
7624 
7625 void
7626 cpuid_mwait_free(cpu_t *cpu)
7627 {
7628 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7629 		return;
7630 	}
7631 
7632 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7633 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7634 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7635 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7636 	}
7637 
7638 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7639 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7640 }
7641 
7642 void
7643 patch_tsc_read(int flag)
7644 {
7645 	size_t cnt;
7646 
7647 	switch (flag) {
7648 	case TSC_NONE:
7649 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7650 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7651 		break;
7652 	case TSC_RDTSC_LFENCE:
7653 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7654 		(void) memcpy((void *)tsc_read,
7655 		    (void *)&_tsc_lfence_start, cnt);
7656 		break;
7657 	case TSC_TSCP:
7658 		cnt = &_tscp_end - &_tscp_start;
7659 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7660 		break;
7661 	default:
7662 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7663 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7664 		break;
7665 	}
7666 	tsc_type = flag;
7667 }
7668 
7669 int
7670 cpuid_deep_cstates_supported(void)
7671 {
7672 	struct cpuid_info *cpi;
7673 	struct cpuid_regs regs;
7674 
7675 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7676 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7677 
7678 	cpi = CPU->cpu_m.mcpu_cpi;
7679 
7680 	switch (cpi->cpi_vendor) {
7681 	case X86_VENDOR_Intel:
7682 		if (cpi->cpi_xmaxeax < 0x80000007)
7683 			return (0);
7684 
7685 		/*
7686 		 * Does TSC run at a constant rate in all C-states?
7687 		 */
7688 		regs.cp_eax = 0x80000007;
7689 		(void) __cpuid_insn(&regs);
7690 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7691 
7692 	default:
7693 		return (0);
7694 	}
7695 }
7696 
7697 #endif	/* !__xpv */
7698 
7699 void
7700 post_startup_cpu_fixups(void)
7701 {
7702 #ifndef __xpv
7703 	/*
7704 	 * Some AMD processors support C1E state. Entering this state will
7705 	 * cause the local APIC timer to stop, which we can't deal with at
7706 	 * this time.
7707 	 */
7708 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7709 		on_trap_data_t otd;
7710 		uint64_t reg;
7711 
7712 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7713 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7714 			/* Disable C1E state if it is enabled by BIOS */
7715 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7716 			    AMD_ACTONCMPHALT_MASK) {
7717 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7718 				    AMD_ACTONCMPHALT_SHIFT);
7719 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7720 			}
7721 		}
7722 		no_trap();
7723 	}
7724 #endif	/* !__xpv */
7725 }
7726 
7727 void
7728 enable_pcid(void)
7729 {
7730 	if (x86_use_pcid == -1)
7731 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7732 
7733 	if (x86_use_invpcid == -1) {
7734 		x86_use_invpcid = is_x86_feature(x86_featureset,
7735 		    X86FSET_INVPCID);
7736 	}
7737 
7738 	if (!x86_use_pcid)
7739 		return;
7740 
7741 	/*
7742 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7743 	 * bits; better make sure there's nothing there.
7744 	 */
7745 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7746 
7747 	setcr4(getcr4() | CR4_PCIDE);
7748 }
7749 
7750 /*
7751  * Setup necessary registers to enable XSAVE feature on this processor.
7752  * This function needs to be called early enough, so that no xsave/xrstor
7753  * ops will execute on the processor before the MSRs are properly set up.
7754  *
7755  * Current implementation has the following assumption:
7756  * - cpuid_pass_basic() is done, so that X86 features are known.
7757  * - fpu_probe() is done, so that fp_save_mech is chosen.
7758  */
7759 void
7760 xsave_setup_msr(cpu_t *cpu)
7761 {
7762 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7763 	ASSERT(fp_save_mech == FP_XSAVE);
7764 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7765 
7766 	/* Enable OSXSAVE in CR4. */
7767 	setcr4(getcr4() | CR4_OSXSAVE);
7768 	/*
7769 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7770 	 * correct value.
7771 	 */
7772 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7773 	setup_xfem();
7774 }
7775 
7776 /*
7777  * Starting with the Westmere processor the local
7778  * APIC timer will continue running in all C-states,
7779  * including the deepest C-states.
7780  */
7781 int
7782 cpuid_arat_supported(void)
7783 {
7784 	struct cpuid_info *cpi;
7785 	struct cpuid_regs regs;
7786 
7787 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7788 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7789 
7790 	cpi = CPU->cpu_m.mcpu_cpi;
7791 
7792 	switch (cpi->cpi_vendor) {
7793 	case X86_VENDOR_Intel:
7794 		/*
7795 		 * Always-running Local APIC Timer is
7796 		 * indicated by CPUID.6.EAX[2].
7797 		 */
7798 		if (cpi->cpi_maxeax >= 6) {
7799 			regs.cp_eax = 6;
7800 			(void) cpuid_insn(NULL, &regs);
7801 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7802 		} else {
7803 			return (0);
7804 		}
7805 	default:
7806 		return (0);
7807 	}
7808 }
7809 
7810 /*
7811  * Check support for Intel ENERGY_PERF_BIAS feature
7812  */
7813 int
7814 cpuid_iepb_supported(struct cpu *cp)
7815 {
7816 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7817 	struct cpuid_regs regs;
7818 
7819 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7820 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7821 
7822 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7823 		return (0);
7824 	}
7825 
7826 	/*
7827 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7828 	 * capability bit CPUID.6.ECX.3
7829 	 */
7830 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7831 		return (0);
7832 
7833 	regs.cp_eax = 0x6;
7834 	(void) cpuid_insn(NULL, &regs);
7835 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7836 }
7837 
7838 /*
7839  * Check support for TSC deadline timer
7840  *
7841  * TSC deadline timer provides a superior software programming
7842  * model over local APIC timer that eliminates "time drifts".
7843  * Instead of specifying a relative time, software specifies an
7844  * absolute time as the target at which the processor should
7845  * generate a timer event.
7846  */
7847 int
7848 cpuid_deadline_tsc_supported(void)
7849 {
7850 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7851 	struct cpuid_regs regs;
7852 
7853 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7854 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7855 
7856 	switch (cpi->cpi_vendor) {
7857 	case X86_VENDOR_Intel:
7858 		if (cpi->cpi_maxeax >= 1) {
7859 			regs.cp_eax = 1;
7860 			(void) cpuid_insn(NULL, &regs);
7861 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7862 		} else {
7863 			return (0);
7864 		}
7865 	default:
7866 		return (0);
7867 	}
7868 }
7869 
7870 #if !defined(__xpv)
7871 /*
7872  * Patch in versions of bcopy for high performance Intel Nhm processors
7873  * and later...
7874  */
7875 void
7876 patch_memops(uint_t vendor)
7877 {
7878 	size_t cnt, i;
7879 	caddr_t to, from;
7880 
7881 	if ((vendor == X86_VENDOR_Intel) &&
7882 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7883 		cnt = &bcopy_patch_end - &bcopy_patch_start;
7884 		to = &bcopy_ck_size;
7885 		from = &bcopy_patch_start;
7886 		for (i = 0; i < cnt; i++) {
7887 			*to++ = *from++;
7888 		}
7889 	}
7890 }
7891 #endif  /*  !__xpv */
7892 
7893 /*
7894  * We're being asked to tell the system how many bits are required to represent
7895  * the various thread and strand IDs. While it's tempting to derive this based
7896  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7897  * correct. Instead, this needs to be based on the number of bits that the APIC
7898  * allows for these different configurations. We only update these to a larger
7899  * value if we find one.
7900  */
7901 void
7902 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7903 {
7904 	struct cpuid_info *cpi;
7905 
7906 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7907 	cpi = cpu->cpu_m.mcpu_cpi;
7908 
7909 	if (cpi->cpi_ncore_bits > *core_nbits) {
7910 		*core_nbits = cpi->cpi_ncore_bits;
7911 	}
7912 
7913 	if (cpi->cpi_nthread_bits > *strand_nbits) {
7914 		*strand_nbits = cpi->cpi_nthread_bits;
7915 	}
7916 }
7917 
7918 void
7919 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7920 {
7921 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7922 	struct cpuid_regs cp;
7923 
7924 	/*
7925 	 * Reread the CPUID portions that we need for various security
7926 	 * information.
7927 	 */
7928 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7929 		/*
7930 		 * Check if we now have leaf 7 available to us.
7931 		 */
7932 		if (cpi->cpi_maxeax < 7) {
7933 			bzero(&cp, sizeof (cp));
7934 			cp.cp_eax = 0;
7935 			cpi->cpi_maxeax = __cpuid_insn(&cp);
7936 			if (cpi->cpi_maxeax < 7)
7937 				return;
7938 		}
7939 
7940 		bzero(&cp, sizeof (cp));
7941 		cp.cp_eax = 7;
7942 		cp.cp_ecx = 0;
7943 		(void) __cpuid_insn(&cp);
7944 		cpi->cpi_std[7] = cp;
7945 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7946 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
7947 		/* No xcpuid support */
7948 		if (cpi->cpi_family < 5 ||
7949 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7950 			return;
7951 
7952 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7953 			bzero(&cp, sizeof (cp));
7954 			cp.cp_eax = CPUID_LEAF_EXT_0;
7955 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7956 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7957 				return;
7958 			}
7959 		}
7960 
7961 		/*
7962 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
7963 		 * leaf 0x21. So we also check that.
7964 		 */
7965 		bzero(&cp, sizeof (cp));
7966 		cp.cp_eax = CPUID_LEAF_EXT_8;
7967 		(void) __cpuid_insn(&cp);
7968 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7969 		cpi->cpi_extd[8] = cp;
7970 
7971 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
7972 			return;
7973 		}
7974 
7975 		bzero(&cp, sizeof (cp));
7976 		cp.cp_eax = CPUID_LEAF_EXT_21;
7977 		(void) __cpuid_insn(&cp);
7978 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
7979 		cpi->cpi_extd[0x21] = cp;
7980 	} else {
7981 		/*
7982 		 * Nothing to do here. Return an empty set which has already
7983 		 * been zeroed for us.
7984 		 */
7985 		return;
7986 	}
7987 	cpuid_scan_security(cpu, fset);
7988 }
7989 
7990 /* ARGSUSED */
7991 static int
7992 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7993 {
7994 	uchar_t *fset;
7995 	boolean_t first_pass = (boolean_t)arg1;
7996 
7997 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7998 	if (first_pass && CPU->cpu_id != 0)
7999 		return (0);
8000 	if (!first_pass && CPU->cpu_id == 0)
8001 		return (0);
8002 	cpuid_pass_ucode(CPU, fset);
8003 
8004 	return (0);
8005 }
8006 
8007 /*
8008  * After a microcode update where the version has changed, then we need to
8009  * rescan CPUID. To do this we check every CPU to make sure that they have the
8010  * same microcode. Then we perform a cross call to all such CPUs. It's the
8011  * caller's job to make sure that no one else can end up doing an update while
8012  * this is going on.
8013  *
8014  * We assume that the system is microcode capable if we're called.
8015  */
8016 void
8017 cpuid_post_ucodeadm(void)
8018 {
8019 	uint32_t rev;
8020 	int i;
8021 	struct cpu *cpu;
8022 	cpuset_t cpuset;
8023 	void *argdata;
8024 	uchar_t *f0;
8025 
8026 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
8027 
8028 	mutex_enter(&cpu_lock);
8029 	cpu = cpu_get(0);
8030 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
8031 	CPUSET_ONLY(cpuset, 0);
8032 	for (i = 1; i < max_ncpus; i++) {
8033 		if ((cpu = cpu_get(i)) == NULL)
8034 			continue;
8035 
8036 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
8037 			panic("post microcode update CPU %d has differing "
8038 			    "microcode revision (%u) from CPU 0 (%u)",
8039 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
8040 		}
8041 		CPUSET_ADD(cpuset, i);
8042 	}
8043 
8044 	/*
8045 	 * We do the cross calls in two passes. The first pass is only for the
8046 	 * boot CPU. The second pass is for all of the other CPUs. This allows
8047 	 * the boot CPU to go through and change behavior related to patching or
8048 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
8049 	 * other CPUs to follow suit.
8050 	 */
8051 	kpreempt_disable();
8052 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
8053 	    cpuid_post_ucodeadm_xc);
8054 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
8055 	    cpuid_post_ucodeadm_xc);
8056 	kpreempt_enable();
8057 
8058 	/*
8059 	 * OK, now look at each CPU and see if their feature sets are equal.
8060 	 */
8061 	f0 = argdata;
8062 	for (i = 1; i < max_ncpus; i++) {
8063 		uchar_t *fset;
8064 		if (!CPU_IN_SET(cpuset, i))
8065 			continue;
8066 
8067 		fset = (uchar_t *)((uintptr_t)argdata +
8068 		    sizeof (x86_featureset) * i);
8069 
8070 		if (!compare_x86_featureset(f0, fset)) {
8071 			panic("Post microcode update CPU %d has "
8072 			    "differing security feature (%p) set from CPU 0 "
8073 			    "(%p), not appending to feature set", i,
8074 			    (void *)fset, (void *)f0);
8075 		}
8076 	}
8077 
8078 	mutex_exit(&cpu_lock);
8079 
8080 	for (i = 0; i < NUM_X86_FEATURES; i++) {
8081 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8082 		    x86_feature_names[i]);
8083 		if (is_x86_feature(f0, i)) {
8084 			add_x86_feature(x86_featureset, i);
8085 		}
8086 	}
8087 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8088 }
8089 
8090 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8091 
8092 typedef struct cpuid_pass_def {
8093 	cpuid_pass_t cpd_pass;
8094 	cpuid_pass_f cpd_func;
8095 } cpuid_pass_def_t;
8096 
8097 /*
8098  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8099  * normal sense and should not appear here.
8100  */
8101 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8102 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8103 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
8104 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
8105 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
8106 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8107 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8108 };
8109 
8110 void
8111 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8112 {
8113 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
8114 
8115 	if (cp == NULL)
8116 		cp = CPU;
8117 
8118 	/*
8119 	 * Space statically allocated for BSP, ensure pointer is set
8120 	 */
8121 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8122 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
8123 
8124 	ASSERT(cpuid_checkpass(cp, pass - 1));
8125 
8126 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8127 		if (cpuid_pass_defs[i].cpd_pass == pass) {
8128 			cpuid_pass_defs[i].cpd_func(cp, arg);
8129 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8130 			return;
8131 		}
8132 	}
8133 
8134 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8135 	    pass, cp->cpu_id);
8136 }
8137 
8138 /*
8139  * Extract the processor family from a chiprev.  Processor families are not the
8140  * same as cpuid families; see comments above and in x86_archext.h.
8141  */
8142 x86_processor_family_t
8143 chiprev_family(const x86_chiprev_t cr)
8144 {
8145 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8146 }
8147 
8148 /*
8149  * A chiprev matches its template if the vendor and family are identical and the
8150  * revision of the chiprev matches one of the bits set in the template.  Callers
8151  * may bitwise-OR together chiprevs of the same vendor and family to form the
8152  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8153  * multiple vendors or processor families with a single call.  Note that this
8154  * function operates on processor families, not cpuid families.
8155  */
8156 boolean_t
8157 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8158 {
8159 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8160 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8161 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8162 }
8163 
8164 /*
8165  * A chiprev is at least min if the vendor and family are identical and the
8166  * revision of the chiprev is at least as recent as that of min.  Processor
8167  * families are considered unordered and cannot be compared using this function.
8168  * Note that this function operates on processor families, not cpuid families.
8169  * Use of the _ANY chiprev variant with this function is not useful; it will
8170  * always return B_FALSE if the _ANY variant is supplied as the minimum
8171  * revision.  To determine only whether a chiprev is of a given processor
8172  * family, test the return value of chiprev_family() instead.
8173  */
8174 boolean_t
8175 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8176 {
8177 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8178 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8179 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8180 }
8181 
8182 /*
8183  * The uarch functions operate in a manner similar to the chiprev functions
8184  * above.  While it is tempting to allow these to operate on microarchitectures
8185  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8186  * than ZEN2), we elect not to do so because a manufacturer may supply
8187  * processors of multiple different microarchitecture families each of which may
8188  * be internally ordered but unordered with respect to those of other families.
8189  */
8190 x86_uarch_t
8191 uarchrev_uarch(const x86_uarchrev_t ur)
8192 {
8193 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8194 }
8195 
8196 boolean_t
8197 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8198 {
8199 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8200 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8201 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8202 }
8203 
8204 boolean_t
8205 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8206 {
8207 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8208 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8209 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8210 }
8211 
8212 /*
8213  * Topology cache related information. This is yet another cache interface that
8214  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8215  * AMD Leaf 8x1D (introduced with Zen 1).
8216  */
8217 static boolean_t
8218 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8219 {
8220 	switch (cpi->cpi_vendor) {
8221 	case X86_VENDOR_Intel:
8222 		if (cpi->cpi_maxeax >= 4) {
8223 			return (B_TRUE);
8224 		}
8225 		break;
8226 	case X86_VENDOR_AMD:
8227 	case X86_VENDOR_HYGON:
8228 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8229 		    is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8230 			return (B_TRUE);
8231 		}
8232 		break;
8233 	default:
8234 		break;
8235 	}
8236 
8237 	return (B_FALSE);
8238 }
8239 
8240 int
8241 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8242 {
8243 	const struct cpuid_info *cpi;
8244 
8245 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8246 	cpi = cpu->cpu_m.mcpu_cpi;
8247 
8248 	if (!cpuid_cache_topo_sup(cpi)) {
8249 		return (ENOTSUP);
8250 	}
8251 
8252 	*ncache = cpi->cpi_cache_leaf_size;
8253 	return (0);
8254 }
8255 
8256 int
8257 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8258 {
8259 	const struct cpuid_info *cpi;
8260 	const struct cpuid_regs *cp;
8261 
8262 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8263 	cpi = cpu->cpu_m.mcpu_cpi;
8264 
8265 	if (!cpuid_cache_topo_sup(cpi)) {
8266 		return (ENOTSUP);
8267 	}
8268 
8269 	if (cno >= cpi->cpi_cache_leaf_size) {
8270 		return (EINVAL);
8271 	}
8272 
8273 	bzero(cache, sizeof (cache));
8274 	cp = cpi->cpi_cache_leaves[cno];
8275 	switch (CPI_CACHE_TYPE(cp)) {
8276 	case CPI_CACHE_TYPE_DATA:
8277 		cache->xc_type = X86_CACHE_TYPE_DATA;
8278 		break;
8279 	case CPI_CACHE_TYPE_INSTR:
8280 		cache->xc_type = X86_CACHE_TYPE_INST;
8281 		break;
8282 	case CPI_CACHE_TYPE_UNIFIED:
8283 		cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8284 		break;
8285 	case CPI_CACHE_TYPE_DONE:
8286 	default:
8287 		return (EINVAL);
8288 	}
8289 	cache->xc_level = CPI_CACHE_LVL(cp);
8290 	if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8291 		cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8292 	}
8293 	cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8294 	/*
8295 	 * The number of sets is reserved on AMD if the CPU is tagged as fully
8296 	 * associative, where as it is considered valid on Intel.
8297 	 */
8298 	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8299 	    CPI_FULL_ASSOC_CACHE(cp) != 0) {
8300 		cache->xc_nsets = 1;
8301 	} else {
8302 		cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8303 	}
8304 	cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8305 	cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8306 	cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8307 	    cache->xc_line_size;
8308 	/*
8309 	 * We're looking for the number of bits to cover the number of CPUs that
8310 	 * are being shared. Normally this would be the value - 1, but the CPUID
8311 	 * value is encoded as the actual value minus one, so we don't modify
8312 	 * this at all.
8313 	 */
8314 	cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8315 
8316 	/*
8317 	 * To construct a unique ID we construct a uint64_t that looks as
8318 	 * follows:
8319 	 *
8320 	 * [47:40] cache level
8321 	 * [39:32] CPUID cache type
8322 	 * [31:00] shifted APIC ID
8323 	 *
8324 	 * The shifted APIC ID gives us a guarantee that a given cache entry is
8325 	 * unique within its peers. The other two numbers give us something that
8326 	 * ensures that something is unique within the CPU. If we just had the
8327 	 * APIC ID shifted over by the indicated number of bits we'd end up with
8328 	 * an ID of zero for the L1I, L1D, L2, and L3.
8329 	 *
8330 	 * The format of this ID is private to the system and can change across
8331 	 * a reboot for the time being.
8332 	 */
8333 	cache->xc_id = (uint64_t)cache->xc_level << 40;
8334 	cache->xc_id |= (uint64_t)cache->xc_type << 32;
8335 	cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8336 
8337 	return (0);
8338 }
8339