xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision 873fefb258ee57f8d210b4709f2a8d8f034af869)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2025 Oxide Computer Company
28  * Copyright 2024 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * ---------
328  * Microcode
329  * ---------
330  *
331  * Microcode updates may be applied by the firmware (BIOS/UEFI) and/or by the
332  * operating system and may result in architecturally visible changes (e.g.,
333  * changed MSR or CPUID bits). As such, we want to apply any updates as early
334  * as possible during the boot process -- right after the IDENT pass.
335  *
336  * Microcode may also be updated at runtime via ucodeadm(8), after which we do
337  * a selective rescan of the cpuid leaves to determine what features have
338  * changed. Microcode updates can provide more details about security related
339  * features to deal with issues like Spectre and L1TF. On occasion, vendors have
340  * violated their contract and removed bits. However, we don't try to detect
341  * that because that puts us in a situation that we really can't deal with. As
342  * such, the only thing we rescan are security related features today. See
343  * cpuid_pass_ucode(). This is not a pass in the same sense as the others and
344  * is run on demand, via cpuid_post_ucodeadm().
345  *
346  *
347  * All of the passes are run on all CPUs. However, for the most part we only
348  * care about what the boot CPU says about this information and use the other
349  * CPUs as a rough guide to sanity check that we have the same feature set.
350  *
351  * We do not support running multiple logical CPUs with disjoint, let alone
352  * different, feature sets.
353  *
354  * ------------------
355  * Processor Topology
356  * ------------------
357  *
358  * One of the important things that we need to do is to understand the topology
359  * of the underlying processor. When we say topology in this case, we're trying
360  * to understand the relationship between the logical CPUs that the operating
361  * system sees and the underlying physical layout. Different logical CPUs may
362  * share different resources which can have important consequences for the
363  * performance of the system. For example, they may share caches, execution
364  * units, and more.
365  *
366  * The topology of the processor changes from generation to generation and
367  * vendor to vendor.  Along with that, different vendors use different
368  * terminology, and the operating system itself uses occasionally overlapping
369  * terminology. It's important to understand what this topology looks like so
370  * one can understand the different things that we try to calculate and
371  * determine.
372  *
373  * To get started, let's talk about a little bit of terminology that we've used
374  * so far, is used throughout this file, and is fairly generic across multiple
375  * vendors:
376  *
377  * CPU
378  *	A central processing unit (CPU) refers to a logical and/or virtual
379  *	entity that the operating system can execute instructions on. The
380  *	underlying resources for this CPU may be shared between multiple
381  *	entities; however, to the operating system it is a discrete unit.
382  *
383  * PROCESSOR and PACKAGE
384  *
385  *	Generally, when we use the term 'processor' on its own, we are referring
386  *	to the physical entity that one buys and plugs into a board. However,
387  *	because processor has been overloaded and one might see it used to mean
388  *	multiple different levels, we will instead use the term 'package' for
389  *	the rest of this file. The term package comes from the electrical
390  *	engineering side and refers to the physical entity that encloses the
391  *	electronics inside. Strictly speaking the package can contain more than
392  *	just the CPU, for example, on many processors it may also have what's
393  *	called an 'integrated graphical processing unit (GPU)'. Because the
394  *	package can encapsulate multiple units, it is the largest physical unit
395  *	that we refer to.
396  *
397  * SOCKET
398  *
399  *	A socket refers to unit on a system board (generally the motherboard)
400  *	that can receive a package. A single package, or processor, is plugged
401  *	into a single socket. A system may have multiple sockets. Often times,
402  *	the term socket is used interchangeably with package and refers to the
403  *	electrical component that has plugged in, and not the receptacle itself.
404  *
405  * CORE
406  *
407  *	A core refers to the physical instantiation of a CPU, generally, with a
408  *	full set of hardware resources available to it. A package may contain
409  *	multiple cores inside of it or it may just have a single one. A
410  *	processor with more than one core is often referred to as 'multi-core'.
411  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
412  *	that has 'multi-core' processors.
413  *
414  *	A core may expose a single logical CPU to the operating system, or it
415  *	may expose multiple CPUs, which we call threads, defined below.
416  *
417  *	Some resources may still be shared by cores in the same package. For
418  *	example, many processors will share the level 3 cache between cores.
419  *	Some AMD generations share hardware resources between cores. For more
420  *	information on that see the section 'AMD Topology'.
421  *
422  * THREAD and STRAND
423  *
424  *	In this file, generally a thread refers to a hardware resources and not
425  *	the operating system's logical abstraction. A thread is always exposed
426  *	as an independent logical CPU to the operating system. A thread belongs
427  *	to a specific core. A core may have more than one thread. When that is
428  *	the case, the threads that are part of the same core are often referred
429  *	to as 'siblings'.
430  *
431  *	When multiple threads exist, this is generally referred to as
432  *	simultaneous multi-threading (SMT). When Intel introduced this in their
433  *	processors they called it hyper-threading (HT). When multiple threads
434  *	are active in a core, they split the resources of the core. For example,
435  *	two threads may share the same set of hardware execution units.
436  *
437  *	The operating system often uses the term 'strand' to refer to a thread.
438  *	This helps disambiguate it from the software concept.
439  *
440  * CHIP
441  *
442  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
443  *	base meaning, it is used to refer to a single integrated circuit, which
444  *	may or may not be the only thing in the package. In illumos, when you
445  *	see the term 'chip' it is almost always referring to the same thing as
446  *	the 'package'. However, many vendors may use chip to refer to one of
447  *	many integrated circuits that have been placed in the package. As an
448  *	example, see the subsequent definition.
449  *
450  *	To try and keep things consistent, we will only use chip when referring
451  *	to the entire integrated circuit package, with the exception of the
452  *	definition of multi-chip module (because it is in the name) and use the
453  *	term 'die' when we want the more general, potential sub-component
454  *	definition.
455  *
456  * DIE
457  *
458  *	A die refers to an integrated circuit. Inside of the package there may
459  *	be a single die or multiple dies. This is sometimes called a 'chip' in
460  *	vendor's parlance, but in this file, we use the term die to refer to a
461  *	subcomponent.
462  *
463  * MULTI-CHIP MODULE
464  *
465  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
466  *	are connected together in the same package. When a multi-chip design is
467  *	used, generally each chip is manufactured independently and then joined
468  *	together in the package. For example, on AMD's Zen microarchitecture
469  *	(family 0x17), the package contains several dies (the second meaning of
470  *	chip from above) that are connected together.
471  *
472  * CACHE
473  *
474  *	A cache is a part of the processor that maintains copies of recently
475  *	accessed memory. Caches are split into levels and then into types.
476  *	Commonly there are one to three levels, called level one, two, and
477  *	three. The lower the level, the smaller it is, the closer it is to the
478  *	execution units of the CPU, and the faster it is to access. The layout
479  *	and design of the cache come in many different flavors, consult other
480  *	resources for a discussion of those.
481  *
482  *	Caches are generally split into two types, the instruction and data
483  *	cache. The caches contain what their names suggest, the instruction
484  *	cache has executable program text, while the data cache has all other
485  *	memory that the processor accesses. As of this writing, data is kept
486  *	coherent between all of the caches on x86, so if one modifies program
487  *	text before it is executed, that will be in the data cache, and the
488  *	instruction cache will be synchronized with that change when the
489  *	processor actually executes those instructions. This coherency also
490  *	covers the fact that data could show up in multiple caches.
491  *
492  *	Generally, the lowest level caches are specific to a core. However, the
493  *	last layer cache is shared between some number of cores. The number of
494  *	CPUs sharing this last level cache is important. This has implications
495  *	for the choices that the scheduler makes, as accessing memory that might
496  *	be in a remote cache after thread migration can be quite expensive.
497  *
498  *	Sometimes, the word cache is abbreviated with a '$', because in US
499  *	English the word cache is pronounced the same as cash. So L1D$ refers to
500  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
501  *	in the rest of this theory statement for clarity.
502  *
503  * MEMORY CONTROLLER
504  *
505  *	The memory controller is a component that provides access to DRAM. Each
506  *	memory controller can access a set number of DRAM channels. Each channel
507  *	can have a number of DIMMs (sticks of memory) associated with it. A
508  *	given package may have more than one memory controller. The association
509  *	of the memory controller to a group of cores is important as it is
510  *	cheaper to access memory on the controller that you are associated with.
511  *
512  * NUMA
513  *
514  *	NUMA or non-uniform memory access, describes a way that systems are
515  *	built. On x86, any processor core can address all of the memory in the
516  *	system. However, When using multiple sockets or possibly within a
517  *	multi-chip module, some of that memory is physically closer and some of
518  *	it is further. Memory that is further away is more expensive to access.
519  *	Consider the following image of multiple sockets with memory:
520  *
521  *	+--------+                                                +--------+
522  *	| DIMM A |         +----------+      +----------+         | DIMM D |
523  *	+--------+-+       |          |      |          |       +-+------+-+
524  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
525  *	  +--------+-+     |          |      |          |     +-+------+-+
526  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
527  *	    +--------+                                        +--------+
528  *
529  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
530  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
531  *	access DIMMs A-C and more expensive to access D-F as it has to go
532  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
533  *	D-F are cheaper than A-C. While the socket form is the most common, when
534  *	using multi-chip modules, this can also sometimes occur. For another
535  *	example of this that's more involved, see the AMD topology section.
536  *
537  *
538  * Intel Topology
539  * --------------
540  *
541  * Most Intel processors since Nehalem, (as of this writing the current gen
542  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
543  * the package is a single monolithic die. MCMs currently aren't used. Most
544  * parts have three levels of caches, with the L3 cache being shared between
545  * all of the cores on the package. The L1/L2 cache is generally specific to
546  * an individual core. The following image shows at a simplified level what
547  * this looks like. The memory controller is commonly part of something called
548  * the 'Uncore', that used to be separate physical chips that were not a part of
549  * the package, but are now part of the same chip.
550  *
551  *  +-----------------------------------------------------------------------+
552  *  | Package                                                               |
553  *  |  +-------------------+  +-------------------+  +-------------------+  |
554  *  |  | Core              |  | Core              |  | Core              |  |
555  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
556  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
557  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
558  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
559  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
560  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
561  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
562  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
563  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
564  *  |  +-------------------+  +-------------------+  +-------------------+  |
565  *  | +-------------------------------------------------------------------+ |
566  *  | |                         Shared L3 Cache                           | |
567  *  | +-------------------------------------------------------------------+ |
568  *  | +-------------------------------------------------------------------+ |
569  *  | |                        Memory Controller                          | |
570  *  | +-------------------------------------------------------------------+ |
571  *  +-----------------------------------------------------------------------+
572  *
573  * A side effect of this current architecture is that what we care about from a
574  * scheduling and topology perspective, is simplified. In general we care about
575  * understanding which logical CPUs are part of the same core and socket.
576  *
577  * To determine the relationship between threads and cores, Intel initially used
578  * the identifier in the advanced programmable interrupt controller (APIC). They
579  * also added cpuid leaf 4 to give additional information about the number of
580  * threads and CPUs in the processor. With the addition of x2apic (which
581  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
582  * additional cpuid topology leaf 0xB was added.
583  *
584  * AMD Topology
585  * ------------
586  *
587  * When discussing AMD topology, we want to break this into three distinct
588  * generations of topology. There's the basic topology that has been used in
589  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
590  * with family 0x15 (Bulldozer), and there's the topology that was introduced
591  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
592  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
593  * additional terminology that's worth talking about.
594  *
595  * Until the introduction of family 0x17 (Zen), AMD did not implement something
596  * that they considered SMT. Whether or not the AMD processors have SMT
597  * influences many things including scheduling and reliability, availability,
598  * and serviceability (RAS) features.
599  *
600  * NODE
601  *
602  *	AMD uses the term node to refer to a die that contains a number of cores
603  *	and I/O resources. Depending on the processor family and model, more
604  *	than one node can be present in the package. When there is more than one
605  *	node this indicates a multi-chip module. Usually each node has its own
606  *	access to memory and I/O devices. This is important and generally
607  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
608  *	result, we track this relationship in the operating system.
609  *
610  *	In processors with an L3 cache, the L3 cache is generally shared across
611  *	the entire node, though the way this is carved up varies from generation
612  *	to generation.
613  *
614  * BULLDOZER
615  *
616  *	Starting with the Bulldozer family (0x15) and continuing until the
617  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
618  *	compute unit. In a compute unit, two traditional cores share a number of
619  *	hardware resources. Critically, they share the FPU, L1 instruction
620  *	cache, and the L2 cache. Several compute units were then combined inside
621  *	of a single node.  Because the integer execution units, L1 data cache,
622  *	and some other resources were not shared between the cores, AMD never
623  *	considered this to be SMT.
624  *
625  * ZEN
626  *
627  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
628  *	is called Zeppelin. These modules are similar to the idea of nodes used
629  *	previously. Each of these nodes has two DRAM channels which all of the
630  *	cores in the node can access uniformly. These nodes are linked together
631  *	in the package, creating a NUMA environment.
632  *
633  *	The Zeppelin die itself contains two different 'core complexes'. Each
634  *	core complex consists of four cores which each have two threads, for a
635  *	total of 8 logical CPUs per complex. Unlike other generations,
636  *	where all the logical CPUs in a given node share the L3 cache, here each
637  *	core complex has its own shared L3 cache.
638  *
639  *	A further thing that we need to consider is that in some configurations,
640  *	particularly with the Threadripper line of processors, not every die
641  *	actually has its memory controllers wired up to actual memory channels.
642  *	This means that some cores have memory attached to them and others
643  *	don't.
644  *
645  *	To put Zen in perspective, consider the following images:
646  *
647  *      +--------------------------------------------------------+
648  *      | Core Complex                                           |
649  *      | +-------------------+    +-------------------+  +---+  |
650  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
651  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
652  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
653  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
654  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
655  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
656  *      | +-------------------+    +-------------------+  | C |  |
657  *      | +-------------------+    +-------------------+  | a |  |
658  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
659  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
660  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
661  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
662  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
663  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
664  *      | +-------------------+    +-------------------+  +---+  |
665  *      |                                                        |
666  *	+--------------------------------------------------------+
667  *
668  *  This first image represents a single Zen core complex that consists of four
669  *  cores.
670  *
671  *
672  *	+--------------------------------------------------------+
673  *	| Zeppelin Die                                           |
674  *	|  +--------------------------------------------------+  |
675  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
676  *	|  +--------------------------------------------------+  |
677  *      |                           HH                           |
678  *	|          +-----------+    HH    +-----------+          |
679  *	|          |           |    HH    |           |          |
680  *	|          |    Core   |==========|    Core   |          |
681  *	|          |  Complex  |==========|  Complex  |          |
682  *	|          |           |    HH    |           |          |
683  *	|          +-----------+    HH    +-----------+          |
684  *      |                           HH                           |
685  *	|  +--------------------------------------------------+  |
686  *	|  |                Memory Controller                 |  |
687  *	|  +--------------------------------------------------+  |
688  *      |                                                        |
689  *	+--------------------------------------------------------+
690  *
691  *  This image represents a single Zeppelin Die. Note how both cores are
692  *  connected to the same memory controller and I/O units. While each core
693  *  complex has its own L3 cache as seen in the first image, they both have
694  *  uniform access to memory.
695  *
696  *
697  *                      PP                     PP
698  *                      PP                     PP
699  *           +----------PP---------------------PP---------+
700  *           |          PP                     PP         |
701  *           |    +-----------+          +-----------+    |
702  *           |    |           |          |           |    |
703  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
704  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
705  *           |    |           |          |           |    |
706  *           |    +-----------+ooo    ...+-----------+    |
707  *           |          HH      ooo  ...       HH         |
708  *           |          HH        oo..         HH         |
709  *           |          HH        ..oo         HH         |
710  *           |          HH      ...  ooo       HH         |
711  *           |    +-----------+...    ooo+-----------+    |
712  *           |    |           |          |           |    |
713  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
714  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
715  *           |    |           |          |           |    |
716  *           |    +-----------+          +-----------+    |
717  *           |          PP                     PP         |
718  *           +----------PP---------------------PP---------+
719  *                      PP                     PP
720  *                      PP                     PP
721  *
722  *  This image represents a single Zen package. In this example, it has four
723  *  Zeppelin dies, though some configurations only have a single one. In this
724  *  example, each die is directly connected to the next. Also, each die is
725  *  represented as being connected to memory by the 'M' character and connected
726  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
727  *  die is made up of two core complexes, we have multiple different NUMA
728  *  domains that we care about for these systems.
729  *
730  * ZEN 2
731  *
732  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
733  *	each Zeppelin Die had its own I/O die, that has been moved out of the
734  *	core complex in Zen 2. The actual core complex looks pretty similar, but
735  *	now the die actually looks much simpler:
736  *
737  *      +--------------------------------------------------------+
738  *      | Zen 2 Core Complex Die    HH                           |
739  *      |                           HH                           |
740  *      |          +-----------+    HH    +-----------+          |
741  *      |          |           |    HH    |           |          |
742  *      |          |    Core   |==========|    Core   |          |
743  *      |          |  Complex  |==========|  Complex  |          |
744  *      |          |           |    HH    |           |          |
745  *      |          +-----------+    HH    +-----------+          |
746  *      |                           HH                           |
747  *      |                           HH                           |
748  *      +--------------------------------------------------------+
749  *
750  *	From here, when we add the central I/O die, this changes things a bit.
751  *	Each die is connected to the I/O die, rather than trying to interconnect
752  *	them directly. The following image takes the same Zen 1 image that we
753  *	had earlier and shows what it looks like with the I/O die instead:
754  *
755  *                                 PP    PP
756  *                                 PP    PP
757  *           +---------------------PP----PP---------------------+
758  *           |                     PP    PP                     |
759  *           |  +-----------+      PP    PP      +-----------+  |
760  *           |  |           |      PP    PP      |           |  |
761  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
762  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
763  *           |  |         |o|oooo|          |oooo|o|         |  |
764  *           |  +-----------+    |          |    +-----------+  |
765  *           |                   |   I/O    |                   |
766  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
767  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
768  *           |                   |          |                   |
769  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
770  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
771  *           |                   |          |                   |
772  *           |  +-----------+    |          |    +-----------+  |
773  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
774  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
775  *           |  |    Die    |      PP    PP      |    Die    |  |
776  *           |  |           |      PP    PP      |           |  |
777  *           |  +-----------+      PP    PP      +-----------+  |
778  *           |                     PP    PP                     |
779  *           +---------------------PP----PP---------------------+
780  *                                 PP    PP
781  *                                 PP    PP
782  *
783  *	The above has four core complex dies installed, though the Zen 2 EPYC
784  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
785  *	generally only have one to two. The more notable difference here is how
786  *	everything communicates. Note that memory and PCIe come out of the
787  *	central die. This changes the way that one die accesses a resource. It
788  *	basically always has to go to the I/O die, where as in Zen 1 it may have
789  *	satisfied it locally. In general, this ends up being a better strategy
790  *	for most things, though it is possible to still treat everything in four
791  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
792  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
793  *	now there is only one 'node' present.
794  *
795  * ZEN 3
796  *
797  *	From an architectural perspective, Zen 3 is a much smaller change from
798  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
799  *	its microarchitectural changes. The biggest thing for us is how the die
800  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
801  *	cache. However, in Zen 3, the L3 is now shared between the entire core
802  *	complex die and is no longer partitioned between each core complex. This
803  *	means that all cores on the die can share the same L3 cache. Otherwise,
804  *	the general layout of the overall package with various core complexes
805  *	and an I/O die stays the same. Here's what the Core Complex Die looks
806  *	like in a bit more detail:
807  *
808  *               +-------------------------------------------------+
809  *               | Zen 3 Core Complex Die                          |
810  *               | +-------------------+    +-------------------+  |
811  *               | | Core       +----+ |    | Core       +----+ |  |
812  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
813  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
814  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
815  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
816  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
817  *               | +-------------------+    +-------------------+  |
818  *               | +-------------------+    +-------------------+  |
819  *               | | Core       +----+ |    | Core       +----+ |  |
820  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
821  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
822  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
823  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
824  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
825  *               | +-------------------+    +-------------------+  |
826  *               |                                                 |
827  *               | +--------------------------------------------+  |
828  *               | |                 L3 Cache                   |  |
829  *               | +--------------------------------------------+  |
830  *               |                                                 |
831  *               | +-------------------+    +-------------------+  |
832  *               | | Core       +----+ |    | Core       +----+ |  |
833  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
834  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
835  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
836  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
837  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
838  *               | +-------------------+    +-------------------+  |
839  *               | +-------------------+    +-------------------+  |
840  *               | | Core       +----+ |    | Core       +----+ |  |
841  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
842  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
843  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
844  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
845  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
846  *               | +-------------------+    +-------------------+  |
847  *               +-------------------------------------------------+
848  *
849  *	While it is not pictured, there are connections from the die to the
850  *	broader data fabric and additional functional blocks to support that
851  *	communication and coherency.
852  *
853  * CPUID LEAVES
854  *
855  * There are a few different CPUID leaves that we can use to try and understand
856  * the actual state of the world. As part of the introduction of family 0xf, AMD
857  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
858  * processors that are in the system. Because families before Zen didn't have
859  * SMT, this was always the number of cores that were in the system. However, it
860  * should always be thought of as the number of logical threads to be consistent
861  * between generations. In addition we also get the size of the APIC ID that is
862  * used to represent the number of logical processors. This is important for
863  * deriving topology information.
864  *
865  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
866  * bit between Bulldozer and later families, but it is quite useful in
867  * determining the topology information. Because this information has changed
868  * across family generations, it's worth calling out what these mean
869  * explicitly. The registers have the following meanings:
870  *
871  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
872  *		APIC ID, even though on systems without x2apic support, it will
873  *		be limited to 8 bits.
874  *
875  *	%ebx	On Bulldozer-era systems this contains information about the
876  *		number of cores that are in a compute unit (cores that share
877  *		resources). It also contains a per-package compute unit ID that
878  *		identifies which compute unit the logical CPU is a part of.
879  *
880  *		On Zen-era systems this instead contains the number of threads
881  *		per core and the ID of the core that the logical CPU is a part
882  *		of. Note, this ID is unique only to the package, it is not
883  *		globally unique across the entire system.
884  *
885  *	%ecx	This contains the number of nodes that exist in the package. It
886  *		also contains an ID that identifies which node the logical CPU
887  *		is a part of.
888  *
889  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
890  * cache layout to determine which logical CPUs are sharing which caches.
891  *
892  * illumos Topology
893  * ----------------
894  *
895  * Based on the above we synthesize the information into several different
896  * variables that we store in the 'struct cpuid_info'. We'll go into the details
897  * of what each member is supposed to represent and their uniqueness. In
898  * general, there are two levels of uniqueness that we care about. We care about
899  * an ID that is globally unique. That means that it will be unique across all
900  * entities in the system. For example, the default logical CPU ID is globally
901  * unique. On the other hand, there is some information that we only care about
902  * being unique within the context of a single package / socket. Here are the
903  * variables that we keep track of and their meaning.
904  *
905  * Several of the values that are asking for an identifier, with the exception
906  * of cpi_apicid, are allowed to be synthetic.
907  *
908  *
909  * cpi_apicid
910  *
911  *	This is the value of the CPU's APIC id. This should be the full 32-bit
912  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
913  *	APIC ID. This value is globally unique between all logical CPUs across
914  *	all packages. This is usually required by the APIC.
915  *
916  * cpi_chipid
917  *
918  *	This value indicates the ID of the package that the logical CPU is a
919  *	part of. This value is allowed to be synthetic. It is usually derived by
920  *	taking the CPU's APIC ID and determining how many bits are used to
921  *	represent CPU cores in the package. All logical CPUs that are part of
922  *	the same package must have the same value.
923  *
924  * cpi_coreid
925  *
926  *	This represents the ID of a CPU core. Two logical CPUs should only have
927  *	the same cpi_coreid value if they are part of the same core. These
928  *	values may be synthetic. On systems that support SMT, this value is
929  *	usually derived from the APIC ID, otherwise it is often synthetic and
930  *	just set to the value of the cpu_id in the cpu_t.
931  *
932  * cpi_pkgcoreid
933  *
934  *	This is similar to the cpi_coreid in that logical CPUs that are part of
935  *	the same core should have the same ID. The main difference is that these
936  *	values are only required to be unique to a given socket.
937  *
938  * cpi_clogid
939  *
940  *	This represents the logical ID of a logical CPU. This value should be
941  *	unique within a given socket for each logical CPU. This is allowed to be
942  *	synthetic, though it is usually based off of the CPU's apic ID. The
943  *	broader system expects that logical CPUs that have are part of the same
944  *	core have contiguous numbers. For example, if there were two threads per
945  *	core, then the core IDs divided by two should be the same and the first
946  *	modulus two should be zero and the second one. For example, IDs 4 and 5
947  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
948  *	6 represent two logical CPUs that are part of different cores.
949  *
950  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
951  *	from the same source, strictly speaking, they don't have to be and the
952  *	two values should be considered logically independent. One should not
953  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
954  *	some kind of relationship. While this is tempting, we've seen cases on
955  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
956  *
957  * cpi_ncpu_per_chip
958  *
959  *	This value indicates the total number of logical CPUs that exist in the
960  *	physical package. Critically, this is not the number of logical CPUs
961  *	that exist for just the single core.
962  *
963  *	This value should be the same for all logical CPUs in the same package.
964  *
965  * cpi_ncore_per_chip
966  *
967  *	This value indicates the total number of physical CPU cores that exist
968  *	in the package. The system compares this value with cpi_ncpu_per_chip to
969  *	determine if simultaneous multi-threading (SMT) is enabled. When
970  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
971  *	the X86FSET_HTT feature is not set. If this value is greater than one,
972  *	than we consider the processor to have the feature X86FSET_CMP, to
973  *	indicate that there is support for more than one core.
974  *
975  *	This value should be the same for all logical CPUs in the same package.
976  *
977  * cpi_procnodes_per_pkg
978  *
979  *	This value indicates the number of 'nodes' that exist in the package.
980  *	When processors are actually a multi-chip module, this represents the
981  *	number of such modules that exist in the package. Currently, on Intel
982  *	based systems this member is always set to 1.
983  *
984  *	This value should be the same for all logical CPUs in the same package.
985  *
986  * cpi_procnodeid
987  *
988  *	This value indicates the ID of the node that the logical CPU is a part
989  *	of. All logical CPUs that are in the same node must have the same value
990  *	here. This value must be unique across all of the packages in the
991  *	system.  On Intel based systems, this is currently set to the value in
992  *	cpi_chipid because there is only one node.
993  *
994  * cpi_cores_per_compunit
995  *
996  *	This value indicates the number of cores that are part of a compute
997  *	unit. See the AMD topology section for this. This member only has real
998  *	meaning currently for AMD Bulldozer family processors. For all other
999  *	processors, this should currently be set to 1.
1000  *
1001  * cpi_compunitid
1002  *
1003  *	This indicates the compute unit that the logical CPU belongs to. For
1004  *	processors without AMD Bulldozer-style compute units this should be set
1005  *	to the value of cpi_coreid.
1006  *
1007  * cpi_ncpu_shr_last_cache
1008  *
1009  *	This indicates the number of logical CPUs that are sharing the same last
1010  *	level cache. This value should be the same for all CPUs that are sharing
1011  *	that cache. The last cache refers to the cache that is closest to memory
1012  *	and furthest away from the CPU.
1013  *
1014  * cpi_last_lvl_cacheid
1015  *
1016  *	This indicates the ID of the last cache that the logical CPU uses. This
1017  *	cache is often shared between multiple logical CPUs and is the cache
1018  *	that is closest to memory and furthest away from the CPU. This value
1019  *	should be the same for a group of logical CPUs only if they actually
1020  *	share the same last level cache. IDs should not overlap between
1021  *	packages.
1022  *
1023  * cpi_ncore_bits
1024  *
1025  *	This indicates the number of bits that are required to represent all of
1026  *	the cores in the system. As cores are derived based on their APIC IDs,
1027  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1028  *	this value to be larger than the actual number of IDs that are present
1029  *	in the system. This is used to size tables by the CMI framework. It is
1030  *	only filled in for Intel and AMD CPUs.
1031  *
1032  * cpi_nthread_bits
1033  *
1034  *	This indicates the number of bits required to represent all of the IDs
1035  *	that cover the logical CPUs that exist on a given core. It's OK for this
1036  *	value to be larger than the actual number of IDs that are present in the
1037  *	system.  This is used to size tables by the CMI framework. It is
1038  *	only filled in for Intel and AMD CPUs.
1039  *
1040  * -----------
1041  * Hypervisors
1042  * -----------
1043  *
1044  * If trying to manage the differences between vendors wasn't bad enough, it can
1045  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1046  * the ability to interpose on all cpuid instructions and change them to suit
1047  * their purposes. In general, this is necessary as the hypervisor wants to be
1048  * able to present a more uniform set of features or not necessarily give the
1049  * guest operating system kernel knowledge of all features so it can be
1050  * more easily migrated between systems.
1051  *
1052  * When it comes to trying to determine topology information, this can be a
1053  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1054  * leaf, it'll often return all zeros. Because of that, you'll often see various
1055  * checks scattered about fields being non-zero before we assume we can use
1056  * them.
1057  *
1058  * When it comes to topology information, the hypervisor is often incentivized
1059  * to lie to you about topology. This is because it doesn't always actually
1060  * guarantee that topology at all. The topology path we take in the system
1061  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1062  * or AMD CPU, then we basically do our normal path. However, when they don't
1063  * use an actual vendor, then that usually turns into multiple one-core CPUs
1064  * that we enumerate that are often on different sockets. The actual behavior
1065  * depends greatly on what the hypervisor actually exposes to us.
1066  *
1067  * --------------------
1068  * Exposing Information
1069  * --------------------
1070  *
1071  * We expose CPUID information in three different forms in the system.
1072  *
1073  * The first is through the x86_featureset variable. This is used in conjunction
1074  * with the is_x86_feature() function. This is queried by x86-specific functions
1075  * to determine which features are or aren't present in the system and to make
1076  * decisions based upon them. For example, users of this include everything from
1077  * parts of the system dedicated to reliability, availability, and
1078  * serviceability (RAS), to making decisions about how to handle security
1079  * mitigations, to various x86-specific drivers. General purpose or
1080  * architecture independent drivers should never be calling this function.
1081  *
1082  * The second means is through the auxiliary vector. The auxiliary vector is a
1083  * series of tagged data that the kernel passes down to a user program when it
1084  * begins executing. This information is used to indicate to programs what
1085  * instruction set extensions are present. For example, information about the
1086  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1087  * since user programs cannot make use of it. However, things like the AVX
1088  * instruction sets are. Programs use this information to make run-time
1089  * decisions about what features they should use. As an example, the run-time
1090  * link-editor (rtld) can relocate different functions depending on the hardware
1091  * support available.
1092  *
1093  * The final form is through a series of accessor functions that all have the
1094  * form cpuid_get*. This is used by a number of different subsystems in the
1095  * kernel to determine more detailed information about what we're running on,
1096  * topology information, etc. Some of these subsystems include processor groups
1097  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1098  * microcode, and performance monitoring. These functions all ASSERT that the
1099  * CPU they're being called on has reached a certain cpuid pass. If the passes
1100  * are rearranged, then this needs to be adjusted.
1101  *
1102  * -----------------------------------------------
1103  * Speculative Execution CPU Side Channel Security
1104  * -----------------------------------------------
1105  *
1106  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1107  * execution in the CPU to create side channels there have been a number of
1108  * different attacks and corresponding issues that the operating system needs to
1109  * mitigate against. The following list is some of the common, but not
1110  * exhaustive, set of issues that we know about and have done some or need to do
1111  * more work in the system to mitigate against:
1112  *
1113  *   - Spectre v1
1114  *   - swapgs (Spectre v1 variant)
1115  *   - Spectre v2
1116  *     - Branch History Injection (BHI).
1117  *   - Meltdown (Spectre v3)
1118  *   - Rogue Register Read (Spectre v3a)
1119  *   - Speculative Store Bypass (Spectre v4)
1120  *   - ret2spec, SpectreRSB
1121  *   - L1 Terminal Fault (L1TF)
1122  *   - Microarchitectural Data Sampling (MDS)
1123  *   - Register File Data Sampling (RFDS)
1124  *
1125  * Each of these requires different sets of mitigations and has different attack
1126  * surfaces. For the most part, this discussion is about protecting the kernel
1127  * from non-kernel executing environments such as user processes and hardware
1128  * virtual machines. Unfortunately, there are a number of user vs. user
1129  * scenarios that exist with these. The rest of this section will describe the
1130  * overall approach that the system has taken to address these as well as their
1131  * shortcomings. Unfortunately, not all of the above have been handled today.
1132  *
1133  * SPECTRE v2, ret2spec, SpectreRSB
1134  *
1135  * The second variant of the spectre attack focuses on performing branch target
1136  * injection. This generally impacts indirect call instructions in the system.
1137  * There are four different ways to mitigate this issue that are commonly
1138  * described today:
1139  *
1140  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1141  *  2. Using Retpolines and RSB Stuffing
1142  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1143  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1144  *
1145  * IBRS uses a feature added to microcode to restrict speculation, among other
1146  * things. This form of mitigation has not been used as it has been generally
1147  * seen as too expensive and requires reactivation upon various transitions in
1148  * the system.
1149  *
1150  * As a less impactful alternative to IBRS, retpolines were developed by
1151  * Google. These basically require one to replace indirect calls with a specific
1152  * trampoline that will cause speculation to fail and break the attack.
1153  * Retpolines require compiler support. We always build with retpolines in the
1154  * external thunk mode. This means that a traditional indirect call is replaced
1155  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1156  * of this is that all indirect function calls are performed through a register.
1157  *
1158  * We have to use a common external location of the thunk and not inline it into
1159  * the callsite so that way we can have a single place to patch these functions.
1160  * As it turns out, we currently have two different forms of retpolines that
1161  * exist in the system:
1162  *
1163  *  1. A full retpoline
1164  *  2. A no-op version
1165  *
1166  * The first one is used in the general case. Historically, there was an
1167  * AMD-specific optimized retopoline variant that was based around using a
1168  * serializing lfence instruction; however, in March 2022 it was announced that
1169  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1170  * use it and it is no longer available in the system.
1171  *
1172  * The third form described above is the most curious. It turns out that the way
1173  * that retpolines are implemented is that they rely on how speculation is
1174  * performed on a 'ret' instruction. Intel has continued to optimize this
1175  * process (which is partly why we need to have return stack buffer stuffing,
1176  * but more on that in a bit) and in processors starting with Cascade Lake
1177  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1178  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1179  *
1180  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1181  * physical core. However, if this is the case, we don't want to use retpolines
1182  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1183  * function (called a thunk) into a jmp instruction. This means that we're still
1184  * paying the cost of an extra jump to the external thunk, but it gives us
1185  * flexibility and the ability to have a single kernel image that works across a
1186  * wide variety of systems and hardware features.
1187  *
1188  * Unfortunately, this alone is insufficient. First, Skylake systems have
1189  * additional speculation for the Return Stack Buffer (RSB) which is used to
1190  * return from call instructions which retpolines take advantage of. However,
1191  * this problem is not just limited to Skylake and is actually more pernicious.
1192  * The SpectreRSB paper introduces several more problems that can arise with
1193  * dealing with this. The RSB can be poisoned just like the indirect branch
1194  * predictor. This means that one needs to clear the RSB when transitioning
1195  * between two different privilege domains. Some examples include:
1196  *
1197  *  - Switching between two different user processes
1198  *  - Going between user land and the kernel
1199  *  - Returning to the kernel from a hardware virtual machine
1200  *
1201  * Mitigating this involves combining a couple of different things. The first is
1202  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1203  * Bridge. When an RSB entry refers to a user address and we're executing in the
1204  * kernel, speculation through it will be stopped when SMEP is enabled. This
1205  * protects against a number of the different cases that we would normally be
1206  * worried about such as when we enter the kernel from user land.
1207  *
1208  * To prevent against additional manipulation of the RSB from other contexts
1209  * such as a non-root VMX context attacking the kernel we first look to
1210  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1211  * nothing else that we need to do to protect the kernel at this time.
1212  *
1213  * Unfortunately, not all eIBRS implementations are sufficient to guard
1214  * against RSB manipulations, so we still need to manually overwrite the
1215  * contents of the return stack buffer unless the hardware specifies we are
1216  * covered. We do this through the x86_rsb_stuff() function.  Currently this
1217  * is employed on context switch and vmx_exit. The x86_rsb_stuff() function is
1218  * disabled only when mitigations in general are, or if we have hardware
1219  * indicating no need for post-barrier RSB protections, either in one place
1220  * (old hardware), or on both (newer hardware).
1221  *
1222  * If SMEP is not present, then we would have to stuff the RSB every time we
1223  * transitioned from user mode to the kernel, which isn't very practical right
1224  * now.
1225  *
1226  * To fully protect user to user and vmx to vmx attacks from these classes of
1227  * issues, we would also need to allow them to opt into performing an Indirect
1228  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1229  *
1230  * The fourth form of mitigation here is specific to AMD and is called Automated
1231  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1232  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1233  * (extended feature enable register) MSR. This bit basically says that IBRS
1234  * acts as though it is always active when executing at CPL0 and when executing
1235  * in the 'host' context when SEV-SNP is enabled.
1236  *
1237  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1238  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1239  * to the kernel, we must still consider the remaining cases that exist, just
1240  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1241  * traditional technique to work, this is not true on all CPUs. While a write to
1242  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1243  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1244  * guard page is present between user and kernel address spaces and SMEP is
1245  * enabled, then there is no need to clear the RSB at all.
1246  *
1247  * By default, the system will enable RSB stuffing and the required variant of
1248  * retpolines and store that information in the x86_spectrev2_mitigation value.
1249  * This will be evaluated after a microcode update as well, though it is
1250  * expected that microcode updates will not take away features. This may mean
1251  * that a late loaded microcode may not end up in the optimal configuration
1252  * (though this should be rare).
1253  *
1254  * Currently we do not build kmdb with retpolines or perform any additional side
1255  * channel security mitigations for it. One complication with kmdb is that it
1256  * requires its own retpoline thunks and it would need to adjust itself based on
1257  * what the kernel does. The threat model of kmdb is more limited and therefore
1258  * it may make more sense to investigate using prediction barriers as the whole
1259  * system is only executing a single instruction at a time while in kmdb.
1260  *
1261  * Branch History Injection (BHI)
1262  *
1263  * BHI is a specific form of SPECTREv2 where an attacker may manipulate branch
1264  * history before transitioning from user to supervisor mode (or from VMX
1265  * non-root/guest to root mode). The attacker can then exploit certain
1266  * compiler-generated code-sequences ("gadgets") to disclose information from
1267  * other contexts or domains.  Recent (late-2023/early-2024) research in
1268  * object code analysis discovered many more potential gadgets than what was
1269  * initially reported (which previously was confined to Linux use of
1270  * unprivileged eBPF).
1271  *
1272  * The BHI threat doesn't exist in processsors that predate eIBRS, or in AMD
1273  * ones. Some eIBRS processors have the ability to disable branch history in
1274  * certain (but not all) cases using an MSR write. eIBRS processors that don't
1275  * have the ability to disable must use a software sequence to scrub the
1276  * branch history buffer.
1277  *
1278  * BHI_DIS_S (the aforementioned MSR) prevents ring 0 from ring 3 (VMX guest
1279  * or VMX root). It does not protect different user processes from each other,
1280  * or ring 3 VMX guest from ring 3 VMX root or vice versa.
1281  *
1282  * The BHI clearing sequence prevents user exploiting kernel gadgets, and user
1283  * A's use of user B's gadgets.
1284  *
1285  * SMEP and eIBRS are a continuing defense-in-depth measure protecting the
1286  * kernel.
1287  *
1288  * SPECTRE v1, v4
1289  *
1290  * The v1 and v4 variants of spectre are not currently mitigated in the
1291  * system and require other classes of changes to occur in the code.
1292  *
1293  * SPECTRE v1 (SWAPGS VARIANT)
1294  *
1295  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1296  * can generally affect any branch-dependent code. The swapgs issue is one
1297  * variant of this. If we are coming in from userspace, we can have code like
1298  * this:
1299  *
1300  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1301  *	je	1f
1302  *	movq	$0, REGOFF_SAVFP(%rsp)
1303  *	swapgs
1304  *	1:
1305  *	movq	%gs:CPU_THREAD, %rax
1306  *
1307  * If an attacker can cause a mis-speculation of the branch here, we could skip
1308  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1309  * load. If subsequent code can act as the usual Spectre cache gadget, this
1310  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1311  * any use of the %gs override.
1312  *
1313  * The other case is also an issue: if we're coming into a trap from kernel
1314  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1315  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1316  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1317  * case, and the fix is the same in both cases (an lfence at the branch target
1318  * 1: in this example), we'll just do it unconditionally.
1319  *
1320  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1321  * harder for user-space to actually set a useful %gsbase value: although it's
1322  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1323  * mitigate anyway.
1324  *
1325  * MELTDOWN
1326  *
1327  * Meltdown, or spectre v3, allowed a user process to read any data in their
1328  * address space regardless of whether or not the page tables in question
1329  * allowed the user to have the ability to read them. The solution to meltdown
1330  * is kernel page table isolation. In this world, there are two page tables that
1331  * are used for a process, one in user land and one in the kernel. To implement
1332  * this we use per-CPU page tables and switch between the user and kernel
1333  * variants when entering and exiting the kernel.  For more information about
1334  * this process and how the trampolines work, please see the big theory
1335  * statements and additional comments in:
1336  *
1337  *  - uts/i86pc/ml/kpti_trampolines.s
1338  *  - uts/i86pc/vm/hat_i86.c
1339  *
1340  * While Meltdown only impacted Intel systems and there are also Intel systems
1341  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1342  * kernel page table isolation enabled. While this may at first seem weird, an
1343  * important thing to remember is that you can't speculatively read an address
1344  * if it's never in your page table at all. Having user processes without kernel
1345  * pages present provides us with an important layer of defense in the kernel
1346  * against any other side channel attacks that exist and have yet to be
1347  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1348  * default, no matter the x86 system.
1349  *
1350  * L1 TERMINAL FAULT
1351  *
1352  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1353  * execution uses page table entries. Effectively, it is two different problems.
1354  * The first is that it ignores the not present bit in the page table entries
1355  * when performing speculative execution. This means that something can
1356  * speculatively read the listed physical address if it's present in the L1
1357  * cache under certain conditions (see Intel's documentation for the full set of
1358  * conditions). Secondly, this can be used to bypass hardware virtualization
1359  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1360  * instructions.
1361  *
1362  * For the non-hardware virtualized case, this is relatively easy to deal with.
1363  * We must make sure that all unmapped pages have an address of zero. This means
1364  * that they could read the first 4k of physical memory; however, we never use
1365  * that first page in the operating system and always skip putting it in our
1366  * memory map, even if firmware tells us we can use it in our memory map. While
1367  * other systems try to put extra metadata in the address and reserved bits,
1368  * which led to this being problematic in those cases, we do not.
1369  *
1370  * For hardware virtual machines things are more complicated. Because they can
1371  * construct their own page tables, it isn't hard for them to perform this
1372  * attack against any physical address. The one wrinkle is that this physical
1373  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1374  * to flush the L1 data cache. We wrap this up in the function
1375  * spec_uarch_flush(). This function is also used in the mitigation of
1376  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1377  * hypervisors such as KVM or bhyve are responsible for performing this before
1378  * entering the guest.
1379  *
1380  * Because this attack takes place in the L1 cache, there's another wrinkle
1381  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1382  * designs. This means that when a thread enters a hardware virtualized context
1383  * and flushes the L1 data cache, the other thread on the processor may then go
1384  * ahead and put new data in it that can be potentially attacked. While one
1385  * solution is to disable SMT on the system, another option that is available is
1386  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1387  * goes through and makes sure that if a HVM is being scheduled on one thread,
1388  * then the thing on the other thread is from the same hardware virtual machine.
1389  * If an interrupt comes in or the guest exits to the broader system, then the
1390  * other SMT thread will be kicked out.
1391  *
1392  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1393  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1394  * perform L1TF related mitigations.
1395  *
1396  * MICROARCHITECTURAL DATA SAMPLING
1397  *
1398  * Microarchitectural data sampling (MDS) is a combination of four discrete
1399  * vulnerabilities that are similar issues affecting various parts of the CPU's
1400  * microarchitectural implementation around load, store, and fill buffers.
1401  * Specifically it is made up of the following subcomponents:
1402  *
1403  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1404  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1405  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1406  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1407  *
1408  * To begin addressing these, Intel has introduced another feature in microcode
1409  * called MD_CLEAR. This changes the verw instruction to operate in a different
1410  * way. This allows us to execute the verw instruction in a particular way to
1411  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1412  * updated when this microcode is present to flush this state.
1413  *
1414  * Primarily we need to flush this state whenever we transition from the kernel
1415  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1416  * little bit different. Here the structures are statically sized when a logical
1417  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1418  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1419  * mwait, or another ACPI method. To perform these flushes, we call
1420  * x86_md_clear() at all of these transition points.
1421  *
1422  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1423  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1424  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1425  * a no-op.
1426  *
1427  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1428  * particular, everything we've discussed above is only valid for a single
1429  * thread executing on a core. In the case where you have hyper-threading
1430  * present, this attack can be performed between threads. The theoretical fix
1431  * for this is to ensure that both threads are always in the same security
1432  * domain. This means that they are executing in the same ring and mutually
1433  * trust each other. Practically speaking, this would mean that a system call
1434  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1435  * Rather than implement this, we recommend that one disables hyper-threading
1436  * through the use of psradm -aS.
1437  *
1438  * TSX ASYNCHRONOUS ABORT
1439  *
1440  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1441  * behaves like MDS, but leverages Intel's transactional instructions as another
1442  * vector. Effectively, when a transaction hits one of these cases (unmapped
1443  * page, various cache snoop activity, etc.) then the same data can be exposed
1444  * as in the case of MDS. This means that you can attack your twin.
1445  *
1446  * Intel has described that there are two different ways that we can mitigate
1447  * this problem on affected processors:
1448  *
1449  *   1) We can use the same techniques used to deal with MDS. Flushing the
1450  *      microarchitectural buffers and disabling hyperthreading will mitigate
1451  *      this in the same way.
1452  *
1453  *   2) Using microcode to disable TSX.
1454  *
1455  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1456  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1457  * That's OK as we're already doing all such mitigations. On the other hand,
1458  * processors with MDS_NO are all supposed to receive microcode updates that
1459  * enumerate support for disabling TSX. In general, we'd rather use this method
1460  * when available as it doesn't require disabling hyperthreading to be
1461  * effective. Currently we basically are relying on microcode for processors
1462  * that enumerate MDS_NO.
1463  *
1464  * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1465  * Sampling: RFDS. This allows an attacker to sample values that were in any
1466  * of integer, floating point, or vector registers. This was discovered by
1467  * Intel during internal validation work.  The existence of the RFDS_NO
1468  * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1469  * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1470  * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1471  * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1472  * MSR that L1D uses.
1473  *
1474  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1475  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1476  * different powers. The first allows us to cause all transactions to
1477  * immediately abort. The second gives us a means of disabling TSX completely,
1478  * which includes removing it from cpuid. If we have support for this in
1479  * microcode during the first cpuid pass, then we'll disable TSX completely such
1480  * that user land never has a chance to observe the bit. However, if we are late
1481  * loading the microcode, then we must use the functionality to cause
1482  * transactions to automatically abort. This is necessary for user land's sake.
1483  * Once a program sees a cpuid bit, it must not be taken away.
1484  *
1485  * We track whether or not we should do this based on what cpuid pass we're in.
1486  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1487  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1488  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1489  * second time after we do the initial microcode update.  As a result we need to
1490  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1491  * suitable microcode on the current CPU (which happens prior to
1492  * cpuid_pass_ucode()).
1493  *
1494  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1495  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1496  * unfortunate feature in a number of ways, and taking the opportunity to
1497  * finally be able to turn it off is likely to be of benefit in the future.
1498  *
1499  * SUMMARY
1500  *
1501  * The following table attempts to summarize the mitigations for various issues
1502  * and what's done in various places:
1503  *
1504  *  - Spectre v1: Not currently mitigated
1505  *  - swapgs: lfences after swapgs paths
1506  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1507  *  - Meltdown: Kernel Page Table Isolation
1508  *  - Spectre v3a: Updated CPU microcode
1509  *  - Spectre v4: Not currently mitigated
1510  *  - SpectreRSB: SMEP and RSB Stuffing
1511  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1512  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1513  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1514  *  - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1515  *  - BHI: software sequence, and use of BHI_DIS_S if microcode has it.
1516  *
1517  * The following table indicates the x86 feature set bits that indicate that a
1518  * given problem has been solved or a notable feature is present:
1519  *
1520  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1521  *  - MDS_NO: All forms of MDS
1522  *  - TAA_NO: TAA
1523  *  - RFDS_NO: RFDS
1524  *  - BHI_NO: BHI
1525  */
1526 
1527 #include <sys/types.h>
1528 #include <sys/archsystm.h>
1529 #include <sys/x86_archext.h>
1530 #include <sys/kmem.h>
1531 #include <sys/systm.h>
1532 #include <sys/cmn_err.h>
1533 #include <sys/sunddi.h>
1534 #include <sys/sunndi.h>
1535 #include <sys/cpuvar.h>
1536 #include <sys/processor.h>
1537 #include <sys/stdbool.h>
1538 #include <sys/sysmacros.h>
1539 #include <sys/pg.h>
1540 #include <sys/fp.h>
1541 #include <sys/controlregs.h>
1542 #include <sys/bitmap.h>
1543 #include <sys/auxv_386.h>
1544 #include <sys/memnode.h>
1545 #include <sys/pci_cfgspace.h>
1546 #include <sys/comm_page.h>
1547 #include <sys/mach_mmu.h>
1548 #include <sys/ucode.h>
1549 #include <sys/tsc.h>
1550 #include <sys/kobj.h>
1551 #include <sys/asm_misc.h>
1552 #include <sys/bitmap.h>
1553 
1554 #ifdef __xpv
1555 #include <sys/hypervisor.h>
1556 #else
1557 #include <sys/ontrap.h>
1558 #endif
1559 
1560 uint_t x86_vendor = X86_VENDOR_IntelClone;
1561 uint_t x86_type = X86_TYPE_OTHER;
1562 uint_t x86_clflush_size = 0;
1563 
1564 #if defined(__xpv)
1565 int x86_use_pcid = 0;
1566 int x86_use_invpcid = 0;
1567 #else
1568 int x86_use_pcid = -1;
1569 int x86_use_invpcid = -1;
1570 #endif
1571 
1572 typedef enum {
1573 	X86_SPECTREV2_RETPOLINE,
1574 	X86_SPECTREV2_ENHANCED_IBRS,
1575 	X86_SPECTREV2_AUTO_IBRS,
1576 	X86_SPECTREV2_DISABLED
1577 } x86_spectrev2_mitigation_t;
1578 
1579 uint_t x86_disable_spectrev2 = 0;
1580 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1581     X86_SPECTREV2_RETPOLINE;
1582 
1583 /*
1584  * The mitigation status for TAA:
1585  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1586  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1587  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1588  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1589  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1590  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1591  */
1592 typedef enum {
1593 	X86_TAA_NOTHING,
1594 	X86_TAA_DISABLED,
1595 	X86_TAA_MD_CLEAR,
1596 	X86_TAA_TSX_FORCE_ABORT,
1597 	X86_TAA_TSX_DISABLE,
1598 	X86_TAA_HW_MITIGATED
1599 } x86_taa_mitigation_t;
1600 
1601 uint_t x86_disable_taa = 0;
1602 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1603 
1604 uint_t pentiumpro_bug4046376;
1605 
1606 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1607 
1608 static char *x86_feature_names[NUM_X86_FEATURES] = {
1609 	"lgpg",
1610 	"tsc",
1611 	"msr",
1612 	"mtrr",
1613 	"pge",
1614 	"de",
1615 	"cmov",
1616 	"mmx",
1617 	"mca",
1618 	"pae",
1619 	"cv8",
1620 	"pat",
1621 	"sep",
1622 	"sse",
1623 	"sse2",
1624 	"htt",
1625 	"asysc",
1626 	"nx",
1627 	"sse3",
1628 	"cx16",
1629 	"cmp",
1630 	"tscp",
1631 	"mwait",
1632 	"sse4a",
1633 	"cpuid",
1634 	"ssse3",
1635 	"sse4_1",
1636 	"sse4_2",
1637 	"1gpg",
1638 	"clfsh",
1639 	"64",
1640 	"aes",
1641 	"pclmulqdq",
1642 	"xsave",
1643 	"avx",
1644 	"vmx",
1645 	"svm",
1646 	"topoext",
1647 	"f16c",
1648 	"rdrand",
1649 	"x2apic",
1650 	"avx2",
1651 	"bmi1",
1652 	"bmi2",
1653 	"fma",
1654 	"smep",
1655 	"smap",
1656 	"adx",
1657 	"rdseed",
1658 	"mpx",
1659 	"avx512f",
1660 	"avx512dq",
1661 	"avx512pf",
1662 	"avx512er",
1663 	"avx512cd",
1664 	"avx512bw",
1665 	"avx512vl",
1666 	"avx512fma",
1667 	"avx512vbmi",
1668 	"avx512_vpopcntdq",
1669 	"avx512_4vnniw",
1670 	"avx512_4fmaps",
1671 	"xsaveopt",
1672 	"xsavec",
1673 	"xsaves",
1674 	"sha",
1675 	"umip",
1676 	"pku",
1677 	"ospke",
1678 	"pcid",
1679 	"invpcid",
1680 	"ibrs",
1681 	"ibpb",
1682 	"stibp",
1683 	"ssbd",
1684 	"ssbd_virt",
1685 	"rdcl_no",
1686 	"ibrs_all",
1687 	"rsba",
1688 	"ssb_no",
1689 	"stibp_all",
1690 	"flush_cmd",
1691 	"l1d_vmentry_no",
1692 	"fsgsbase",
1693 	"clflushopt",
1694 	"clwb",
1695 	"monitorx",
1696 	"clzero",
1697 	"xop",
1698 	"fma4",
1699 	"tbm",
1700 	"avx512_vnni",
1701 	"amd_pcec",
1702 	"md_clear",
1703 	"mds_no",
1704 	"core_thermal",
1705 	"pkg_thermal",
1706 	"tsx_ctrl",
1707 	"taa_no",
1708 	"ppin",
1709 	"vaes",
1710 	"vpclmulqdq",
1711 	"lfence_serializing",
1712 	"gfni",
1713 	"avx512_vp2intersect",
1714 	"avx512_bitalg",
1715 	"avx512_vbmi2",
1716 	"avx512_bf16",
1717 	"auto_ibrs",
1718 	"rfds_no",
1719 	"rfds_clear",
1720 	"pbrsb_no",
1721 	"bhi_no",
1722 	"bhi_clear"
1723 };
1724 
1725 boolean_t
is_x86_feature(void * featureset,uint_t feature)1726 is_x86_feature(void *featureset, uint_t feature)
1727 {
1728 	ASSERT(feature < NUM_X86_FEATURES);
1729 	return (BT_TEST((ulong_t *)featureset, feature));
1730 }
1731 
1732 void
add_x86_feature(void * featureset,uint_t feature)1733 add_x86_feature(void *featureset, uint_t feature)
1734 {
1735 	ASSERT(feature < NUM_X86_FEATURES);
1736 	BT_SET((ulong_t *)featureset, feature);
1737 }
1738 
1739 void
remove_x86_feature(void * featureset,uint_t feature)1740 remove_x86_feature(void *featureset, uint_t feature)
1741 {
1742 	ASSERT(feature < NUM_X86_FEATURES);
1743 	BT_CLEAR((ulong_t *)featureset, feature);
1744 }
1745 
1746 boolean_t
compare_x86_featureset(void * setA,void * setB)1747 compare_x86_featureset(void *setA, void *setB)
1748 {
1749 	/*
1750 	 * We assume that the unused bits of the bitmap are always zero.
1751 	 */
1752 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1753 		return (B_TRUE);
1754 	} else {
1755 		return (B_FALSE);
1756 	}
1757 }
1758 
1759 void
print_x86_featureset(void * featureset)1760 print_x86_featureset(void *featureset)
1761 {
1762 	uint_t i;
1763 
1764 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1765 		if (is_x86_feature(featureset, i)) {
1766 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1767 			    x86_feature_names[i]);
1768 		}
1769 	}
1770 }
1771 
1772 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1773 static size_t xsave_state_size = 0;
1774 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1775 boolean_t xsave_force_disable = B_FALSE;
1776 extern int disable_smap;
1777 
1778 /*
1779  * This is set to platform type we are running on.
1780  */
1781 static int platform_type = -1;
1782 
1783 #if !defined(__xpv)
1784 /*
1785  * Variable to patch if hypervisor platform detection needs to be
1786  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1787  */
1788 int enable_platform_detection = 1;
1789 #endif
1790 
1791 /*
1792  * monitor/mwait info.
1793  *
1794  * size_actual and buf_actual are the real address and size allocated to get
1795  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1796  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1797  * processor cache-line alignment, but this is not guarantied in the furture.
1798  */
1799 struct mwait_info {
1800 	size_t		mon_min;	/* min size to avoid missed wakeups */
1801 	size_t		mon_max;	/* size to avoid false wakeups */
1802 	size_t		size_actual;	/* size actually allocated */
1803 	void		*buf_actual;	/* memory actually allocated */
1804 	uint32_t	support;	/* processor support of monitor/mwait */
1805 };
1806 
1807 /*
1808  * xsave/xrestor info.
1809  *
1810  * This structure contains HW feature bits and the size of the xsave save area.
1811  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1812  * (xsave_state) to describe the xsave layout. However, at runtime the
1813  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1814  * xsave_state structure simply represents the legacy layout of the beginning
1815  * of the xsave area.
1816  */
1817 struct xsave_info {
1818 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1819 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1820 	size_t		xsav_max_size;  /* max size save area for HW features */
1821 	size_t		ymm_size;	/* AVX: size of ymm save area */
1822 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1823 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1824 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1825 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1826 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1827 	size_t		opmask_size;	/* AVX512: size of opmask save */
1828 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1829 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1830 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1831 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1832 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1833 };
1834 
1835 
1836 /*
1837  * These constants determine how many of the elements of the
1838  * cpuid we cache in the cpuid_info data structure; the
1839  * remaining elements are accessible via the cpuid instruction.
1840  */
1841 
1842 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1843 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1844 #define	NMAX_CPI_TOPO	0x10		/* Sanity check on leaf 8X26, 1F */
1845 
1846 /*
1847  * See the big theory statement for a more detailed explanation of what some of
1848  * these members mean.
1849  */
1850 struct cpuid_info {
1851 	uint_t cpi_pass;		/* last pass completed */
1852 	/*
1853 	 * standard function information
1854 	 */
1855 	uint_t cpi_maxeax;		/* fn 0: %eax */
1856 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1857 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1858 
1859 	uint_t cpi_family;		/* fn 1: extended family */
1860 	uint_t cpi_model;		/* fn 1: extended model */
1861 	uint_t cpi_step;		/* fn 1: stepping */
1862 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1863 					/*		AMD: package/socket # */
1864 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1865 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1866 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1867 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1868 	uint_t cpi_ncache;		/* fn 2: number of elements */
1869 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1870 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1871 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1872 					/* Intel fn: 4, AMD fn: 8000001d */
1873 	struct cpuid_regs **cpi_cache_leaves;	/* Actual leaves from above */
1874 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1875 	struct cpuid_regs cpi_sub7[2];	/* Leaf 7, sub-leaves 1-2 */
1876 	/*
1877 	 * extended function information
1878 	 */
1879 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1880 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1881 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1882 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1883 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1884 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1885 
1886 	id_t cpi_coreid;		/* same coreid => strands share core */
1887 	int cpi_pkgcoreid;		/* core number within single package */
1888 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1889 					/* Intel: fn 4: %eax[31-26] */
1890 
1891 	/*
1892 	 * These values represent the number of bits that are required to store
1893 	 * information about the number of cores and threads.
1894 	 */
1895 	uint_t cpi_ncore_bits;
1896 	uint_t cpi_nthread_bits;
1897 	/*
1898 	 * supported feature information
1899 	 */
1900 	uint32_t cpi_support[6];
1901 #define	STD_EDX_FEATURES	0
1902 #define	AMD_EDX_FEATURES	1
1903 #define	TM_EDX_FEATURES		2
1904 #define	STD_ECX_FEATURES	3
1905 #define	AMD_ECX_FEATURES	4
1906 #define	STD_EBX_FEATURES	5
1907 	/*
1908 	 * Synthesized information, where known.
1909 	 */
1910 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1911 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1912 	uint32_t cpi_socket;		/* Chip package/socket type */
1913 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1914 
1915 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1916 	uint32_t cpi_apicid;
1917 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1918 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1919 					/* Intel: 1 */
1920 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1921 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1922 
1923 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1924 
1925 	/*
1926 	 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1927 	 * eventually leaf 0x1F (Intel).
1928 	 */
1929 	uint_t cpi_topo_nleaves;
1930 	struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1931 };
1932 
1933 
1934 static struct cpuid_info cpuid_info0;
1935 
1936 /*
1937  * These bit fields are defined by the Intel Application Note AP-485
1938  * "Intel Processor Identification and the CPUID Instruction"
1939  */
1940 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1941 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1942 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1943 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1944 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1945 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1946 
1947 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1948 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1949 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1950 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1951 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1952 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1953 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1954 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1955 #define	CPI_FEATURES_7_2_EDX(cpi)	((cpi)->cpi_sub7[1].cp_edx)
1956 
1957 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1958 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1959 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1960 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1961 
1962 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1963 #define	CPI_XMAXEAX_MAX		0x80000100
1964 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1965 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1966 
1967 /*
1968  * Function 4 (Deterministic Cache Parameters) macros
1969  * Defined by Intel Application Note AP-485
1970  */
1971 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1972 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1973 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1974 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1975 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1976 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1977 #define	CPI_CACHE_TYPE_DONE	0
1978 #define	CPI_CACHE_TYPE_DATA	1
1979 #define	CPI_CACHE_TYPE_INSTR	2
1980 #define	CPI_CACHE_TYPE_UNIFIED	3
1981 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1982 
1983 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1984 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1985 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1986 
1987 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1988 
1989 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1990 
1991 
1992 /*
1993  * A couple of shorthand macros to identify "later" P6-family chips
1994  * like the Pentium M and Core.  First, the "older" P6-based stuff
1995  * (loosely defined as "pre-Pentium-4"):
1996  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1997  */
1998 #define	IS_LEGACY_P6(cpi) (			\
1999 	cpi->cpi_family == 6 &&			\
2000 		(cpi->cpi_model == 1 ||		\
2001 		cpi->cpi_model == 3 ||		\
2002 		cpi->cpi_model == 5 ||		\
2003 		cpi->cpi_model == 6 ||		\
2004 		cpi->cpi_model == 7 ||		\
2005 		cpi->cpi_model == 8 ||		\
2006 		cpi->cpi_model == 0xA ||	\
2007 		cpi->cpi_model == 0xB)		\
2008 )
2009 
2010 /* A "new F6" is everything with family 6 that's not the above */
2011 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
2012 
2013 /* Extended family/model support */
2014 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
2015 	cpi->cpi_family >= 0xf)
2016 
2017 /*
2018  * Info for monitor/mwait idle loop.
2019  *
2020  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
2021  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
2022  * 2006.
2023  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
2024  * Documentation Updates" #33633, Rev 2.05, December 2006.
2025  */
2026 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
2027 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
2028 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
2029 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
2030 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
2031 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
2032 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
2033 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
2034 /*
2035  * Number of sub-cstates for a given c-state.
2036  */
2037 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
2038 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
2039 
2040 /*
2041  * XSAVE leaf 0xD enumeration
2042  */
2043 #define	CPUID_LEAFD_2_YMM_OFFSET	576
2044 #define	CPUID_LEAFD_2_YMM_SIZE		256
2045 
2046 /*
2047  * Common extended leaf names to cut down on typos.
2048  */
2049 #define	CPUID_LEAF_EXT_0		0x80000000
2050 #define	CPUID_LEAF_EXT_8		0x80000008
2051 #define	CPUID_LEAF_EXT_1d		0x8000001d
2052 #define	CPUID_LEAF_EXT_1e		0x8000001e
2053 #define	CPUID_LEAF_EXT_21		0x80000021
2054 #define	CPUID_LEAF_EXT_26		0x80000026
2055 
2056 /*
2057  * Functions we consume from cpuid_subr.c;  don't publish these in a header
2058  * file to try and keep people using the expected cpuid_* interfaces.
2059  */
2060 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2061 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2062 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2063 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2064 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2065 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2066 
2067 /*
2068  * Apply up various platform-dependent restrictions where the
2069  * underlying platform restrictions mean the CPU can be marked
2070  * as less capable than its cpuid instruction would imply.
2071  */
2072 #if defined(__xpv)
2073 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)2074 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2075 {
2076 	switch (eax) {
2077 	case 1: {
2078 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2079 		    0 : CPUID_INTC_EDX_MCA;
2080 		cp->cp_edx &=
2081 		    ~(mcamask |
2082 		    CPUID_INTC_EDX_PSE |
2083 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2084 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2085 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2086 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2087 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2088 		break;
2089 	}
2090 
2091 	case 0x80000001:
2092 		cp->cp_edx &=
2093 		    ~(CPUID_AMD_EDX_PSE |
2094 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2095 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2096 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2097 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2098 		    CPUID_AMD_EDX_TSCP);
2099 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2100 		break;
2101 	default:
2102 		break;
2103 	}
2104 
2105 	switch (vendor) {
2106 	case X86_VENDOR_Intel:
2107 		switch (eax) {
2108 		case 4:
2109 			/*
2110 			 * Zero out the (ncores-per-chip - 1) field
2111 			 */
2112 			cp->cp_eax &= 0x03fffffff;
2113 			break;
2114 		default:
2115 			break;
2116 		}
2117 		break;
2118 	case X86_VENDOR_AMD:
2119 	case X86_VENDOR_HYGON:
2120 		switch (eax) {
2121 
2122 		case 0x80000001:
2123 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2124 			break;
2125 
2126 		case CPUID_LEAF_EXT_8:
2127 			/*
2128 			 * Zero out the (ncores-per-chip - 1) field
2129 			 */
2130 			cp->cp_ecx &= 0xffffff00;
2131 			break;
2132 		default:
2133 			break;
2134 		}
2135 		break;
2136 	default:
2137 		break;
2138 	}
2139 }
2140 #else
2141 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2142 #endif
2143 
2144 /*
2145  *  Some undocumented ways of patching the results of the cpuid
2146  *  instruction to permit running Solaris 10 on future cpus that
2147  *  we don't currently support.  Could be set to non-zero values
2148  *  via settings in eeprom.
2149  */
2150 
2151 uint32_t cpuid_feature_ecx_include;
2152 uint32_t cpuid_feature_ecx_exclude;
2153 uint32_t cpuid_feature_edx_include;
2154 uint32_t cpuid_feature_edx_exclude;
2155 
2156 /*
2157  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2158  */
2159 void
cpuid_alloc_space(cpu_t * cpu)2160 cpuid_alloc_space(cpu_t *cpu)
2161 {
2162 	/*
2163 	 * By convention, cpu0 is the boot cpu, which is set up
2164 	 * before memory allocation is available.  All other cpus get
2165 	 * their cpuid_info struct allocated here.
2166 	 */
2167 	ASSERT(cpu->cpu_id != 0);
2168 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2169 	cpu->cpu_m.mcpu_cpi =
2170 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2171 }
2172 
2173 void
cpuid_free_space(cpu_t * cpu)2174 cpuid_free_space(cpu_t *cpu)
2175 {
2176 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2177 	int i;
2178 
2179 	ASSERT(cpi != NULL);
2180 	ASSERT(cpi != &cpuid_info0);
2181 
2182 	/*
2183 	 * Free up any cache leaf related dynamic storage. The first entry was
2184 	 * cached from the standard cpuid storage, so we should not free it.
2185 	 */
2186 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2187 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2188 	if (cpi->cpi_cache_leaf_size > 0)
2189 		kmem_free(cpi->cpi_cache_leaves,
2190 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2191 
2192 	kmem_free(cpi, sizeof (*cpi));
2193 	cpu->cpu_m.mcpu_cpi = NULL;
2194 }
2195 
2196 #if !defined(__xpv)
2197 /*
2198  * Determine the type of the underlying platform. This is used to customize
2199  * initialization of various subsystems (e.g. TSC). determine_platform() must
2200  * only ever be called once to prevent two processors from seeing different
2201  * values of platform_type. Must be called before cpuid_pass_ident(), the
2202  * earliest consumer to execute; the identification pass will call
2203  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2204  */
2205 void
determine_platform(void)2206 determine_platform(void)
2207 {
2208 	struct cpuid_regs cp;
2209 	uint32_t base;
2210 	uint32_t regs[4];
2211 	char *hvstr = (char *)regs;
2212 
2213 	ASSERT(platform_type == -1);
2214 
2215 	platform_type = HW_NATIVE;
2216 
2217 	if (!enable_platform_detection)
2218 		return;
2219 
2220 	/*
2221 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2222 	 * vendor signature, and set platform type accordingly.
2223 	 *
2224 	 * References:
2225 	 * http://lkml.org/lkml/2008/10/1/246
2226 	 * http://kb.vmware.com/kb/1009458
2227 	 */
2228 	cp.cp_eax = 0x1;
2229 	(void) __cpuid_insn(&cp);
2230 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2231 		cp.cp_eax = 0x40000000;
2232 		(void) __cpuid_insn(&cp);
2233 		regs[0] = cp.cp_ebx;
2234 		regs[1] = cp.cp_ecx;
2235 		regs[2] = cp.cp_edx;
2236 		regs[3] = 0;
2237 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2238 			platform_type = HW_XEN_HVM;
2239 			return;
2240 		}
2241 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2242 			platform_type = HW_VMWARE;
2243 			return;
2244 		}
2245 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2246 			platform_type = HW_KVM;
2247 			return;
2248 		}
2249 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2250 			platform_type = HW_BHYVE;
2251 			return;
2252 		}
2253 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2254 			platform_type = HW_MICROSOFT;
2255 			return;
2256 		}
2257 		if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2258 			platform_type = HW_QEMU_TCG;
2259 			return;
2260 		}
2261 		if (strcmp(hvstr, HVSIG_VIRTUALBOX) == 0) {
2262 			platform_type = HW_VIRTUALBOX;
2263 			return;
2264 		}
2265 		if (strcmp(hvstr, HVSIG_ACRN) == 0) {
2266 			platform_type = HW_ACRN;
2267 			return;
2268 		}
2269 	} else {
2270 		/*
2271 		 * Check older VMware hardware versions. VMware hypervisor is
2272 		 * detected by performing an IN operation to VMware hypervisor
2273 		 * port and checking that value returned in %ebx is VMware
2274 		 * hypervisor magic value.
2275 		 *
2276 		 * References: http://kb.vmware.com/kb/1009458
2277 		 */
2278 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2279 		if (regs[1] == VMWARE_HVMAGIC) {
2280 			platform_type = HW_VMWARE;
2281 			return;
2282 		}
2283 	}
2284 
2285 	/*
2286 	 * Check Xen hypervisor. In a fully virtualized domain,
2287 	 * Xen's pseudo-cpuid function returns a string representing the
2288 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2289 	 * supported cpuid function. We need at least a (base + 2) leaf value
2290 	 * to do what we want to do. Try different base values, since the
2291 	 * hypervisor might use a different one depending on whether Hyper-V
2292 	 * emulation is switched on by default or not.
2293 	 */
2294 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2295 		cp.cp_eax = base;
2296 		(void) __cpuid_insn(&cp);
2297 		regs[0] = cp.cp_ebx;
2298 		regs[1] = cp.cp_ecx;
2299 		regs[2] = cp.cp_edx;
2300 		regs[3] = 0;
2301 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2302 		    cp.cp_eax >= (base + 2)) {
2303 			platform_type &= ~HW_NATIVE;
2304 			platform_type |= HW_XEN_HVM;
2305 			return;
2306 		}
2307 	}
2308 }
2309 
2310 int
get_hwenv(void)2311 get_hwenv(void)
2312 {
2313 	ASSERT(platform_type != -1);
2314 	return (platform_type);
2315 }
2316 
2317 int
is_controldom(void)2318 is_controldom(void)
2319 {
2320 	return (0);
2321 }
2322 
2323 #else
2324 
2325 int
get_hwenv(void)2326 get_hwenv(void)
2327 {
2328 	return (HW_XEN_PV);
2329 }
2330 
2331 int
is_controldom(void)2332 is_controldom(void)
2333 {
2334 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2335 }
2336 
2337 #endif	/* __xpv */
2338 
2339 /*
2340  * Gather the extended topology information. This should be the same for both
2341  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2342  */
2343 static void
cpuid_gather_ext_topo_leaf(struct cpuid_info * cpi,uint32_t leaf)2344 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2345 {
2346 	uint_t i;
2347 
2348 	for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2349 		struct cpuid_regs *regs = &cpi->cpi_topo[i];
2350 
2351 		bzero(regs, sizeof (struct cpuid_regs));
2352 		regs->cp_eax = leaf;
2353 		regs->cp_ecx = i;
2354 
2355 		(void) __cpuid_insn(regs);
2356 		if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2357 		    CPUID_AMD_8X26_TYPE_DONE) {
2358 			break;
2359 		}
2360 	}
2361 
2362 	cpi->cpi_topo_nleaves = i;
2363 }
2364 
2365 /*
2366  * Make sure that we have gathered all of the CPUID leaves that we might need to
2367  * determine topology. We assume that the standard leaf 1 has already been done
2368  * and that xmaxeax has already been calculated.
2369  */
2370 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2371 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2372 {
2373 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2374 
2375 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2376 		struct cpuid_regs *cp;
2377 
2378 		cp = &cpi->cpi_extd[8];
2379 		cp->cp_eax = CPUID_LEAF_EXT_8;
2380 		(void) __cpuid_insn(cp);
2381 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2382 	}
2383 
2384 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2385 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2386 		struct cpuid_regs *cp;
2387 
2388 		cp = &cpi->cpi_extd[0x1e];
2389 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2390 		(void) __cpuid_insn(cp);
2391 	}
2392 
2393 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2394 		cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2395 	}
2396 }
2397 
2398 /*
2399  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2400  * it to everything else. If not, and we're on an AMD system where 8000001e is
2401  * valid, then we use that. Othewrise, we fall back to the default value for the
2402  * APIC ID in leaf 1.
2403  */
2404 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2405 cpuid_gather_apicid(struct cpuid_info *cpi)
2406 {
2407 	/*
2408 	 * Leaf B changes based on the arguments to it. Because we don't cache
2409 	 * it, we need to gather it again.
2410 	 */
2411 	if (cpi->cpi_maxeax >= 0xB) {
2412 		struct cpuid_regs regs;
2413 		struct cpuid_regs *cp;
2414 
2415 		cp = &regs;
2416 		cp->cp_eax = 0xB;
2417 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2418 		(void) __cpuid_insn(cp);
2419 
2420 		if (cp->cp_ebx != 0) {
2421 			return (cp->cp_edx);
2422 		}
2423 	}
2424 
2425 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2426 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2427 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2428 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2429 		return (cpi->cpi_extd[0x1e].cp_eax);
2430 	}
2431 
2432 	return (CPI_APIC_ID(cpi));
2433 }
2434 
2435 /*
2436  * For AMD processors, attempt to calculate the number of chips and cores that
2437  * exist. The way that we do this varies based on the generation, because the
2438  * generations themselves have changed dramatically.
2439  *
2440  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2441  * However, with the advent of family 17h (Zen) it actually tells us the number
2442  * of threads, so we need to look at leaf 0x8000001e if available to determine
2443  * its value. Otherwise, for all prior families, the number of enabled cores is
2444  * the same as threads.
2445  *
2446  * If we do not have leaf 0x80000008, then we assume that this processor does
2447  * not have anything. AMD's older CPUID specification says there's no reason to
2448  * fall back to leaf 1.
2449  *
2450  * In some virtualization cases we will not have leaf 8000001e or it will be
2451  * zero. When that happens we assume the number of threads is one.
2452  */
2453 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2454 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2455 {
2456 	uint_t nthreads, nthread_per_core;
2457 
2458 	nthreads = nthread_per_core = 1;
2459 
2460 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2461 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2462 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2463 		nthreads = CPI_CPU_COUNT(cpi);
2464 	}
2465 
2466 	/*
2467 	 * For us to have threads, and know about it, we have to be at least at
2468 	 * family 17h and have the cpuid bit that says we have extended
2469 	 * topology.
2470 	 */
2471 	if (cpi->cpi_family >= 0x17 &&
2472 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2473 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2474 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2475 	}
2476 
2477 	*ncpus = nthreads;
2478 	*ncores = nthreads / nthread_per_core;
2479 }
2480 
2481 /*
2482  * Seed the initial values for the cores and threads for an Intel based
2483  * processor. These values will be overwritten if we detect that the processor
2484  * supports CPUID leaf 0xb.
2485  */
2486 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2487 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2488 {
2489 	/*
2490 	 * Only seed the number of physical cores from the first level leaf 4
2491 	 * information. The number of threads there indicate how many share the
2492 	 * L1 cache, which may or may not have anything to do with the number of
2493 	 * logical CPUs per core.
2494 	 */
2495 	if (cpi->cpi_maxeax >= 4) {
2496 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2497 	} else {
2498 		*ncores = 1;
2499 	}
2500 
2501 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2502 		*ncpus = CPI_CPU_COUNT(cpi);
2503 	} else {
2504 		*ncpus = *ncores;
2505 	}
2506 }
2507 
2508 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2509 cpuid_leafB_getids(cpu_t *cpu)
2510 {
2511 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2512 	struct cpuid_regs regs;
2513 	struct cpuid_regs *cp;
2514 
2515 	if (cpi->cpi_maxeax < 0xB)
2516 		return (B_FALSE);
2517 
2518 	cp = &regs;
2519 	cp->cp_eax = 0xB;
2520 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2521 
2522 	(void) __cpuid_insn(cp);
2523 
2524 	/*
2525 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2526 	 * indicates that the extended topology enumeration leaf is
2527 	 * available.
2528 	 */
2529 	if (cp->cp_ebx != 0) {
2530 		uint32_t x2apic_id = 0;
2531 		uint_t coreid_shift = 0;
2532 		uint_t ncpu_per_core = 1;
2533 		uint_t chipid_shift = 0;
2534 		uint_t ncpu_per_chip = 1;
2535 		uint_t i;
2536 		uint_t level;
2537 
2538 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2539 			cp->cp_eax = 0xB;
2540 			cp->cp_ecx = i;
2541 
2542 			(void) __cpuid_insn(cp);
2543 			level = CPI_CPU_LEVEL_TYPE(cp);
2544 
2545 			if (level == 1) {
2546 				x2apic_id = cp->cp_edx;
2547 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2548 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2549 			} else if (level == 2) {
2550 				x2apic_id = cp->cp_edx;
2551 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2552 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2553 			}
2554 		}
2555 
2556 		/*
2557 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2558 		 */
2559 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2560 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2561 		    ncpu_per_core;
2562 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2563 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2564 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2565 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2566 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2567 		cpi->cpi_compunitid = cpi->cpi_coreid;
2568 
2569 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2570 			cpi->cpi_nthread_bits = coreid_shift;
2571 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2572 		}
2573 
2574 		return (B_TRUE);
2575 	} else {
2576 		return (B_FALSE);
2577 	}
2578 }
2579 
2580 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2581 cpuid_intel_getids(cpu_t *cpu, void *feature)
2582 {
2583 	uint_t i;
2584 	uint_t chipid_shift = 0;
2585 	uint_t coreid_shift = 0;
2586 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2587 
2588 	/*
2589 	 * There are no compute units or processor nodes currently on Intel.
2590 	 * Always set these to one.
2591 	 */
2592 	cpi->cpi_procnodes_per_pkg = 1;
2593 	cpi->cpi_cores_per_compunit = 1;
2594 
2595 	/*
2596 	 * If cpuid Leaf B is present, use that to try and get this information.
2597 	 * It will be the most accurate for Intel CPUs.
2598 	 */
2599 	if (cpuid_leafB_getids(cpu))
2600 		return;
2601 
2602 	/*
2603 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2604 	 * and ncore_per_chip. These represent the largest power of two values
2605 	 * that we need to cover all of the IDs in the system. Therefore, we use
2606 	 * those values to seed the number of bits needed to cover information
2607 	 * in the case when leaf B is not available. These values will probably
2608 	 * be larger than required, but that's OK.
2609 	 */
2610 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2611 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2612 
2613 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2614 		chipid_shift++;
2615 
2616 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2617 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2618 
2619 	if (is_x86_feature(feature, X86FSET_CMP)) {
2620 		/*
2621 		 * Multi-core (and possibly multi-threaded)
2622 		 * processors.
2623 		 */
2624 		uint_t ncpu_per_core = 0;
2625 
2626 		if (cpi->cpi_ncore_per_chip == 1)
2627 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2628 		else if (cpi->cpi_ncore_per_chip > 1)
2629 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2630 			    cpi->cpi_ncore_per_chip;
2631 		/*
2632 		 * 8bit APIC IDs on dual core Pentiums
2633 		 * look like this:
2634 		 *
2635 		 * +-----------------------+------+------+
2636 		 * | Physical Package ID   |  MC  |  HT  |
2637 		 * +-----------------------+------+------+
2638 		 * <------- chipid -------->
2639 		 * <------- coreid --------------->
2640 		 *			   <--- clogid -->
2641 		 *			   <------>
2642 		 *			   pkgcoreid
2643 		 *
2644 		 * Where the number of bits necessary to
2645 		 * represent MC and HT fields together equals
2646 		 * to the minimum number of bits necessary to
2647 		 * store the value of cpi->cpi_ncpu_per_chip.
2648 		 * Of those bits, the MC part uses the number
2649 		 * of bits necessary to store the value of
2650 		 * cpi->cpi_ncore_per_chip.
2651 		 */
2652 		for (i = 1; i < ncpu_per_core; i <<= 1)
2653 			coreid_shift++;
2654 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2655 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2656 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2657 		/*
2658 		 * Single-core multi-threaded processors.
2659 		 */
2660 		cpi->cpi_coreid = cpi->cpi_chipid;
2661 		cpi->cpi_pkgcoreid = 0;
2662 	} else {
2663 		/*
2664 		 * Single-core single-thread processors.
2665 		 */
2666 		cpi->cpi_coreid = cpu->cpu_id;
2667 		cpi->cpi_pkgcoreid = 0;
2668 	}
2669 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2670 	cpi->cpi_compunitid = cpi->cpi_coreid;
2671 }
2672 
2673 /*
2674  * Historically, AMD has had CMP chips with only a single thread per core.
2675  * However, starting in family 17h (Zen), this has changed and they now have
2676  * multiple threads. Our internal core id needs to be a unique value.
2677  *
2678  * To determine the core id of an AMD system, if we're from a family before 17h,
2679  * then we just use the cpu id, as that gives us a good value that will be
2680  * unique for each core. If instead, we're on family 17h or later, then we need
2681  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2682  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2683  * We can't use the normal core id in that leaf as it's only unique within the
2684  * socket, which is perfect for cpi_pkgcoreid, but not us.
2685  */
2686 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2687 cpuid_amd_get_coreid(cpu_t *cpu)
2688 {
2689 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2690 
2691 	if (cpi->cpi_family >= 0x17 &&
2692 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2693 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2694 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2695 		if (nthreads > 1) {
2696 			VERIFY3U(nthreads, ==, 2);
2697 			return (cpi->cpi_apicid >> 1);
2698 		}
2699 	}
2700 
2701 	return (cpu->cpu_id);
2702 }
2703 
2704 /*
2705  * IDs on AMD is a more challenging task. This is notable because of the
2706  * following two facts:
2707  *
2708  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2709  *     also no way to get an actual unique core id from the system. As such, we
2710  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2711  *     however, guarantee that sibling cores of a chip will have sequential
2712  *     coreids starting at a multiple of the number of cores per chip - that is
2713  *     usually the case, but if the APIC IDs have been set up in a different
2714  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2715  *
2716  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2717  *     called compute units. These compute units share the L1I cache, L2 cache,
2718  *     and the FPU. To deal with this, a new topology leaf was added in
2719  *     0x8000001e. However, parts of this leaf have different meanings
2720  *     once we get to family 0x17.
2721  */
2722 
2723 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2724 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2725 {
2726 	int i, first_half, coreidsz;
2727 	uint32_t nb_caps_reg;
2728 	uint_t node2_1;
2729 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2730 	struct cpuid_regs *cp;
2731 
2732 	/*
2733 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2734 	 * hasn't been stripped by virtualization). We always set the compute
2735 	 * unit id to the same value. Also, initialize the default number of
2736 	 * cores per compute unit and nodes per package. This will be
2737 	 * overwritten when we know information about a particular family.
2738 	 */
2739 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2740 	cpi->cpi_compunitid = cpi->cpi_coreid;
2741 	cpi->cpi_cores_per_compunit = 1;
2742 	cpi->cpi_procnodes_per_pkg = 1;
2743 
2744 	/*
2745 	 * To construct the logical ID, we need to determine how many APIC IDs
2746 	 * are dedicated to the cores and threads. This is provided for us in
2747 	 * 0x80000008. However, if it's not present (say due to virtualization),
2748 	 * then we assume it's one. This should be present on all 64-bit AMD
2749 	 * processors.  It was added in family 0xf (Hammer).
2750 	 */
2751 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2752 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2753 
2754 		/*
2755 		 * In AMD parlance chip is really a node while illumos
2756 		 * uses chip as equivalent to socket/package.
2757 		 */
2758 		if (coreidsz == 0) {
2759 			/* Use legacy method */
2760 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2761 				coreidsz++;
2762 			if (coreidsz == 0)
2763 				coreidsz = 1;
2764 		}
2765 	} else {
2766 		/* Assume single-core part */
2767 		coreidsz = 1;
2768 	}
2769 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2770 
2771 	/*
2772 	 * The package core ID varies depending on the family. While it may be
2773 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2774 	 * this value is the core id in the given node. For non-virtualized
2775 	 * family 17h, we need to take the logical core id and shift off the
2776 	 * threads like we do when getting the core id.  Otherwise, we can use
2777 	 * the clogid as is. When family 17h is virtualized, the clogid should
2778 	 * be sufficient as if we don't have valid data in the leaf, then we
2779 	 * won't think we have SMT, in which case the cpi_clogid should be
2780 	 * sufficient.
2781 	 */
2782 	if (cpi->cpi_family >= 0x17 &&
2783 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2784 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2785 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2786 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2787 		if (nthreads > 1) {
2788 			VERIFY3U(nthreads, ==, 2);
2789 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2790 		} else {
2791 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2792 		}
2793 	} else {
2794 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2795 	}
2796 
2797 	/*
2798 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2799 	 * (bulldozer) or newer, then we can derive all of this from leaf
2800 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2801 	 */
2802 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2803 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2804 		cp = &cpi->cpi_extd[0x1e];
2805 
2806 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2807 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2808 
2809 		/*
2810 		 * For Bulldozer-era CPUs, recalculate the compute unit
2811 		 * information.
2812 		 */
2813 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2814 			cpi->cpi_cores_per_compunit =
2815 			    BITX(cp->cp_ebx, 15, 8) + 1;
2816 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2817 			    (cpi->cpi_ncore_per_chip /
2818 			    cpi->cpi_cores_per_compunit) *
2819 			    (cpi->cpi_procnodeid /
2820 			    cpi->cpi_procnodes_per_pkg);
2821 		}
2822 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2823 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2824 	} else if (cpi->cpi_family == 0x10) {
2825 		/*
2826 		 * See if we are a multi-node processor.
2827 		 * All processors in the system have the same number of nodes
2828 		 */
2829 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2830 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2831 			/* Single-node */
2832 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2833 			    coreidsz);
2834 		} else {
2835 
2836 			/*
2837 			 * Multi-node revision D (2 nodes per package
2838 			 * are supported)
2839 			 */
2840 			cpi->cpi_procnodes_per_pkg = 2;
2841 
2842 			first_half = (cpi->cpi_pkgcoreid <=
2843 			    (cpi->cpi_ncore_per_chip/2 - 1));
2844 
2845 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2846 				/* We are BSP */
2847 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2848 			} else {
2849 
2850 				/* We are AP */
2851 				/* NodeId[2:1] bits to use for reading F3xe8 */
2852 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2853 
2854 				nb_caps_reg =
2855 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2856 
2857 				/*
2858 				 * Check IntNodeNum bit (31:30, but bit 31 is
2859 				 * always 0 on dual-node processors)
2860 				 */
2861 				if (BITX(nb_caps_reg, 30, 30) == 0)
2862 					cpi->cpi_procnodeid = node2_1 +
2863 					    !first_half;
2864 				else
2865 					cpi->cpi_procnodeid = node2_1 +
2866 					    first_half;
2867 			}
2868 		}
2869 	} else {
2870 		cpi->cpi_procnodeid = 0;
2871 	}
2872 
2873 	cpi->cpi_chipid =
2874 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2875 
2876 	cpi->cpi_ncore_bits = coreidsz;
2877 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2878 	    cpi->cpi_ncore_per_chip);
2879 }
2880 
2881 static void
spec_uarch_flush_noop(void)2882 spec_uarch_flush_noop(void)
2883 {
2884 }
2885 
2886 /*
2887  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2888  * MDS-related micro-architectural state that would normally happen by calling
2889  * x86_md_clear().
2890  */
2891 static void
spec_uarch_flush_msr(void)2892 spec_uarch_flush_msr(void)
2893 {
2894 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2895 }
2896 
2897 /*
2898  * This function points to a function that will flush certain
2899  * micro-architectural state on the processor. This flush is used to mitigate
2900  * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2901  * This function can point to one of three functions:
2902  *
2903  * - A noop which is done because we either are vulnerable, but do not have
2904  *   microcode available to help deal with a fix, or because we aren't
2905  *   vulnerable.
2906  *
2907  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2908  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2909  *   however, it only flushes the MDS related micro-architectural state on the
2910  *   current hyperthread, it does not do anything for the twin.
2911  *
2912  * - x86_md_clear which will flush the MDS related state. This is done when we
2913  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2914  *   (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2915  *   can clear it (RFDS_CLEAR is set).
2916  */
2917 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2918 
2919 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2920 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2921 {
2922 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2923 
2924 	/* Non-Intel doesn't concern us here. */
2925 	if (cpi->cpi_vendor != X86_VENDOR_Intel)
2926 		return;
2927 
2928 	/*
2929 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2930 	 * has been fixed in hardware, it doesn't cover everything related to
2931 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2932 	 * need to mitigate this.
2933 	 *
2934 	 * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2935 	 * because of the small cases of RFDS.
2936 	 */
2937 
2938 	if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2939 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2940 	    (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2941 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2942 		const uint8_t nop = NOP_INSTR;
2943 		uint8_t *md = (uint8_t *)x86_md_clear;
2944 
2945 		*md = nop;
2946 	}
2947 
2948 	membar_producer();
2949 }
2950 
2951 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2952 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2953 {
2954 	boolean_t need_l1d, need_mds, need_rfds;
2955 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2956 
2957 	/*
2958 	 * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2959 	 * in hardware, then there's nothing left for us to do for enabling
2960 	 * the flush. We can also go ahead and say that SMT exclusion is
2961 	 * unnecessary.
2962 	 */
2963 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2964 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2965 	    is_x86_feature(featureset, X86FSET_MDS_NO) &&
2966 	    is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2967 		extern int smt_exclusion;
2968 		smt_exclusion = 0;
2969 		spec_uarch_flush = spec_uarch_flush_noop;
2970 		membar_producer();
2971 		return;
2972 	}
2973 
2974 	/*
2975 	 * The locations where we need to perform an L1D flush are required both
2976 	 * for mitigating L1TF and MDS. When verw support is present in
2977 	 * microcode, then the L1D flush will take care of doing that as well.
2978 	 * However, if we have a system where RDCL_NO is present, but we don't
2979 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2980 	 * L1D flush.
2981 	 */
2982 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2983 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2984 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2985 		need_l1d = B_TRUE;
2986 	} else {
2987 		need_l1d = B_FALSE;
2988 	}
2989 
2990 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2991 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2992 		need_mds = B_TRUE;
2993 	} else {
2994 		need_mds = B_FALSE;
2995 	}
2996 
2997 	if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2998 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
2999 		need_rfds = B_TRUE;
3000 	} else {
3001 		need_rfds = B_FALSE;
3002 	}
3003 
3004 	if (need_l1d) {
3005 		/*
3006 		 * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
3007 		 * together. If the following VERIFY trips, we need to add
3008 		 * further fixes here.
3009 		 */
3010 		VERIFY(!need_rfds);
3011 		spec_uarch_flush = spec_uarch_flush_msr;
3012 	} else if (need_mds || need_rfds) {
3013 		spec_uarch_flush = x86_md_clear;
3014 	} else {
3015 		/*
3016 		 * We have no hardware mitigations available to us.
3017 		 */
3018 		spec_uarch_flush = spec_uarch_flush_noop;
3019 	}
3020 	membar_producer();
3021 }
3022 
3023 /*
3024  * Branch History Injection (BHI) mitigations.
3025  *
3026  * Intel has provided a software sequence that will scrub the BHB. Like RSB
3027  * (below) we can scribble a return at the beginning to avoid if if the CPU
3028  * is modern enough. We can also scribble a return if the CPU is old enough
3029  * to not have an RSB (pre-eIBRS).
3030  */
3031 typedef enum {
3032 	X86_BHI_TOO_OLD_OR_DISABLED,	/* Pre-eIBRS or disabled */
3033 	X86_BHI_NEW_ENOUGH,		/* AMD, or Intel with BHI_NO set */
3034 	X86_BHI_DIS_S,			/* BHI_NO == 0, but BHI_DIS_S avail. */
3035 	/* NOTE: BHI_DIS_S above will still need the software sequence. */
3036 	X86_BHI_SOFTWARE_SEQUENCE,	/* Use software sequence */
3037 } x86_native_bhi_mitigation_t;
3038 
3039 x86_native_bhi_mitigation_t x86_bhi_mitigation = X86_BHI_SOFTWARE_SEQUENCE;
3040 
3041 static void
cpuid_enable_bhi_dis_s(void)3042 cpuid_enable_bhi_dis_s(void)
3043 {
3044 	uint64_t val;
3045 
3046 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3047 	val |= IA32_SPEC_CTRL_BHI_DIS_S;
3048 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3049 }
3050 
3051 /*
3052  * This function scribbles RET into the first instruction of x86_bhb_clear()
3053  * if SPECTREV2 mitigations are disabled, the CPU is too old, the CPU is new
3054  * enough to fix (which includes non-Intel CPUs), or the CPU has an explicit
3055  * disable-Branch-History control.
3056  */
3057 static x86_native_bhi_mitigation_t
cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit,cpu_t * cpu,uchar_t * featureset)3058 cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit, cpu_t *cpu,
3059     uchar_t *featureset)
3060 {
3061 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3062 	const uint8_t ret = RET_INSTR;
3063 	uint8_t *bhb_clear = (uint8_t *)x86_bhb_clear;
3064 
3065 	ASSERT0(cpu->cpu_id);
3066 
3067 	/* First check for explicitly disabled... */
3068 	if (v2mit == X86_SPECTREV2_DISABLED) {
3069 		*bhb_clear = ret;
3070 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3071 	}
3072 
3073 	/*
3074 	 * Then check for BHI_NO, which means the CPU doesn't have this bug,
3075 	 * or if it's non-Intel, in which case this mitigation mechanism
3076 	 * doesn't apply.
3077 	 */
3078 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
3079 	    is_x86_feature(featureset, X86FSET_BHI_NO)) {
3080 		*bhb_clear = ret;
3081 		return (X86_BHI_NEW_ENOUGH);
3082 	}
3083 
3084 	/*
3085 	 * Now check for the BHI_CTRL MSR, and then set it if available.
3086 	 * We will still need to use the software sequence, however.
3087 	 */
3088 	if (is_x86_feature(featureset, X86FSET_BHI_CTRL)) {
3089 		cpuid_enable_bhi_dis_s();
3090 		return (X86_BHI_DIS_S);
3091 	}
3092 
3093 	/*
3094 	 * Finally, check if we are too old to bother with RSB:
3095 	 */
3096 	if (v2mit == X86_SPECTREV2_RETPOLINE) {
3097 		*bhb_clear = ret;
3098 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3099 	}
3100 
3101 	ASSERT(*bhb_clear != ret);
3102 	return (X86_BHI_SOFTWARE_SEQUENCE);
3103 }
3104 
3105 /*
3106  * We default to enabling Return Stack Buffer (RSB) mitigations.
3107  *
3108  * We used to skip RSB mitigations with Intel eIBRS, but developments around
3109  * post-barrier RSB (PBRSB) guessing suggests we should enable Intel RSB
3110  * mitigations always unless explicitly bypassed, or unless hardware indicates
3111  * the bug has been fixed.
3112  *
3113  * The current decisions for using, or ignoring, a RSB software stuffing
3114  * sequence are expressed by the following table:
3115  *
3116  * +-------+------------+-----------------+--------+
3117  * | eIBRS |  PBRSB_NO  |  context switch | vmexit |
3118  * +-------+------------+-----------------+--------+
3119  * |   Yes |     No     |  stuff          | stuff  |
3120  * |   Yes |     Yes    |  ignore         | ignore |
3121  * |   No  |     No     |  stuff          | ignore |
3122  * +-------+------------+-----------------+--------+
3123  *
3124  * Note that if an Intel CPU has no eIBRS, it will never enumerate PBRSB_NO,
3125  * because machines with no eIBRS do not have a problem with PBRSB overflow.
3126  * See the Intel document cited below for details.
3127  *
3128  * Also note that AMD AUTO_IBRS has no PBRSB problem, so it is not included in
3129  * the table above, and that there is no situation where vmexit stuffing is
3130  * needed, but context-switch stuffing isn't.
3131  */
3132 
3133 /* BEGIN CSTYLED */
3134 /*
3135  * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/post-barrier-return-stack-buffer-predictions.html
3136  */
3137 /* END CSTYLED */
3138 
3139 /*
3140  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
3141  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
3142  * also states that as long as SMEP and we maintain at least one page between
3143  * the kernel and user space (we have much more of a red zone), then we do not
3144  * need to clear the RSB. We constrain this to only when Automatic IRBS is
3145  * present.
3146  */
3147 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit,bool intel_pbrsb_no)3148 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit, bool intel_pbrsb_no)
3149 {
3150 	const uint8_t ret = RET_INSTR;
3151 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
3152 	uint8_t *vmx_stuff = (uint8_t *)x86_rsb_stuff_vmexit;
3153 
3154 	switch (mit) {
3155 	case X86_SPECTREV2_AUTO_IBRS:
3156 	case X86_SPECTREV2_DISABLED:
3157 		/* Don't bother with any RSB stuffing! */
3158 		*stuff = ret;
3159 		*vmx_stuff = ret;
3160 		break;
3161 	case X86_SPECTREV2_RETPOLINE:
3162 		/*
3163 		 * The Intel document on Post-Barrier RSB says that processors
3164 		 * without eIBRS do not have PBRSB problems upon VMEXIT.
3165 		 */
3166 		VERIFY(!intel_pbrsb_no);
3167 		VERIFY3U(*stuff, !=, ret);
3168 		*vmx_stuff = ret;
3169 		break;
3170 	default:
3171 		/*
3172 		 * eIBRS is all that's left.  If CPU claims PBRSB is fixed,
3173 		 * don't use the RSB mitigation in either case.  Otherwise
3174 		 * both vmexit and context-switching require the software
3175 		 * mitigation.
3176 		 */
3177 		if (intel_pbrsb_no) {
3178 			/* CPU claims PBRSB problems are fixed. */
3179 			*stuff = ret;
3180 			*vmx_stuff = ret;
3181 		}
3182 		VERIFY3U(*stuff, ==, *vmx_stuff);
3183 		break;
3184 	}
3185 }
3186 
3187 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)3188 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3189 {
3190 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3191 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3192 	    "_r14", "_r15" };
3193 	const uint_t nthunks = ARRAY_SIZE(thunks);
3194 	const char *type;
3195 	uint_t i;
3196 
3197 	if (mit == x86_spectrev2_mitigation)
3198 		return;
3199 
3200 	switch (mit) {
3201 	case X86_SPECTREV2_RETPOLINE:
3202 		type = "gen";
3203 		break;
3204 	case X86_SPECTREV2_AUTO_IBRS:
3205 	case X86_SPECTREV2_ENHANCED_IBRS:
3206 	case X86_SPECTREV2_DISABLED:
3207 		type = "jmp";
3208 		break;
3209 	default:
3210 		panic("asked to update retpoline state with unknown state!");
3211 	}
3212 
3213 	for (i = 0; i < nthunks; i++) {
3214 		uintptr_t source, dest;
3215 		int ssize, dsize;
3216 		char sourcebuf[64], destbuf[64];
3217 
3218 		(void) snprintf(destbuf, sizeof (destbuf),
3219 		    "__x86_indirect_thunk%s", thunks[i]);
3220 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
3221 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
3222 
3223 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3224 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
3225 		VERIFY3U(source, !=, 0);
3226 		VERIFY3U(dest, !=, 0);
3227 		VERIFY3S(dsize, >=, ssize);
3228 		bcopy((void *)source, (void *)dest, ssize);
3229 	}
3230 }
3231 
3232 static void
cpuid_enable_enhanced_ibrs(void)3233 cpuid_enable_enhanced_ibrs(void)
3234 {
3235 	uint64_t val;
3236 
3237 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3238 	val |= IA32_SPEC_CTRL_IBRS;
3239 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3240 }
3241 
3242 static void
cpuid_enable_auto_ibrs(void)3243 cpuid_enable_auto_ibrs(void)
3244 {
3245 	uint64_t val;
3246 
3247 	val = rdmsr(MSR_AMD_EFER);
3248 	val |= AMD_EFER_AIBRSE;
3249 	wrmsr(MSR_AMD_EFER, val);
3250 }
3251 
3252 /*
3253  *  AMD Zen 5 processors have a bug where the 16- and 32-bit forms of the
3254  *  RDSEED instruction can frequently return 0 despite indicating success
3255  *  (CF=1) - See AMD-SB-7055 / CVE-2025-62626.
3256  */
3257 static void
cpuid_evaluate_amd_rdseed(cpu_t * cpu,uchar_t * featureset)3258 cpuid_evaluate_amd_rdseed(cpu_t *cpu, uchar_t *featureset)
3259 {
3260 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3261 	struct cpuid_regs *ecp = &cpi->cpi_std[7];
3262 	uint32_t rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
3263 	uint64_t val;
3264 
3265 	ASSERT3U(cpi->cpi_vendor, ==, X86_VENDOR_AMD);
3266 	ASSERT(ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED);
3267 
3268 	/* This erratum only applies to the Zen5 uarch */
3269 	if (uarchrev_uarch(cpi->cpi_uarchrev) != X86_UARCH_AMD_ZEN5)
3270 		return;
3271 
3272 	/*
3273 	 * AMD-SB-7055 specifies microcode versions that mitigate this issue on
3274 	 * BRH-C1 and BRHD-B0. If we're on one of those chips and the microcode
3275 	 * version is new enough we can leave RDSEED enabled.
3276 	 */
3277 	if (chiprev_matches(cpi->cpi_chiprev, X86_CHIPREV_AMD_TURIN_C1) &&
3278 	    rev >= 0x0b00215a) {
3279 		return;
3280 	}
3281 	if (chiprev_matches(cpi->cpi_chiprev, X86_CHIPREV_AMD_DENSE_TURIN_B0) &&
3282 	    rev >= 0x0b101054) {
3283 		return;
3284 	}
3285 
3286 	/*
3287 	 * Go ahead and disable RDSEED on this boot.
3288 	 * In addition to removing it from the feature set and cached value, we
3289 	 * also need to remove it from the features returned by CPUID7 so that
3290 	 * userland programs performing their own feature detection will
3291 	 * determine it is not available.
3292 	 */
3293 	if (cpu->cpu_id == 0)
3294 		cmn_err(CE_WARN, "Masking unreliable RDSEED on this hardware");
3295 
3296 	remove_x86_feature(featureset, X86FSET_RDSEED);
3297 	ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
3298 
3299 	val = rdmsr(MSR_AMD_CPUID7_FEATURES);
3300 	val &= ~MSR_AMD_CPUID7_FEATURES_RDSEED;
3301 	wrmsr(MSR_AMD_CPUID7_FEATURES, val);
3302 }
3303 
3304 /*
3305  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3306  * we can disable TSX, we do so.
3307  *
3308  * This determination is done only on the boot CPU, potentially after loading
3309  * updated microcode.
3310  */
3311 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)3312 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3313 {
3314 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3315 
3316 	VERIFY(cpu->cpu_id == 0);
3317 
3318 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3319 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3320 		return;
3321 	}
3322 
3323 	if (x86_disable_taa) {
3324 		x86_taa_mitigation = X86_TAA_DISABLED;
3325 		return;
3326 	}
3327 
3328 	/*
3329 	 * If we do not have the ability to disable TSX, then our only
3330 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3331 	 * MDS mitigation as described above.  The latter relies upon us having
3332 	 * configured MDS mitigations correctly! This includes disabling SMT if
3333 	 * we want to cross-CPU-thread protection.
3334 	 */
3335 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3336 		/*
3337 		 * It's not clear whether any parts will enumerate TAA_NO
3338 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3339 		 */
3340 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3341 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3342 			return;
3343 		}
3344 
3345 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3346 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3347 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3348 		} else {
3349 			x86_taa_mitigation = X86_TAA_NOTHING;
3350 		}
3351 		return;
3352 	}
3353 
3354 	/*
3355 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3356 	 * enough in boot.
3357 	 *
3358 	 * Otherwise, we'll fall back to causing transactions to abort as our
3359 	 * mitigation. TSX-using code will always take the fallback path.
3360 	 */
3361 	if (cpi->cpi_pass < 4) {
3362 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3363 	} else {
3364 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3365 	}
3366 }
3367 
3368 /*
3369  * As mentioned, we should only touch the MSR when we've got a suitable
3370  * microcode loaded on this CPU.
3371  */
3372 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)3373 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3374 {
3375 	uint64_t val;
3376 
3377 	switch (taa) {
3378 	case X86_TAA_TSX_DISABLE:
3379 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3380 			return;
3381 		val = rdmsr(MSR_IA32_TSX_CTRL);
3382 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3383 		wrmsr(MSR_IA32_TSX_CTRL, val);
3384 		break;
3385 	case X86_TAA_TSX_FORCE_ABORT:
3386 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3387 			return;
3388 		val = rdmsr(MSR_IA32_TSX_CTRL);
3389 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3390 		wrmsr(MSR_IA32_TSX_CTRL, val);
3391 		break;
3392 	case X86_TAA_HW_MITIGATED:
3393 	case X86_TAA_MD_CLEAR:
3394 	case X86_TAA_DISABLED:
3395 	case X86_TAA_NOTHING:
3396 		break;
3397 	}
3398 }
3399 
3400 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)3401 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3402 {
3403 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3404 	x86_spectrev2_mitigation_t v2mit;
3405 
3406 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3407 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3408 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3409 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3410 			add_x86_feature(featureset, X86FSET_IBPB);
3411 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3412 			add_x86_feature(featureset, X86FSET_IBRS);
3413 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3414 			add_x86_feature(featureset, X86FSET_STIBP);
3415 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3416 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3417 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3418 			add_x86_feature(featureset, X86FSET_SSBD);
3419 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3420 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3421 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3422 			add_x86_feature(featureset, X86FSET_SSB_NO);
3423 
3424 		/*
3425 		 * Rather than Enhanced IBRS, AMD has a different feature that
3426 		 * is a bit in EFER that can be enabled and will basically do
3427 		 * the right thing while executing in the kernel.
3428 		 */
3429 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3430 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3431 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3432 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3433 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3434 		}
3435 
3436 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3437 	    cpi->cpi_maxeax >= 7) {
3438 		struct cpuid_regs *ecp;
3439 		ecp = &cpi->cpi_std[7];
3440 
3441 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3442 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3443 		}
3444 
3445 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3446 			add_x86_feature(featureset, X86FSET_IBRS);
3447 			add_x86_feature(featureset, X86FSET_IBPB);
3448 		}
3449 
3450 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3451 			add_x86_feature(featureset, X86FSET_STIBP);
3452 		}
3453 
3454 		/*
3455 		 * Some prediction controls are enumerated by subleaf 2 of
3456 		 * leaf 7.
3457 		 */
3458 		if (CPI_FEATURES_7_2_EDX(cpi) & CPUID_INTC_EDX_7_2_BHI_CTRL) {
3459 			add_x86_feature(featureset, X86FSET_BHI_CTRL);
3460 		}
3461 
3462 		/*
3463 		 * Don't read the arch caps MSR on xpv where we lack the
3464 		 * on_trap().
3465 		 */
3466 #ifndef __xpv
3467 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3468 			on_trap_data_t otd;
3469 
3470 			/*
3471 			 * Be paranoid and assume we'll get a #GP.
3472 			 */
3473 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3474 				uint64_t reg;
3475 
3476 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3477 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3478 					add_x86_feature(featureset,
3479 					    X86FSET_RDCL_NO);
3480 				}
3481 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3482 					add_x86_feature(featureset,
3483 					    X86FSET_IBRS_ALL);
3484 				}
3485 				if (reg & IA32_ARCH_CAP_RSBA) {
3486 					add_x86_feature(featureset,
3487 					    X86FSET_RSBA);
3488 				}
3489 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3490 					add_x86_feature(featureset,
3491 					    X86FSET_L1D_VM_NO);
3492 				}
3493 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3494 					add_x86_feature(featureset,
3495 					    X86FSET_SSB_NO);
3496 				}
3497 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3498 					add_x86_feature(featureset,
3499 					    X86FSET_MDS_NO);
3500 				}
3501 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3502 					add_x86_feature(featureset,
3503 					    X86FSET_TSX_CTRL);
3504 				}
3505 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3506 					add_x86_feature(featureset,
3507 					    X86FSET_TAA_NO);
3508 				}
3509 				if (reg & IA32_ARCH_CAP_RFDS_NO) {
3510 					add_x86_feature(featureset,
3511 					    X86FSET_RFDS_NO);
3512 				}
3513 				if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3514 					add_x86_feature(featureset,
3515 					    X86FSET_RFDS_CLEAR);
3516 				}
3517 				if (reg & IA32_ARCH_CAP_PBRSB_NO) {
3518 					add_x86_feature(featureset,
3519 					    X86FSET_PBRSB_NO);
3520 				}
3521 				if (reg & IA32_ARCH_CAP_BHI_NO) {
3522 					add_x86_feature(featureset,
3523 					    X86FSET_BHI_NO);
3524 				}
3525 			}
3526 			no_trap();
3527 		}
3528 #endif	/* !__xpv */
3529 
3530 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3531 			add_x86_feature(featureset, X86FSET_SSBD);
3532 
3533 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3534 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3535 	}
3536 
3537 	/*
3538 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3539 	 * will have already run this function and determined what we need to
3540 	 * do. This gives us a hook for per-HW thread mitigations such as
3541 	 * enhanced IBRS, or disabling TSX.
3542 	 */
3543 	if (cpu->cpu_id != 0) {
3544 		switch (x86_spectrev2_mitigation) {
3545 		case X86_SPECTREV2_ENHANCED_IBRS:
3546 			cpuid_enable_enhanced_ibrs();
3547 			break;
3548 		case X86_SPECTREV2_AUTO_IBRS:
3549 			cpuid_enable_auto_ibrs();
3550 			break;
3551 		default:
3552 			break;
3553 		}
3554 
3555 		/* If we're committed to BHI_DIS_S, set it for this core. */
3556 		if (x86_bhi_mitigation == X86_BHI_DIS_S)
3557 			cpuid_enable_bhi_dis_s();
3558 
3559 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3560 		return;
3561 	}
3562 
3563 	/*
3564 	 * Go through and initialize various security mechanisms that we should
3565 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3566 	 * TAA.
3567 	 */
3568 
3569 	/*
3570 	 * By default we've come in with retpolines enabled. Check whether we
3571 	 * should disable them or enable enhanced or automatic IBRS.
3572 	 *
3573 	 * Note, we do not allow the use of AMD optimized retpolines as it was
3574 	 * disclosed by AMD in March 2022 that they were still
3575 	 * vulnerable. Prior to that point, we used them.
3576 	 */
3577 	if (x86_disable_spectrev2 != 0) {
3578 		v2mit = X86_SPECTREV2_DISABLED;
3579 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3580 		cpuid_enable_auto_ibrs();
3581 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3582 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3583 		cpuid_enable_enhanced_ibrs();
3584 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3585 	} else {
3586 		v2mit = X86_SPECTREV2_RETPOLINE;
3587 	}
3588 
3589 	cpuid_patch_retpolines(v2mit);
3590 	cpuid_patch_rsb(v2mit, is_x86_feature(featureset, X86FSET_PBRSB_NO));
3591 	x86_bhi_mitigation = cpuid_learn_and_patch_bhi(v2mit, cpu, featureset);
3592 	x86_spectrev2_mitigation = v2mit;
3593 	membar_producer();
3594 
3595 	/*
3596 	 * We need to determine what changes are required for mitigating L1TF
3597 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3598 	 * is required.
3599 	 *
3600 	 * If any of these are present, then we need to flush u-arch state at
3601 	 * various points. For MDS, we need to do so whenever we change to a
3602 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3603 	 * flush the L1D cache at VM entry. When we have microcode that handles
3604 	 * MDS, the L1D flush also clears the other u-arch state that the
3605 	 * md_clear does.
3606 	 */
3607 
3608 	/*
3609 	 * Update whether or not we need to be taking explicit action against
3610 	 * MDS or RFDS.
3611 	 */
3612 	cpuid_update_md_clear(cpu, featureset);
3613 
3614 	/*
3615 	 * Determine whether SMT exclusion is required and whether or not we
3616 	 * need to perform an l1d flush.
3617 	 */
3618 	cpuid_update_l1d_flush(cpu, featureset);
3619 
3620 	/*
3621 	 * Determine what our mitigation strategy should be for TAA and then
3622 	 * also apply TAA mitigations.
3623 	 */
3624 	cpuid_update_tsx(cpu, featureset);
3625 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3626 }
3627 
3628 /*
3629  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3630  */
3631 void
setup_xfem(void)3632 setup_xfem(void)
3633 {
3634 	uint64_t flags = XFEATURE_LEGACY_FP;
3635 
3636 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3637 
3638 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3639 		flags |= XFEATURE_SSE;
3640 
3641 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3642 		flags |= XFEATURE_AVX;
3643 
3644 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3645 		flags |= XFEATURE_AVX512;
3646 
3647 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3648 
3649 	xsave_bv_all = flags;
3650 }
3651 
3652 static void
cpuid_basic_topology(cpu_t * cpu,uchar_t * featureset)3653 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3654 {
3655 	struct cpuid_info *cpi;
3656 
3657 	cpi = cpu->cpu_m.mcpu_cpi;
3658 
3659 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3660 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3661 		cpuid_gather_amd_topology_leaves(cpu);
3662 	}
3663 
3664 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3665 
3666 	/*
3667 	 * Before we can calculate the IDs that we should assign to this
3668 	 * processor, we need to understand how many cores and threads it has.
3669 	 */
3670 	switch (cpi->cpi_vendor) {
3671 	case X86_VENDOR_Intel:
3672 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3673 		    &cpi->cpi_ncore_per_chip);
3674 		break;
3675 	case X86_VENDOR_AMD:
3676 	case X86_VENDOR_HYGON:
3677 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3678 		    &cpi->cpi_ncore_per_chip);
3679 		break;
3680 	default:
3681 		/*
3682 		 * If we have some other x86 compatible chip, it's not clear how
3683 		 * they would behave. The most common case is virtualization
3684 		 * today, though there are also 64-bit VIA chips. Assume that
3685 		 * all we can get is the basic Leaf 1 HTT information.
3686 		 */
3687 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3688 			cpi->cpi_ncore_per_chip = 1;
3689 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3690 		}
3691 		break;
3692 	}
3693 
3694 	/*
3695 	 * Based on the calculated number of threads and cores, potentially
3696 	 * assign the HTT and CMT features.
3697 	 */
3698 	if (cpi->cpi_ncore_per_chip > 1) {
3699 		add_x86_feature(featureset, X86FSET_CMP);
3700 	}
3701 
3702 	if (cpi->cpi_ncpu_per_chip > 1 &&
3703 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3704 		add_x86_feature(featureset, X86FSET_HTT);
3705 	}
3706 
3707 	/*
3708 	 * Now that has been set up, we need to go through and calculate all of
3709 	 * the rest of the parameters that exist. If we think the CPU doesn't
3710 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3711 	 * up information in some way. The most likely case for this is
3712 	 * virtualization where we have a lot of partial topology information.
3713 	 */
3714 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3715 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3716 		/*
3717 		 * This is a single core, single-threaded processor.
3718 		 */
3719 		cpi->cpi_procnodes_per_pkg = 1;
3720 		cpi->cpi_cores_per_compunit = 1;
3721 		cpi->cpi_compunitid = 0;
3722 		cpi->cpi_chipid = -1;
3723 		cpi->cpi_clogid = 0;
3724 		cpi->cpi_coreid = cpu->cpu_id;
3725 		cpi->cpi_pkgcoreid = 0;
3726 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3727 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3728 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3729 		} else {
3730 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3731 		}
3732 	} else {
3733 		switch (cpi->cpi_vendor) {
3734 		case X86_VENDOR_Intel:
3735 			cpuid_intel_getids(cpu, featureset);
3736 			break;
3737 		case X86_VENDOR_AMD:
3738 		case X86_VENDOR_HYGON:
3739 			cpuid_amd_getids(cpu, featureset);
3740 			break;
3741 		default:
3742 			/*
3743 			 * In this case, it's hard to say what we should do.
3744 			 * We're going to model them to the OS as single core
3745 			 * threads. We don't have a good identifier for them, so
3746 			 * we're just going to use the cpu id all on a single
3747 			 * chip.
3748 			 *
3749 			 * This case has historically been different from the
3750 			 * case above where we don't have HTT or CMP. While they
3751 			 * could be combined, we've opted to keep it separate to
3752 			 * minimize the risk of topology changes in weird cases.
3753 			 */
3754 			cpi->cpi_procnodes_per_pkg = 1;
3755 			cpi->cpi_cores_per_compunit = 1;
3756 			cpi->cpi_chipid = 0;
3757 			cpi->cpi_coreid = cpu->cpu_id;
3758 			cpi->cpi_clogid = cpu->cpu_id;
3759 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3760 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3761 			cpi->cpi_compunitid = cpi->cpi_coreid;
3762 			break;
3763 		}
3764 	}
3765 }
3766 
3767 /*
3768  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3769  * always gather leaf 6 if it's supported; however, we only look for features on
3770  * Intel systems as AMD does not currently define any of the features we look
3771  * for below.
3772  */
3773 static void
cpuid_basic_thermal(cpu_t * cpu,uchar_t * featureset)3774 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3775 {
3776 	struct cpuid_regs *cp;
3777 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3778 
3779 	if (cpi->cpi_maxeax < 6) {
3780 		return;
3781 	}
3782 
3783 	cp = &cpi->cpi_std[6];
3784 	cp->cp_eax = 6;
3785 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3786 	(void) __cpuid_insn(cp);
3787 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3788 
3789 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3790 		return;
3791 	}
3792 
3793 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3794 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3795 	}
3796 
3797 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3798 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3799 	}
3800 }
3801 
3802 /*
3803  * This is used when we discover that we have AVX support in cpuid. This
3804  * proceeds to scan for the rest of the AVX derived features.
3805  */
3806 static void
cpuid_basic_avx(cpu_t * cpu,uchar_t * featureset)3807 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3808 {
3809 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3810 
3811 	/*
3812 	 * If we don't have AVX, don't bother with most of this.
3813 	 */
3814 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3815 		return;
3816 
3817 	add_x86_feature(featureset, X86FSET_AVX);
3818 
3819 	/*
3820 	 * Intel says we can't check these without also
3821 	 * checking AVX.
3822 	 */
3823 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3824 		add_x86_feature(featureset, X86FSET_F16C);
3825 
3826 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3827 		add_x86_feature(featureset, X86FSET_FMA);
3828 
3829 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3830 		add_x86_feature(featureset, X86FSET_BMI1);
3831 
3832 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3833 		add_x86_feature(featureset, X86FSET_BMI2);
3834 
3835 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3836 		add_x86_feature(featureset, X86FSET_AVX2);
3837 
3838 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3839 		add_x86_feature(featureset, X86FSET_VAES);
3840 
3841 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3842 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3843 
3844 	/*
3845 	 * The rest of the AVX features require AVX512. Do not check them unless
3846 	 * it is present.
3847 	 */
3848 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3849 		return;
3850 	add_x86_feature(featureset, X86FSET_AVX512F);
3851 
3852 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3853 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3854 
3855 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3856 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3857 
3858 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3859 		add_x86_feature(featureset, X86FSET_AVX512PF);
3860 
3861 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3862 		add_x86_feature(featureset, X86FSET_AVX512ER);
3863 
3864 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3865 		add_x86_feature(featureset, X86FSET_AVX512CD);
3866 
3867 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3868 		add_x86_feature(featureset, X86FSET_AVX512BW);
3869 
3870 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3871 		add_x86_feature(featureset, X86FSET_AVX512VL);
3872 
3873 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3874 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3875 
3876 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3877 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3878 
3879 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3880 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3881 
3882 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3883 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3884 
3885 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3886 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3887 
3888 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3889 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3890 
3891 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3892 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3893 
3894 	/*
3895 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3896 	 * we don't need to.
3897 	 */
3898 	if (cpi->cpi_std[7].cp_eax < 1)
3899 		return;
3900 
3901 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3902 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3903 }
3904 
3905 /*
3906  * PPIN is the protected processor inventory number. On AMD this is an actual
3907  * feature bit. However, on Intel systems we need to read the platform
3908  * information MSR if we're on a specific model.
3909  */
3910 #if !defined(__xpv)
3911 static void
cpuid_basic_ppin(cpu_t * cpu,uchar_t * featureset)3912 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3913 {
3914 	on_trap_data_t otd;
3915 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3916 
3917 	switch (cpi->cpi_vendor) {
3918 	case X86_VENDOR_AMD:
3919 		/*
3920 		 * This leaf will have already been gathered in the topology
3921 		 * functions.
3922 		 */
3923 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3924 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3925 				add_x86_feature(featureset, X86FSET_PPIN);
3926 			}
3927 		}
3928 		break;
3929 	case X86_VENDOR_Intel:
3930 		if (cpi->cpi_family != 6)
3931 			break;
3932 		switch (cpi->cpi_model) {
3933 		case INTC_MODEL_IVYBRIDGE_XEON:
3934 		case INTC_MODEL_HASWELL_XEON:
3935 		case INTC_MODEL_BROADWELL_XEON:
3936 		case INTC_MODEL_BROADWELL_XEON_D:
3937 		case INTC_MODEL_SKYLAKE_XEON:
3938 		case INTC_MODEL_ICELAKE_XEON:
3939 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3940 				uint64_t value;
3941 
3942 				value = rdmsr(MSR_PLATFORM_INFO);
3943 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3944 					add_x86_feature(featureset,
3945 					    X86FSET_PPIN);
3946 				}
3947 			}
3948 			no_trap();
3949 			break;
3950 		default:
3951 			break;
3952 		}
3953 		break;
3954 	default:
3955 		break;
3956 	}
3957 }
3958 #endif	/* ! __xpv */
3959 
3960 static void
cpuid_pass_prelude(cpu_t * cpu,void * arg)3961 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3962 {
3963 	uchar_t *featureset = (uchar_t *)arg;
3964 
3965 	/*
3966 	 * We don't run on any processor that doesn't have cpuid, and could not
3967 	 * possibly have arrived here.
3968 	 */
3969 	add_x86_feature(featureset, X86FSET_CPUID);
3970 }
3971 
3972 static void
cpuid_pass_ident(cpu_t * cpu,void * arg __unused)3973 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3974 {
3975 	struct cpuid_info *cpi;
3976 	struct cpuid_regs *cp;
3977 
3978 	/*
3979 	 * We require that virtual/native detection be complete and that PCI
3980 	 * config space access has been set up; at present there is no reliable
3981 	 * way to determine the latter.
3982 	 */
3983 #if !defined(__xpv)
3984 	ASSERT3S(platform_type, !=, -1);
3985 #endif	/* !__xpv */
3986 
3987 	cpi = cpu->cpu_m.mcpu_cpi;
3988 	ASSERT(cpi != NULL);
3989 
3990 	cp = &cpi->cpi_std[0];
3991 	cp->cp_eax = 0;
3992 	cpi->cpi_maxeax = __cpuid_insn(cp);
3993 	{
3994 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3995 		*iptr++ = cp->cp_ebx;
3996 		*iptr++ = cp->cp_edx;
3997 		*iptr++ = cp->cp_ecx;
3998 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3999 	}
4000 
4001 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
4002 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
4003 
4004 	/*
4005 	 * Limit the range in case of weird hardware
4006 	 */
4007 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
4008 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
4009 	if (cpi->cpi_maxeax < 1)
4010 		return;
4011 
4012 	cp = &cpi->cpi_std[1];
4013 	cp->cp_eax = 1;
4014 	(void) __cpuid_insn(cp);
4015 
4016 	/*
4017 	 * Extract identifying constants for easy access.
4018 	 */
4019 	cpi->cpi_model = CPI_MODEL(cpi);
4020 	cpi->cpi_family = CPI_FAMILY(cpi);
4021 
4022 	if (cpi->cpi_family == 0xf)
4023 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
4024 
4025 	/*
4026 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
4027 	 * Intel, and presumably everyone else, uses model == 0xf, as
4028 	 * one would expect (max value means possible overflow).  Sigh.
4029 	 */
4030 
4031 	switch (cpi->cpi_vendor) {
4032 	case X86_VENDOR_Intel:
4033 		if (IS_EXTENDED_MODEL_INTEL(cpi))
4034 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4035 		break;
4036 	case X86_VENDOR_AMD:
4037 		if (CPI_FAMILY(cpi) == 0xf)
4038 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4039 		break;
4040 	case X86_VENDOR_HYGON:
4041 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4042 		break;
4043 	default:
4044 		if (cpi->cpi_model == 0xf)
4045 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4046 		break;
4047 	}
4048 
4049 	cpi->cpi_step = CPI_STEP(cpi);
4050 	cpi->cpi_brandid = CPI_BRANDID(cpi);
4051 
4052 	/*
4053 	 * Synthesize chip "revision" and socket type
4054 	 */
4055 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4056 	    cpi->cpi_model, cpi->cpi_step);
4057 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4058 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4059 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4060 	    cpi->cpi_model, cpi->cpi_step);
4061 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
4062 	    cpi->cpi_model, cpi->cpi_step);
4063 }
4064 
4065 static void
cpuid_pass_basic(cpu_t * cpu,void * arg)4066 cpuid_pass_basic(cpu_t *cpu, void *arg)
4067 {
4068 	uchar_t *featureset = (uchar_t *)arg;
4069 	uint32_t mask_ecx, mask_edx;
4070 	struct cpuid_info *cpi;
4071 	struct cpuid_regs *cp;
4072 	int xcpuid;
4073 #if !defined(__xpv)
4074 	extern int idle_cpu_prefer_mwait;
4075 #endif
4076 
4077 	cpi = cpu->cpu_m.mcpu_cpi;
4078 	ASSERT(cpi != NULL);
4079 
4080 	if (cpi->cpi_maxeax < 1)
4081 		return;
4082 
4083 	/*
4084 	 * This was filled during the identification pass.
4085 	 */
4086 	cp = &cpi->cpi_std[1];
4087 
4088 	/*
4089 	 * *default* assumptions:
4090 	 * - believe %edx feature word
4091 	 * - ignore %ecx feature word
4092 	 * - 32-bit virtual and physical addressing
4093 	 */
4094 	mask_edx = 0xffffffff;
4095 	mask_ecx = 0;
4096 
4097 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
4098 
4099 	switch (cpi->cpi_vendor) {
4100 	case X86_VENDOR_Intel:
4101 		if (cpi->cpi_family == 5)
4102 			x86_type = X86_TYPE_P5;
4103 		else if (IS_LEGACY_P6(cpi)) {
4104 			x86_type = X86_TYPE_P6;
4105 			pentiumpro_bug4046376 = 1;
4106 			/*
4107 			 * Clear the SEP bit when it was set erroneously
4108 			 */
4109 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
4110 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
4111 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
4112 			x86_type = X86_TYPE_P4;
4113 			/*
4114 			 * We don't currently depend on any of the %ecx
4115 			 * features until Prescott, so we'll only check
4116 			 * this from P4 onwards.  We might want to revisit
4117 			 * that idea later.
4118 			 */
4119 			mask_ecx = 0xffffffff;
4120 		} else if (cpi->cpi_family > 0xf)
4121 			mask_ecx = 0xffffffff;
4122 		/*
4123 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4124 		 * to obtain the monitor linesize.
4125 		 */
4126 		if (cpi->cpi_maxeax < 5)
4127 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4128 		break;
4129 	case X86_VENDOR_IntelClone:
4130 	default:
4131 		break;
4132 	case X86_VENDOR_AMD:
4133 #if defined(OPTERON_ERRATUM_108)
4134 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
4135 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
4136 			cpi->cpi_model = 0xc;
4137 		} else
4138 #endif
4139 		if (cpi->cpi_family == 5) {
4140 			/*
4141 			 * AMD K5 and K6
4142 			 *
4143 			 * These CPUs have an incomplete implementation
4144 			 * of MCA/MCE which we mask away.
4145 			 */
4146 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
4147 
4148 			/*
4149 			 * Model 0 uses the wrong (APIC) bit
4150 			 * to indicate PGE.  Fix it here.
4151 			 */
4152 			if (cpi->cpi_model == 0) {
4153 				if (cp->cp_edx & 0x200) {
4154 					cp->cp_edx &= ~0x200;
4155 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
4156 				}
4157 			}
4158 
4159 			/*
4160 			 * Early models had problems w/ MMX; disable.
4161 			 */
4162 			if (cpi->cpi_model < 6)
4163 				mask_edx &= ~CPUID_INTC_EDX_MMX;
4164 		}
4165 
4166 		/*
4167 		 * For newer families, SSE3 and CX16, at least, are valid;
4168 		 * enable all
4169 		 */
4170 		if (cpi->cpi_family >= 0xf)
4171 			mask_ecx = 0xffffffff;
4172 		/*
4173 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4174 		 * to obtain the monitor linesize.
4175 		 */
4176 		if (cpi->cpi_maxeax < 5)
4177 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4178 
4179 #if !defined(__xpv)
4180 		/*
4181 		 * AMD has not historically used MWAIT in the CPU's idle loop.
4182 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
4183 		 * know for certain that in at least family 17h, per AMD, mwait
4184 		 * is preferred. Families in-between are less certain.
4185 		 */
4186 		if (cpi->cpi_family < 0x17) {
4187 			idle_cpu_prefer_mwait = 0;
4188 		}
4189 #endif
4190 
4191 		break;
4192 	case X86_VENDOR_HYGON:
4193 		/* Enable all for Hygon Dhyana CPU */
4194 		mask_ecx = 0xffffffff;
4195 		break;
4196 	case X86_VENDOR_TM:
4197 		/*
4198 		 * workaround the NT workaround in CMS 4.1
4199 		 */
4200 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
4201 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
4202 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4203 		break;
4204 	case X86_VENDOR_Centaur:
4205 		/*
4206 		 * workaround the NT workarounds again
4207 		 */
4208 		if (cpi->cpi_family == 6)
4209 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4210 		break;
4211 	case X86_VENDOR_Cyrix:
4212 		/*
4213 		 * We rely heavily on the probing in locore
4214 		 * to actually figure out what parts, if any,
4215 		 * of the Cyrix cpuid instruction to believe.
4216 		 */
4217 		switch (x86_type) {
4218 		case X86_TYPE_CYRIX_486:
4219 			mask_edx = 0;
4220 			break;
4221 		case X86_TYPE_CYRIX_6x86:
4222 			mask_edx = 0;
4223 			break;
4224 		case X86_TYPE_CYRIX_6x86L:
4225 			mask_edx =
4226 			    CPUID_INTC_EDX_DE |
4227 			    CPUID_INTC_EDX_CX8;
4228 			break;
4229 		case X86_TYPE_CYRIX_6x86MX:
4230 			mask_edx =
4231 			    CPUID_INTC_EDX_DE |
4232 			    CPUID_INTC_EDX_MSR |
4233 			    CPUID_INTC_EDX_CX8 |
4234 			    CPUID_INTC_EDX_PGE |
4235 			    CPUID_INTC_EDX_CMOV |
4236 			    CPUID_INTC_EDX_MMX;
4237 			break;
4238 		case X86_TYPE_CYRIX_GXm:
4239 			mask_edx =
4240 			    CPUID_INTC_EDX_MSR |
4241 			    CPUID_INTC_EDX_CX8 |
4242 			    CPUID_INTC_EDX_CMOV |
4243 			    CPUID_INTC_EDX_MMX;
4244 			break;
4245 		case X86_TYPE_CYRIX_MediaGX:
4246 			break;
4247 		case X86_TYPE_CYRIX_MII:
4248 		case X86_TYPE_VIA_CYRIX_III:
4249 			mask_edx =
4250 			    CPUID_INTC_EDX_DE |
4251 			    CPUID_INTC_EDX_TSC |
4252 			    CPUID_INTC_EDX_MSR |
4253 			    CPUID_INTC_EDX_CX8 |
4254 			    CPUID_INTC_EDX_PGE |
4255 			    CPUID_INTC_EDX_CMOV |
4256 			    CPUID_INTC_EDX_MMX;
4257 			break;
4258 		default:
4259 			break;
4260 		}
4261 		break;
4262 	}
4263 
4264 #if defined(__xpv)
4265 	/*
4266 	 * Do not support MONITOR/MWAIT under a hypervisor
4267 	 */
4268 	mask_ecx &= ~CPUID_INTC_ECX_MON;
4269 	/*
4270 	 * Do not support XSAVE under a hypervisor for now
4271 	 */
4272 	xsave_force_disable = B_TRUE;
4273 
4274 #endif	/* __xpv */
4275 
4276 	if (xsave_force_disable) {
4277 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4278 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
4279 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
4280 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
4281 	}
4282 
4283 	/*
4284 	 * Now we've figured out the masks that determine
4285 	 * which bits we choose to believe, apply the masks
4286 	 * to the feature words, then map the kernel's view
4287 	 * of these feature words into its feature word.
4288 	 */
4289 	cp->cp_edx &= mask_edx;
4290 	cp->cp_ecx &= mask_ecx;
4291 
4292 	/*
4293 	 * apply any platform restrictions (we don't call this
4294 	 * immediately after __cpuid_insn here, because we need the
4295 	 * workarounds applied above first)
4296 	 */
4297 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4298 
4299 	/*
4300 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
4301 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4302 	 * 7 has sub-leaves determined by ecx.
4303 	 */
4304 	if (cpi->cpi_maxeax >= 7) {
4305 		struct cpuid_regs *ecp;
4306 		ecp = &cpi->cpi_std[7];
4307 		ecp->cp_eax = 7;
4308 		ecp->cp_ecx = 0;
4309 		(void) __cpuid_insn(ecp);
4310 
4311 		/*
4312 		 * If XSAVE has been disabled, just ignore all of the
4313 		 * extended-save-area dependent flags here. By removing most of
4314 		 * the leaf 7, sub-leaf 0 flags, that will ensure that we don't
4315 		 * end up looking at additional xsave dependent leaves right
4316 		 * now.
4317 		 */
4318 		if (xsave_force_disable) {
4319 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4320 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4321 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4322 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4323 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4324 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4325 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4326 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4327 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4328 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4329 		}
4330 
4331 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4332 			add_x86_feature(featureset, X86FSET_SMEP);
4333 
4334 		/*
4335 		 * We check disable_smap here in addition to in startup_smap()
4336 		 * to ensure CPUs that aren't the boot CPU don't accidentally
4337 		 * include it in the feature set and thus generate a mismatched
4338 		 * x86 feature set across CPUs.
4339 		 */
4340 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4341 		    disable_smap == 0)
4342 			add_x86_feature(featureset, X86FSET_SMAP);
4343 
4344 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) {
4345 			add_x86_feature(featureset, X86FSET_RDSEED);
4346 			if (cpi->cpi_vendor == X86_VENDOR_AMD)
4347 				cpuid_evaluate_amd_rdseed(cpu, featureset);
4348 		}
4349 
4350 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4351 			add_x86_feature(featureset, X86FSET_ADX);
4352 
4353 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4354 			add_x86_feature(featureset, X86FSET_FSGSBASE);
4355 
4356 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4357 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4358 
4359 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4360 			add_x86_feature(featureset, X86FSET_INVPCID);
4361 
4362 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4363 			add_x86_feature(featureset, X86FSET_UMIP);
4364 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4365 			add_x86_feature(featureset, X86FSET_PKU);
4366 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4367 			add_x86_feature(featureset, X86FSET_OSPKE);
4368 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4369 			add_x86_feature(featureset, X86FSET_GFNI);
4370 
4371 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4372 			add_x86_feature(featureset, X86FSET_CLWB);
4373 
4374 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4375 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4376 				add_x86_feature(featureset, X86FSET_MPX);
4377 		}
4378 
4379 		/*
4380 		 * If we have subleaf 1 or 2 available, grab and store
4381 		 * that. This is used for more AVX and related features.
4382 		 */
4383 		if (ecp->cp_eax >= 1) {
4384 			struct cpuid_regs *c71;
4385 			c71 = &cpi->cpi_sub7[0];
4386 			c71->cp_eax = 7;
4387 			c71->cp_ecx = 1;
4388 			(void) __cpuid_insn(c71);
4389 		}
4390 
4391 		/* Subleaf 2 has certain security indicators in it. */
4392 		if (ecp->cp_eax >= 2) {
4393 			struct cpuid_regs *c72;
4394 			c72 = &cpi->cpi_sub7[1];
4395 			c72->cp_eax = 7;
4396 			c72->cp_ecx = 2;
4397 			(void) __cpuid_insn(c72);
4398 		}
4399 	}
4400 
4401 	/*
4402 	 * fold in overrides from the "eeprom" mechanism
4403 	 */
4404 	cp->cp_edx |= cpuid_feature_edx_include;
4405 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4406 
4407 	cp->cp_ecx |= cpuid_feature_ecx_include;
4408 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4409 
4410 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4411 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4412 	}
4413 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4414 		add_x86_feature(featureset, X86FSET_TSC);
4415 	}
4416 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4417 		add_x86_feature(featureset, X86FSET_MSR);
4418 	}
4419 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4420 		add_x86_feature(featureset, X86FSET_MTRR);
4421 	}
4422 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4423 		add_x86_feature(featureset, X86FSET_PGE);
4424 	}
4425 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4426 		add_x86_feature(featureset, X86FSET_CMOV);
4427 	}
4428 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4429 		add_x86_feature(featureset, X86FSET_MMX);
4430 	}
4431 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4432 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4433 		add_x86_feature(featureset, X86FSET_MCA);
4434 	}
4435 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4436 		add_x86_feature(featureset, X86FSET_PAE);
4437 	}
4438 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4439 		add_x86_feature(featureset, X86FSET_CX8);
4440 	}
4441 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4442 		add_x86_feature(featureset, X86FSET_CX16);
4443 	}
4444 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4445 		add_x86_feature(featureset, X86FSET_PAT);
4446 	}
4447 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4448 		add_x86_feature(featureset, X86FSET_SEP);
4449 	}
4450 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4451 		/*
4452 		 * In our implementation, fxsave/fxrstor
4453 		 * are prerequisites before we'll even
4454 		 * try and do SSE things.
4455 		 */
4456 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4457 			add_x86_feature(featureset, X86FSET_SSE);
4458 		}
4459 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4460 			add_x86_feature(featureset, X86FSET_SSE2);
4461 		}
4462 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4463 			add_x86_feature(featureset, X86FSET_SSE3);
4464 		}
4465 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4466 			add_x86_feature(featureset, X86FSET_SSSE3);
4467 		}
4468 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4469 			add_x86_feature(featureset, X86FSET_SSE4_1);
4470 		}
4471 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4472 			add_x86_feature(featureset, X86FSET_SSE4_2);
4473 		}
4474 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4475 			add_x86_feature(featureset, X86FSET_AES);
4476 		}
4477 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4478 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4479 		}
4480 
4481 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4482 			add_x86_feature(featureset, X86FSET_SHA);
4483 
4484 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4485 			add_x86_feature(featureset, X86FSET_XSAVE);
4486 
4487 			/* We only test AVX & AVX512 when there is XSAVE */
4488 			cpuid_basic_avx(cpu, featureset);
4489 		}
4490 	}
4491 
4492 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4493 		add_x86_feature(featureset, X86FSET_PCID);
4494 	}
4495 
4496 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4497 		add_x86_feature(featureset, X86FSET_X2APIC);
4498 	}
4499 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4500 		add_x86_feature(featureset, X86FSET_DE);
4501 	}
4502 #if !defined(__xpv)
4503 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4504 
4505 		/*
4506 		 * We require the CLFLUSH instruction for erratum workaround
4507 		 * to use MONITOR/MWAIT.
4508 		 */
4509 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4510 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4511 			add_x86_feature(featureset, X86FSET_MWAIT);
4512 		} else {
4513 			extern int idle_cpu_assert_cflush_monitor;
4514 
4515 			/*
4516 			 * All processors we are aware of which have
4517 			 * MONITOR/MWAIT also have CLFLUSH.
4518 			 */
4519 			if (idle_cpu_assert_cflush_monitor) {
4520 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4521 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4522 			}
4523 		}
4524 	}
4525 #endif	/* __xpv */
4526 
4527 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4528 		add_x86_feature(featureset, X86FSET_VMX);
4529 	}
4530 
4531 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4532 		add_x86_feature(featureset, X86FSET_RDRAND);
4533 
4534 	/*
4535 	 * Only need it first time, rest of the cpus would follow suit.
4536 	 * we only capture this for the bootcpu.
4537 	 */
4538 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4539 		add_x86_feature(featureset, X86FSET_CLFSH);
4540 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4541 	}
4542 	if (is_x86_feature(featureset, X86FSET_PAE))
4543 		cpi->cpi_pabits = 36;
4544 
4545 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4546 		struct cpuid_regs r, *ecp;
4547 
4548 		ecp = &r;
4549 		ecp->cp_eax = 0xD;
4550 		ecp->cp_ecx = 1;
4551 		ecp->cp_edx = ecp->cp_ebx = 0;
4552 		(void) __cpuid_insn(ecp);
4553 
4554 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4555 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4556 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4557 			add_x86_feature(featureset, X86FSET_XSAVEC);
4558 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4559 			add_x86_feature(featureset, X86FSET_XSAVES);
4560 
4561 		/*
4562 		 * Zen 2 family processors suffer from erratum 1386 that causes
4563 		 * xsaves to not function correctly in some circumstances. There
4564 		 * are no supervisor states in Zen 2 and earlier. Practically
4565 		 * speaking this has no impact for us as we currently do not
4566 		 * leverage compressed xsave formats. To safeguard against
4567 		 * issues in the future where we may opt to using it, we remove
4568 		 * it from the feature set now. While Matisse has a microcode
4569 		 * update available with a fix, not all Zen 2 CPUs do so it's
4570 		 * simpler for the moment to unconditionally remove it.
4571 		 */
4572 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4573 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4574 			remove_x86_feature(featureset, X86FSET_XSAVES);
4575 		}
4576 	}
4577 
4578 	/*
4579 	 * Work on the "extended" feature information, doing
4580 	 * some basic initialization to be used in the extended pass.
4581 	 */
4582 	xcpuid = 0;
4583 	switch (cpi->cpi_vendor) {
4584 	case X86_VENDOR_Intel:
4585 		/*
4586 		 * On KVM we know we will have proper support for extended
4587 		 * cpuid.
4588 		 */
4589 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4590 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4591 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4592 			xcpuid++;
4593 		break;
4594 	case X86_VENDOR_AMD:
4595 		if (cpi->cpi_family > 5 ||
4596 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4597 			xcpuid++;
4598 		break;
4599 	case X86_VENDOR_Cyrix:
4600 		/*
4601 		 * Only these Cyrix CPUs are -known- to support
4602 		 * extended cpuid operations.
4603 		 */
4604 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4605 		    x86_type == X86_TYPE_CYRIX_GXm)
4606 			xcpuid++;
4607 		break;
4608 	case X86_VENDOR_HYGON:
4609 	case X86_VENDOR_Centaur:
4610 	case X86_VENDOR_TM:
4611 	default:
4612 		xcpuid++;
4613 		break;
4614 	}
4615 
4616 	if (xcpuid) {
4617 		cp = &cpi->cpi_extd[0];
4618 		cp->cp_eax = CPUID_LEAF_EXT_0;
4619 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4620 	}
4621 
4622 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4623 
4624 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4625 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4626 
4627 		switch (cpi->cpi_vendor) {
4628 		case X86_VENDOR_Intel:
4629 		case X86_VENDOR_AMD:
4630 		case X86_VENDOR_HYGON:
4631 			if (cpi->cpi_xmaxeax < 0x80000001)
4632 				break;
4633 			cp = &cpi->cpi_extd[1];
4634 			cp->cp_eax = 0x80000001;
4635 			(void) __cpuid_insn(cp);
4636 
4637 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4638 			    cpi->cpi_family == 5 &&
4639 			    cpi->cpi_model == 6 &&
4640 			    cpi->cpi_step == 6) {
4641 				/*
4642 				 * K6 model 6 uses bit 10 to indicate SYSC
4643 				 * Later models use bit 11. Fix it here.
4644 				 */
4645 				if (cp->cp_edx & 0x400) {
4646 					cp->cp_edx &= ~0x400;
4647 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4648 				}
4649 			}
4650 
4651 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4652 
4653 			/*
4654 			 * Compute the additions to the kernel's feature word.
4655 			 */
4656 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4657 				add_x86_feature(featureset, X86FSET_NX);
4658 			}
4659 
4660 			/*
4661 			 * Regardless whether or not we boot 64-bit,
4662 			 * we should have a way to identify whether
4663 			 * the CPU is capable of running 64-bit.
4664 			 */
4665 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4666 				add_x86_feature(featureset, X86FSET_64);
4667 			}
4668 
4669 			/* 1 GB large page - enable only for 64 bit kernel */
4670 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4671 				add_x86_feature(featureset, X86FSET_1GPG);
4672 			}
4673 
4674 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4675 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4676 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4677 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4678 				add_x86_feature(featureset, X86FSET_SSE4A);
4679 			}
4680 
4681 			/*
4682 			 * It's really tricky to support syscall/sysret in
4683 			 * the i386 kernel; we rely on sysenter/sysexit
4684 			 * instead.  In the amd64 kernel, things are -way-
4685 			 * better.
4686 			 */
4687 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4688 				add_x86_feature(featureset, X86FSET_ASYSC);
4689 			}
4690 
4691 			/*
4692 			 * While we're thinking about system calls, note
4693 			 * that AMD processors don't support sysenter
4694 			 * in long mode at all, so don't try to program them.
4695 			 */
4696 			if (x86_vendor == X86_VENDOR_AMD ||
4697 			    x86_vendor == X86_VENDOR_HYGON) {
4698 				remove_x86_feature(featureset, X86FSET_SEP);
4699 			}
4700 
4701 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4702 				add_x86_feature(featureset, X86FSET_TSCP);
4703 			}
4704 
4705 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4706 				add_x86_feature(featureset, X86FSET_SVM);
4707 			}
4708 
4709 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4710 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4711 			}
4712 
4713 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4714 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4715 			}
4716 
4717 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4718 				add_x86_feature(featureset, X86FSET_XOP);
4719 			}
4720 
4721 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4722 				add_x86_feature(featureset, X86FSET_FMA4);
4723 			}
4724 
4725 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4726 				add_x86_feature(featureset, X86FSET_TBM);
4727 			}
4728 
4729 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4730 				add_x86_feature(featureset, X86FSET_MONITORX);
4731 			}
4732 			break;
4733 		default:
4734 			break;
4735 		}
4736 
4737 		/*
4738 		 * Get CPUID data about processor cores and hyperthreads.
4739 		 */
4740 		switch (cpi->cpi_vendor) {
4741 		case X86_VENDOR_Intel:
4742 			if (cpi->cpi_maxeax >= 4) {
4743 				cp = &cpi->cpi_std[4];
4744 				cp->cp_eax = 4;
4745 				cp->cp_ecx = 0;
4746 				(void) __cpuid_insn(cp);
4747 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4748 			}
4749 			/*FALLTHROUGH*/
4750 		case X86_VENDOR_AMD:
4751 		case X86_VENDOR_HYGON:
4752 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4753 				break;
4754 			cp = &cpi->cpi_extd[8];
4755 			cp->cp_eax = CPUID_LEAF_EXT_8;
4756 			(void) __cpuid_insn(cp);
4757 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4758 			    cp);
4759 
4760 			/*
4761 			 * AMD uses ebx for some extended functions.
4762 			 */
4763 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4764 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4765 				/*
4766 				 * While we're here, check for the AMD "Error
4767 				 * Pointer Zero/Restore" feature. This can be
4768 				 * used to setup the FP save handlers
4769 				 * appropriately.
4770 				 */
4771 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4772 					cpi->cpi_fp_amd_save = 0;
4773 				} else {
4774 					cpi->cpi_fp_amd_save = 1;
4775 				}
4776 
4777 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4778 					add_x86_feature(featureset,
4779 					    X86FSET_CLZERO);
4780 				}
4781 			}
4782 
4783 			/*
4784 			 * Virtual and physical address limits from
4785 			 * cpuid override previously guessed values.
4786 			 */
4787 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4788 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4789 			break;
4790 		default:
4791 			break;
4792 		}
4793 
4794 		/*
4795 		 * Get CPUID data about TSC Invariance in Deep C-State.
4796 		 */
4797 		switch (cpi->cpi_vendor) {
4798 		case X86_VENDOR_Intel:
4799 		case X86_VENDOR_AMD:
4800 		case X86_VENDOR_HYGON:
4801 			if (cpi->cpi_maxeax >= 7) {
4802 				cp = &cpi->cpi_extd[7];
4803 				cp->cp_eax = 0x80000007;
4804 				cp->cp_ecx = 0;
4805 				(void) __cpuid_insn(cp);
4806 			}
4807 			break;
4808 		default:
4809 			break;
4810 		}
4811 	}
4812 
4813 	/*
4814 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4815 	 * run and thus gathered some of its dependent leaves.
4816 	 */
4817 	cpuid_basic_topology(cpu, featureset);
4818 	cpuid_basic_thermal(cpu, featureset);
4819 #if !defined(__xpv)
4820 	cpuid_basic_ppin(cpu, featureset);
4821 #endif
4822 
4823 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4824 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4825 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4826 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4827 			/* Special handling for AMD FP not necessary. */
4828 			cpi->cpi_fp_amd_save = 0;
4829 		} else {
4830 			cpi->cpi_fp_amd_save = 1;
4831 		}
4832 	}
4833 
4834 	/*
4835 	 * Check (and potentially set) if lfence is serializing.
4836 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4837 	 */
4838 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4839 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4840 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4841 		/*
4842 		 * The AMD white paper Software Techniques For Managing
4843 		 * Speculation on AMD Processors details circumstances for when
4844 		 * lfence instructions are serializing.
4845 		 *
4846 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4847 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4848 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4849 		 * committed to supporting that MSR on all later CPUs.
4850 		 */
4851 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4852 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4853 		} else if (cpi->cpi_family >= 0x10) {
4854 #if !defined(__xpv)
4855 			uint64_t val;
4856 
4857 			/*
4858 			 * Be careful when attempting to enable the bit, and
4859 			 * verify that it was actually set in case we are
4860 			 * running in a hypervisor which is less than faithful
4861 			 * about its emulation of this feature.
4862 			 */
4863 			on_trap_data_t otd;
4864 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4865 				val = rdmsr(MSR_AMD_DE_CFG);
4866 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4867 				wrmsr(MSR_AMD_DE_CFG, val);
4868 				val = rdmsr(MSR_AMD_DE_CFG);
4869 			} else {
4870 				val = 0;
4871 			}
4872 			no_trap();
4873 
4874 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4875 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4876 			}
4877 #endif
4878 		}
4879 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4880 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4881 		/*
4882 		 * Documentation and other OSes indicate that lfence is always
4883 		 * serializing on Intel CPUs.
4884 		 */
4885 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4886 	}
4887 
4888 
4889 	/*
4890 	 * Check the processor leaves that are used for security features. Grab
4891 	 * any additional processor-specific leaves that we may not have yet.
4892 	 */
4893 	switch (cpi->cpi_vendor) {
4894 	case X86_VENDOR_AMD:
4895 	case X86_VENDOR_HYGON:
4896 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4897 			cp = &cpi->cpi_extd[0x21];
4898 			cp->cp_eax = CPUID_LEAF_EXT_21;
4899 			cp->cp_ecx = 0;
4900 			(void) __cpuid_insn(cp);
4901 		}
4902 		break;
4903 	default:
4904 		break;
4905 	}
4906 
4907 	cpuid_scan_security(cpu, featureset);
4908 }
4909 
4910 /*
4911  * Make copies of the cpuid table entries we depend on, in
4912  * part for ease of parsing now, in part so that we have only
4913  * one place to correct any of it, in part for ease of
4914  * later export to userland, and in part so we can look at
4915  * this stuff in a crash dump.
4916  */
4917 
4918 static void
cpuid_pass_extended(cpu_t * cpu,void * _arg __unused)4919 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4920 {
4921 	uint_t n, nmax;
4922 	int i;
4923 	struct cpuid_regs *cp;
4924 	uint8_t *dp;
4925 	uint32_t *iptr;
4926 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4927 
4928 	if (cpi->cpi_maxeax < 1)
4929 		return;
4930 
4931 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4932 		nmax = NMAX_CPI_STD;
4933 	/*
4934 	 * (We already handled n == 0 and n == 1 in the basic pass)
4935 	 */
4936 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4937 		/*
4938 		 * leaves 6 and 7 were handled in the basic pass
4939 		 */
4940 		if (n == 6 || n == 7)
4941 			continue;
4942 
4943 		cp->cp_eax = n;
4944 
4945 		/*
4946 		 * CPUID function 4 expects %ecx to be initialized
4947 		 * with an index which indicates which cache to return
4948 		 * information about. The OS is expected to call function 4
4949 		 * with %ecx set to 0, 1, 2, ... until it returns with
4950 		 * EAX[4:0] set to 0, which indicates there are no more
4951 		 * caches.
4952 		 *
4953 		 * Here, populate cpi_std[4] with the information returned by
4954 		 * function 4 when %ecx == 0, and do the rest in a later pass
4955 		 * when dynamic memory allocation becomes available.
4956 		 *
4957 		 * Note: we need to explicitly initialize %ecx here, since
4958 		 * function 4 may have been previously invoked.
4959 		 */
4960 		if (n == 4)
4961 			cp->cp_ecx = 0;
4962 
4963 		(void) __cpuid_insn(cp);
4964 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4965 		switch (n) {
4966 		case 2:
4967 			/*
4968 			 * "the lower 8 bits of the %eax register
4969 			 * contain a value that identifies the number
4970 			 * of times the cpuid [instruction] has to be
4971 			 * executed to obtain a complete image of the
4972 			 * processor's caching systems."
4973 			 *
4974 			 * How *do* they make this stuff up?
4975 			 */
4976 			cpi->cpi_ncache = sizeof (*cp) *
4977 			    BITX(cp->cp_eax, 7, 0);
4978 			if (cpi->cpi_ncache == 0)
4979 				break;
4980 			cpi->cpi_ncache--;	/* skip count byte */
4981 
4982 			/*
4983 			 * Well, for now, rather than attempt to implement
4984 			 * this slightly dubious algorithm, we just look
4985 			 * at the first 15 ..
4986 			 */
4987 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4988 				cpi->cpi_ncache = sizeof (*cp) - 1;
4989 
4990 			dp = cpi->cpi_cacheinfo;
4991 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4992 				uint8_t *p = (void *)&cp->cp_eax;
4993 				for (i = 1; i < 4; i++)
4994 					if (p[i] != 0)
4995 						*dp++ = p[i];
4996 			}
4997 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4998 				uint8_t *p = (void *)&cp->cp_ebx;
4999 				for (i = 0; i < 4; i++)
5000 					if (p[i] != 0)
5001 						*dp++ = p[i];
5002 			}
5003 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
5004 				uint8_t *p = (void *)&cp->cp_ecx;
5005 				for (i = 0; i < 4; i++)
5006 					if (p[i] != 0)
5007 						*dp++ = p[i];
5008 			}
5009 			if (BITX(cp->cp_edx, 31, 31) == 0) {
5010 				uint8_t *p = (void *)&cp->cp_edx;
5011 				for (i = 0; i < 4; i++)
5012 					if (p[i] != 0)
5013 						*dp++ = p[i];
5014 			}
5015 			break;
5016 
5017 		case 3:	/* Processor serial number, if PSN supported */
5018 			break;
5019 
5020 		case 4:	/* Deterministic cache parameters */
5021 			break;
5022 
5023 		case 5:	/* Monitor/Mwait parameters */
5024 		{
5025 			size_t mwait_size;
5026 
5027 			/*
5028 			 * check cpi_mwait.support which was set in
5029 			 * cpuid_pass_basic()
5030 			 */
5031 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
5032 				break;
5033 
5034 			/*
5035 			 * Protect ourself from insane mwait line size.
5036 			 * Workaround for incomplete hardware emulator(s).
5037 			 */
5038 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
5039 			if (mwait_size < sizeof (uint32_t) ||
5040 			    !ISP2(mwait_size)) {
5041 #if DEBUG
5042 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
5043 				    "size %ld", cpu->cpu_id, (long)mwait_size);
5044 #endif
5045 				break;
5046 			}
5047 
5048 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
5049 			cpi->cpi_mwait.mon_max = mwait_size;
5050 			if (MWAIT_EXTENSION(cpi)) {
5051 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
5052 				if (MWAIT_INT_ENABLE(cpi))
5053 					cpi->cpi_mwait.support |=
5054 					    MWAIT_ECX_INT_ENABLE;
5055 			}
5056 			break;
5057 		}
5058 		default:
5059 			break;
5060 		}
5061 	}
5062 
5063 	/*
5064 	 * XSAVE enumeration
5065 	 */
5066 	if (cpi->cpi_maxeax >= 0xD) {
5067 		struct cpuid_regs regs;
5068 		boolean_t cpuid_d_valid = B_TRUE;
5069 
5070 		cp = &regs;
5071 		cp->cp_eax = 0xD;
5072 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
5073 
5074 		(void) __cpuid_insn(cp);
5075 
5076 		/*
5077 		 * Sanity checks for debug
5078 		 */
5079 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
5080 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
5081 			cpuid_d_valid = B_FALSE;
5082 		}
5083 
5084 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
5085 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
5086 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
5087 
5088 		/*
5089 		 * If the hw supports AVX, get the size and offset in the save
5090 		 * area for the ymm state.
5091 		 */
5092 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
5093 			cp->cp_eax = 0xD;
5094 			cp->cp_ecx = 2;
5095 			cp->cp_edx = cp->cp_ebx = 0;
5096 
5097 			(void) __cpuid_insn(cp);
5098 
5099 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
5100 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
5101 				cpuid_d_valid = B_FALSE;
5102 			}
5103 
5104 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
5105 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
5106 		}
5107 
5108 		/*
5109 		 * If the hw supports MPX, get the size and offset in the
5110 		 * save area for BNDREGS and BNDCSR.
5111 		 */
5112 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
5113 			cp->cp_eax = 0xD;
5114 			cp->cp_ecx = 3;
5115 			cp->cp_edx = cp->cp_ebx = 0;
5116 
5117 			(void) __cpuid_insn(cp);
5118 
5119 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
5120 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
5121 
5122 			cp->cp_eax = 0xD;
5123 			cp->cp_ecx = 4;
5124 			cp->cp_edx = cp->cp_ebx = 0;
5125 
5126 			(void) __cpuid_insn(cp);
5127 
5128 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
5129 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
5130 		}
5131 
5132 		/*
5133 		 * If the hw supports AVX512, get the size and offset in the
5134 		 * save area for the opmask registers and zmm state.
5135 		 */
5136 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
5137 			cp->cp_eax = 0xD;
5138 			cp->cp_ecx = 5;
5139 			cp->cp_edx = cp->cp_ebx = 0;
5140 
5141 			(void) __cpuid_insn(cp);
5142 
5143 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
5144 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
5145 
5146 			cp->cp_eax = 0xD;
5147 			cp->cp_ecx = 6;
5148 			cp->cp_edx = cp->cp_ebx = 0;
5149 
5150 			(void) __cpuid_insn(cp);
5151 
5152 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
5153 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
5154 
5155 			cp->cp_eax = 0xD;
5156 			cp->cp_ecx = 7;
5157 			cp->cp_edx = cp->cp_ebx = 0;
5158 
5159 			(void) __cpuid_insn(cp);
5160 
5161 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
5162 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
5163 		}
5164 
5165 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
5166 			xsave_state_size = 0;
5167 		} else if (cpuid_d_valid) {
5168 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
5169 		} else {
5170 			/* Broken CPUID 0xD, probably in HVM */
5171 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
5172 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
5173 			    ", ymm_size = %d, ymm_offset = %d\n",
5174 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
5175 			    cpi->cpi_xsave.xsav_hw_features_high,
5176 			    (int)cpi->cpi_xsave.xsav_max_size,
5177 			    (int)cpi->cpi_xsave.ymm_size,
5178 			    (int)cpi->cpi_xsave.ymm_offset);
5179 
5180 			if (xsave_state_size != 0) {
5181 				/*
5182 				 * This must be a non-boot CPU. We cannot
5183 				 * continue, because boot cpu has already
5184 				 * enabled XSAVE.
5185 				 */
5186 				ASSERT(cpu->cpu_id != 0);
5187 				cmn_err(CE_PANIC, "cpu%d: we have already "
5188 				    "enabled XSAVE on boot cpu, cannot "
5189 				    "continue.", cpu->cpu_id);
5190 			} else {
5191 				/*
5192 				 * If we reached here on the boot CPU, it's also
5193 				 * almost certain that we'll reach here on the
5194 				 * non-boot CPUs. When we're here on a boot CPU
5195 				 * we should disable the feature, on a non-boot
5196 				 * CPU we need to confirm that we have.
5197 				 */
5198 				if (cpu->cpu_id == 0) {
5199 					remove_x86_feature(x86_featureset,
5200 					    X86FSET_XSAVE);
5201 					remove_x86_feature(x86_featureset,
5202 					    X86FSET_AVX);
5203 					remove_x86_feature(x86_featureset,
5204 					    X86FSET_F16C);
5205 					remove_x86_feature(x86_featureset,
5206 					    X86FSET_BMI1);
5207 					remove_x86_feature(x86_featureset,
5208 					    X86FSET_BMI2);
5209 					remove_x86_feature(x86_featureset,
5210 					    X86FSET_FMA);
5211 					remove_x86_feature(x86_featureset,
5212 					    X86FSET_AVX2);
5213 					remove_x86_feature(x86_featureset,
5214 					    X86FSET_MPX);
5215 					remove_x86_feature(x86_featureset,
5216 					    X86FSET_AVX512F);
5217 					remove_x86_feature(x86_featureset,
5218 					    X86FSET_AVX512DQ);
5219 					remove_x86_feature(x86_featureset,
5220 					    X86FSET_AVX512PF);
5221 					remove_x86_feature(x86_featureset,
5222 					    X86FSET_AVX512ER);
5223 					remove_x86_feature(x86_featureset,
5224 					    X86FSET_AVX512CD);
5225 					remove_x86_feature(x86_featureset,
5226 					    X86FSET_AVX512BW);
5227 					remove_x86_feature(x86_featureset,
5228 					    X86FSET_AVX512VL);
5229 					remove_x86_feature(x86_featureset,
5230 					    X86FSET_AVX512FMA);
5231 					remove_x86_feature(x86_featureset,
5232 					    X86FSET_AVX512VBMI);
5233 					remove_x86_feature(x86_featureset,
5234 					    X86FSET_AVX512VNNI);
5235 					remove_x86_feature(x86_featureset,
5236 					    X86FSET_AVX512VPOPCDQ);
5237 					remove_x86_feature(x86_featureset,
5238 					    X86FSET_AVX512NNIW);
5239 					remove_x86_feature(x86_featureset,
5240 					    X86FSET_AVX512FMAPS);
5241 					remove_x86_feature(x86_featureset,
5242 					    X86FSET_VAES);
5243 					remove_x86_feature(x86_featureset,
5244 					    X86FSET_VPCLMULQDQ);
5245 					remove_x86_feature(x86_featureset,
5246 					    X86FSET_GFNI);
5247 					remove_x86_feature(x86_featureset,
5248 					    X86FSET_AVX512_VP2INT);
5249 					remove_x86_feature(x86_featureset,
5250 					    X86FSET_AVX512_BITALG);
5251 					remove_x86_feature(x86_featureset,
5252 					    X86FSET_AVX512_VBMI2);
5253 					remove_x86_feature(x86_featureset,
5254 					    X86FSET_AVX512_BF16);
5255 
5256 					xsave_force_disable = B_TRUE;
5257 				} else {
5258 					VERIFY(is_x86_feature(x86_featureset,
5259 					    X86FSET_XSAVE) == B_FALSE);
5260 				}
5261 			}
5262 		}
5263 	}
5264 
5265 
5266 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
5267 		return;
5268 
5269 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
5270 		nmax = NMAX_CPI_EXTD;
5271 	/*
5272 	 * Copy the extended properties, fixing them as we go. While we start at
5273 	 * 2 because we've already handled a few cases in the basic pass, the
5274 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5275 	 */
5276 	iptr = (void *)cpi->cpi_brandstr;
5277 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5278 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5279 		(void) __cpuid_insn(cp);
5280 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5281 		    cp);
5282 		switch (n) {
5283 		case 2:
5284 		case 3:
5285 		case 4:
5286 			/*
5287 			 * Extract the brand string
5288 			 */
5289 			*iptr++ = cp->cp_eax;
5290 			*iptr++ = cp->cp_ebx;
5291 			*iptr++ = cp->cp_ecx;
5292 			*iptr++ = cp->cp_edx;
5293 			break;
5294 		case 5:
5295 			switch (cpi->cpi_vendor) {
5296 			case X86_VENDOR_AMD:
5297 				/*
5298 				 * The Athlon and Duron were the first
5299 				 * parts to report the sizes of the
5300 				 * TLB for large pages. Before then,
5301 				 * we don't trust the data.
5302 				 */
5303 				if (cpi->cpi_family < 6 ||
5304 				    (cpi->cpi_family == 6 &&
5305 				    cpi->cpi_model < 1))
5306 					cp->cp_eax = 0;
5307 				break;
5308 			default:
5309 				break;
5310 			}
5311 			break;
5312 		case 6:
5313 			switch (cpi->cpi_vendor) {
5314 			case X86_VENDOR_AMD:
5315 				/*
5316 				 * The Athlon and Duron were the first
5317 				 * AMD parts with L2 TLB's.
5318 				 * Before then, don't trust the data.
5319 				 */
5320 				if (cpi->cpi_family < 6 ||
5321 				    (cpi->cpi_family == 6 &&
5322 				    cpi->cpi_model < 1))
5323 					cp->cp_eax = cp->cp_ebx = 0;
5324 				/*
5325 				 * AMD Duron rev A0 reports L2
5326 				 * cache size incorrectly as 1K
5327 				 * when it is really 64K
5328 				 */
5329 				if (cpi->cpi_family == 6 &&
5330 				    cpi->cpi_model == 3 &&
5331 				    cpi->cpi_step == 0) {
5332 					cp->cp_ecx &= 0xffff;
5333 					cp->cp_ecx |= 0x400000;
5334 				}
5335 				break;
5336 			case X86_VENDOR_Cyrix:	/* VIA C3 */
5337 				/*
5338 				 * VIA C3 processors are a bit messed
5339 				 * up w.r.t. encoding cache sizes in %ecx
5340 				 */
5341 				if (cpi->cpi_family != 6)
5342 					break;
5343 				/*
5344 				 * model 7 and 8 were incorrectly encoded
5345 				 *
5346 				 * xxx is model 8 really broken?
5347 				 */
5348 				if (cpi->cpi_model == 7 ||
5349 				    cpi->cpi_model == 8)
5350 					cp->cp_ecx =
5351 					    BITX(cp->cp_ecx, 31, 24) << 16 |
5352 					    BITX(cp->cp_ecx, 23, 16) << 12 |
5353 					    BITX(cp->cp_ecx, 15, 8) << 8 |
5354 					    BITX(cp->cp_ecx, 7, 0);
5355 				/*
5356 				 * model 9 stepping 1 has wrong associativity
5357 				 */
5358 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5359 					cp->cp_ecx |= 8 << 12;
5360 				break;
5361 			case X86_VENDOR_Intel:
5362 				/*
5363 				 * Extended L2 Cache features function.
5364 				 * First appeared on Prescott.
5365 				 */
5366 			default:
5367 				break;
5368 			}
5369 			break;
5370 		default:
5371 			break;
5372 		}
5373 	}
5374 }
5375 
5376 static const char *
intel_cpubrand(const struct cpuid_info * cpi)5377 intel_cpubrand(const struct cpuid_info *cpi)
5378 {
5379 	int i;
5380 
5381 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5382 
5383 	switch (cpi->cpi_family) {
5384 	case 5:
5385 		return ("Intel Pentium(r)");
5386 	case 6:
5387 		switch (cpi->cpi_model) {
5388 			uint_t celeron, xeon;
5389 			const struct cpuid_regs *cp;
5390 		case 0:
5391 		case 1:
5392 		case 2:
5393 			return ("Intel Pentium(r) Pro");
5394 		case 3:
5395 		case 4:
5396 			return ("Intel Pentium(r) II");
5397 		case 6:
5398 			return ("Intel Celeron(r)");
5399 		case 5:
5400 		case 7:
5401 			celeron = xeon = 0;
5402 			cp = &cpi->cpi_std[2];	/* cache info */
5403 
5404 			for (i = 1; i < 4; i++) {
5405 				uint_t tmp;
5406 
5407 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5408 				if (tmp == 0x40)
5409 					celeron++;
5410 				if (tmp >= 0x44 && tmp <= 0x45)
5411 					xeon++;
5412 			}
5413 
5414 			for (i = 0; i < 2; i++) {
5415 				uint_t tmp;
5416 
5417 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5418 				if (tmp == 0x40)
5419 					celeron++;
5420 				else if (tmp >= 0x44 && tmp <= 0x45)
5421 					xeon++;
5422 			}
5423 
5424 			for (i = 0; i < 4; i++) {
5425 				uint_t tmp;
5426 
5427 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5428 				if (tmp == 0x40)
5429 					celeron++;
5430 				else if (tmp >= 0x44 && tmp <= 0x45)
5431 					xeon++;
5432 			}
5433 
5434 			for (i = 0; i < 4; i++) {
5435 				uint_t tmp;
5436 
5437 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5438 				if (tmp == 0x40)
5439 					celeron++;
5440 				else if (tmp >= 0x44 && tmp <= 0x45)
5441 					xeon++;
5442 			}
5443 
5444 			if (celeron)
5445 				return ("Intel Celeron(r)");
5446 			if (xeon)
5447 				return (cpi->cpi_model == 5 ?
5448 				    "Intel Pentium(r) II Xeon(tm)" :
5449 				    "Intel Pentium(r) III Xeon(tm)");
5450 			return (cpi->cpi_model == 5 ?
5451 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5452 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5453 		default:
5454 			break;
5455 		}
5456 	default:
5457 		break;
5458 	}
5459 
5460 	/* BrandID is present if the field is nonzero */
5461 	if (cpi->cpi_brandid != 0) {
5462 		static const struct {
5463 			uint_t bt_bid;
5464 			const char *bt_str;
5465 		} brand_tbl[] = {
5466 			{ 0x1,	"Intel(r) Celeron(r)" },
5467 			{ 0x2,	"Intel(r) Pentium(r) III" },
5468 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5469 			{ 0x4,	"Intel(r) Pentium(r) III" },
5470 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5471 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5472 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5473 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5474 			{ 0xa,	"Intel(r) Celeron(r)" },
5475 			{ 0xb,	"Intel(r) Xeon(tm)" },
5476 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5477 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5478 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5479 			{ 0x11, "Mobile Genuine Intel(r)" },
5480 			{ 0x12, "Intel(r) Celeron(r) M" },
5481 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5482 			{ 0x14, "Intel(r) Celeron(r)" },
5483 			{ 0x15, "Mobile Genuine Intel(r)" },
5484 			{ 0x16,	"Intel(r) Pentium(r) M" },
5485 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5486 		};
5487 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5488 		uint_t sgn;
5489 
5490 		sgn = (cpi->cpi_family << 8) |
5491 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5492 
5493 		for (i = 0; i < btblmax; i++)
5494 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5495 				break;
5496 		if (i < btblmax) {
5497 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5498 				return ("Intel(r) Celeron(r)");
5499 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5500 				return ("Intel(r) Xeon(tm) MP");
5501 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5502 				return ("Intel(r) Xeon(tm)");
5503 			return (brand_tbl[i].bt_str);
5504 		}
5505 	}
5506 
5507 	return (NULL);
5508 }
5509 
5510 static const char *
amd_cpubrand(const struct cpuid_info * cpi)5511 amd_cpubrand(const struct cpuid_info *cpi)
5512 {
5513 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5514 
5515 	switch (cpi->cpi_family) {
5516 	case 5:
5517 		switch (cpi->cpi_model) {
5518 		case 0:
5519 		case 1:
5520 		case 2:
5521 		case 3:
5522 		case 4:
5523 		case 5:
5524 			return ("AMD-K5(r)");
5525 		case 6:
5526 		case 7:
5527 			return ("AMD-K6(r)");
5528 		case 8:
5529 			return ("AMD-K6(r)-2");
5530 		case 9:
5531 			return ("AMD-K6(r)-III");
5532 		default:
5533 			return ("AMD (family 5)");
5534 		}
5535 	case 6:
5536 		switch (cpi->cpi_model) {
5537 		case 1:
5538 			return ("AMD-K7(tm)");
5539 		case 0:
5540 		case 2:
5541 		case 4:
5542 			return ("AMD Athlon(tm)");
5543 		case 3:
5544 		case 7:
5545 			return ("AMD Duron(tm)");
5546 		case 6:
5547 		case 8:
5548 		case 10:
5549 			/*
5550 			 * Use the L2 cache size to distinguish
5551 			 */
5552 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5553 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5554 		default:
5555 			return ("AMD (family 6)");
5556 		}
5557 	default:
5558 		break;
5559 	}
5560 
5561 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5562 	    cpi->cpi_brandid != 0) {
5563 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5564 		case 3:
5565 			return ("AMD Opteron(tm) UP 1xx");
5566 		case 4:
5567 			return ("AMD Opteron(tm) DP 2xx");
5568 		case 5:
5569 			return ("AMD Opteron(tm) MP 8xx");
5570 		default:
5571 			return ("AMD Opteron(tm)");
5572 		}
5573 	}
5574 
5575 	return (NULL);
5576 }
5577 
5578 static const char *
cyrix_cpubrand(struct cpuid_info * cpi,uint_t type)5579 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5580 {
5581 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5582 
5583 	switch (type) {
5584 	case X86_TYPE_CYRIX_6x86:
5585 		return ("Cyrix 6x86");
5586 	case X86_TYPE_CYRIX_6x86L:
5587 		return ("Cyrix 6x86L");
5588 	case X86_TYPE_CYRIX_6x86MX:
5589 		return ("Cyrix 6x86MX");
5590 	case X86_TYPE_CYRIX_GXm:
5591 		return ("Cyrix GXm");
5592 	case X86_TYPE_CYRIX_MediaGX:
5593 		return ("Cyrix MediaGX");
5594 	case X86_TYPE_CYRIX_MII:
5595 		return ("Cyrix M2");
5596 	case X86_TYPE_VIA_CYRIX_III:
5597 		return ("VIA Cyrix M3");
5598 	default:
5599 		/*
5600 		 * Have another wild guess ..
5601 		 */
5602 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5603 			return ("Cyrix 5x86");
5604 		else if (cpi->cpi_family == 5) {
5605 			switch (cpi->cpi_model) {
5606 			case 2:
5607 				return ("Cyrix 6x86");	/* Cyrix M1 */
5608 			case 4:
5609 				return ("Cyrix MediaGX");
5610 			default:
5611 				break;
5612 			}
5613 		} else if (cpi->cpi_family == 6) {
5614 			switch (cpi->cpi_model) {
5615 			case 0:
5616 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5617 			case 5:
5618 			case 6:
5619 			case 7:
5620 			case 8:
5621 			case 9:
5622 				return ("VIA C3");
5623 			default:
5624 				break;
5625 			}
5626 		}
5627 		break;
5628 	}
5629 	return (NULL);
5630 }
5631 
5632 /*
5633  * This only gets called in the case that the CPU extended
5634  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5635  * aren't available, or contain null bytes for some reason.
5636  */
5637 static void
fabricate_brandstr(struct cpuid_info * cpi)5638 fabricate_brandstr(struct cpuid_info *cpi)
5639 {
5640 	const char *brand = NULL;
5641 
5642 	switch (cpi->cpi_vendor) {
5643 	case X86_VENDOR_Intel:
5644 		brand = intel_cpubrand(cpi);
5645 		break;
5646 	case X86_VENDOR_AMD:
5647 		brand = amd_cpubrand(cpi);
5648 		break;
5649 	case X86_VENDOR_Cyrix:
5650 		brand = cyrix_cpubrand(cpi, x86_type);
5651 		break;
5652 	case X86_VENDOR_NexGen:
5653 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5654 			brand = "NexGen Nx586";
5655 		break;
5656 	case X86_VENDOR_Centaur:
5657 		if (cpi->cpi_family == 5)
5658 			switch (cpi->cpi_model) {
5659 			case 4:
5660 				brand = "Centaur C6";
5661 				break;
5662 			case 8:
5663 				brand = "Centaur C2";
5664 				break;
5665 			case 9:
5666 				brand = "Centaur C3";
5667 				break;
5668 			default:
5669 				break;
5670 			}
5671 		break;
5672 	case X86_VENDOR_Rise:
5673 		if (cpi->cpi_family == 5 &&
5674 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5675 			brand = "Rise mP6";
5676 		break;
5677 	case X86_VENDOR_SiS:
5678 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5679 			brand = "SiS 55x";
5680 		break;
5681 	case X86_VENDOR_TM:
5682 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5683 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5684 		break;
5685 	case X86_VENDOR_NSC:
5686 	case X86_VENDOR_UMC:
5687 	default:
5688 		break;
5689 	}
5690 	if (brand) {
5691 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5692 		return;
5693 	}
5694 
5695 	/*
5696 	 * If all else fails ...
5697 	 */
5698 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5699 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5700 	    cpi->cpi_model, cpi->cpi_step);
5701 }
5702 
5703 /*
5704  * This routine is called just after kernel memory allocation
5705  * becomes available on cpu0, and as part of mp_startup() on
5706  * the other cpus.
5707  *
5708  * Fixup the brand string, and collect any information from cpuid
5709  * that requires dynamically allocated storage to represent.
5710  */
5711 
5712 static void
cpuid_pass_dynamic(cpu_t * cpu,void * _arg __unused)5713 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5714 {
5715 	int	i, max, shft, level, size;
5716 	struct cpuid_regs regs;
5717 	struct cpuid_regs *cp;
5718 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5719 
5720 	/*
5721 	 * Deterministic cache parameters
5722 	 *
5723 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5724 	 * values that are present are currently defined to be the same. This
5725 	 * means we can use the same logic to parse it as long as we use the
5726 	 * appropriate leaf to get the data. If you're updating this, make sure
5727 	 * you're careful about which vendor supports which aspect.
5728 	 *
5729 	 * Take this opportunity to detect the number of threads sharing the
5730 	 * last level cache, and construct a corresponding cache id. The
5731 	 * respective cpuid_info members are initialized to the default case of
5732 	 * "no last level cache sharing".
5733 	 */
5734 	cpi->cpi_ncpu_shr_last_cache = 1;
5735 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5736 
5737 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5738 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5739 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5740 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5741 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5742 		uint32_t leaf;
5743 
5744 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5745 			leaf = 4;
5746 		} else {
5747 			leaf = CPUID_LEAF_EXT_1d;
5748 		}
5749 
5750 		/*
5751 		 * Find the # of elements (size) returned by the leaf and along
5752 		 * the way detect last level cache sharing details.
5753 		 */
5754 		bzero(&regs, sizeof (regs));
5755 		cp = &regs;
5756 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5757 			cp->cp_eax = leaf;
5758 			cp->cp_ecx = i;
5759 
5760 			(void) __cpuid_insn(cp);
5761 
5762 			if (CPI_CACHE_TYPE(cp) == 0)
5763 				break;
5764 			level = CPI_CACHE_LVL(cp);
5765 			if (level > max) {
5766 				max = level;
5767 				cpi->cpi_ncpu_shr_last_cache =
5768 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5769 			}
5770 		}
5771 		cpi->cpi_cache_leaf_size = size = i;
5772 
5773 		/*
5774 		 * Allocate the cpi_cache_leaves array. The first element
5775 		 * references the regs for the corresponding leaf with %ecx set
5776 		 * to 0. This was gathered in cpuid_pass_extended().
5777 		 */
5778 		if (size > 0) {
5779 			cpi->cpi_cache_leaves =
5780 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5781 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5782 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5783 			} else {
5784 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5785 			}
5786 
5787 			/*
5788 			 * Allocate storage to hold the additional regs
5789 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5790 			 *
5791 			 * The regs for the leaf, %ecx == 0 has already
5792 			 * been allocated as indicated above.
5793 			 */
5794 			for (i = 1; i < size; i++) {
5795 				cp = cpi->cpi_cache_leaves[i] =
5796 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5797 				cp->cp_eax = leaf;
5798 				cp->cp_ecx = i;
5799 
5800 				(void) __cpuid_insn(cp);
5801 			}
5802 		}
5803 		/*
5804 		 * Determine the number of bits needed to represent
5805 		 * the number of CPUs sharing the last level cache.
5806 		 *
5807 		 * Shift off that number of bits from the APIC id to
5808 		 * derive the cache id.
5809 		 */
5810 		shft = 0;
5811 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5812 			shft++;
5813 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5814 	}
5815 
5816 	/*
5817 	 * Now fixup the brand string
5818 	 */
5819 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5820 		fabricate_brandstr(cpi);
5821 	} else {
5822 
5823 		/*
5824 		 * If we successfully extracted a brand string from the cpuid
5825 		 * instruction, clean it up by removing leading spaces and
5826 		 * similar junk.
5827 		 */
5828 		if (cpi->cpi_brandstr[0]) {
5829 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5830 			char *src, *dst;
5831 
5832 			dst = src = (char *)cpi->cpi_brandstr;
5833 			src[maxlen - 1] = '\0';
5834 			/*
5835 			 * strip leading spaces
5836 			 */
5837 			while (*src == ' ')
5838 				src++;
5839 			/*
5840 			 * Remove any 'Genuine' or "Authentic" prefixes
5841 			 */
5842 			if (strncmp(src, "Genuine ", 8) == 0)
5843 				src += 8;
5844 			if (strncmp(src, "Authentic ", 10) == 0)
5845 				src += 10;
5846 
5847 			/*
5848 			 * Now do an in-place copy.
5849 			 * Map (R) to (r) and (TM) to (tm).
5850 			 * The era of teletypes is long gone, and there's
5851 			 * -really- no need to shout.
5852 			 */
5853 			while (*src != '\0') {
5854 				if (src[0] == '(') {
5855 					if (strncmp(src + 1, "R)", 2) == 0) {
5856 						(void) strncpy(dst, "(r)", 3);
5857 						src += 3;
5858 						dst += 3;
5859 						continue;
5860 					}
5861 					if (strncmp(src + 1, "TM)", 3) == 0) {
5862 						(void) strncpy(dst, "(tm)", 4);
5863 						src += 4;
5864 						dst += 4;
5865 						continue;
5866 					}
5867 				}
5868 				*dst++ = *src++;
5869 			}
5870 			*dst = '\0';
5871 
5872 			/*
5873 			 * Finally, remove any trailing spaces
5874 			 */
5875 			while (--dst > cpi->cpi_brandstr)
5876 				if (*dst == ' ')
5877 					*dst = '\0';
5878 				else
5879 					break;
5880 		} else
5881 			fabricate_brandstr(cpi);
5882 	}
5883 }
5884 
5885 typedef struct {
5886 	uint32_t avm_av;
5887 	uint32_t avm_feat;
5888 } av_feat_map_t;
5889 
5890 /*
5891  * These arrays are used to map features that we should add based on x86
5892  * features that are present. As a large number depend on kernel features,
5893  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5894  * There is an array of these for each hwcap word. Some features aren't tracked
5895  * in the kernel x86 featureset and that's ok. They will not show up in here.
5896  */
5897 static const av_feat_map_t x86fset_to_av1[] = {
5898 	{ AV_386_CX8, X86FSET_CX8 },
5899 	{ AV_386_SEP, X86FSET_SEP },
5900 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5901 	{ AV_386_CMOV, X86FSET_CMOV },
5902 	{ AV_386_FXSR, X86FSET_SSE },
5903 	{ AV_386_SSE, X86FSET_SSE },
5904 	{ AV_386_SSE2, X86FSET_SSE2 },
5905 	{ AV_386_SSE3, X86FSET_SSE3 },
5906 	{ AV_386_CX16, X86FSET_CX16 },
5907 	{ AV_386_TSCP, X86FSET_TSCP },
5908 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5909 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5910 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5911 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5912 	{ AV_386_AES, X86FSET_AES },
5913 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5914 	{ AV_386_XSAVE, X86FSET_XSAVE },
5915 	{ AV_386_AVX, X86FSET_AVX },
5916 	{ AV_386_VMX, X86FSET_VMX },
5917 	{ AV_386_AMD_SVM, X86FSET_SVM }
5918 };
5919 
5920 static const av_feat_map_t x86fset_to_av2[] = {
5921 	{ AV_386_2_F16C, X86FSET_F16C },
5922 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5923 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5924 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5925 	{ AV_386_2_FMA, X86FSET_FMA },
5926 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5927 	{ AV_386_2_ADX, X86FSET_ADX },
5928 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5929 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5930 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5931 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5932 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5933 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5934 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5935 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5936 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5937 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5938 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5939 	{ AV_386_2_SHA, X86FSET_SHA },
5940 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5941 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5942 	{ AV_386_2_CLWB, X86FSET_CLWB },
5943 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5944 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5945 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5946 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5947 	{ AV_386_2_VAES, X86FSET_VAES },
5948 	{ AV_386_2_GFNI, X86FSET_GFNI },
5949 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5950 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5951 };
5952 
5953 static const av_feat_map_t x86fset_to_av3[] = {
5954 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5955 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5956 };
5957 
5958 /*
5959  * This routine is called out of bind_hwcap() much later in the life
5960  * of the kernel (post_startup()).  The job of this routine is to resolve
5961  * the hardware feature support and kernel support for those features into
5962  * what we're actually going to tell applications via the aux vector.
5963  *
5964  * Most of the aux vector is derived from the x86_featureset array vector where
5965  * a given feature indicates that an aux vector should be plumbed through. This
5966  * allows the kernel to use one tracking mechanism for these based on whether or
5967  * not it has the required hardware support (most often xsave). Most newer
5968  * features are added there in case we need them in the kernel. Otherwise,
5969  * features are evaluated based on looking at the cpuid features that remain. If
5970  * you find yourself wanting to clear out cpuid features for some reason, they
5971  * should instead be driven by the feature set so we have a consistent view.
5972  */
5973 
5974 static void
cpuid_pass_resolve(cpu_t * cpu,void * arg)5975 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5976 {
5977 	uint_t *hwcap_out = (uint_t *)arg;
5978 	struct cpuid_info *cpi;
5979 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5980 
5981 	cpi = cpu->cpu_m.mcpu_cpi;
5982 
5983 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5984 		if (is_x86_feature(x86_featureset,
5985 		    x86fset_to_av1[i].avm_feat)) {
5986 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5987 		}
5988 	}
5989 
5990 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5991 		if (is_x86_feature(x86_featureset,
5992 		    x86fset_to_av2[i].avm_feat)) {
5993 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5994 		}
5995 	}
5996 
5997 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5998 		if (is_x86_feature(x86_featureset,
5999 		    x86fset_to_av3[i].avm_feat)) {
6000 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
6001 		}
6002 	}
6003 
6004 	/*
6005 	 * From here on out we're working through features that don't have
6006 	 * corresponding kernel feature flags for various reasons that are
6007 	 * mostly just due to the historical implementation.
6008 	 */
6009 	if (cpi->cpi_maxeax >= 1) {
6010 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
6011 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
6012 
6013 		*edx = CPI_FEATURES_EDX(cpi);
6014 		*ecx = CPI_FEATURES_ECX(cpi);
6015 
6016 		/*
6017 		 * [no explicit support required beyond x87 fp context]
6018 		 */
6019 		if (!fpu_exists)
6020 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
6021 
6022 		/*
6023 		 * Now map the supported feature vector to things that we
6024 		 * think userland will care about.
6025 		 */
6026 		if (*ecx & CPUID_INTC_ECX_MOVBE)
6027 			hwcap_flags |= AV_386_MOVBE;
6028 
6029 		if (*ecx & CPUID_INTC_ECX_POPCNT)
6030 			hwcap_flags |= AV_386_POPCNT;
6031 		if (*edx & CPUID_INTC_EDX_FPU)
6032 			hwcap_flags |= AV_386_FPU;
6033 		if (*edx & CPUID_INTC_EDX_MMX)
6034 			hwcap_flags |= AV_386_MMX;
6035 		if (*edx & CPUID_INTC_EDX_TSC)
6036 			hwcap_flags |= AV_386_TSC;
6037 	}
6038 
6039 	/*
6040 	 * Check a few miscellaneous features.
6041 	 */
6042 	if (cpi->cpi_xmaxeax < 0x80000001)
6043 		goto resolve_done;
6044 
6045 	switch (cpi->cpi_vendor) {
6046 		uint32_t *edx, *ecx;
6047 
6048 	case X86_VENDOR_Intel:
6049 		/*
6050 		 * Seems like Intel duplicated what we necessary
6051 		 * here to make the initial crop of 64-bit OS's work.
6052 		 * Hopefully, those are the only "extended" bits
6053 		 * they'll add.
6054 		 */
6055 		/*FALLTHROUGH*/
6056 
6057 	case X86_VENDOR_AMD:
6058 	case X86_VENDOR_HYGON:
6059 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
6060 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
6061 
6062 		*edx = CPI_FEATURES_XTD_EDX(cpi);
6063 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
6064 
6065 		/*
6066 		 * [no explicit support required beyond
6067 		 * x87 fp context and exception handlers]
6068 		 */
6069 		if (!fpu_exists)
6070 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
6071 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
6072 
6073 		/*
6074 		 * Now map the supported feature vector to
6075 		 * things that we think userland will care about.
6076 		 */
6077 		if (*edx & CPUID_AMD_EDX_MMXamd)
6078 			hwcap_flags |= AV_386_AMD_MMX;
6079 		if (*edx & CPUID_AMD_EDX_3DNow)
6080 			hwcap_flags |= AV_386_AMD_3DNow;
6081 		if (*edx & CPUID_AMD_EDX_3DNowx)
6082 			hwcap_flags |= AV_386_AMD_3DNowx;
6083 
6084 		switch (cpi->cpi_vendor) {
6085 		case X86_VENDOR_AMD:
6086 		case X86_VENDOR_HYGON:
6087 			if (*ecx & CPUID_AMD_ECX_AHF64)
6088 				hwcap_flags |= AV_386_AHF;
6089 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6090 				hwcap_flags |= AV_386_AMD_LZCNT;
6091 			break;
6092 
6093 		case X86_VENDOR_Intel:
6094 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6095 				hwcap_flags |= AV_386_AMD_LZCNT;
6096 			/*
6097 			 * Aarrgh.
6098 			 * Intel uses a different bit in the same word.
6099 			 */
6100 			if (*ecx & CPUID_INTC_ECX_AHF64)
6101 				hwcap_flags |= AV_386_AHF;
6102 			break;
6103 		default:
6104 			break;
6105 		}
6106 		break;
6107 
6108 	default:
6109 		break;
6110 	}
6111 
6112 resolve_done:
6113 	if (hwcap_out != NULL) {
6114 		hwcap_out[0] = hwcap_flags;
6115 		hwcap_out[1] = hwcap_flags_2;
6116 		hwcap_out[2] = hwcap_flags_3;
6117 	}
6118 }
6119 
6120 
6121 /*
6122  * Simulate the cpuid instruction using the data we previously
6123  * captured about this CPU.  We try our best to return the truth
6124  * about the hardware, independently of kernel support.
6125  */
6126 uint32_t
cpuid_insn(cpu_t * cpu,struct cpuid_regs * cp)6127 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
6128 {
6129 	struct cpuid_info *cpi;
6130 	struct cpuid_regs *xcp;
6131 
6132 	if (cpu == NULL)
6133 		cpu = CPU;
6134 	cpi = cpu->cpu_m.mcpu_cpi;
6135 
6136 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6137 
6138 	/*
6139 	 * CPUID data is cached in two separate places: cpi_std for standard
6140 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
6141 	 */
6142 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
6143 		xcp = &cpi->cpi_std[cp->cp_eax];
6144 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
6145 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
6146 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
6147 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
6148 	} else {
6149 		/*
6150 		 * The caller is asking for data from an input parameter which
6151 		 * the kernel has not cached.  In this case we go fetch from
6152 		 * the hardware and return the data directly to the user.
6153 		 */
6154 		return (__cpuid_insn(cp));
6155 	}
6156 
6157 	cp->cp_eax = xcp->cp_eax;
6158 	cp->cp_ebx = xcp->cp_ebx;
6159 	cp->cp_ecx = xcp->cp_ecx;
6160 	cp->cp_edx = xcp->cp_edx;
6161 	return (cp->cp_eax);
6162 }
6163 
6164 boolean_t
cpuid_checkpass(const cpu_t * const cpu,const cpuid_pass_t pass)6165 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
6166 {
6167 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
6168 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
6169 }
6170 
6171 int
cpuid_getbrandstr(cpu_t * cpu,char * s,size_t n)6172 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
6173 {
6174 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6175 
6176 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
6177 }
6178 
6179 int
cpuid_is_cmt(cpu_t * cpu)6180 cpuid_is_cmt(cpu_t *cpu)
6181 {
6182 	if (cpu == NULL)
6183 		cpu = CPU;
6184 
6185 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6186 
6187 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
6188 }
6189 
6190 /*
6191  * AMD and Intel both implement the 64-bit variant of the syscall
6192  * instruction (syscallq), so if there's -any- support for syscall,
6193  * cpuid currently says "yes, we support this".
6194  *
6195  * However, Intel decided to -not- implement the 32-bit variant of the
6196  * syscall instruction, so we provide a predicate to allow our caller
6197  * to test that subtlety here.
6198  *
6199  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
6200  *	even in the case where the hardware would in fact support it.
6201  */
6202 /*ARGSUSED*/
6203 int
cpuid_syscall32_insn(cpu_t * cpu)6204 cpuid_syscall32_insn(cpu_t *cpu)
6205 {
6206 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
6207 
6208 #if !defined(__xpv)
6209 	if (cpu == NULL)
6210 		cpu = CPU;
6211 
6212 	/*CSTYLED*/
6213 	{
6214 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6215 
6216 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
6217 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
6218 		    cpi->cpi_xmaxeax >= 0x80000001 &&
6219 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
6220 			return (1);
6221 	}
6222 #endif
6223 	return (0);
6224 }
6225 
6226 int
cpuid_getidstr(cpu_t * cpu,char * s,size_t n)6227 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
6228 {
6229 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6230 
6231 	static const char fmt[] =
6232 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
6233 	static const char fmt_ht[] =
6234 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
6235 
6236 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6237 
6238 	if (cpuid_is_cmt(cpu))
6239 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
6240 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6241 		    cpi->cpi_family, cpi->cpi_model,
6242 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6243 	return (snprintf(s, n, fmt,
6244 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6245 	    cpi->cpi_family, cpi->cpi_model,
6246 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6247 }
6248 
6249 const char *
cpuid_getvendorstr(cpu_t * cpu)6250 cpuid_getvendorstr(cpu_t *cpu)
6251 {
6252 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6253 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
6254 }
6255 
6256 uint_t
cpuid_getvendor(cpu_t * cpu)6257 cpuid_getvendor(cpu_t *cpu)
6258 {
6259 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6260 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
6261 }
6262 
6263 uint_t
cpuid_getfamily(cpu_t * cpu)6264 cpuid_getfamily(cpu_t *cpu)
6265 {
6266 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6267 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
6268 }
6269 
6270 uint_t
cpuid_getmodel(cpu_t * cpu)6271 cpuid_getmodel(cpu_t *cpu)
6272 {
6273 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6274 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
6275 }
6276 
6277 uint_t
cpuid_get_ncpu_per_chip(cpu_t * cpu)6278 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6279 {
6280 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6281 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6282 }
6283 
6284 uint_t
cpuid_get_ncore_per_chip(cpu_t * cpu)6285 cpuid_get_ncore_per_chip(cpu_t *cpu)
6286 {
6287 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6288 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6289 }
6290 
6291 uint_t
cpuid_get_ncpu_sharing_last_cache(cpu_t * cpu)6292 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6293 {
6294 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6295 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6296 }
6297 
6298 id_t
cpuid_get_last_lvl_cacheid(cpu_t * cpu)6299 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6300 {
6301 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6302 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6303 }
6304 
6305 uint_t
cpuid_getstep(cpu_t * cpu)6306 cpuid_getstep(cpu_t *cpu)
6307 {
6308 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6309 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
6310 }
6311 
6312 uint_t
cpuid_getsig(struct cpu * cpu)6313 cpuid_getsig(struct cpu *cpu)
6314 {
6315 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6316 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6317 }
6318 
6319 x86_chiprev_t
cpuid_getchiprev(struct cpu * cpu)6320 cpuid_getchiprev(struct cpu *cpu)
6321 {
6322 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6323 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6324 }
6325 
6326 const char *
cpuid_getchiprevstr(struct cpu * cpu)6327 cpuid_getchiprevstr(struct cpu *cpu)
6328 {
6329 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6330 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6331 }
6332 
6333 uint32_t
cpuid_getsockettype(struct cpu * cpu)6334 cpuid_getsockettype(struct cpu *cpu)
6335 {
6336 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6337 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6338 }
6339 
6340 const char *
cpuid_getsocketstr(cpu_t * cpu)6341 cpuid_getsocketstr(cpu_t *cpu)
6342 {
6343 	static const char *socketstr = NULL;
6344 	struct cpuid_info *cpi;
6345 
6346 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6347 	cpi = cpu->cpu_m.mcpu_cpi;
6348 
6349 	/* Assume that socket types are the same across the system */
6350 	if (socketstr == NULL)
6351 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6352 		    cpi->cpi_model, cpi->cpi_step);
6353 
6354 
6355 	return (socketstr);
6356 }
6357 
6358 x86_uarchrev_t
cpuid_getuarchrev(cpu_t * cpu)6359 cpuid_getuarchrev(cpu_t *cpu)
6360 {
6361 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6362 }
6363 
6364 int
cpuid_get_chipid(cpu_t * cpu)6365 cpuid_get_chipid(cpu_t *cpu)
6366 {
6367 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6368 
6369 	if (cpuid_is_cmt(cpu))
6370 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6371 	return (cpu->cpu_id);
6372 }
6373 
6374 id_t
cpuid_get_coreid(cpu_t * cpu)6375 cpuid_get_coreid(cpu_t *cpu)
6376 {
6377 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6378 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6379 }
6380 
6381 int
cpuid_get_pkgcoreid(cpu_t * cpu)6382 cpuid_get_pkgcoreid(cpu_t *cpu)
6383 {
6384 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6385 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6386 }
6387 
6388 int
cpuid_get_clogid(cpu_t * cpu)6389 cpuid_get_clogid(cpu_t *cpu)
6390 {
6391 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6392 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6393 }
6394 
6395 int
cpuid_get_cacheid(cpu_t * cpu)6396 cpuid_get_cacheid(cpu_t *cpu)
6397 {
6398 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6399 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6400 }
6401 
6402 uint_t
cpuid_get_procnodeid(cpu_t * cpu)6403 cpuid_get_procnodeid(cpu_t *cpu)
6404 {
6405 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6406 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6407 }
6408 
6409 uint_t
cpuid_get_procnodes_per_pkg(cpu_t * cpu)6410 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6411 {
6412 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6413 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6414 }
6415 
6416 uint_t
cpuid_get_compunitid(cpu_t * cpu)6417 cpuid_get_compunitid(cpu_t *cpu)
6418 {
6419 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6420 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6421 }
6422 
6423 uint_t
cpuid_get_cores_per_compunit(cpu_t * cpu)6424 cpuid_get_cores_per_compunit(cpu_t *cpu)
6425 {
6426 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6427 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6428 }
6429 
6430 uint32_t
cpuid_get_apicid(cpu_t * cpu)6431 cpuid_get_apicid(cpu_t *cpu)
6432 {
6433 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6434 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6435 		return (UINT32_MAX);
6436 	} else {
6437 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6438 	}
6439 }
6440 
6441 void
cpuid_get_addrsize(cpu_t * cpu,uint_t * pabits,uint_t * vabits)6442 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6443 {
6444 	struct cpuid_info *cpi;
6445 
6446 	if (cpu == NULL)
6447 		cpu = CPU;
6448 	cpi = cpu->cpu_m.mcpu_cpi;
6449 
6450 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6451 
6452 	if (pabits)
6453 		*pabits = cpi->cpi_pabits;
6454 	if (vabits)
6455 		*vabits = cpi->cpi_vabits;
6456 }
6457 
6458 size_t
cpuid_get_xsave_size(void)6459 cpuid_get_xsave_size(void)
6460 {
6461 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6462 	    sizeof (struct xsave_state)));
6463 }
6464 
6465 /*
6466  * Export information about known offsets to the kernel. We only care about
6467  * things we have actually enabled support for in %xcr0.
6468  */
6469 void
cpuid_get_xsave_info(uint64_t bit,size_t * sizep,size_t * offp)6470 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6471 {
6472 	size_t size, off;
6473 
6474 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6475 
6476 	if (sizep == NULL)
6477 		sizep = &size;
6478 	if (offp == NULL)
6479 		offp = &off;
6480 
6481 	switch (bit) {
6482 	case XFEATURE_LEGACY_FP:
6483 	case XFEATURE_SSE:
6484 		*sizep = sizeof (struct fxsave_state);
6485 		*offp = 0;
6486 		break;
6487 	case XFEATURE_AVX:
6488 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6489 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6490 		break;
6491 	case XFEATURE_AVX512_OPMASK:
6492 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6493 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6494 		break;
6495 	case XFEATURE_AVX512_ZMM:
6496 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6497 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6498 		break;
6499 	case XFEATURE_AVX512_HI_ZMM:
6500 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6501 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6502 		break;
6503 	default:
6504 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6505 	}
6506 }
6507 
6508 /*
6509  * Return true if the CPUs on this system require 'pointer clearing' for the
6510  * floating point error pointer exception handling. In the past, this has been
6511  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6512  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6513  * feature bit and is reflected in the cpi_fp_amd_save member.
6514  */
6515 boolean_t
cpuid_need_fp_excp_handling(void)6516 cpuid_need_fp_excp_handling(void)
6517 {
6518 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6519 	    cpuid_info0.cpi_fp_amd_save != 0);
6520 }
6521 
6522 /*
6523  * Returns the number of data TLB entries for a corresponding
6524  * pagesize.  If it can't be computed, or isn't known, the
6525  * routine returns zero.  If you ask about an architecturally
6526  * impossible pagesize, the routine will panic (so that the
6527  * hat implementor knows that things are inconsistent.)
6528  */
6529 uint_t
cpuid_get_dtlb_nent(cpu_t * cpu,size_t pagesize)6530 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6531 {
6532 	struct cpuid_info *cpi;
6533 	uint_t dtlb_nent = 0;
6534 
6535 	if (cpu == NULL)
6536 		cpu = CPU;
6537 	cpi = cpu->cpu_m.mcpu_cpi;
6538 
6539 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6540 
6541 	/*
6542 	 * Check the L2 TLB info
6543 	 */
6544 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6545 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6546 
6547 		switch (pagesize) {
6548 
6549 		case 4 * 1024:
6550 			/*
6551 			 * All zero in the top 16 bits of the register
6552 			 * indicates a unified TLB. Size is in low 16 bits.
6553 			 */
6554 			if ((cp->cp_ebx & 0xffff0000) == 0)
6555 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6556 			else
6557 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6558 			break;
6559 
6560 		case 2 * 1024 * 1024:
6561 			if ((cp->cp_eax & 0xffff0000) == 0)
6562 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6563 			else
6564 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6565 			break;
6566 
6567 		default:
6568 			panic("unknown L2 pagesize");
6569 			/*NOTREACHED*/
6570 		}
6571 	}
6572 
6573 	if (dtlb_nent != 0)
6574 		return (dtlb_nent);
6575 
6576 	/*
6577 	 * No L2 TLB support for this size, try L1.
6578 	 */
6579 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6580 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6581 
6582 		switch (pagesize) {
6583 		case 4 * 1024:
6584 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6585 			break;
6586 		case 2 * 1024 * 1024:
6587 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6588 			break;
6589 		default:
6590 			panic("unknown L1 d-TLB pagesize");
6591 			/*NOTREACHED*/
6592 		}
6593 	}
6594 
6595 	return (dtlb_nent);
6596 }
6597 
6598 /*
6599  * Return 0 if the erratum is not present or not applicable, positive
6600  * if it is, and negative if the status of the erratum is unknown.
6601  *
6602  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6603  * Processors" #25759, Rev 3.57, August 2005
6604  */
6605 int
cpuid_opteron_erratum(cpu_t * cpu,uint_t erratum)6606 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6607 {
6608 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6609 	uint_t eax;
6610 
6611 	/*
6612 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6613 	 * a legacy (32-bit) AMD CPU.
6614 	 */
6615 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6616 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6617 	    cpi->cpi_family == 6) {
6618 		return (0);
6619 	}
6620 
6621 	eax = cpi->cpi_std[1].cp_eax;
6622 
6623 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6624 #define	SH_B3(eax)	(eax == 0xf51)
6625 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6626 
6627 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6628 
6629 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6630 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6631 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6632 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6633 
6634 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6635 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6636 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6637 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6638 
6639 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6640 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6641 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6642 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6643 #define	BH_E4(eax)	(eax == 0x20fb1)
6644 #define	SH_E5(eax)	(eax == 0x20f42)
6645 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6646 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6647 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6648 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6649 			    DH_E6(eax) || JH_E6(eax))
6650 
6651 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6652 #define	DR_B0(eax)	(eax == 0x100f20)
6653 #define	DR_B1(eax)	(eax == 0x100f21)
6654 #define	DR_BA(eax)	(eax == 0x100f2a)
6655 #define	DR_B2(eax)	(eax == 0x100f22)
6656 #define	DR_B3(eax)	(eax == 0x100f23)
6657 #define	RB_C0(eax)	(eax == 0x100f40)
6658 
6659 	switch (erratum) {
6660 	case 1:
6661 		return (cpi->cpi_family < 0x10);
6662 	case 51:	/* what does the asterisk mean? */
6663 		return (B(eax) || SH_C0(eax) || CG(eax));
6664 	case 52:
6665 		return (B(eax));
6666 	case 57:
6667 		return (cpi->cpi_family <= 0x11);
6668 	case 58:
6669 		return (B(eax));
6670 	case 60:
6671 		return (cpi->cpi_family <= 0x11);
6672 	case 61:
6673 	case 62:
6674 	case 63:
6675 	case 64:
6676 	case 65:
6677 	case 66:
6678 	case 68:
6679 	case 69:
6680 	case 70:
6681 	case 71:
6682 		return (B(eax));
6683 	case 72:
6684 		return (SH_B0(eax));
6685 	case 74:
6686 		return (B(eax));
6687 	case 75:
6688 		return (cpi->cpi_family < 0x10);
6689 	case 76:
6690 		return (B(eax));
6691 	case 77:
6692 		return (cpi->cpi_family <= 0x11);
6693 	case 78:
6694 		return (B(eax) || SH_C0(eax));
6695 	case 79:
6696 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6697 	case 80:
6698 	case 81:
6699 	case 82:
6700 		return (B(eax));
6701 	case 83:
6702 		return (B(eax) || SH_C0(eax) || CG(eax));
6703 	case 85:
6704 		return (cpi->cpi_family < 0x10);
6705 	case 86:
6706 		return (SH_C0(eax) || CG(eax));
6707 	case 88:
6708 		return (B(eax) || SH_C0(eax));
6709 	case 89:
6710 		return (cpi->cpi_family < 0x10);
6711 	case 90:
6712 		return (B(eax) || SH_C0(eax) || CG(eax));
6713 	case 91:
6714 	case 92:
6715 		return (B(eax) || SH_C0(eax));
6716 	case 93:
6717 		return (SH_C0(eax));
6718 	case 94:
6719 		return (B(eax) || SH_C0(eax) || CG(eax));
6720 	case 95:
6721 		return (B(eax) || SH_C0(eax));
6722 	case 96:
6723 		return (B(eax) || SH_C0(eax) || CG(eax));
6724 	case 97:
6725 	case 98:
6726 		return (SH_C0(eax) || CG(eax));
6727 	case 99:
6728 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6729 	case 100:
6730 		return (B(eax) || SH_C0(eax));
6731 	case 101:
6732 	case 103:
6733 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6734 	case 104:
6735 		return (SH_C0(eax) || CG(eax) || D0(eax));
6736 	case 105:
6737 	case 106:
6738 	case 107:
6739 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6740 	case 108:
6741 		return (DH_CG(eax));
6742 	case 109:
6743 		return (SH_C0(eax) || CG(eax) || D0(eax));
6744 	case 110:
6745 		return (D0(eax) || EX(eax));
6746 	case 111:
6747 		return (CG(eax));
6748 	case 112:
6749 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6750 	case 113:
6751 		return (eax == 0x20fc0);
6752 	case 114:
6753 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6754 	case 115:
6755 		return (SH_E0(eax) || JH_E1(eax));
6756 	case 116:
6757 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6758 	case 117:
6759 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6760 	case 118:
6761 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6762 		    JH_E6(eax));
6763 	case 121:
6764 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6765 	case 122:
6766 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6767 	case 123:
6768 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6769 	case 131:
6770 		return (cpi->cpi_family < 0x10);
6771 	case 6336786:
6772 
6773 		/*
6774 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6775 		 * if this is a K8 family or newer processor. We're testing for
6776 		 * this 'erratum' to determine whether or not we have a constant
6777 		 * TSC.
6778 		 *
6779 		 * Our current fix for this is to disable the C1-Clock ramping.
6780 		 * However, this doesn't work on newer processor families nor
6781 		 * does it work when virtualized as those devices don't exist.
6782 		 */
6783 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6784 			return (0);
6785 		}
6786 
6787 		if (CPI_FAMILY(cpi) == 0xf) {
6788 			struct cpuid_regs regs;
6789 			regs.cp_eax = 0x80000007;
6790 			(void) __cpuid_insn(&regs);
6791 			return (!(regs.cp_edx & 0x100));
6792 		}
6793 		return (0);
6794 	case 147:
6795 		/*
6796 		 * This erratum (K8 #147) is not present on family 10 and newer.
6797 		 */
6798 		if (cpi->cpi_family >= 0x10) {
6799 			return (0);
6800 		}
6801 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6802 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6803 
6804 	case 6671130:
6805 		/*
6806 		 * check for processors (pre-Shanghai) that do not provide
6807 		 * optimal management of 1gb ptes in its tlb.
6808 		 */
6809 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6810 
6811 	case 298:
6812 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6813 		    DR_B2(eax) || RB_C0(eax));
6814 
6815 	case 721:
6816 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6817 
6818 	default:
6819 		return (-1);
6820 
6821 	}
6822 }
6823 
6824 /*
6825  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6826  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6827  */
6828 int
osvw_opteron_erratum(cpu_t * cpu,uint_t erratum)6829 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6830 {
6831 	struct cpuid_info	*cpi;
6832 	uint_t			osvwid;
6833 	static int		osvwfeature = -1;
6834 	uint64_t		osvwlength;
6835 
6836 
6837 	cpi = cpu->cpu_m.mcpu_cpi;
6838 
6839 	/* confirm OSVW supported */
6840 	if (osvwfeature == -1) {
6841 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6842 	} else {
6843 		/* assert that osvw feature setting is consistent on all cpus */
6844 		ASSERT(osvwfeature ==
6845 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6846 	}
6847 	if (!osvwfeature)
6848 		return (-1);
6849 
6850 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6851 
6852 	switch (erratum) {
6853 	case 298:	/* osvwid is 0 */
6854 		osvwid = 0;
6855 		if (osvwlength <= (uint64_t)osvwid) {
6856 			/* osvwid 0 is unknown */
6857 			return (-1);
6858 		}
6859 
6860 		/*
6861 		 * Check the OSVW STATUS MSR to determine the state
6862 		 * of the erratum where:
6863 		 *   0 - fixed by HW
6864 		 *   1 - BIOS has applied the workaround when BIOS
6865 		 *   workaround is available. (Or for other errata,
6866 		 *   OS workaround is required.)
6867 		 * For a value of 1, caller will confirm that the
6868 		 * erratum 298 workaround has indeed been applied by BIOS.
6869 		 *
6870 		 * A 1 may be set in cpus that have a HW fix
6871 		 * in a mixed cpu system. Regarding erratum 298:
6872 		 *   In a multiprocessor platform, the workaround above
6873 		 *   should be applied to all processors regardless of
6874 		 *   silicon revision when an affected processor is
6875 		 *   present.
6876 		 */
6877 
6878 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6879 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6880 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6881 
6882 	default:
6883 		return (-1);
6884 	}
6885 }
6886 
6887 static const char assoc_str[] = "associativity";
6888 static const char line_str[] = "line-size";
6889 static const char size_str[] = "size";
6890 
6891 static void
add_cache_prop(dev_info_t * devi,const char * label,const char * type,uint32_t val)6892 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6893     uint32_t val)
6894 {
6895 	char buf[128];
6896 
6897 	/*
6898 	 * ndi_prop_update_int() is used because it is desirable for
6899 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6900 	 */
6901 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6902 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6903 }
6904 
6905 /*
6906  * Intel-style cache/tlb description
6907  *
6908  * Standard cpuid level 2 gives a randomly ordered
6909  * selection of tags that index into a table that describes
6910  * cache and tlb properties.
6911  */
6912 
6913 static const char l1_icache_str[] = "l1-icache";
6914 static const char l1_dcache_str[] = "l1-dcache";
6915 static const char l2_cache_str[] = "l2-cache";
6916 static const char l3_cache_str[] = "l3-cache";
6917 static const char itlb4k_str[] = "itlb-4K";
6918 static const char dtlb4k_str[] = "dtlb-4K";
6919 static const char itlb2M_str[] = "itlb-2M";
6920 static const char itlb4M_str[] = "itlb-4M";
6921 static const char dtlb4M_str[] = "dtlb-4M";
6922 static const char dtlb24_str[] = "dtlb0-2M-4M";
6923 static const char itlb424_str[] = "itlb-4K-2M-4M";
6924 static const char itlb24_str[] = "itlb-2M-4M";
6925 static const char dtlb44_str[] = "dtlb-4K-4M";
6926 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6927 static const char sl2_cache_str[] = "sectored-l2-cache";
6928 static const char itrace_str[] = "itrace-cache";
6929 static const char sl3_cache_str[] = "sectored-l3-cache";
6930 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6931 
6932 static const struct cachetab {
6933 	uint8_t		ct_code;
6934 	uint8_t		ct_assoc;
6935 	uint16_t	ct_line_size;
6936 	size_t		ct_size;
6937 	const char	*ct_label;
6938 } intel_ctab[] = {
6939 	/*
6940 	 * maintain descending order!
6941 	 *
6942 	 * Codes ignored - Reason
6943 	 * ----------------------
6944 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6945 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6946 	 */
6947 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6948 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6949 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6950 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6951 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6952 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6953 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6954 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6955 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6956 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6957 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6958 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6959 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6960 	{ 0xc0, 4, 0, 8, dtlb44_str },
6961 	{ 0xba, 4, 0, 64, dtlb4k_str },
6962 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6963 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6964 	{ 0xb2, 4, 0, 64, itlb4k_str },
6965 	{ 0xb0, 4, 0, 128, itlb4k_str },
6966 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6967 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6968 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6969 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6970 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6971 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6972 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6973 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6974 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6975 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6976 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6977 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6978 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6979 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6980 	{ 0x73, 8, 0, 64*1024, itrace_str},
6981 	{ 0x72, 8, 0, 32*1024, itrace_str},
6982 	{ 0x71, 8, 0, 16*1024, itrace_str},
6983 	{ 0x70, 8, 0, 12*1024, itrace_str},
6984 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6985 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6986 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6987 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6988 	{ 0x5d, 0, 0, 256, dtlb44_str},
6989 	{ 0x5c, 0, 0, 128, dtlb44_str},
6990 	{ 0x5b, 0, 0, 64, dtlb44_str},
6991 	{ 0x5a, 4, 0, 32, dtlb24_str},
6992 	{ 0x59, 0, 0, 16, dtlb4k_str},
6993 	{ 0x57, 4, 0, 16, dtlb4k_str},
6994 	{ 0x56, 4, 0, 16, dtlb4M_str},
6995 	{ 0x55, 0, 0, 7, itlb24_str},
6996 	{ 0x52, 0, 0, 256, itlb424_str},
6997 	{ 0x51, 0, 0, 128, itlb424_str},
6998 	{ 0x50, 0, 0, 64, itlb424_str},
6999 	{ 0x4f, 0, 0, 32, itlb4k_str},
7000 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
7001 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
7002 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
7003 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
7004 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
7005 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
7006 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
7007 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
7008 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
7009 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
7010 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
7011 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
7012 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
7013 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
7014 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
7015 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
7016 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
7017 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
7018 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
7019 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
7020 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
7021 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
7022 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
7023 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
7024 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
7025 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
7026 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
7027 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
7028 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
7029 	{ 0x0b, 4, 0, 4, itlb4M_str},
7030 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
7031 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
7032 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
7033 	{ 0x05, 4, 0, 32, dtlb4M_str},
7034 	{ 0x04, 4, 0, 8, dtlb4M_str},
7035 	{ 0x03, 4, 0, 64, dtlb4k_str},
7036 	{ 0x02, 4, 0, 2, itlb4M_str},
7037 	{ 0x01, 4, 0, 32, itlb4k_str},
7038 	{ 0 }
7039 };
7040 
7041 static const struct cachetab cyrix_ctab[] = {
7042 	{ 0x70, 4, 0, 32, "tlb-4K" },
7043 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
7044 	{ 0 }
7045 };
7046 
7047 /*
7048  * Search a cache table for a matching entry
7049  */
7050 static const struct cachetab *
find_cacheent(const struct cachetab * ct,uint_t code)7051 find_cacheent(const struct cachetab *ct, uint_t code)
7052 {
7053 	if (code != 0) {
7054 		for (; ct->ct_code != 0; ct++)
7055 			if (ct->ct_code <= code)
7056 				break;
7057 		if (ct->ct_code == code)
7058 			return (ct);
7059 	}
7060 	return (NULL);
7061 }
7062 
7063 /*
7064  * Populate cachetab entry with L2 or L3 cache-information using
7065  * cpuid function 4. This function is called from intel_walk_cacheinfo()
7066  * when descriptor 0x49 is encountered. It returns 0 if no such cache
7067  * information is found.
7068  */
7069 static int
intel_cpuid_4_cache_info(struct cachetab * ct,struct cpuid_info * cpi)7070 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
7071 {
7072 	uint32_t level, i;
7073 	int ret = 0;
7074 
7075 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
7076 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
7077 
7078 		if (level == 2 || level == 3) {
7079 			ct->ct_assoc =
7080 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
7081 			ct->ct_line_size =
7082 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
7083 			ct->ct_size = ct->ct_assoc *
7084 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
7085 			    ct->ct_line_size *
7086 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
7087 
7088 			if (level == 2) {
7089 				ct->ct_label = l2_cache_str;
7090 			} else if (level == 3) {
7091 				ct->ct_label = l3_cache_str;
7092 			}
7093 			ret = 1;
7094 		}
7095 	}
7096 
7097 	return (ret);
7098 }
7099 
7100 /*
7101  * Walk the cacheinfo descriptor, applying 'func' to every valid element
7102  * The walk is terminated if the walker returns non-zero.
7103  */
7104 static void
intel_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7105 intel_walk_cacheinfo(struct cpuid_info *cpi,
7106     void *arg, int (*func)(void *, const struct cachetab *))
7107 {
7108 	const struct cachetab *ct;
7109 	struct cachetab des_49_ct, des_b1_ct;
7110 	uint8_t *dp;
7111 	int i;
7112 
7113 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7114 		return;
7115 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7116 		/*
7117 		 * For overloaded descriptor 0x49 we use cpuid function 4
7118 		 * if supported by the current processor, to create
7119 		 * cache information.
7120 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
7121 		 * to disambiguate the cache information.
7122 		 */
7123 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
7124 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
7125 				ct = &des_49_ct;
7126 		} else if (*dp == 0xb1) {
7127 			des_b1_ct.ct_code = 0xb1;
7128 			des_b1_ct.ct_assoc = 4;
7129 			des_b1_ct.ct_line_size = 0;
7130 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
7131 				des_b1_ct.ct_size = 8;
7132 				des_b1_ct.ct_label = itlb2M_str;
7133 			} else {
7134 				des_b1_ct.ct_size = 4;
7135 				des_b1_ct.ct_label = itlb4M_str;
7136 			}
7137 			ct = &des_b1_ct;
7138 		} else {
7139 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
7140 				continue;
7141 			}
7142 		}
7143 
7144 		if (func(arg, ct) != 0) {
7145 			break;
7146 		}
7147 	}
7148 }
7149 
7150 /*
7151  * (Like the Intel one, except for Cyrix CPUs)
7152  */
7153 static void
cyrix_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7154 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
7155     void *arg, int (*func)(void *, const struct cachetab *))
7156 {
7157 	const struct cachetab *ct;
7158 	uint8_t *dp;
7159 	int i;
7160 
7161 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7162 		return;
7163 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7164 		/*
7165 		 * Search Cyrix-specific descriptor table first ..
7166 		 */
7167 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
7168 			if (func(arg, ct) != 0)
7169 				break;
7170 			continue;
7171 		}
7172 		/*
7173 		 * .. else fall back to the Intel one
7174 		 */
7175 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
7176 			if (func(arg, ct) != 0)
7177 				break;
7178 			continue;
7179 		}
7180 	}
7181 }
7182 
7183 /*
7184  * A cacheinfo walker that adds associativity, line-size, and size properties
7185  * to the devinfo node it is passed as an argument.
7186  */
7187 static int
add_cacheent_props(void * arg,const struct cachetab * ct)7188 add_cacheent_props(void *arg, const struct cachetab *ct)
7189 {
7190 	dev_info_t *devi = arg;
7191 
7192 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
7193 	if (ct->ct_line_size != 0)
7194 		add_cache_prop(devi, ct->ct_label, line_str,
7195 		    ct->ct_line_size);
7196 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
7197 	return (0);
7198 }
7199 
7200 
7201 static const char fully_assoc[] = "fully-associative?";
7202 
7203 /*
7204  * AMD style cache/tlb description
7205  *
7206  * Extended functions 5 and 6 directly describe properties of
7207  * tlbs and various cache levels.
7208  */
7209 static void
add_amd_assoc(dev_info_t * devi,const char * label,uint_t assoc)7210 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7211 {
7212 	switch (assoc) {
7213 	case 0:	/* reserved; ignore */
7214 		break;
7215 	default:
7216 		add_cache_prop(devi, label, assoc_str, assoc);
7217 		break;
7218 	case 0xff:
7219 		add_cache_prop(devi, label, fully_assoc, 1);
7220 		break;
7221 	}
7222 }
7223 
7224 static void
add_amd_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7225 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7226 {
7227 	if (size == 0)
7228 		return;
7229 	add_cache_prop(devi, label, size_str, size);
7230 	add_amd_assoc(devi, label, assoc);
7231 }
7232 
7233 static void
add_amd_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7234 add_amd_cache(dev_info_t *devi, const char *label,
7235     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7236 {
7237 	if (size == 0 || line_size == 0)
7238 		return;
7239 	add_amd_assoc(devi, label, assoc);
7240 	/*
7241 	 * Most AMD parts have a sectored cache. Multiple cache lines are
7242 	 * associated with each tag. A sector consists of all cache lines
7243 	 * associated with a tag. For example, the AMD K6-III has a sector
7244 	 * size of 2 cache lines per tag.
7245 	 */
7246 	if (lines_per_tag != 0)
7247 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7248 	add_cache_prop(devi, label, line_str, line_size);
7249 	add_cache_prop(devi, label, size_str, size * 1024);
7250 }
7251 
7252 static void
add_amd_l2_assoc(dev_info_t * devi,const char * label,uint_t assoc)7253 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7254 {
7255 	switch (assoc) {
7256 	case 0:	/* off */
7257 		break;
7258 	case 1:
7259 	case 2:
7260 	case 4:
7261 		add_cache_prop(devi, label, assoc_str, assoc);
7262 		break;
7263 	case 6:
7264 		add_cache_prop(devi, label, assoc_str, 8);
7265 		break;
7266 	case 8:
7267 		add_cache_prop(devi, label, assoc_str, 16);
7268 		break;
7269 	case 0xf:
7270 		add_cache_prop(devi, label, fully_assoc, 1);
7271 		break;
7272 	default: /* reserved; ignore */
7273 		break;
7274 	}
7275 }
7276 
7277 static void
add_amd_l2_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7278 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7279 {
7280 	if (size == 0 || assoc == 0)
7281 		return;
7282 	add_amd_l2_assoc(devi, label, assoc);
7283 	add_cache_prop(devi, label, size_str, size);
7284 }
7285 
7286 static void
add_amd_l2_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7287 add_amd_l2_cache(dev_info_t *devi, const char *label,
7288     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7289 {
7290 	if (size == 0 || assoc == 0 || line_size == 0)
7291 		return;
7292 	add_amd_l2_assoc(devi, label, assoc);
7293 	if (lines_per_tag != 0)
7294 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7295 	add_cache_prop(devi, label, line_str, line_size);
7296 	add_cache_prop(devi, label, size_str, size * 1024);
7297 }
7298 
7299 static void
amd_cache_info(struct cpuid_info * cpi,dev_info_t * devi)7300 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7301 {
7302 	struct cpuid_regs *cp;
7303 
7304 	if (cpi->cpi_xmaxeax < 0x80000005)
7305 		return;
7306 	cp = &cpi->cpi_extd[5];
7307 
7308 	/*
7309 	 * 4M/2M L1 TLB configuration
7310 	 *
7311 	 * We report the size for 2M pages because AMD uses two
7312 	 * TLB entries for one 4M page.
7313 	 */
7314 	add_amd_tlb(devi, "dtlb-2M",
7315 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7316 	add_amd_tlb(devi, "itlb-2M",
7317 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7318 
7319 	/*
7320 	 * 4K L1 TLB configuration
7321 	 */
7322 
7323 	switch (cpi->cpi_vendor) {
7324 		uint_t nentries;
7325 	case X86_VENDOR_TM:
7326 		if (cpi->cpi_family >= 5) {
7327 			/*
7328 			 * Crusoe processors have 256 TLB entries, but
7329 			 * cpuid data format constrains them to only
7330 			 * reporting 255 of them.
7331 			 */
7332 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7333 				nentries = 256;
7334 			/*
7335 			 * Crusoe processors also have a unified TLB
7336 			 */
7337 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7338 			    nentries);
7339 			break;
7340 		}
7341 		/*FALLTHROUGH*/
7342 	default:
7343 		add_amd_tlb(devi, itlb4k_str,
7344 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7345 		add_amd_tlb(devi, dtlb4k_str,
7346 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7347 		break;
7348 	}
7349 
7350 	/*
7351 	 * data L1 cache configuration
7352 	 */
7353 
7354 	add_amd_cache(devi, l1_dcache_str,
7355 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7356 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7357 
7358 	/*
7359 	 * code L1 cache configuration
7360 	 */
7361 
7362 	add_amd_cache(devi, l1_icache_str,
7363 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7364 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7365 
7366 	if (cpi->cpi_xmaxeax < 0x80000006)
7367 		return;
7368 	cp = &cpi->cpi_extd[6];
7369 
7370 	/* Check for a unified L2 TLB for large pages */
7371 
7372 	if (BITX(cp->cp_eax, 31, 16) == 0)
7373 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7374 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7375 	else {
7376 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7377 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7378 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7379 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7380 	}
7381 
7382 	/* Check for a unified L2 TLB for 4K pages */
7383 
7384 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7385 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7386 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7387 	} else {
7388 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7389 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7390 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7391 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7392 	}
7393 
7394 	add_amd_l2_cache(devi, l2_cache_str,
7395 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7396 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7397 }
7398 
7399 /*
7400  * There are two basic ways that the x86 world describes it cache
7401  * and tlb architecture - Intel's way and AMD's way.
7402  *
7403  * Return which flavor of cache architecture we should use
7404  */
7405 static int
x86_which_cacheinfo(struct cpuid_info * cpi)7406 x86_which_cacheinfo(struct cpuid_info *cpi)
7407 {
7408 	switch (cpi->cpi_vendor) {
7409 	case X86_VENDOR_Intel:
7410 		if (cpi->cpi_maxeax >= 2)
7411 			return (X86_VENDOR_Intel);
7412 		break;
7413 	case X86_VENDOR_AMD:
7414 		/*
7415 		 * The K5 model 1 was the first part from AMD that reported
7416 		 * cache sizes via extended cpuid functions.
7417 		 */
7418 		if (cpi->cpi_family > 5 ||
7419 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7420 			return (X86_VENDOR_AMD);
7421 		break;
7422 	case X86_VENDOR_HYGON:
7423 		return (X86_VENDOR_AMD);
7424 	case X86_VENDOR_TM:
7425 		if (cpi->cpi_family >= 5)
7426 			return (X86_VENDOR_AMD);
7427 		/*FALLTHROUGH*/
7428 	default:
7429 		/*
7430 		 * If they have extended CPU data for 0x80000005
7431 		 * then we assume they have AMD-format cache
7432 		 * information.
7433 		 *
7434 		 * If not, and the vendor happens to be Cyrix,
7435 		 * then try our-Cyrix specific handler.
7436 		 *
7437 		 * If we're not Cyrix, then assume we're using Intel's
7438 		 * table-driven format instead.
7439 		 */
7440 		if (cpi->cpi_xmaxeax >= 0x80000005)
7441 			return (X86_VENDOR_AMD);
7442 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7443 			return (X86_VENDOR_Cyrix);
7444 		else if (cpi->cpi_maxeax >= 2)
7445 			return (X86_VENDOR_Intel);
7446 		break;
7447 	}
7448 	return (-1);
7449 }
7450 
7451 void
cpuid_set_cpu_properties(void * dip,processorid_t cpu_id,struct cpuid_info * cpi)7452 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7453     struct cpuid_info *cpi)
7454 {
7455 	dev_info_t *cpu_devi;
7456 	int create;
7457 
7458 	cpu_devi = (dev_info_t *)dip;
7459 
7460 	/* device_type */
7461 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7462 	    "device_type", "cpu");
7463 
7464 	/* reg */
7465 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7466 	    "reg", cpu_id);
7467 
7468 	/* cpu-mhz, and clock-frequency */
7469 	if (cpu_freq > 0) {
7470 		long long mul;
7471 
7472 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7473 		    "cpu-mhz", cpu_freq);
7474 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7475 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7476 			    "clock-frequency", (int)mul);
7477 	}
7478 
7479 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7480 
7481 	/* vendor-id */
7482 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7483 	    "vendor-id", cpi->cpi_vendorstr);
7484 
7485 	if (cpi->cpi_maxeax == 0) {
7486 		return;
7487 	}
7488 
7489 	/*
7490 	 * family, model, and step
7491 	 */
7492 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7493 	    "family", CPI_FAMILY(cpi));
7494 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7495 	    "cpu-model", CPI_MODEL(cpi));
7496 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7497 	    "stepping-id", CPI_STEP(cpi));
7498 
7499 	/* type */
7500 	switch (cpi->cpi_vendor) {
7501 	case X86_VENDOR_Intel:
7502 		create = 1;
7503 		break;
7504 	default:
7505 		create = 0;
7506 		break;
7507 	}
7508 	if (create)
7509 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7510 		    "type", CPI_TYPE(cpi));
7511 
7512 	/* ext-family */
7513 	switch (cpi->cpi_vendor) {
7514 	case X86_VENDOR_Intel:
7515 	case X86_VENDOR_AMD:
7516 		create = cpi->cpi_family >= 0xf;
7517 		break;
7518 	case X86_VENDOR_HYGON:
7519 		create = 1;
7520 		break;
7521 	default:
7522 		create = 0;
7523 		break;
7524 	}
7525 	if (create)
7526 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7527 		    "ext-family", CPI_FAMILY_XTD(cpi));
7528 
7529 	/* ext-model */
7530 	switch (cpi->cpi_vendor) {
7531 	case X86_VENDOR_Intel:
7532 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7533 		break;
7534 	case X86_VENDOR_AMD:
7535 		create = CPI_FAMILY(cpi) == 0xf;
7536 		break;
7537 	case X86_VENDOR_HYGON:
7538 		create = 1;
7539 		break;
7540 	default:
7541 		create = 0;
7542 		break;
7543 	}
7544 	if (create)
7545 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7546 		    "ext-model", CPI_MODEL_XTD(cpi));
7547 
7548 	/* generation */
7549 	switch (cpi->cpi_vendor) {
7550 	case X86_VENDOR_AMD:
7551 	case X86_VENDOR_HYGON:
7552 		/*
7553 		 * AMD K5 model 1 was the first part to support this
7554 		 */
7555 		create = cpi->cpi_xmaxeax >= 0x80000001;
7556 		break;
7557 	default:
7558 		create = 0;
7559 		break;
7560 	}
7561 	if (create)
7562 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7563 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7564 
7565 	/* brand-id */
7566 	switch (cpi->cpi_vendor) {
7567 	case X86_VENDOR_Intel:
7568 		/*
7569 		 * brand id first appeared on Pentium III Xeon model 8,
7570 		 * and Celeron model 8 processors and Opteron
7571 		 */
7572 		create = cpi->cpi_family > 6 ||
7573 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7574 		break;
7575 	case X86_VENDOR_AMD:
7576 		create = cpi->cpi_family >= 0xf;
7577 		break;
7578 	case X86_VENDOR_HYGON:
7579 		create = 1;
7580 		break;
7581 	default:
7582 		create = 0;
7583 		break;
7584 	}
7585 	if (create && cpi->cpi_brandid != 0) {
7586 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7587 		    "brand-id", cpi->cpi_brandid);
7588 	}
7589 
7590 	/* chunks, and apic-id */
7591 	switch (cpi->cpi_vendor) {
7592 		/*
7593 		 * first available on Pentium IV and Opteron (K8)
7594 		 */
7595 	case X86_VENDOR_Intel:
7596 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7597 		break;
7598 	case X86_VENDOR_AMD:
7599 		create = cpi->cpi_family >= 0xf;
7600 		break;
7601 	case X86_VENDOR_HYGON:
7602 		create = 1;
7603 		break;
7604 	default:
7605 		create = 0;
7606 		break;
7607 	}
7608 	if (create) {
7609 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7610 		    "chunks", CPI_CHUNKS(cpi));
7611 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7612 		    "apic-id", cpi->cpi_apicid);
7613 		if (cpi->cpi_chipid >= 0) {
7614 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7615 			    "chip#", cpi->cpi_chipid);
7616 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7617 			    "clog#", cpi->cpi_clogid);
7618 		}
7619 	}
7620 
7621 	/* cpuid-features */
7622 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7623 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7624 
7625 
7626 	/* cpuid-features-ecx */
7627 	switch (cpi->cpi_vendor) {
7628 	case X86_VENDOR_Intel:
7629 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7630 		break;
7631 	case X86_VENDOR_AMD:
7632 		create = cpi->cpi_family >= 0xf;
7633 		break;
7634 	case X86_VENDOR_HYGON:
7635 		create = 1;
7636 		break;
7637 	default:
7638 		create = 0;
7639 		break;
7640 	}
7641 	if (create)
7642 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7643 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7644 
7645 	/* ext-cpuid-features */
7646 	switch (cpi->cpi_vendor) {
7647 	case X86_VENDOR_Intel:
7648 	case X86_VENDOR_AMD:
7649 	case X86_VENDOR_HYGON:
7650 	case X86_VENDOR_Cyrix:
7651 	case X86_VENDOR_TM:
7652 	case X86_VENDOR_Centaur:
7653 		create = cpi->cpi_xmaxeax >= 0x80000001;
7654 		break;
7655 	default:
7656 		create = 0;
7657 		break;
7658 	}
7659 	if (create) {
7660 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7661 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7662 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7663 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7664 	}
7665 
7666 	/*
7667 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7668 	 * model 1, and Cyrix GXm.  On earlier models we try and
7669 	 * simulate something similar .. so this string should always
7670 	 * same -something- about the processor, however lame.
7671 	 */
7672 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7673 	    "brand-string", cpi->cpi_brandstr);
7674 
7675 	/*
7676 	 * Finally, cache and tlb information
7677 	 */
7678 	switch (x86_which_cacheinfo(cpi)) {
7679 	case X86_VENDOR_Intel:
7680 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7681 		break;
7682 	case X86_VENDOR_Cyrix:
7683 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7684 		break;
7685 	case X86_VENDOR_AMD:
7686 		amd_cache_info(cpi, cpu_devi);
7687 		break;
7688 	default:
7689 		break;
7690 	}
7691 }
7692 
7693 struct l2info {
7694 	int *l2i_csz;
7695 	int *l2i_lsz;
7696 	int *l2i_assoc;
7697 	int l2i_ret;
7698 };
7699 
7700 /*
7701  * A cacheinfo walker that fetches the size, line-size and associativity
7702  * of the L2 cache
7703  */
7704 static int
intel_l2cinfo(void * arg,const struct cachetab * ct)7705 intel_l2cinfo(void *arg, const struct cachetab *ct)
7706 {
7707 	struct l2info *l2i = arg;
7708 	int *ip;
7709 
7710 	if (ct->ct_label != l2_cache_str &&
7711 	    ct->ct_label != sl2_cache_str)
7712 		return (0);	/* not an L2 -- keep walking */
7713 
7714 	if ((ip = l2i->l2i_csz) != NULL)
7715 		*ip = ct->ct_size;
7716 	if ((ip = l2i->l2i_lsz) != NULL)
7717 		*ip = ct->ct_line_size;
7718 	if ((ip = l2i->l2i_assoc) != NULL)
7719 		*ip = ct->ct_assoc;
7720 	l2i->l2i_ret = ct->ct_size;
7721 	return (1);		/* was an L2 -- terminate walk */
7722 }
7723 
7724 /*
7725  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7726  *
7727  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7728  *	value is the associativity, the associativity for the L2 cache and
7729  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7730  *	an index into the amd_afd[] array to determine the associativity.
7731  *	-1 is undefined. 0 is fully associative.
7732  */
7733 
7734 static int amd_afd[] =
7735 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7736 
7737 static void
amd_l2cacheinfo(struct cpuid_info * cpi,struct l2info * l2i)7738 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7739 {
7740 	struct cpuid_regs *cp;
7741 	uint_t size, assoc;
7742 	int i;
7743 	int *ip;
7744 
7745 	if (cpi->cpi_xmaxeax < 0x80000006)
7746 		return;
7747 	cp = &cpi->cpi_extd[6];
7748 
7749 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7750 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7751 		uint_t cachesz = size * 1024;
7752 		assoc = amd_afd[i];
7753 
7754 		ASSERT(assoc != -1);
7755 
7756 		if ((ip = l2i->l2i_csz) != NULL)
7757 			*ip = cachesz;
7758 		if ((ip = l2i->l2i_lsz) != NULL)
7759 			*ip = BITX(cp->cp_ecx, 7, 0);
7760 		if ((ip = l2i->l2i_assoc) != NULL)
7761 			*ip = assoc;
7762 		l2i->l2i_ret = cachesz;
7763 	}
7764 }
7765 
7766 int
getl2cacheinfo(cpu_t * cpu,int * csz,int * lsz,int * assoc)7767 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7768 {
7769 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7770 	struct l2info __l2info, *l2i = &__l2info;
7771 
7772 	l2i->l2i_csz = csz;
7773 	l2i->l2i_lsz = lsz;
7774 	l2i->l2i_assoc = assoc;
7775 	l2i->l2i_ret = -1;
7776 
7777 	switch (x86_which_cacheinfo(cpi)) {
7778 	case X86_VENDOR_Intel:
7779 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7780 		break;
7781 	case X86_VENDOR_Cyrix:
7782 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7783 		break;
7784 	case X86_VENDOR_AMD:
7785 		amd_l2cacheinfo(cpi, l2i);
7786 		break;
7787 	default:
7788 		break;
7789 	}
7790 	return (l2i->l2i_ret);
7791 }
7792 
7793 #if !defined(__xpv)
7794 
7795 uint32_t *
cpuid_mwait_alloc(cpu_t * cpu)7796 cpuid_mwait_alloc(cpu_t *cpu)
7797 {
7798 	uint32_t	*ret;
7799 	size_t		mwait_size;
7800 
7801 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7802 
7803 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7804 	if (mwait_size == 0)
7805 		return (NULL);
7806 
7807 	/*
7808 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7809 	 * allocations.  mwait_size is currently cache line sized.  Neither
7810 	 * of these implementation details are guarantied to be true in the
7811 	 * future.
7812 	 *
7813 	 * First try allocating mwait_size as kmem_alloc() currently returns
7814 	 * correctly aligned memory.  If kmem_alloc() does not return
7815 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7816 	 *
7817 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7818 	 * decide to free this memory.
7819 	 */
7820 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7821 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7822 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7823 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7824 		*ret = MWAIT_RUNNING;
7825 		return (ret);
7826 	} else {
7827 		kmem_free(ret, mwait_size);
7828 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7829 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7830 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7831 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7832 		*ret = MWAIT_RUNNING;
7833 		return (ret);
7834 	}
7835 }
7836 
7837 void
cpuid_mwait_free(cpu_t * cpu)7838 cpuid_mwait_free(cpu_t *cpu)
7839 {
7840 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7841 		return;
7842 	}
7843 
7844 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7845 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7846 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7847 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7848 	}
7849 
7850 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7851 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7852 }
7853 
7854 void
patch_tsc_read(int flag)7855 patch_tsc_read(int flag)
7856 {
7857 	size_t cnt;
7858 
7859 	switch (flag) {
7860 	case TSC_NONE:
7861 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7862 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7863 		break;
7864 	case TSC_RDTSC_LFENCE:
7865 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7866 		(void) memcpy((void *)tsc_read,
7867 		    (void *)&_tsc_lfence_start, cnt);
7868 		break;
7869 	case TSC_TSCP:
7870 		cnt = &_tscp_end - &_tscp_start;
7871 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7872 		break;
7873 	default:
7874 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7875 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7876 		break;
7877 	}
7878 	tsc_type = flag;
7879 }
7880 
7881 int
cpuid_deep_cstates_supported(void)7882 cpuid_deep_cstates_supported(void)
7883 {
7884 	struct cpuid_info *cpi;
7885 	struct cpuid_regs regs;
7886 
7887 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7888 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7889 
7890 	cpi = CPU->cpu_m.mcpu_cpi;
7891 
7892 	switch (cpi->cpi_vendor) {
7893 	case X86_VENDOR_Intel:
7894 	case X86_VENDOR_AMD:
7895 	case X86_VENDOR_HYGON:
7896 		if (cpi->cpi_xmaxeax < 0x80000007)
7897 			return (0);
7898 
7899 		/*
7900 		 * Does TSC run at a constant rate in all C-states?
7901 		 */
7902 		regs.cp_eax = 0x80000007;
7903 		(void) __cpuid_insn(&regs);
7904 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7905 
7906 	default:
7907 		return (0);
7908 	}
7909 }
7910 
7911 #endif	/* !__xpv */
7912 
7913 void
post_startup_cpu_fixups(void)7914 post_startup_cpu_fixups(void)
7915 {
7916 #ifndef __xpv
7917 	/*
7918 	 * Some AMD processors support C1E state. Entering this state will
7919 	 * cause the local APIC timer to stop, which we can't deal with at
7920 	 * this time.
7921 	 */
7922 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7923 		on_trap_data_t otd;
7924 		uint64_t reg;
7925 
7926 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7927 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7928 			/* Disable C1E state if it is enabled by BIOS */
7929 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7930 			    AMD_ACTONCMPHALT_MASK) {
7931 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7932 				    AMD_ACTONCMPHALT_SHIFT);
7933 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7934 			}
7935 		}
7936 		no_trap();
7937 	}
7938 #endif	/* !__xpv */
7939 }
7940 
7941 void
enable_pcid(void)7942 enable_pcid(void)
7943 {
7944 	if (x86_use_pcid == -1)
7945 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7946 
7947 	if (x86_use_invpcid == -1) {
7948 		x86_use_invpcid = is_x86_feature(x86_featureset,
7949 		    X86FSET_INVPCID);
7950 	}
7951 
7952 	if (!x86_use_pcid)
7953 		return;
7954 
7955 	/*
7956 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7957 	 * bits; better make sure there's nothing there.
7958 	 */
7959 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7960 
7961 	setcr4(getcr4() | CR4_PCIDE);
7962 }
7963 
7964 /*
7965  * Setup necessary registers to enable XSAVE feature on this processor.
7966  * This function needs to be called early enough, so that no xsave/xrstor
7967  * ops will execute on the processor before the MSRs are properly set up.
7968  *
7969  * Current implementation has the following assumption:
7970  * - cpuid_pass_basic() is done, so that X86 features are known.
7971  * - fpu_probe() is done, so that fp_save_mech is chosen.
7972  */
7973 void
xsave_setup_msr(cpu_t * cpu)7974 xsave_setup_msr(cpu_t *cpu)
7975 {
7976 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7977 	ASSERT(fp_save_mech == FP_XSAVE);
7978 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7979 
7980 	/* Enable OSXSAVE in CR4. */
7981 	setcr4(getcr4() | CR4_OSXSAVE);
7982 	/*
7983 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7984 	 * correct value.
7985 	 */
7986 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7987 	setup_xfem();
7988 }
7989 
7990 /*
7991  * Starting with the Westmere processor the local
7992  * APIC timer will continue running in all C-states,
7993  * including the deepest C-states.
7994  */
7995 int
cpuid_arat_supported(void)7996 cpuid_arat_supported(void)
7997 {
7998 	struct cpuid_info *cpi;
7999 	struct cpuid_regs regs;
8000 
8001 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8002 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8003 
8004 	cpi = CPU->cpu_m.mcpu_cpi;
8005 
8006 	switch (cpi->cpi_vendor) {
8007 	case X86_VENDOR_Intel:
8008 	case X86_VENDOR_AMD:
8009 	case X86_VENDOR_HYGON:
8010 		/*
8011 		 * Always-running Local APIC Timer is
8012 		 * indicated by CPUID.6.EAX[2].
8013 		 */
8014 		if (cpi->cpi_maxeax >= 6) {
8015 			regs.cp_eax = 6;
8016 			(void) cpuid_insn(NULL, &regs);
8017 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
8018 		} else {
8019 			return (0);
8020 		}
8021 	default:
8022 		return (0);
8023 	}
8024 }
8025 
8026 /*
8027  * Check support for Intel ENERGY_PERF_BIAS feature
8028  */
8029 int
cpuid_iepb_supported(struct cpu * cp)8030 cpuid_iepb_supported(struct cpu *cp)
8031 {
8032 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
8033 	struct cpuid_regs regs;
8034 
8035 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
8036 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8037 
8038 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
8039 		return (0);
8040 	}
8041 
8042 	/*
8043 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
8044 	 * capability bit CPUID.6.ECX.3
8045 	 */
8046 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
8047 		return (0);
8048 
8049 	regs.cp_eax = 0x6;
8050 	(void) cpuid_insn(NULL, &regs);
8051 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
8052 }
8053 
8054 /*
8055  * Check support for TSC deadline timer
8056  *
8057  * TSC deadline timer provides a superior software programming
8058  * model over local APIC timer that eliminates "time drifts".
8059  * Instead of specifying a relative time, software specifies an
8060  * absolute time as the target at which the processor should
8061  * generate a timer event.
8062  */
8063 int
cpuid_deadline_tsc_supported(void)8064 cpuid_deadline_tsc_supported(void)
8065 {
8066 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
8067 	struct cpuid_regs regs;
8068 
8069 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8070 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8071 
8072 	switch (cpi->cpi_vendor) {
8073 	case X86_VENDOR_Intel:
8074 		if (cpi->cpi_maxeax >= 1) {
8075 			regs.cp_eax = 1;
8076 			(void) cpuid_insn(NULL, &regs);
8077 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
8078 		} else {
8079 			return (0);
8080 		}
8081 	default:
8082 		return (0);
8083 	}
8084 }
8085 
8086 #if !defined(__xpv)
8087 /*
8088  * Patch in versions of bcopy for high performance Intel Nhm processors
8089  * and later...
8090  */
8091 void
patch_memops(uint_t vendor)8092 patch_memops(uint_t vendor)
8093 {
8094 	size_t cnt, i;
8095 	caddr_t to, from;
8096 
8097 	if ((vendor == X86_VENDOR_Intel) &&
8098 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
8099 		cnt = &bcopy_patch_end - &bcopy_patch_start;
8100 		to = &bcopy_ck_size;
8101 		from = &bcopy_patch_start;
8102 		for (i = 0; i < cnt; i++) {
8103 			*to++ = *from++;
8104 		}
8105 	}
8106 }
8107 #endif  /*  !__xpv */
8108 
8109 /*
8110  * We're being asked to tell the system how many bits are required to represent
8111  * the various thread and strand IDs. While it's tempting to derive this based
8112  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
8113  * correct. Instead, this needs to be based on the number of bits that the APIC
8114  * allows for these different configurations. We only update these to a larger
8115  * value if we find one.
8116  */
8117 void
cpuid_get_ext_topo(cpu_t * cpu,uint_t * core_nbits,uint_t * strand_nbits)8118 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
8119 {
8120 	struct cpuid_info *cpi;
8121 
8122 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8123 	cpi = cpu->cpu_m.mcpu_cpi;
8124 
8125 	if (cpi->cpi_ncore_bits > *core_nbits) {
8126 		*core_nbits = cpi->cpi_ncore_bits;
8127 	}
8128 
8129 	if (cpi->cpi_nthread_bits > *strand_nbits) {
8130 		*strand_nbits = cpi->cpi_nthread_bits;
8131 	}
8132 }
8133 
8134 void
cpuid_pass_ucode(cpu_t * cpu,uchar_t * fset)8135 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
8136 {
8137 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
8138 	struct cpuid_regs cp;
8139 
8140 	/*
8141 	 * Reread the CPUID portions that we need for various security
8142 	 * information.
8143 	 */
8144 	switch (cpi->cpi_vendor) {
8145 	case X86_VENDOR_Intel:
8146 		/*
8147 		 * Check if we now have leaf 7 available to us.
8148 		 */
8149 		if (cpi->cpi_maxeax < 7) {
8150 			bzero(&cp, sizeof (cp));
8151 			cp.cp_eax = 0;
8152 			cpi->cpi_maxeax = __cpuid_insn(&cp);
8153 			if (cpi->cpi_maxeax < 7)
8154 				break;
8155 		}
8156 
8157 		bzero(&cp, sizeof (cp));
8158 		cp.cp_eax = 7;
8159 		cp.cp_ecx = 0;
8160 		(void) __cpuid_insn(&cp);
8161 		cpi->cpi_std[7] = cp;
8162 		break;
8163 
8164 	case X86_VENDOR_AMD:
8165 	case X86_VENDOR_HYGON:
8166 		/* No xcpuid support */
8167 		if (cpi->cpi_family < 5 ||
8168 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
8169 			break;
8170 
8171 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
8172 			bzero(&cp, sizeof (cp));
8173 			cp.cp_eax = CPUID_LEAF_EXT_0;
8174 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
8175 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
8176 				break;
8177 		}
8178 
8179 		/*
8180 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
8181 		 * leaf 0x21. So we also check that.
8182 		 */
8183 		bzero(&cp, sizeof (cp));
8184 		cp.cp_eax = CPUID_LEAF_EXT_8;
8185 		(void) __cpuid_insn(&cp);
8186 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
8187 		cpi->cpi_extd[8] = cp;
8188 
8189 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21)
8190 			break;
8191 
8192 		bzero(&cp, sizeof (cp));
8193 		cp.cp_eax = CPUID_LEAF_EXT_21;
8194 		(void) __cpuid_insn(&cp);
8195 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
8196 		cpi->cpi_extd[0x21] = cp;
8197 		break;
8198 
8199 	default:
8200 		/*
8201 		 * Nothing to do here. Return an empty set which has already
8202 		 * been zeroed for us.
8203 		 */
8204 		return;
8205 	}
8206 
8207 	cpuid_scan_security(cpu, fset);
8208 }
8209 
8210 /* ARGSUSED */
8211 static int
cpuid_post_ucodeadm_xc(xc_arg_t arg0,xc_arg_t arg1,xc_arg_t arg2)8212 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
8213 {
8214 	uchar_t *fset;
8215 	boolean_t first_pass = (boolean_t)arg1;
8216 
8217 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
8218 	if (first_pass && CPU->cpu_id != 0)
8219 		return (0);
8220 	if (!first_pass && CPU->cpu_id == 0)
8221 		return (0);
8222 	cpuid_pass_ucode(CPU, fset);
8223 
8224 	return (0);
8225 }
8226 
8227 /*
8228  * After a microcode update where the version has changed, then we need to
8229  * rescan CPUID. To do this we check every CPU to make sure that they have the
8230  * same microcode. Then we perform a cross call to all such CPUs. It's the
8231  * caller's job to make sure that no one else can end up doing an update while
8232  * this is going on.
8233  *
8234  * We assume that the system is microcode capable if we're called.
8235  */
8236 void
cpuid_post_ucodeadm(void)8237 cpuid_post_ucodeadm(void)
8238 {
8239 	uint32_t rev;
8240 	int i;
8241 	struct cpu *cpu;
8242 	cpuset_t cpuset;
8243 	void *argdata;
8244 	uchar_t *f0;
8245 
8246 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
8247 
8248 	mutex_enter(&cpu_lock);
8249 	cpu = cpu_get(0);
8250 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
8251 	CPUSET_ONLY(cpuset, 0);
8252 	for (i = 1; i < max_ncpus; i++) {
8253 		if ((cpu = cpu_get(i)) == NULL)
8254 			continue;
8255 
8256 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
8257 			panic("post microcode update CPU %d has differing "
8258 			    "microcode revision (%u) from CPU 0 (%u)",
8259 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
8260 		}
8261 		CPUSET_ADD(cpuset, i);
8262 	}
8263 
8264 	/*
8265 	 * We do the cross calls in two passes. The first pass is only for the
8266 	 * boot CPU. The second pass is for all of the other CPUs. This allows
8267 	 * the boot CPU to go through and change behavior related to patching or
8268 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
8269 	 * other CPUs to follow suit.
8270 	 */
8271 	kpreempt_disable();
8272 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
8273 	    cpuid_post_ucodeadm_xc);
8274 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
8275 	    cpuid_post_ucodeadm_xc);
8276 	kpreempt_enable();
8277 
8278 	/*
8279 	 * OK, now look at each CPU and see if their feature sets are equal.
8280 	 */
8281 	f0 = argdata;
8282 	for (i = 1; i < max_ncpus; i++) {
8283 		uchar_t *fset;
8284 		if (!CPU_IN_SET(cpuset, i))
8285 			continue;
8286 
8287 		fset = (uchar_t *)((uintptr_t)argdata +
8288 		    sizeof (x86_featureset) * i);
8289 
8290 		if (!compare_x86_featureset(f0, fset)) {
8291 			panic("Post microcode update CPU %d has "
8292 			    "differing security feature (%p) set from CPU 0 "
8293 			    "(%p), not appending to feature set", i,
8294 			    (void *)fset, (void *)f0);
8295 		}
8296 	}
8297 
8298 	mutex_exit(&cpu_lock);
8299 
8300 	for (i = 0; i < NUM_X86_FEATURES; i++) {
8301 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8302 		    x86_feature_names[i]);
8303 		if (is_x86_feature(f0, i)) {
8304 			add_x86_feature(x86_featureset, i);
8305 		}
8306 	}
8307 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8308 }
8309 
8310 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8311 
8312 typedef struct cpuid_pass_def {
8313 	cpuid_pass_t cpd_pass;
8314 	cpuid_pass_f cpd_func;
8315 } cpuid_pass_def_t;
8316 
8317 /*
8318  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8319  * normal sense and should not appear here.
8320  */
8321 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8322 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8323 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
8324 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
8325 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
8326 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8327 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8328 };
8329 
8330 void
cpuid_execpass(cpu_t * cp,cpuid_pass_t pass,void * arg)8331 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8332 {
8333 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
8334 
8335 	if (cp == NULL)
8336 		cp = CPU;
8337 
8338 	/*
8339 	 * Space statically allocated for BSP, ensure pointer is set
8340 	 */
8341 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8342 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
8343 
8344 	ASSERT(cpuid_checkpass(cp, pass - 1));
8345 
8346 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8347 		if (cpuid_pass_defs[i].cpd_pass == pass) {
8348 			cpuid_pass_defs[i].cpd_func(cp, arg);
8349 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8350 			return;
8351 		}
8352 	}
8353 
8354 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8355 	    pass, cp->cpu_id);
8356 }
8357 
8358 /*
8359  * Extract the processor family from a chiprev.  Processor families are not the
8360  * same as cpuid families; see comments above and in x86_archext.h.
8361  */
8362 x86_processor_family_t
chiprev_family(const x86_chiprev_t cr)8363 chiprev_family(const x86_chiprev_t cr)
8364 {
8365 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8366 }
8367 
8368 /*
8369  * A chiprev matches its template if the vendor and family are identical and the
8370  * revision of the chiprev matches one of the bits set in the template.  Callers
8371  * may bitwise-OR together chiprevs of the same vendor and family to form the
8372  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8373  * multiple vendors or processor families with a single call.  Note that this
8374  * function operates on processor families, not cpuid families.
8375  */
8376 boolean_t
chiprev_matches(const x86_chiprev_t cr,const x86_chiprev_t template)8377 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8378 {
8379 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8380 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8381 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8382 }
8383 
8384 /*
8385  * A chiprev is at least min if the vendor and family are identical and the
8386  * revision of the chiprev is at least as recent as that of min.  Processor
8387  * families are considered unordered and cannot be compared using this function.
8388  * Note that this function operates on processor families, not cpuid families.
8389  * Use of the _ANY chiprev variant with this function is not useful; it will
8390  * always return B_FALSE if the _ANY variant is supplied as the minimum
8391  * revision.  To determine only whether a chiprev is of a given processor
8392  * family, test the return value of chiprev_family() instead.
8393  */
8394 boolean_t
chiprev_at_least(const x86_chiprev_t cr,const x86_chiprev_t min)8395 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8396 {
8397 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8398 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8399 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8400 }
8401 
8402 /*
8403  * The uarch functions operate in a manner similar to the chiprev functions
8404  * above.  While it is tempting to allow these to operate on microarchitectures
8405  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8406  * than ZEN2), we elect not to do so because a manufacturer may supply
8407  * processors of multiple different microarchitecture families each of which may
8408  * be internally ordered but unordered with respect to those of other families.
8409  */
8410 x86_uarch_t
uarchrev_uarch(const x86_uarchrev_t ur)8411 uarchrev_uarch(const x86_uarchrev_t ur)
8412 {
8413 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8414 }
8415 
8416 boolean_t
uarchrev_matches(const x86_uarchrev_t ur,const x86_uarchrev_t template)8417 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8418 {
8419 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8420 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8421 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8422 }
8423 
8424 boolean_t
uarchrev_at_least(const x86_uarchrev_t ur,const x86_uarchrev_t min)8425 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8426 {
8427 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8428 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8429 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8430 }
8431 
8432 /*
8433  * Topology cache related information. This is yet another cache interface that
8434  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8435  * AMD Leaf 8x1D (introduced with Zen 1).
8436  */
8437 static boolean_t
cpuid_cache_topo_sup(const struct cpuid_info * cpi)8438 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8439 {
8440 	switch (cpi->cpi_vendor) {
8441 	case X86_VENDOR_Intel:
8442 		if (cpi->cpi_maxeax >= 4) {
8443 			return (B_TRUE);
8444 		}
8445 		break;
8446 	case X86_VENDOR_AMD:
8447 	case X86_VENDOR_HYGON:
8448 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8449 		    is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8450 			return (B_TRUE);
8451 		}
8452 		break;
8453 	default:
8454 		break;
8455 	}
8456 
8457 	return (B_FALSE);
8458 }
8459 
8460 int
cpuid_getncaches(struct cpu * cpu,uint32_t * ncache)8461 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8462 {
8463 	const struct cpuid_info *cpi;
8464 
8465 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8466 	cpi = cpu->cpu_m.mcpu_cpi;
8467 
8468 	if (!cpuid_cache_topo_sup(cpi)) {
8469 		return (ENOTSUP);
8470 	}
8471 
8472 	*ncache = cpi->cpi_cache_leaf_size;
8473 	return (0);
8474 }
8475 
8476 int
cpuid_getcache(struct cpu * cpu,uint32_t cno,x86_cache_t * cache)8477 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8478 {
8479 	const struct cpuid_info *cpi;
8480 	const struct cpuid_regs *cp;
8481 
8482 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8483 	cpi = cpu->cpu_m.mcpu_cpi;
8484 
8485 	if (!cpuid_cache_topo_sup(cpi)) {
8486 		return (ENOTSUP);
8487 	}
8488 
8489 	if (cno >= cpi->cpi_cache_leaf_size) {
8490 		return (EINVAL);
8491 	}
8492 
8493 	bzero(cache, sizeof (x86_cache_t));
8494 	cp = cpi->cpi_cache_leaves[cno];
8495 	switch (CPI_CACHE_TYPE(cp)) {
8496 	case CPI_CACHE_TYPE_DATA:
8497 		cache->xc_type = X86_CACHE_TYPE_DATA;
8498 		break;
8499 	case CPI_CACHE_TYPE_INSTR:
8500 		cache->xc_type = X86_CACHE_TYPE_INST;
8501 		break;
8502 	case CPI_CACHE_TYPE_UNIFIED:
8503 		cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8504 		break;
8505 	case CPI_CACHE_TYPE_DONE:
8506 	default:
8507 		return (EINVAL);
8508 	}
8509 	cache->xc_level = CPI_CACHE_LVL(cp);
8510 	if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8511 		cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8512 	}
8513 	cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8514 	/*
8515 	 * The number of sets is reserved on AMD if the CPU is tagged as fully
8516 	 * associative, where as it is considered valid on Intel.
8517 	 */
8518 	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8519 	    CPI_FULL_ASSOC_CACHE(cp) != 0) {
8520 		cache->xc_nsets = 1;
8521 	} else {
8522 		cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8523 	}
8524 	cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8525 	cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8526 	cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8527 	    cache->xc_line_size;
8528 	/*
8529 	 * We're looking for the number of bits to cover the number of CPUs that
8530 	 * are being shared. Normally this would be the value - 1, but the CPUID
8531 	 * value is encoded as the actual value minus one, so we don't modify
8532 	 * this at all.
8533 	 */
8534 	cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8535 
8536 	/*
8537 	 * To construct a unique ID we construct a uint64_t that looks as
8538 	 * follows:
8539 	 *
8540 	 * [47:40] cache level
8541 	 * [39:32] CPUID cache type
8542 	 * [31:00] shifted APIC ID
8543 	 *
8544 	 * The shifted APIC ID gives us a guarantee that a given cache entry is
8545 	 * unique within its peers. The other two numbers give us something that
8546 	 * ensures that something is unique within the CPU. If we just had the
8547 	 * APIC ID shifted over by the indicated number of bits we'd end up with
8548 	 * an ID of zero for the L1I, L1D, L2, and L3.
8549 	 *
8550 	 * The format of this ID is private to the system and can change across
8551 	 * a reboot for the time being.
8552 	 */
8553 	cache->xc_id = (uint64_t)cache->xc_level << 40;
8554 	cache->xc_id |= (uint64_t)cache->xc_type << 32;
8555 	cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8556 
8557 	return (0);
8558 }
8559