xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision 75f8aca0475922393e44856adeaa1425ab244aa9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2025 Oxide Computer Company
28  * Copyright 2024 MNX Cloud, Inc.
29  * Copyright 2025 Edgecast Cloud LLC.
30  */
31 /*
32  * Copyright (c) 2010, Intel Corporation.
33  * All rights reserved.
34  */
35 /*
36  * Portions Copyright 2009 Advanced Micro Devices, Inc.
37  */
38 
39 /*
40  * CPU Identification logic
41  *
42  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
43  * with the identification of CPUs, their features, and their topologies. More
44  * specifically, this file helps drive the following:
45  *
46  * 1. Enumeration of features of the processor which are used by the kernel to
47  *    determine what features to enable or disable. These may be instruction set
48  *    enhancements or features that we use.
49  *
50  * 2. Enumeration of instruction set architecture (ISA) additions that userland
51  *    will be told about through the auxiliary vector.
52  *
53  * 3. Understanding the physical topology of the CPU such as the number of
54  *    caches, how many cores it has, whether or not it supports symmetric
55  *    multi-processing (SMT), etc.
56  *
57  * ------------------------
58  * CPUID History and Basics
59  * ------------------------
60  *
61  * The cpuid instruction was added by Intel roughly around the time that the
62  * original Pentium was introduced. The purpose of cpuid was to tell in a
63  * programmatic fashion information about the CPU that previously was guessed
64  * at. For example, an important part of cpuid is that we can know what
65  * extensions to the ISA exist. If you use an invalid opcode you would get a
66  * #UD, so this method allows a program (whether a user program or the kernel)
67  * to determine what exists without crashing or getting a SIGILL. Of course,
68  * this was also during the era of the clones and the AMD Am5x86. The vendor
69  * name shows up first in cpuid for a reason.
70  *
71  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
72  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
73  * its own meaning. The different leaves are broken down into different regions:
74  *
75  *	[ 0, 7fffffff ]			This region is called the 'basic'
76  *					region. This region is generally defined
77  *					by Intel, though some of the original
78  *					portions have different meanings based
79  *					on the manufacturer. These days, Intel
80  *					adds most new features to this region.
81  *					AMD adds non-Intel compatible
82  *					information in the third, extended
83  *					region. Intel uses this for everything
84  *					including ISA extensions, CPU
85  *					features, cache information, topology,
86  *					and more.
87  *
88  *					There is a hole carved out of this
89  *					region which is reserved for
90  *					hypervisors.
91  *
92  *	[ 40000000, 4fffffff ]		This region, which is found in the
93  *					middle of the previous region, is
94  *					explicitly promised to never be used by
95  *					CPUs. Instead, it is used by hypervisors
96  *					to communicate information about
97  *					themselves to the operating system. The
98  *					values and details are unique for each
99  *					hypervisor.
100  *
101  *	[ 80000000, ffffffff ]		This region is called the 'extended'
102  *					region. Some of the low leaves mirror
103  *					parts of the basic leaves. This region
104  *					has generally been used by AMD for
105  *					various extensions. For example, AMD-
106  *					specific information about caches,
107  *					features, and topology are found in this
108  *					region.
109  *
110  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
111  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
112  * the ranges, one of the primary things returned is the maximum valid leaf in
113  * that range. This allows for discovery of what range of CPUID is valid.
114  *
115  * The CPUs have potentially surprising behavior when using an invalid leaf or
116  * unimplemented leaf. If the requested leaf is within the valid basic or
117  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
118  * set to zero. However, if you specify a leaf that is outside of a valid range,
119  * then instead it will be filled with the last valid _basic_ leaf. For example,
120  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
121  * an invalid extended leaf will return the information for leaf 3.
122  *
123  * Some leaves are broken down into sub-leaves. This means that the value
124  * depends on both the leaf asked for in %eax and a secondary register. For
125  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
126  * additional information. Or when getting topology information in leaf 0xb, the
127  * initial value in %ecx changes which level of the topology that you are
128  * getting information about.
129  *
130  * cpuid values are always kept to 32 bits regardless of whether or not the
131  * program is in 64-bit mode. When executing in 64-bit mode, the upper
132  * 32 bits of the register are always set to zero so that way the values are the
133  * same regardless of execution mode.
134  *
135  * ----------------------
136  * Identifying Processors
137  * ----------------------
138  *
139  * We can identify a processor in two steps. The first step looks at cpuid leaf
140  * 0. Leaf 0 contains the processor's vendor information. This is done by
141  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
142  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
143  *
144  * From there, a processor is identified by a combination of three different
145  * values:
146  *
147  *  1. Family
148  *  2. Model
149  *  3. Stepping
150  *
151  * Each vendor uses the family and model to uniquely identify a processor. The
152  * way that family and model are changed depends on the vendor. For example,
153  * Intel has been using family 0x6 for almost all of their processor since the
154  * Pentium Pro/Pentium II era, often called the P6. The model is used to
155  * identify the exact processor. Different models are often used for the client
156  * (consumer) and server parts. Even though each processor often has major
157  * architectural differences, they still are considered the same family by
158  * Intel.
159  *
160  * On the other hand, each major AMD architecture generally has its own family.
161  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
162  * the model number is used to help identify specific processors.  As AMD's
163  * product lines have expanded, they have started putting a mixed bag of
164  * processors into the same family, with each processor under a single
165  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
166  * refer to each such collection as a processor family, distinct from cpuid
167  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
168  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
169  * defines the processor family's non-architectural features.  In general, we'll
170  * use "family" here to mean the family number reported by the cpuid instruction
171  * and distinguish the processor family from it where appropriate.
172  *
173  * The stepping is used to refer to a revision of a specific microprocessor. The
174  * term comes from equipment used to produce masks that are used to create
175  * integrated circuits.
176  *
177  * The information is present in leaf 1, %eax. In technical documentation you
178  * will see the terms extended model and extended family. The original family,
179  * model, and stepping fields were each 4 bits wide. If the values in either
180  * are 0xf, then one is to consult the extended model and extended family, which
181  * take previously reserved bits and allow for a larger number of models and add
182  * 0xf to them.
183  *
184  * When we process this information, we store the full family, model, and
185  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
186  * cpi_step, respectively. Whenever you are performing comparisons with the
187  * family, model, and stepping, you should use these members and not the raw
188  * values from cpuid. If you must use the raw values from cpuid directly, you
189  * must make sure that you add the extended model and family to the base model
190  * and family.
191  *
192  * In general, we do not use information about the family, model, and stepping
193  * to determine whether or not a feature is present; that is generally driven by
194  * specific leaves. However, when something we care about on the processor is
195  * not considered 'architectural' meaning that it is specific to a set of
196  * processors and not promised in the architecture model to be consistent from
197  * generation to generation, then we will fall back on this information. The
198  * most common cases where this comes up is when we have to workaround errata in
199  * the processor, are dealing with processor-specific features such as CPU
200  * performance counters, or we want to provide additional information for things
201  * such as fault management.
202  *
203  * While processors also do have a brand string, which is the name that people
204  * are familiar with when buying the processor, they are not meant for
205  * programmatic consumption. That is what the family, model, and stepping are
206  * for.
207  *
208  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
209  * and stepping(s) that refer to a single or very closely related set of silicon
210  * implementations; while there are sometimes more specific ways to learn of the
211  * presence or absence of a particular erratum or workaround, one may generally
212  * assume that all processors of the same chiprev have the same errata and we
213  * have chosen to represent them this way precisely because that is how AMD
214  * groups them in their revision guides (errata documentation).  The processor
215  * family (x86_processor_family_t) may be extracted from the chiprev if that
216  * level of detail is not needed.  Processor families are considered unordered
217  * but revisions within a family may be compared for either an exact match or at
218  * least as recent as a reference revision.  See the chiprev_xxx() functions
219  * below.
220  *
221  * Similarly, each processor family implements a particular microarchitecture,
222  * which itself may have multiple revisions.  In general, non-architectural
223  * features are specific to a processor family, but some may exist across
224  * families containing cores that implement the same microarchitectural revision
225  * (and, such cores share common bugs, too).  We provide utility routines
226  * analogous to those for extracting and comparing chiprevs for
227  * microarchitectures as well; see the uarch_xxx() functions.
228  *
229  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
230  * present used and available only for AMD and AMD-like processors.
231  *
232  * ------------
233  * CPUID Passes
234  * ------------
235  *
236  * As part of performing feature detection, we break this into several different
237  * passes. There used to be a pass 0 that was done from assembly in locore.s to
238  * support processors that have a missing or broken cpuid instruction (notably
239  * certain Cyrix processors) but those were all 32-bit processors which are no
240  * longer supported. Passes are no longer numbered explicitly to make it easier
241  * to break them up or move them around as needed; however, they still have a
242  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
243  * x86_archext.h. The external interface to execute a cpuid pass or determine
244  * whether a pass has been completed consists of cpuid_execpass() and
245  * cpuid_checkpass() respectively.  The passes now, in that execution order,
246  * are as follows:
247  *
248  *	PRELUDE		This pass does not have any dependencies on system
249  *			setup; in particular, unlike all subsequent passes it is
250  *			guaranteed not to require PCI config space access.  It
251  *			sets the flag indicating that the processor we are
252  *			running on supports the cpuid instruction, which all
253  *			64-bit processors do.  This would also be the place to
254  *			add any other basic state that is required later on and
255  *			can be learned without dependencies.
256  *
257  *	IDENT		Determine which vendor manufactured the CPU, the family,
258  *			model, and stepping information, and compute basic
259  *			identifying tags from those values.  This is done first
260  *			so that machine-dependent code can control the features
261  *			the cpuid instruction will report during subsequent
262  *			passes if needed, and so that any intervening
263  *			machine-dependent code that needs basic identity will
264  *			have it available.  This includes synthesised
265  *			identifiers such as chiprev and uarchrev as well as the
266  *			values obtained directly from cpuid.  Prior to executing
267  *			this pass, machine-depedent boot code is responsible for
268  *			ensuring that the PCI configuration space access
269  *			functions have been set up and, if necessary, that
270  *			determine_platform() has been called.
271  *
272  *	BASIC		This is the primary pass and is responsible for doing a
273  *			large number of different things:
274  *
275  *			1. Gathering a large number of feature flags to
276  *			determine which features the CPU support and which
277  *			indicate things that we need to do other work in the OS
278  *			to enable. Features detected this way are added to the
279  *			x86_featureset which can be queried to
280  *			determine what we should do. This includes processing
281  *			all of the basic and extended CPU features that we care
282  *			about.
283  *
284  *			2. Determining the CPU's topology. This includes
285  *			information about how many cores and threads are present
286  *			in the package. It also is responsible for figuring out
287  *			which logical CPUs are potentially part of the same core
288  *			and what other resources they might share. For more
289  *			information see the 'Topology' section.
290  *
291  *			3. Determining the set of CPU security-specific features
292  *			that we need to worry about and determine the
293  *			appropriate set of workarounds.
294  *
295  *			Pass 1 on the boot CPU occurs before KMDB is started.
296  *
297  *	EXTENDED	The second pass is done after startup(). Here, we check
298  *			other miscellaneous features. Most of this is gathering
299  *			additional basic and extended features that we'll use in
300  *			later passes or for debugging support.
301  *
302  *	DYNAMIC		The third pass occurs after the kernel memory allocator
303  *			has been fully initialized. This gathers information
304  *			where we might need dynamic memory available for our
305  *			uses. This includes several varying width leaves that
306  *			have cache information and the processor's brand string.
307  *
308  *	RESOLVE		The fourth and final normal pass is performed after the
309  *			kernel has brought most everything online. This is
310  *			invoked from post_startup(). In this pass, we go through
311  *			the set of features that we have enabled and turn that
312  *			into the hardware auxiliary vector features that
313  *			userland receives. This is used by userland, primarily
314  *			by the run-time link-editor (RTLD), though userland
315  *			software could also refer to it directly.
316  *
317  * The function that performs a pass is currently assumed to be infallible, and
318  * all existing implementation are.  This simplifies callers by allowing
319  * cpuid_execpass() to return void. Similarly, implementers do not need to check
320  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
321  * Both of these assumptions can be relaxed if needed by future developments.
322  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
323  * error to attempt to execute a pass before all previous passes have been
324  * completed on the specified CPU, or to request cpuid information before the
325  * pass that captures it has been executed.  These conditions can be tested
326  * using cpuid_checkpass().
327  *
328  * ---------
329  * Microcode
330  * ---------
331  *
332  * Microcode updates may be applied by the firmware (BIOS/UEFI) and/or by the
333  * operating system and may result in architecturally visible changes (e.g.,
334  * changed MSR or CPUID bits). As such, we want to apply any updates as early
335  * as possible during the boot process -- right after the IDENT pass.
336  *
337  * Microcode may also be updated at runtime via ucodeadm(8), after which we do
338  * a selective rescan of the cpuid leaves to determine what features have
339  * changed. Microcode updates can provide more details about security related
340  * features to deal with issues like Spectre and L1TF. On occasion, vendors have
341  * violated their contract and removed bits. However, we don't try to detect
342  * that because that puts us in a situation that we really can't deal with. As
343  * such, the only thing we rescan are security related features today. See
344  * cpuid_pass_ucode(). This is not a pass in the same sense as the others and
345  * is run on demand, via cpuid_post_ucodeadm().
346  *
347  *
348  * All of the passes are run on all CPUs. However, for the most part we only
349  * care about what the boot CPU says about this information and use the other
350  * CPUs as a rough guide to sanity check that we have the same feature set.
351  *
352  * We do not support running multiple logical CPUs with disjoint, let alone
353  * different, feature sets.
354  *
355  * ------------------
356  * Processor Topology
357  * ------------------
358  *
359  * One of the important things that we need to do is to understand the topology
360  * of the underlying processor. When we say topology in this case, we're trying
361  * to understand the relationship between the logical CPUs that the operating
362  * system sees and the underlying physical layout. Different logical CPUs may
363  * share different resources which can have important consequences for the
364  * performance of the system. For example, they may share caches, execution
365  * units, and more.
366  *
367  * The topology of the processor changes from generation to generation and
368  * vendor to vendor.  Along with that, different vendors use different
369  * terminology, and the operating system itself uses occasionally overlapping
370  * terminology. It's important to understand what this topology looks like so
371  * one can understand the different things that we try to calculate and
372  * determine.
373  *
374  * To get started, let's talk about a little bit of terminology that we've used
375  * so far, is used throughout this file, and is fairly generic across multiple
376  * vendors:
377  *
378  * CPU
379  *	A central processing unit (CPU) refers to a logical and/or virtual
380  *	entity that the operating system can execute instructions on. The
381  *	underlying resources for this CPU may be shared between multiple
382  *	entities; however, to the operating system it is a discrete unit.
383  *
384  * PROCESSOR and PACKAGE
385  *
386  *	Generally, when we use the term 'processor' on its own, we are referring
387  *	to the physical entity that one buys and plugs into a board. However,
388  *	because processor has been overloaded and one might see it used to mean
389  *	multiple different levels, we will instead use the term 'package' for
390  *	the rest of this file. The term package comes from the electrical
391  *	engineering side and refers to the physical entity that encloses the
392  *	electronics inside. Strictly speaking the package can contain more than
393  *	just the CPU, for example, on many processors it may also have what's
394  *	called an 'integrated graphical processing unit (GPU)'. Because the
395  *	package can encapsulate multiple units, it is the largest physical unit
396  *	that we refer to.
397  *
398  * SOCKET
399  *
400  *	A socket refers to unit on a system board (generally the motherboard)
401  *	that can receive a package. A single package, or processor, is plugged
402  *	into a single socket. A system may have multiple sockets. Often times,
403  *	the term socket is used interchangeably with package and refers to the
404  *	electrical component that has plugged in, and not the receptacle itself.
405  *
406  * CORE
407  *
408  *	A core refers to the physical instantiation of a CPU, generally, with a
409  *	full set of hardware resources available to it. A package may contain
410  *	multiple cores inside of it or it may just have a single one. A
411  *	processor with more than one core is often referred to as 'multi-core'.
412  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
413  *	that has 'multi-core' processors.
414  *
415  *	A core may expose a single logical CPU to the operating system, or it
416  *	may expose multiple CPUs, which we call threads, defined below.
417  *
418  *	Some resources may still be shared by cores in the same package. For
419  *	example, many processors will share the level 3 cache between cores.
420  *	Some AMD generations share hardware resources between cores. For more
421  *	information on that see the section 'AMD Topology'.
422  *
423  * THREAD and STRAND
424  *
425  *	In this file, generally a thread refers to a hardware resources and not
426  *	the operating system's logical abstraction. A thread is always exposed
427  *	as an independent logical CPU to the operating system. A thread belongs
428  *	to a specific core. A core may have more than one thread. When that is
429  *	the case, the threads that are part of the same core are often referred
430  *	to as 'siblings'.
431  *
432  *	When multiple threads exist, this is generally referred to as
433  *	simultaneous multi-threading (SMT). When Intel introduced this in their
434  *	processors they called it hyper-threading (HT). When multiple threads
435  *	are active in a core, they split the resources of the core. For example,
436  *	two threads may share the same set of hardware execution units.
437  *
438  *	The operating system often uses the term 'strand' to refer to a thread.
439  *	This helps disambiguate it from the software concept.
440  *
441  * CHIP
442  *
443  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
444  *	base meaning, it is used to refer to a single integrated circuit, which
445  *	may or may not be the only thing in the package. In illumos, when you
446  *	see the term 'chip' it is almost always referring to the same thing as
447  *	the 'package'. However, many vendors may use chip to refer to one of
448  *	many integrated circuits that have been placed in the package. As an
449  *	example, see the subsequent definition.
450  *
451  *	To try and keep things consistent, we will only use chip when referring
452  *	to the entire integrated circuit package, with the exception of the
453  *	definition of multi-chip module (because it is in the name) and use the
454  *	term 'die' when we want the more general, potential sub-component
455  *	definition.
456  *
457  * DIE
458  *
459  *	A die refers to an integrated circuit. Inside of the package there may
460  *	be a single die or multiple dies. This is sometimes called a 'chip' in
461  *	vendor's parlance, but in this file, we use the term die to refer to a
462  *	subcomponent.
463  *
464  * MULTI-CHIP MODULE
465  *
466  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
467  *	are connected together in the same package. When a multi-chip design is
468  *	used, generally each chip is manufactured independently and then joined
469  *	together in the package. For example, on AMD's Zen microarchitecture
470  *	(family 0x17), the package contains several dies (the second meaning of
471  *	chip from above) that are connected together.
472  *
473  * CACHE
474  *
475  *	A cache is a part of the processor that maintains copies of recently
476  *	accessed memory. Caches are split into levels and then into types.
477  *	Commonly there are one to three levels, called level one, two, and
478  *	three. The lower the level, the smaller it is, the closer it is to the
479  *	execution units of the CPU, and the faster it is to access. The layout
480  *	and design of the cache come in many different flavors, consult other
481  *	resources for a discussion of those.
482  *
483  *	Caches are generally split into two types, the instruction and data
484  *	cache. The caches contain what their names suggest, the instruction
485  *	cache has executable program text, while the data cache has all other
486  *	memory that the processor accesses. As of this writing, data is kept
487  *	coherent between all of the caches on x86, so if one modifies program
488  *	text before it is executed, that will be in the data cache, and the
489  *	instruction cache will be synchronized with that change when the
490  *	processor actually executes those instructions. This coherency also
491  *	covers the fact that data could show up in multiple caches.
492  *
493  *	Generally, the lowest level caches are specific to a core. However, the
494  *	last layer cache is shared between some number of cores. The number of
495  *	CPUs sharing this last level cache is important. This has implications
496  *	for the choices that the scheduler makes, as accessing memory that might
497  *	be in a remote cache after thread migration can be quite expensive.
498  *
499  *	Sometimes, the word cache is abbreviated with a '$', because in US
500  *	English the word cache is pronounced the same as cash. So L1D$ refers to
501  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
502  *	in the rest of this theory statement for clarity.
503  *
504  * MEMORY CONTROLLER
505  *
506  *	The memory controller is a component that provides access to DRAM. Each
507  *	memory controller can access a set number of DRAM channels. Each channel
508  *	can have a number of DIMMs (sticks of memory) associated with it. A
509  *	given package may have more than one memory controller. The association
510  *	of the memory controller to a group of cores is important as it is
511  *	cheaper to access memory on the controller that you are associated with.
512  *
513  * NUMA
514  *
515  *	NUMA or non-uniform memory access, describes a way that systems are
516  *	built. On x86, any processor core can address all of the memory in the
517  *	system. However, When using multiple sockets or possibly within a
518  *	multi-chip module, some of that memory is physically closer and some of
519  *	it is further. Memory that is further away is more expensive to access.
520  *	Consider the following image of multiple sockets with memory:
521  *
522  *	+--------+                                                +--------+
523  *	| DIMM A |         +----------+      +----------+         | DIMM D |
524  *	+--------+-+       |          |      |          |       +-+------+-+
525  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
526  *	  +--------+-+     |          |      |          |     +-+------+-+
527  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
528  *	    +--------+                                        +--------+
529  *
530  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
531  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
532  *	access DIMMs A-C and more expensive to access D-F as it has to go
533  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
534  *	D-F are cheaper than A-C. While the socket form is the most common, when
535  *	using multi-chip modules, this can also sometimes occur. For another
536  *	example of this that's more involved, see the AMD topology section.
537  *
538  *
539  * Intel Topology
540  * --------------
541  *
542  * Most Intel processors since Nehalem, (as of this writing the current gen
543  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
544  * the package is a single monolithic die. MCMs currently aren't used. Most
545  * parts have three levels of caches, with the L3 cache being shared between
546  * all of the cores on the package. The L1/L2 cache is generally specific to
547  * an individual core. The following image shows at a simplified level what
548  * this looks like. The memory controller is commonly part of something called
549  * the 'Uncore', that used to be separate physical chips that were not a part of
550  * the package, but are now part of the same chip.
551  *
552  *  +-----------------------------------------------------------------------+
553  *  | Package                                                               |
554  *  |  +-------------------+  +-------------------+  +-------------------+  |
555  *  |  | Core              |  | Core              |  | Core              |  |
556  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
557  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
558  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
559  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
560  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
561  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
562  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
563  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
564  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
565  *  |  +-------------------+  +-------------------+  +-------------------+  |
566  *  | +-------------------------------------------------------------------+ |
567  *  | |                         Shared L3 Cache                           | |
568  *  | +-------------------------------------------------------------------+ |
569  *  | +-------------------------------------------------------------------+ |
570  *  | |                        Memory Controller                          | |
571  *  | +-------------------------------------------------------------------+ |
572  *  +-----------------------------------------------------------------------+
573  *
574  * A side effect of this current architecture is that what we care about from a
575  * scheduling and topology perspective, is simplified. In general we care about
576  * understanding which logical CPUs are part of the same core and socket.
577  *
578  * To determine the relationship between threads and cores, Intel initially used
579  * the identifier in the advanced programmable interrupt controller (APIC). They
580  * also added cpuid leaf 4 to give additional information about the number of
581  * threads and CPUs in the processor. With the addition of x2apic (which
582  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
583  * additional cpuid topology leaf 0xB was added.
584  *
585  * AMD Topology
586  * ------------
587  *
588  * When discussing AMD topology, we want to break this into three distinct
589  * generations of topology. There's the basic topology that has been used in
590  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
591  * with family 0x15 (Bulldozer), and there's the topology that was introduced
592  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
593  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
594  * additional terminology that's worth talking about.
595  *
596  * Until the introduction of family 0x17 (Zen), AMD did not implement something
597  * that they considered SMT. Whether or not the AMD processors have SMT
598  * influences many things including scheduling and reliability, availability,
599  * and serviceability (RAS) features.
600  *
601  * NODE
602  *
603  *	AMD uses the term node to refer to a die that contains a number of cores
604  *	and I/O resources. Depending on the processor family and model, more
605  *	than one node can be present in the package. When there is more than one
606  *	node this indicates a multi-chip module. Usually each node has its own
607  *	access to memory and I/O devices. This is important and generally
608  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
609  *	result, we track this relationship in the operating system.
610  *
611  *	In processors with an L3 cache, the L3 cache is generally shared across
612  *	the entire node, though the way this is carved up varies from generation
613  *	to generation.
614  *
615  * BULLDOZER
616  *
617  *	Starting with the Bulldozer family (0x15) and continuing until the
618  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
619  *	compute unit. In a compute unit, two traditional cores share a number of
620  *	hardware resources. Critically, they share the FPU, L1 instruction
621  *	cache, and the L2 cache. Several compute units were then combined inside
622  *	of a single node.  Because the integer execution units, L1 data cache,
623  *	and some other resources were not shared between the cores, AMD never
624  *	considered this to be SMT.
625  *
626  * ZEN
627  *
628  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
629  *	is called Zeppelin. These modules are similar to the idea of nodes used
630  *	previously. Each of these nodes has two DRAM channels which all of the
631  *	cores in the node can access uniformly. These nodes are linked together
632  *	in the package, creating a NUMA environment.
633  *
634  *	The Zeppelin die itself contains two different 'core complexes'. Each
635  *	core complex consists of four cores which each have two threads, for a
636  *	total of 8 logical CPUs per complex. Unlike other generations,
637  *	where all the logical CPUs in a given node share the L3 cache, here each
638  *	core complex has its own shared L3 cache.
639  *
640  *	A further thing that we need to consider is that in some configurations,
641  *	particularly with the Threadripper line of processors, not every die
642  *	actually has its memory controllers wired up to actual memory channels.
643  *	This means that some cores have memory attached to them and others
644  *	don't.
645  *
646  *	To put Zen in perspective, consider the following images:
647  *
648  *      +--------------------------------------------------------+
649  *      | Core Complex                                           |
650  *      | +-------------------+    +-------------------+  +---+  |
651  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
652  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
653  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
654  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
655  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
656  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
657  *      | +-------------------+    +-------------------+  | C |  |
658  *      | +-------------------+    +-------------------+  | a |  |
659  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
660  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
661  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
662  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
663  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
664  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
665  *      | +-------------------+    +-------------------+  +---+  |
666  *      |                                                        |
667  *	+--------------------------------------------------------+
668  *
669  *  This first image represents a single Zen core complex that consists of four
670  *  cores.
671  *
672  *
673  *	+--------------------------------------------------------+
674  *	| Zeppelin Die                                           |
675  *	|  +--------------------------------------------------+  |
676  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
677  *	|  +--------------------------------------------------+  |
678  *      |                           HH                           |
679  *	|          +-----------+    HH    +-----------+          |
680  *	|          |           |    HH    |           |          |
681  *	|          |    Core   |==========|    Core   |          |
682  *	|          |  Complex  |==========|  Complex  |          |
683  *	|          |           |    HH    |           |          |
684  *	|          +-----------+    HH    +-----------+          |
685  *      |                           HH                           |
686  *	|  +--------------------------------------------------+  |
687  *	|  |                Memory Controller                 |  |
688  *	|  +--------------------------------------------------+  |
689  *      |                                                        |
690  *	+--------------------------------------------------------+
691  *
692  *  This image represents a single Zeppelin Die. Note how both cores are
693  *  connected to the same memory controller and I/O units. While each core
694  *  complex has its own L3 cache as seen in the first image, they both have
695  *  uniform access to memory.
696  *
697  *
698  *                      PP                     PP
699  *                      PP                     PP
700  *           +----------PP---------------------PP---------+
701  *           |          PP                     PP         |
702  *           |    +-----------+          +-----------+    |
703  *           |    |           |          |           |    |
704  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
705  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
706  *           |    |           |          |           |    |
707  *           |    +-----------+ooo    ...+-----------+    |
708  *           |          HH      ooo  ...       HH         |
709  *           |          HH        oo..         HH         |
710  *           |          HH        ..oo         HH         |
711  *           |          HH      ...  ooo       HH         |
712  *           |    +-----------+...    ooo+-----------+    |
713  *           |    |           |          |           |    |
714  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
715  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
716  *           |    |           |          |           |    |
717  *           |    +-----------+          +-----------+    |
718  *           |          PP                     PP         |
719  *           +----------PP---------------------PP---------+
720  *                      PP                     PP
721  *                      PP                     PP
722  *
723  *  This image represents a single Zen package. In this example, it has four
724  *  Zeppelin dies, though some configurations only have a single one. In this
725  *  example, each die is directly connected to the next. Also, each die is
726  *  represented as being connected to memory by the 'M' character and connected
727  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
728  *  die is made up of two core complexes, we have multiple different NUMA
729  *  domains that we care about for these systems.
730  *
731  * ZEN 2
732  *
733  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
734  *	each Zeppelin Die had its own I/O die, that has been moved out of the
735  *	core complex in Zen 2. The actual core complex looks pretty similar, but
736  *	now the die actually looks much simpler:
737  *
738  *      +--------------------------------------------------------+
739  *      | Zen 2 Core Complex Die    HH                           |
740  *      |                           HH                           |
741  *      |          +-----------+    HH    +-----------+          |
742  *      |          |           |    HH    |           |          |
743  *      |          |    Core   |==========|    Core   |          |
744  *      |          |  Complex  |==========|  Complex  |          |
745  *      |          |           |    HH    |           |          |
746  *      |          +-----------+    HH    +-----------+          |
747  *      |                           HH                           |
748  *      |                           HH                           |
749  *      +--------------------------------------------------------+
750  *
751  *	From here, when we add the central I/O die, this changes things a bit.
752  *	Each die is connected to the I/O die, rather than trying to interconnect
753  *	them directly. The following image takes the same Zen 1 image that we
754  *	had earlier and shows what it looks like with the I/O die instead:
755  *
756  *                                 PP    PP
757  *                                 PP    PP
758  *           +---------------------PP----PP---------------------+
759  *           |                     PP    PP                     |
760  *           |  +-----------+      PP    PP      +-----------+  |
761  *           |  |           |      PP    PP      |           |  |
762  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
763  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
764  *           |  |         |o|oooo|          |oooo|o|         |  |
765  *           |  +-----------+    |          |    +-----------+  |
766  *           |                   |   I/O    |                   |
767  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
768  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
769  *           |                   |          |                   |
770  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
771  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
772  *           |                   |          |                   |
773  *           |  +-----------+    |          |    +-----------+  |
774  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
775  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
776  *           |  |    Die    |      PP    PP      |    Die    |  |
777  *           |  |           |      PP    PP      |           |  |
778  *           |  +-----------+      PP    PP      +-----------+  |
779  *           |                     PP    PP                     |
780  *           +---------------------PP----PP---------------------+
781  *                                 PP    PP
782  *                                 PP    PP
783  *
784  *	The above has four core complex dies installed, though the Zen 2 EPYC
785  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
786  *	generally only have one to two. The more notable difference here is how
787  *	everything communicates. Note that memory and PCIe come out of the
788  *	central die. This changes the way that one die accesses a resource. It
789  *	basically always has to go to the I/O die, where as in Zen 1 it may have
790  *	satisfied it locally. In general, this ends up being a better strategy
791  *	for most things, though it is possible to still treat everything in four
792  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
793  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
794  *	now there is only one 'node' present.
795  *
796  * ZEN 3
797  *
798  *	From an architectural perspective, Zen 3 is a much smaller change from
799  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
800  *	its microarchitectural changes. The biggest thing for us is how the die
801  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
802  *	cache. However, in Zen 3, the L3 is now shared between the entire core
803  *	complex die and is no longer partitioned between each core complex. This
804  *	means that all cores on the die can share the same L3 cache. Otherwise,
805  *	the general layout of the overall package with various core complexes
806  *	and an I/O die stays the same. Here's what the Core Complex Die looks
807  *	like in a bit more detail:
808  *
809  *               +-------------------------------------------------+
810  *               | Zen 3 Core Complex Die                          |
811  *               | +-------------------+    +-------------------+  |
812  *               | | Core       +----+ |    | Core       +----+ |  |
813  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
814  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
815  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
816  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
817  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
818  *               | +-------------------+    +-------------------+  |
819  *               | +-------------------+    +-------------------+  |
820  *               | | Core       +----+ |    | Core       +----+ |  |
821  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
822  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
823  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
824  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
825  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
826  *               | +-------------------+    +-------------------+  |
827  *               |                                                 |
828  *               | +--------------------------------------------+  |
829  *               | |                 L3 Cache                   |  |
830  *               | +--------------------------------------------+  |
831  *               |                                                 |
832  *               | +-------------------+    +-------------------+  |
833  *               | | Core       +----+ |    | Core       +----+ |  |
834  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
835  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
836  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
837  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
838  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
839  *               | +-------------------+    +-------------------+  |
840  *               | +-------------------+    +-------------------+  |
841  *               | | Core       +----+ |    | Core       +----+ |  |
842  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
843  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
844  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
845  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
846  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
847  *               | +-------------------+    +-------------------+  |
848  *               +-------------------------------------------------+
849  *
850  *	While it is not pictured, there are connections from the die to the
851  *	broader data fabric and additional functional blocks to support that
852  *	communication and coherency.
853  *
854  * CPUID LEAVES
855  *
856  * There are a few different CPUID leaves that we can use to try and understand
857  * the actual state of the world. As part of the introduction of family 0xf, AMD
858  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
859  * processors that are in the system. Because families before Zen didn't have
860  * SMT, this was always the number of cores that were in the system. However, it
861  * should always be thought of as the number of logical threads to be consistent
862  * between generations. In addition we also get the size of the APIC ID that is
863  * used to represent the number of logical processors. This is important for
864  * deriving topology information.
865  *
866  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
867  * bit between Bulldozer and later families, but it is quite useful in
868  * determining the topology information. Because this information has changed
869  * across family generations, it's worth calling out what these mean
870  * explicitly. The registers have the following meanings:
871  *
872  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
873  *		APIC ID, even though on systems without x2apic support, it will
874  *		be limited to 8 bits.
875  *
876  *	%ebx	On Bulldozer-era systems this contains information about the
877  *		number of cores that are in a compute unit (cores that share
878  *		resources). It also contains a per-package compute unit ID that
879  *		identifies which compute unit the logical CPU is a part of.
880  *
881  *		On Zen-era systems this instead contains the number of threads
882  *		per core and the ID of the core that the logical CPU is a part
883  *		of. Note, this ID is unique only to the package, it is not
884  *		globally unique across the entire system.
885  *
886  *	%ecx	This contains the number of nodes that exist in the package. It
887  *		also contains an ID that identifies which node the logical CPU
888  *		is a part of.
889  *
890  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
891  * cache layout to determine which logical CPUs are sharing which caches.
892  *
893  * illumos Topology
894  * ----------------
895  *
896  * Based on the above we synthesize the information into several different
897  * variables that we store in the 'struct cpuid_info'. We'll go into the details
898  * of what each member is supposed to represent and their uniqueness. In
899  * general, there are two levels of uniqueness that we care about. We care about
900  * an ID that is globally unique. That means that it will be unique across all
901  * entities in the system. For example, the default logical CPU ID is globally
902  * unique. On the other hand, there is some information that we only care about
903  * being unique within the context of a single package / socket. Here are the
904  * variables that we keep track of and their meaning.
905  *
906  * Several of the values that are asking for an identifier, with the exception
907  * of cpi_apicid, are allowed to be synthetic.
908  *
909  *
910  * cpi_apicid
911  *
912  *	This is the value of the CPU's APIC id. This should be the full 32-bit
913  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
914  *	APIC ID. This value is globally unique between all logical CPUs across
915  *	all packages. This is usually required by the APIC.
916  *
917  * cpi_chipid
918  *
919  *	This value indicates the ID of the package that the logical CPU is a
920  *	part of. This value is allowed to be synthetic. It is usually derived by
921  *	taking the CPU's APIC ID and determining how many bits are used to
922  *	represent CPU cores in the package. All logical CPUs that are part of
923  *	the same package must have the same value.
924  *
925  * cpi_coreid
926  *
927  *	This represents the ID of a CPU core. Two logical CPUs should only have
928  *	the same cpi_coreid value if they are part of the same core. These
929  *	values may be synthetic. On systems that support SMT, this value is
930  *	usually derived from the APIC ID, otherwise it is often synthetic and
931  *	just set to the value of the cpu_id in the cpu_t.
932  *
933  * cpi_pkgcoreid
934  *
935  *	This is similar to the cpi_coreid in that logical CPUs that are part of
936  *	the same core should have the same ID. The main difference is that these
937  *	values are only required to be unique to a given socket.
938  *
939  * cpi_clogid
940  *
941  *	This represents the logical ID of a logical CPU. This value should be
942  *	unique within a given socket for each logical CPU. This is allowed to be
943  *	synthetic, though it is usually based off of the CPU's apic ID. The
944  *	broader system expects that logical CPUs that have are part of the same
945  *	core have contiguous numbers. For example, if there were two threads per
946  *	core, then the core IDs divided by two should be the same and the first
947  *	modulus two should be zero and the second one. For example, IDs 4 and 5
948  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
949  *	6 represent two logical CPUs that are part of different cores.
950  *
951  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
952  *	from the same source, strictly speaking, they don't have to be and the
953  *	two values should be considered logically independent. One should not
954  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
955  *	some kind of relationship. While this is tempting, we've seen cases on
956  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
957  *
958  * cpi_ncpu_per_chip
959  *
960  *	This value indicates the total number of logical CPUs that exist in the
961  *	physical package. Critically, this is not the number of logical CPUs
962  *	that exist for just the single core.
963  *
964  *	This value should be the same for all logical CPUs in the same package.
965  *
966  * cpi_ncore_per_chip
967  *
968  *	This value indicates the total number of physical CPU cores that exist
969  *	in the package. The system compares this value with cpi_ncpu_per_chip to
970  *	determine if simultaneous multi-threading (SMT) is enabled. When
971  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
972  *	the X86FSET_HTT feature is not set. If this value is greater than one,
973  *	than we consider the processor to have the feature X86FSET_CMP, to
974  *	indicate that there is support for more than one core.
975  *
976  *	This value should be the same for all logical CPUs in the same package.
977  *
978  * cpi_procnodes_per_pkg
979  *
980  *	This value indicates the number of 'nodes' that exist in the package.
981  *	When processors are actually a multi-chip module, this represents the
982  *	number of such modules that exist in the package. Currently, on Intel
983  *	based systems this member is always set to 1.
984  *
985  *	This value should be the same for all logical CPUs in the same package.
986  *
987  * cpi_procnodeid
988  *
989  *	This value indicates the ID of the node that the logical CPU is a part
990  *	of. All logical CPUs that are in the same node must have the same value
991  *	here. This value must be unique across all of the packages in the
992  *	system.  On Intel based systems, this is currently set to the value in
993  *	cpi_chipid because there is only one node.
994  *
995  * cpi_cores_per_compunit
996  *
997  *	This value indicates the number of cores that are part of a compute
998  *	unit. See the AMD topology section for this. This member only has real
999  *	meaning currently for AMD Bulldozer family processors. For all other
1000  *	processors, this should currently be set to 1.
1001  *
1002  * cpi_compunitid
1003  *
1004  *	This indicates the compute unit that the logical CPU belongs to. For
1005  *	processors without AMD Bulldozer-style compute units this should be set
1006  *	to the value of cpi_coreid.
1007  *
1008  * cpi_ncpu_shr_last_cache
1009  *
1010  *	This indicates the number of logical CPUs that are sharing the same last
1011  *	level cache. This value should be the same for all CPUs that are sharing
1012  *	that cache. The last cache refers to the cache that is closest to memory
1013  *	and furthest away from the CPU.
1014  *
1015  * cpi_last_lvl_cacheid
1016  *
1017  *	This indicates the ID of the last cache that the logical CPU uses. This
1018  *	cache is often shared between multiple logical CPUs and is the cache
1019  *	that is closest to memory and furthest away from the CPU. This value
1020  *	should be the same for a group of logical CPUs only if they actually
1021  *	share the same last level cache. IDs should not overlap between
1022  *	packages.
1023  *
1024  * cpi_ncore_bits
1025  *
1026  *	This indicates the number of bits that are required to represent all of
1027  *	the cores in the system. As cores are derived based on their APIC IDs,
1028  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1029  *	this value to be larger than the actual number of IDs that are present
1030  *	in the system. This is used to size tables by the CMI framework. It is
1031  *	only filled in for Intel and AMD CPUs.
1032  *
1033  * cpi_nthread_bits
1034  *
1035  *	This indicates the number of bits required to represent all of the IDs
1036  *	that cover the logical CPUs that exist on a given core. It's OK for this
1037  *	value to be larger than the actual number of IDs that are present in the
1038  *	system.  This is used to size tables by the CMI framework. It is
1039  *	only filled in for Intel and AMD CPUs.
1040  *
1041  * -----------
1042  * Hypervisors
1043  * -----------
1044  *
1045  * If trying to manage the differences between vendors wasn't bad enough, it can
1046  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1047  * the ability to interpose on all cpuid instructions and change them to suit
1048  * their purposes. In general, this is necessary as the hypervisor wants to be
1049  * able to present a more uniform set of features or not necessarily give the
1050  * guest operating system kernel knowledge of all features so it can be
1051  * more easily migrated between systems.
1052  *
1053  * When it comes to trying to determine topology information, this can be a
1054  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1055  * leaf, it'll often return all zeros. Because of that, you'll often see various
1056  * checks scattered about fields being non-zero before we assume we can use
1057  * them.
1058  *
1059  * When it comes to topology information, the hypervisor is often incentivized
1060  * to lie to you about topology. This is because it doesn't always actually
1061  * guarantee that topology at all. The topology path we take in the system
1062  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1063  * or AMD CPU, then we basically do our normal path. However, when they don't
1064  * use an actual vendor, then that usually turns into multiple one-core CPUs
1065  * that we enumerate that are often on different sockets. The actual behavior
1066  * depends greatly on what the hypervisor actually exposes to us.
1067  *
1068  * --------------------
1069  * Exposing Information
1070  * --------------------
1071  *
1072  * We expose CPUID information in three different forms in the system.
1073  *
1074  * The first is through the x86_featureset variable. This is used in conjunction
1075  * with the is_x86_feature() function. This is queried by x86-specific functions
1076  * to determine which features are or aren't present in the system and to make
1077  * decisions based upon them. For example, users of this include everything from
1078  * parts of the system dedicated to reliability, availability, and
1079  * serviceability (RAS), to making decisions about how to handle security
1080  * mitigations, to various x86-specific drivers. General purpose or
1081  * architecture independent drivers should never be calling this function.
1082  *
1083  * The second means is through the auxiliary vector. The auxiliary vector is a
1084  * series of tagged data that the kernel passes down to a user program when it
1085  * begins executing. This information is used to indicate to programs what
1086  * instruction set extensions are present. For example, information about the
1087  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1088  * since user programs cannot make use of it. However, things like the AVX
1089  * instruction sets are. Programs use this information to make run-time
1090  * decisions about what features they should use. As an example, the run-time
1091  * link-editor (rtld) can relocate different functions depending on the hardware
1092  * support available.
1093  *
1094  * The final form is through a series of accessor functions that all have the
1095  * form cpuid_get*. This is used by a number of different subsystems in the
1096  * kernel to determine more detailed information about what we're running on,
1097  * topology information, etc. Some of these subsystems include processor groups
1098  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1099  * microcode, and performance monitoring. These functions all ASSERT that the
1100  * CPU they're being called on has reached a certain cpuid pass. If the passes
1101  * are rearranged, then this needs to be adjusted.
1102  *
1103  * -----------------------------------------------
1104  * Speculative Execution CPU Side Channel Security
1105  * -----------------------------------------------
1106  *
1107  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1108  * execution in the CPU to create side channels there have been a number of
1109  * different attacks and corresponding issues that the operating system needs to
1110  * mitigate against. The following list is some of the common, but not
1111  * exhaustive, set of issues that we know about and have done some or need to do
1112  * more work in the system to mitigate against:
1113  *
1114  *   - Spectre v1
1115  *   - swapgs (Spectre v1 variant)
1116  *   - Spectre v2
1117  *     - Branch History Injection (BHI).
1118  *   - Meltdown (Spectre v3)
1119  *   - Rogue Register Read (Spectre v3a)
1120  *   - Speculative Store Bypass (Spectre v4)
1121  *   - ret2spec, SpectreRSB
1122  *   - L1 Terminal Fault (L1TF)
1123  *   - Microarchitectural Data Sampling (MDS)
1124  *   - Register File Data Sampling (RFDS)
1125  *
1126  * Each of these requires different sets of mitigations and has different attack
1127  * surfaces. For the most part, this discussion is about protecting the kernel
1128  * from non-kernel executing environments such as user processes and hardware
1129  * virtual machines. Unfortunately, there are a number of user vs. user
1130  * scenarios that exist with these. The rest of this section will describe the
1131  * overall approach that the system has taken to address these as well as their
1132  * shortcomings. Unfortunately, not all of the above have been handled today.
1133  *
1134  * SPECTRE v2, ret2spec, SpectreRSB
1135  *
1136  * The second variant of the spectre attack focuses on performing branch target
1137  * injection. This generally impacts indirect call instructions in the system.
1138  * There are four different ways to mitigate this issue that are commonly
1139  * described today:
1140  *
1141  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1142  *  2. Using Retpolines and RSB Stuffing
1143  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1144  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1145  *
1146  * IBRS uses a feature added to microcode to restrict speculation, among other
1147  * things. This form of mitigation has not been used as it has been generally
1148  * seen as too expensive and requires reactivation upon various transitions in
1149  * the system.
1150  *
1151  * As a less impactful alternative to IBRS, retpolines were developed by
1152  * Google. These basically require one to replace indirect calls with a specific
1153  * trampoline that will cause speculation to fail and break the attack.
1154  * Retpolines require compiler support. We always build with retpolines in the
1155  * external thunk mode. This means that a traditional indirect call is replaced
1156  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1157  * of this is that all indirect function calls are performed through a register.
1158  *
1159  * We have to use a common external location of the thunk and not inline it into
1160  * the callsite so that way we can have a single place to patch these functions.
1161  * As it turns out, we currently have two different forms of retpolines that
1162  * exist in the system:
1163  *
1164  *  1. A full retpoline
1165  *  2. A no-op version
1166  *
1167  * The first one is used in the general case. Historically, there was an
1168  * AMD-specific optimized retopoline variant that was based around using a
1169  * serializing lfence instruction; however, in March 2022 it was announced that
1170  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1171  * use it and it is no longer available in the system.
1172  *
1173  * The third form described above is the most curious. It turns out that the way
1174  * that retpolines are implemented is that they rely on how speculation is
1175  * performed on a 'ret' instruction. Intel has continued to optimize this
1176  * process (which is partly why we need to have return stack buffer stuffing,
1177  * but more on that in a bit) and in processors starting with Cascade Lake
1178  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1179  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1180  *
1181  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1182  * physical core. However, if this is the case, we don't want to use retpolines
1183  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1184  * function (called a thunk) into a jmp instruction. This means that we're still
1185  * paying the cost of an extra jump to the external thunk, but it gives us
1186  * flexibility and the ability to have a single kernel image that works across a
1187  * wide variety of systems and hardware features.
1188  *
1189  * Unfortunately, this alone is insufficient. First, Skylake systems have
1190  * additional speculation for the Return Stack Buffer (RSB) which is used to
1191  * return from call instructions which retpolines take advantage of. However,
1192  * this problem is not just limited to Skylake and is actually more pernicious.
1193  * The SpectreRSB paper introduces several more problems that can arise with
1194  * dealing with this. The RSB can be poisoned just like the indirect branch
1195  * predictor. This means that one needs to clear the RSB when transitioning
1196  * between two different privilege domains. Some examples include:
1197  *
1198  *  - Switching between two different user processes
1199  *  - Going between user land and the kernel
1200  *  - Returning to the kernel from a hardware virtual machine
1201  *
1202  * Mitigating this involves combining a couple of different things. The first is
1203  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1204  * Bridge. When an RSB entry refers to a user address and we're executing in the
1205  * kernel, speculation through it will be stopped when SMEP is enabled. This
1206  * protects against a number of the different cases that we would normally be
1207  * worried about such as when we enter the kernel from user land.
1208  *
1209  * To prevent against additional manipulation of the RSB from other contexts
1210  * such as a non-root VMX context attacking the kernel we first look to
1211  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1212  * nothing else that we need to do to protect the kernel at this time.
1213  *
1214  * Unfortunately, not all eIBRS implementations are sufficient to guard
1215  * against RSB manipulations, so we still need to manually overwrite the
1216  * contents of the return stack buffer unless the hardware specifies we are
1217  * covered. We do this through the x86_rsb_stuff() function.  Currently this
1218  * is employed on context switch and vmx_exit. The x86_rsb_stuff() function is
1219  * disabled only when mitigations in general are, or if we have hardware
1220  * indicating no need for post-barrier RSB protections, either in one place
1221  * (old hardware), or on both (newer hardware).
1222  *
1223  * If SMEP is not present, then we would have to stuff the RSB every time we
1224  * transitioned from user mode to the kernel, which isn't very practical right
1225  * now.
1226  *
1227  * To fully protect user to user and vmx to vmx attacks from these classes of
1228  * issues, we would also need to allow them to opt into performing an Indirect
1229  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1230  *
1231  * The fourth form of mitigation here is specific to AMD and is called Automated
1232  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1233  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1234  * (extended feature enable register) MSR. This bit basically says that IBRS
1235  * acts as though it is always active when executing at CPL0 and when executing
1236  * in the 'host' context when SEV-SNP is enabled.
1237  *
1238  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1239  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1240  * to the kernel, we must still consider the remaining cases that exist, just
1241  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1242  * traditional technique to work, this is not true on all CPUs. While a write to
1243  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1244  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1245  * guard page is present between user and kernel address spaces and SMEP is
1246  * enabled, then there is no need to clear the RSB at all.
1247  *
1248  * By default, the system will enable RSB stuffing and the required variant of
1249  * retpolines and store that information in the x86_spectrev2_mitigation value.
1250  * This will be evaluated after a microcode update as well, though it is
1251  * expected that microcode updates will not take away features. This may mean
1252  * that a late loaded microcode may not end up in the optimal configuration
1253  * (though this should be rare).
1254  *
1255  * Currently we do not build kmdb with retpolines or perform any additional side
1256  * channel security mitigations for it. One complication with kmdb is that it
1257  * requires its own retpoline thunks and it would need to adjust itself based on
1258  * what the kernel does. The threat model of kmdb is more limited and therefore
1259  * it may make more sense to investigate using prediction barriers as the whole
1260  * system is only executing a single instruction at a time while in kmdb.
1261  *
1262  * Branch History Injection (BHI)
1263  *
1264  * BHI is a specific form of SPECTREv2 where an attacker may manipulate branch
1265  * history before transitioning from user to supervisor mode (or from VMX
1266  * non-root/guest to root mode). The attacker can then exploit certain
1267  * compiler-generated code-sequences ("gadgets") to disclose information from
1268  * other contexts or domains.  Recent (late-2023/early-2024) research in
1269  * object code analysis discovered many more potential gadgets than what was
1270  * initially reported (which previously was confined to Linux use of
1271  * unprivileged eBPF).
1272  *
1273  * The BHI threat doesn't exist in processsors that predate eIBRS, or in AMD
1274  * ones. Some eIBRS processors have the ability to disable branch history in
1275  * certain (but not all) cases using an MSR write. eIBRS processors that don't
1276  * have the ability to disable must use a software sequence to scrub the
1277  * branch history buffer.
1278  *
1279  * BHI_DIS_S (the aforementioned MSR) prevents ring 0 from ring 3 (VMX guest
1280  * or VMX root). It does not protect different user processes from each other,
1281  * or ring 3 VMX guest from ring 3 VMX root or vice versa.
1282  *
1283  * The BHI clearing sequence prevents user exploiting kernel gadgets, and user
1284  * A's use of user B's gadgets.
1285  *
1286  * SMEP and eIBRS are a continuing defense-in-depth measure protecting the
1287  * kernel.
1288  *
1289  * SPECTRE v1, v4
1290  *
1291  * The v1 and v4 variants of spectre are not currently mitigated in the
1292  * system and require other classes of changes to occur in the code.
1293  *
1294  * SPECTRE v1 (SWAPGS VARIANT)
1295  *
1296  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1297  * can generally affect any branch-dependent code. The swapgs issue is one
1298  * variant of this. If we are coming in from userspace, we can have code like
1299  * this:
1300  *
1301  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1302  *	je	1f
1303  *	movq	$0, REGOFF_SAVFP(%rsp)
1304  *	swapgs
1305  *	1:
1306  *	movq	%gs:CPU_THREAD, %rax
1307  *
1308  * If an attacker can cause a mis-speculation of the branch here, we could skip
1309  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1310  * load. If subsequent code can act as the usual Spectre cache gadget, this
1311  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1312  * any use of the %gs override.
1313  *
1314  * The other case is also an issue: if we're coming into a trap from kernel
1315  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1316  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1317  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1318  * case, and the fix is the same in both cases (an lfence at the branch target
1319  * 1: in this example), we'll just do it unconditionally.
1320  *
1321  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1322  * harder for user-space to actually set a useful %gsbase value: although it's
1323  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1324  * mitigate anyway.
1325  *
1326  * MELTDOWN
1327  *
1328  * Meltdown, or spectre v3, allowed a user process to read any data in their
1329  * address space regardless of whether or not the page tables in question
1330  * allowed the user to have the ability to read them. The solution to meltdown
1331  * is kernel page table isolation. In this world, there are two page tables that
1332  * are used for a process, one in user land and one in the kernel. To implement
1333  * this we use per-CPU page tables and switch between the user and kernel
1334  * variants when entering and exiting the kernel.  For more information about
1335  * this process and how the trampolines work, please see the big theory
1336  * statements and additional comments in:
1337  *
1338  *  - uts/i86pc/ml/kpti_trampolines.s
1339  *  - uts/i86pc/vm/hat_i86.c
1340  *
1341  * While Meltdown only impacted Intel systems and there are also Intel systems
1342  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1343  * kernel page table isolation enabled. While this may at first seem weird, an
1344  * important thing to remember is that you can't speculatively read an address
1345  * if it's never in your page table at all. Having user processes without kernel
1346  * pages present provides us with an important layer of defense in the kernel
1347  * against any other side channel attacks that exist and have yet to be
1348  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1349  * default, no matter the x86 system.
1350  *
1351  * L1 TERMINAL FAULT
1352  *
1353  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1354  * execution uses page table entries. Effectively, it is two different problems.
1355  * The first is that it ignores the not present bit in the page table entries
1356  * when performing speculative execution. This means that something can
1357  * speculatively read the listed physical address if it's present in the L1
1358  * cache under certain conditions (see Intel's documentation for the full set of
1359  * conditions). Secondly, this can be used to bypass hardware virtualization
1360  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1361  * instructions.
1362  *
1363  * For the non-hardware virtualized case, this is relatively easy to deal with.
1364  * We must make sure that all unmapped pages have an address of zero. This means
1365  * that they could read the first 4k of physical memory; however, we never use
1366  * that first page in the operating system and always skip putting it in our
1367  * memory map, even if firmware tells us we can use it in our memory map. While
1368  * other systems try to put extra metadata in the address and reserved bits,
1369  * which led to this being problematic in those cases, we do not.
1370  *
1371  * For hardware virtual machines things are more complicated. Because they can
1372  * construct their own page tables, it isn't hard for them to perform this
1373  * attack against any physical address. The one wrinkle is that this physical
1374  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1375  * to flush the L1 data cache. We wrap this up in the function
1376  * spec_uarch_flush(). This function is also used in the mitigation of
1377  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1378  * hypervisors such as KVM or bhyve are responsible for performing this before
1379  * entering the guest.
1380  *
1381  * Because this attack takes place in the L1 cache, there's another wrinkle
1382  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1383  * designs. This means that when a thread enters a hardware virtualized context
1384  * and flushes the L1 data cache, the other thread on the processor may then go
1385  * ahead and put new data in it that can be potentially attacked. While one
1386  * solution is to disable SMT on the system, another option that is available is
1387  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1388  * goes through and makes sure that if a HVM is being scheduled on one thread,
1389  * then the thing on the other thread is from the same hardware virtual machine.
1390  * If an interrupt comes in or the guest exits to the broader system, then the
1391  * other SMT thread will be kicked out.
1392  *
1393  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1394  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1395  * perform L1TF related mitigations.
1396  *
1397  * MICROARCHITECTURAL DATA SAMPLING
1398  *
1399  * Microarchitectural data sampling (MDS) is a combination of four discrete
1400  * vulnerabilities that are similar issues affecting various parts of the CPU's
1401  * microarchitectural implementation around load, store, and fill buffers.
1402  * Specifically it is made up of the following subcomponents:
1403  *
1404  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1405  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1406  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1407  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1408  *
1409  * To begin addressing these, Intel has introduced another feature in microcode
1410  * called MD_CLEAR. This changes the verw instruction to operate in a different
1411  * way. This allows us to execute the verw instruction in a particular way to
1412  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1413  * updated when this microcode is present to flush this state.
1414  *
1415  * Primarily we need to flush this state whenever we transition from the kernel
1416  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1417  * little bit different. Here the structures are statically sized when a logical
1418  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1419  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1420  * mwait, or another ACPI method. To perform these flushes, we call
1421  * x86_md_clear() at all of these transition points.
1422  *
1423  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1424  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1425  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1426  * a no-op.
1427  *
1428  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1429  * particular, everything we've discussed above is only valid for a single
1430  * thread executing on a core. In the case where you have hyper-threading
1431  * present, this attack can be performed between threads. The theoretical fix
1432  * for this is to ensure that both threads are always in the same security
1433  * domain. This means that they are executing in the same ring and mutually
1434  * trust each other. Practically speaking, this would mean that a system call
1435  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1436  * Rather than implement this, we recommend that one disables hyper-threading
1437  * through the use of psradm -aS.
1438  *
1439  * TSX ASYNCHRONOUS ABORT
1440  *
1441  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1442  * behaves like MDS, but leverages Intel's transactional instructions as another
1443  * vector. Effectively, when a transaction hits one of these cases (unmapped
1444  * page, various cache snoop activity, etc.) then the same data can be exposed
1445  * as in the case of MDS. This means that you can attack your twin.
1446  *
1447  * Intel has described that there are two different ways that we can mitigate
1448  * this problem on affected processors:
1449  *
1450  *   1) We can use the same techniques used to deal with MDS. Flushing the
1451  *      microarchitectural buffers and disabling hyperthreading will mitigate
1452  *      this in the same way.
1453  *
1454  *   2) Using microcode to disable TSX.
1455  *
1456  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1457  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1458  * That's OK as we're already doing all such mitigations. On the other hand,
1459  * processors with MDS_NO are all supposed to receive microcode updates that
1460  * enumerate support for disabling TSX. In general, we'd rather use this method
1461  * when available as it doesn't require disabling hyperthreading to be
1462  * effective. Currently we basically are relying on microcode for processors
1463  * that enumerate MDS_NO.
1464  *
1465  * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1466  * Sampling: RFDS. This allows an attacker to sample values that were in any
1467  * of integer, floating point, or vector registers. This was discovered by
1468  * Intel during internal validation work.  The existence of the RFDS_NO
1469  * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1470  * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1471  * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1472  * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1473  * MSR that L1D uses.
1474  *
1475  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1476  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1477  * different powers. The first allows us to cause all transactions to
1478  * immediately abort. The second gives us a means of disabling TSX completely,
1479  * which includes removing it from cpuid. If we have support for this in
1480  * microcode during the first cpuid pass, then we'll disable TSX completely such
1481  * that user land never has a chance to observe the bit. However, if we are late
1482  * loading the microcode, then we must use the functionality to cause
1483  * transactions to automatically abort. This is necessary for user land's sake.
1484  * Once a program sees a cpuid bit, it must not be taken away.
1485  *
1486  * We track whether or not we should do this based on what cpuid pass we're in.
1487  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1488  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1489  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1490  * second time after we do the initial microcode update.  As a result we need to
1491  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1492  * suitable microcode on the current CPU (which happens prior to
1493  * cpuid_pass_ucode()).
1494  *
1495  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1496  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1497  * unfortunate feature in a number of ways, and taking the opportunity to
1498  * finally be able to turn it off is likely to be of benefit in the future.
1499  *
1500  * SUMMARY
1501  *
1502  * The following table attempts to summarize the mitigations for various issues
1503  * and what's done in various places:
1504  *
1505  *  - Spectre v1: Not currently mitigated
1506  *  - swapgs: lfences after swapgs paths
1507  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1508  *  - Meltdown: Kernel Page Table Isolation
1509  *  - Spectre v3a: Updated CPU microcode
1510  *  - Spectre v4: Not currently mitigated
1511  *  - SpectreRSB: SMEP and RSB Stuffing
1512  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1513  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1514  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1515  *  - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1516  *  - BHI: software sequence, and use of BHI_DIS_S if microcode has it.
1517  *
1518  * The following table indicates the x86 feature set bits that indicate that a
1519  * given problem has been solved or a notable feature is present:
1520  *
1521  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1522  *  - MDS_NO: All forms of MDS
1523  *  - TAA_NO: TAA
1524  *  - RFDS_NO: RFDS
1525  *  - BHI_NO: BHI
1526  */
1527 
1528 #include <sys/types.h>
1529 #include <sys/archsystm.h>
1530 #include <sys/x86_archext.h>
1531 #include <sys/kmem.h>
1532 #include <sys/systm.h>
1533 #include <sys/cmn_err.h>
1534 #include <sys/sunddi.h>
1535 #include <sys/sunndi.h>
1536 #include <sys/cpuvar.h>
1537 #include <sys/processor.h>
1538 #include <sys/stdbool.h>
1539 #include <sys/sysmacros.h>
1540 #include <sys/pg.h>
1541 #include <sys/fp.h>
1542 #include <sys/controlregs.h>
1543 #include <sys/bitmap.h>
1544 #include <sys/auxv_386.h>
1545 #include <sys/memnode.h>
1546 #include <sys/pci_cfgspace.h>
1547 #include <sys/comm_page.h>
1548 #include <sys/mach_mmu.h>
1549 #include <sys/ucode.h>
1550 #include <sys/tsc.h>
1551 #include <sys/kobj.h>
1552 #include <sys/asm_misc.h>
1553 #include <sys/bitmap.h>
1554 
1555 #ifdef __xpv
1556 #include <sys/hypervisor.h>
1557 #else
1558 #include <sys/ontrap.h>
1559 #endif
1560 
1561 uint_t x86_vendor = X86_VENDOR_IntelClone;
1562 uint_t x86_type = X86_TYPE_OTHER;
1563 uint_t x86_clflush_size = 0;
1564 
1565 #if defined(__xpv)
1566 int x86_use_pcid = 0;
1567 int x86_use_invpcid = 0;
1568 #else
1569 int x86_use_pcid = -1;
1570 int x86_use_invpcid = -1;
1571 #endif
1572 
1573 typedef enum {
1574 	X86_SPECTREV2_RETPOLINE,
1575 	X86_SPECTREV2_ENHANCED_IBRS,
1576 	X86_SPECTREV2_AUTO_IBRS,
1577 	X86_SPECTREV2_DISABLED
1578 } x86_spectrev2_mitigation_t;
1579 
1580 uint_t x86_disable_spectrev2 = 0;
1581 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1582     X86_SPECTREV2_RETPOLINE;
1583 
1584 /*
1585  * The mitigation status for TAA:
1586  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1587  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1588  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1589  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1590  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1591  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1592  */
1593 typedef enum {
1594 	X86_TAA_NOTHING,
1595 	X86_TAA_DISABLED,
1596 	X86_TAA_MD_CLEAR,
1597 	X86_TAA_TSX_FORCE_ABORT,
1598 	X86_TAA_TSX_DISABLE,
1599 	X86_TAA_HW_MITIGATED
1600 } x86_taa_mitigation_t;
1601 
1602 uint_t x86_disable_taa = 0;
1603 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1604 
1605 uint_t pentiumpro_bug4046376;
1606 
1607 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1608 
1609 static char *x86_feature_names[NUM_X86_FEATURES] = {
1610 	"lgpg",
1611 	"tsc",
1612 	"msr",
1613 	"mtrr",
1614 	"pge",
1615 	"de",
1616 	"cmov",
1617 	"mmx",
1618 	"mca",
1619 	"pae",
1620 	"cv8",
1621 	"pat",
1622 	"sep",
1623 	"sse",
1624 	"sse2",
1625 	"htt",
1626 	"asysc",
1627 	"nx",
1628 	"sse3",
1629 	"cx16",
1630 	"cmp",
1631 	"tscp",
1632 	"mwait",
1633 	"sse4a",
1634 	"cpuid",
1635 	"ssse3",
1636 	"sse4_1",
1637 	"sse4_2",
1638 	"1gpg",
1639 	"clfsh",
1640 	"64",
1641 	"aes",
1642 	"pclmulqdq",
1643 	"xsave",
1644 	"avx",
1645 	"vmx",
1646 	"svm",
1647 	"topoext",
1648 	"f16c",
1649 	"rdrand",
1650 	"x2apic",
1651 	"avx2",
1652 	"bmi1",
1653 	"bmi2",
1654 	"fma",
1655 	"smep",
1656 	"smap",
1657 	"adx",
1658 	"rdseed",
1659 	"mpx",
1660 	"avx512f",
1661 	"avx512dq",
1662 	"avx512pf",
1663 	"avx512er",
1664 	"avx512cd",
1665 	"avx512bw",
1666 	"avx512vl",
1667 	"avx512fma",
1668 	"avx512vbmi",
1669 	"avx512_vpopcntdq",
1670 	"avx512_4vnniw",
1671 	"avx512_4fmaps",
1672 	"xsaveopt",
1673 	"xsavec",
1674 	"xsaves",
1675 	"sha",
1676 	"umip",
1677 	"pku",
1678 	"ospke",
1679 	"pcid",
1680 	"invpcid",
1681 	"ibrs",
1682 	"ibpb",
1683 	"stibp",
1684 	"ssbd",
1685 	"ssbd_virt",
1686 	"rdcl_no",
1687 	"ibrs_all",
1688 	"rsba",
1689 	"ssb_no",
1690 	"stibp_all",
1691 	"flush_cmd",
1692 	"l1d_vmentry_no",
1693 	"fsgsbase",
1694 	"clflushopt",
1695 	"clwb",
1696 	"monitorx",
1697 	"clzero",
1698 	"xop",
1699 	"fma4",
1700 	"tbm",
1701 	"avx512_vnni",
1702 	"amd_pcec",
1703 	"md_clear",
1704 	"mds_no",
1705 	"core_thermal",
1706 	"pkg_thermal",
1707 	"tsx_ctrl",
1708 	"taa_no",
1709 	"ppin",
1710 	"vaes",
1711 	"vpclmulqdq",
1712 	"lfence_serializing",
1713 	"gfni",
1714 	"avx512_vp2intersect",
1715 	"avx512_bitalg",
1716 	"avx512_vbmi2",
1717 	"avx512_bf16",
1718 	"auto_ibrs",
1719 	"rfds_no",
1720 	"rfds_clear",
1721 	"pbrsb_no",
1722 	"bhi_no",
1723 	"bhi_clear"
1724 };
1725 
1726 boolean_t
is_x86_feature(void * featureset,uint_t feature)1727 is_x86_feature(void *featureset, uint_t feature)
1728 {
1729 	ASSERT(feature < NUM_X86_FEATURES);
1730 	return (BT_TEST((ulong_t *)featureset, feature));
1731 }
1732 
1733 void
add_x86_feature(void * featureset,uint_t feature)1734 add_x86_feature(void *featureset, uint_t feature)
1735 {
1736 	ASSERT(feature < NUM_X86_FEATURES);
1737 	BT_SET((ulong_t *)featureset, feature);
1738 }
1739 
1740 void
remove_x86_feature(void * featureset,uint_t feature)1741 remove_x86_feature(void *featureset, uint_t feature)
1742 {
1743 	ASSERT(feature < NUM_X86_FEATURES);
1744 	BT_CLEAR((ulong_t *)featureset, feature);
1745 }
1746 
1747 boolean_t
compare_x86_featureset(void * setA,void * setB)1748 compare_x86_featureset(void *setA, void *setB)
1749 {
1750 	/*
1751 	 * We assume that the unused bits of the bitmap are always zero.
1752 	 */
1753 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1754 		return (B_TRUE);
1755 	} else {
1756 		return (B_FALSE);
1757 	}
1758 }
1759 
1760 void
print_x86_featureset(void * featureset)1761 print_x86_featureset(void *featureset)
1762 {
1763 	uint_t i;
1764 
1765 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1766 		if (is_x86_feature(featureset, i)) {
1767 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1768 			    x86_feature_names[i]);
1769 		}
1770 	}
1771 }
1772 
1773 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1774 static size_t xsave_state_size = 0;
1775 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1776 boolean_t xsave_force_disable = B_FALSE;
1777 extern int disable_smap;
1778 
1779 /*
1780  * This is set to platform type we are running on.
1781  */
1782 static int platform_type = -1;
1783 
1784 #if !defined(__xpv)
1785 /*
1786  * Variable to patch if hypervisor platform detection needs to be
1787  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1788  */
1789 int enable_platform_detection = 1;
1790 #endif
1791 
1792 /*
1793  * monitor/mwait info.
1794  *
1795  * size_actual and buf_actual are the real address and size allocated to get
1796  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1797  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1798  * processor cache-line alignment, but this is not guarantied in the furture.
1799  */
1800 struct mwait_info {
1801 	size_t		mon_min;	/* min size to avoid missed wakeups */
1802 	size_t		mon_max;	/* size to avoid false wakeups */
1803 	size_t		size_actual;	/* size actually allocated */
1804 	void		*buf_actual;	/* memory actually allocated */
1805 	uint32_t	support;	/* processor support of monitor/mwait */
1806 };
1807 
1808 /*
1809  * xsave/xrestor info.
1810  *
1811  * This structure contains HW feature bits and the size of the xsave save area.
1812  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1813  * (xsave_state) to describe the xsave layout. However, at runtime the
1814  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1815  * xsave_state structure simply represents the legacy layout of the beginning
1816  * of the xsave area.
1817  */
1818 struct xsave_info {
1819 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1820 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1821 	size_t		xsav_max_size;  /* max size save area for HW features */
1822 	size_t		ymm_size;	/* AVX: size of ymm save area */
1823 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1824 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1825 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1826 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1827 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1828 	size_t		opmask_size;	/* AVX512: size of opmask save */
1829 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1830 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1831 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1832 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1833 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1834 	size_t		pkru_size;	/* PKRU size */
1835 	size_t		pkru_offset;	/* PKRU offset */
1836 };
1837 
1838 
1839 /*
1840  * These constants determine how many of the elements of the
1841  * cpuid we cache in the cpuid_info data structure; the
1842  * remaining elements are accessible via the cpuid instruction.
1843  */
1844 
1845 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1846 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1847 #define	NMAX_CPI_TOPO	0x10		/* Sanity check on leaf 8X26, 1F */
1848 
1849 /*
1850  * See the big theory statement for a more detailed explanation of what some of
1851  * these members mean.
1852  */
1853 struct cpuid_info {
1854 	uint_t cpi_pass;		/* last pass completed */
1855 	/*
1856 	 * standard function information
1857 	 */
1858 	uint_t cpi_maxeax;		/* fn 0: %eax */
1859 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1860 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1861 
1862 	uint_t cpi_family;		/* fn 1: extended family */
1863 	uint_t cpi_model;		/* fn 1: extended model */
1864 	uint_t cpi_step;		/* fn 1: stepping */
1865 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1866 					/*		AMD: package/socket # */
1867 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1868 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1869 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1870 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1871 	uint_t cpi_ncache;		/* fn 2: number of elements */
1872 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1873 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1874 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1875 					/* Intel fn: 4, AMD fn: 8000001d */
1876 	struct cpuid_regs **cpi_cache_leaves;	/* Actual leaves from above */
1877 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1878 	struct cpuid_regs cpi_sub7[2];	/* Leaf 7, sub-leaves 1-2 */
1879 	/*
1880 	 * extended function information
1881 	 */
1882 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1883 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1884 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1885 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1886 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1887 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1888 
1889 	id_t cpi_coreid;		/* same coreid => strands share core */
1890 	int cpi_pkgcoreid;		/* core number within single package */
1891 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1892 					/* Intel: fn 4: %eax[31-26] */
1893 
1894 	/*
1895 	 * These values represent the number of bits that are required to store
1896 	 * information about the number of cores and threads.
1897 	 */
1898 	uint_t cpi_ncore_bits;
1899 	uint_t cpi_nthread_bits;
1900 	/*
1901 	 * supported feature information
1902 	 */
1903 	uint32_t cpi_support[6];
1904 #define	STD_EDX_FEATURES	0
1905 #define	AMD_EDX_FEATURES	1
1906 #define	TM_EDX_FEATURES		2
1907 #define	STD_ECX_FEATURES	3
1908 #define	AMD_ECX_FEATURES	4
1909 #define	STD_EBX_FEATURES	5
1910 	/*
1911 	 * Synthesized information, where known.
1912 	 */
1913 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1914 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1915 	uint32_t cpi_socket;		/* Chip package/socket type */
1916 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1917 
1918 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1919 	uint32_t cpi_apicid;
1920 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1921 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1922 					/* Intel: 1 */
1923 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1924 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1925 
1926 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1927 
1928 	/*
1929 	 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1930 	 * eventually leaf 0x1F (Intel).
1931 	 */
1932 	uint_t cpi_topo_nleaves;
1933 	struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1934 };
1935 
1936 
1937 static struct cpuid_info cpuid_info0;
1938 
1939 /*
1940  * These bit fields are defined by the Intel Application Note AP-485
1941  * "Intel Processor Identification and the CPUID Instruction"
1942  */
1943 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1944 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1945 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1946 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1947 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1948 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1949 
1950 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1951 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1952 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1953 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1954 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1955 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1956 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1957 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1958 #define	CPI_FEATURES_7_2_EDX(cpi)	((cpi)->cpi_sub7[1].cp_edx)
1959 
1960 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1961 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1962 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1963 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1964 
1965 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1966 #define	CPI_XMAXEAX_MAX		0x80000100
1967 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1968 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1969 
1970 /*
1971  * Function 4 (Deterministic Cache Parameters) macros
1972  * Defined by Intel Application Note AP-485
1973  */
1974 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1975 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1976 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1977 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1978 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1979 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1980 #define	CPI_CACHE_TYPE_DONE	0
1981 #define	CPI_CACHE_TYPE_DATA	1
1982 #define	CPI_CACHE_TYPE_INSTR	2
1983 #define	CPI_CACHE_TYPE_UNIFIED	3
1984 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1985 
1986 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1987 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1988 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1989 
1990 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1991 
1992 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1993 
1994 
1995 /*
1996  * A couple of shorthand macros to identify "later" P6-family chips
1997  * like the Pentium M and Core.  First, the "older" P6-based stuff
1998  * (loosely defined as "pre-Pentium-4"):
1999  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
2000  */
2001 #define	IS_LEGACY_P6(cpi) (			\
2002 	cpi->cpi_family == 6 &&			\
2003 		(cpi->cpi_model == 1 ||		\
2004 		cpi->cpi_model == 3 ||		\
2005 		cpi->cpi_model == 5 ||		\
2006 		cpi->cpi_model == 6 ||		\
2007 		cpi->cpi_model == 7 ||		\
2008 		cpi->cpi_model == 8 ||		\
2009 		cpi->cpi_model == 0xA ||	\
2010 		cpi->cpi_model == 0xB)		\
2011 )
2012 
2013 /* A "new F6" is everything with family 6 that's not the above */
2014 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
2015 
2016 /* Extended family/model support */
2017 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
2018 	cpi->cpi_family >= 0xf)
2019 
2020 /*
2021  * Info for monitor/mwait idle loop.
2022  *
2023  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
2024  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
2025  * 2006.
2026  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
2027  * Documentation Updates" #33633, Rev 2.05, December 2006.
2028  */
2029 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
2030 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
2031 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
2032 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
2033 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
2034 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
2035 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
2036 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
2037 /*
2038  * Number of sub-cstates for a given c-state.
2039  */
2040 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
2041 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
2042 
2043 /*
2044  * XSAVE leaf 0xD enumeration
2045  */
2046 #define	CPUID_LEAFD_2_YMM_OFFSET	576
2047 #define	CPUID_LEAFD_2_YMM_SIZE		256
2048 
2049 /*
2050  * Common extended leaf names to cut down on typos.
2051  */
2052 #define	CPUID_LEAF_EXT_0		0x80000000
2053 #define	CPUID_LEAF_EXT_8		0x80000008
2054 #define	CPUID_LEAF_EXT_1d		0x8000001d
2055 #define	CPUID_LEAF_EXT_1e		0x8000001e
2056 #define	CPUID_LEAF_EXT_21		0x80000021
2057 #define	CPUID_LEAF_EXT_26		0x80000026
2058 
2059 /*
2060  * Functions we consume from cpuid_subr.c;  don't publish these in a header
2061  * file to try and keep people using the expected cpuid_* interfaces.
2062  */
2063 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2064 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2065 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2066 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2067 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2068 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2069 
2070 /*
2071  * Apply up various platform-dependent restrictions where the
2072  * underlying platform restrictions mean the CPU can be marked
2073  * as less capable than its cpuid instruction would imply.
2074  */
2075 #if defined(__xpv)
2076 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)2077 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2078 {
2079 	switch (eax) {
2080 	case 1: {
2081 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2082 		    0 : CPUID_INTC_EDX_MCA;
2083 		cp->cp_edx &=
2084 		    ~(mcamask |
2085 		    CPUID_INTC_EDX_PSE |
2086 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2087 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2088 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2089 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2090 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2091 		break;
2092 	}
2093 
2094 	case 0x80000001:
2095 		cp->cp_edx &=
2096 		    ~(CPUID_AMD_EDX_PSE |
2097 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2098 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2099 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2100 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2101 		    CPUID_AMD_EDX_TSCP);
2102 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2103 		break;
2104 	default:
2105 		break;
2106 	}
2107 
2108 	switch (vendor) {
2109 	case X86_VENDOR_Intel:
2110 		switch (eax) {
2111 		case 4:
2112 			/*
2113 			 * Zero out the (ncores-per-chip - 1) field
2114 			 */
2115 			cp->cp_eax &= 0x03fffffff;
2116 			break;
2117 		default:
2118 			break;
2119 		}
2120 		break;
2121 	case X86_VENDOR_AMD:
2122 	case X86_VENDOR_HYGON:
2123 		switch (eax) {
2124 
2125 		case 0x80000001:
2126 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2127 			break;
2128 
2129 		case CPUID_LEAF_EXT_8:
2130 			/*
2131 			 * Zero out the (ncores-per-chip - 1) field
2132 			 */
2133 			cp->cp_ecx &= 0xffffff00;
2134 			break;
2135 		default:
2136 			break;
2137 		}
2138 		break;
2139 	default:
2140 		break;
2141 	}
2142 }
2143 #else
2144 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2145 #endif
2146 
2147 /*
2148  *  Some undocumented ways of patching the results of the cpuid
2149  *  instruction to permit running Solaris 10 on future cpus that
2150  *  we don't currently support.  Could be set to non-zero values
2151  *  via settings in eeprom.
2152  */
2153 
2154 uint32_t cpuid_feature_ecx_include;
2155 uint32_t cpuid_feature_ecx_exclude;
2156 uint32_t cpuid_feature_edx_include;
2157 uint32_t cpuid_feature_edx_exclude;
2158 
2159 /*
2160  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2161  */
2162 void
cpuid_alloc_space(cpu_t * cpu)2163 cpuid_alloc_space(cpu_t *cpu)
2164 {
2165 	/*
2166 	 * By convention, cpu0 is the boot cpu, which is set up
2167 	 * before memory allocation is available.  All other cpus get
2168 	 * their cpuid_info struct allocated here.
2169 	 */
2170 	ASSERT(cpu->cpu_id != 0);
2171 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2172 	cpu->cpu_m.mcpu_cpi =
2173 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2174 }
2175 
2176 void
cpuid_free_space(cpu_t * cpu)2177 cpuid_free_space(cpu_t *cpu)
2178 {
2179 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2180 	int i;
2181 
2182 	ASSERT(cpi != NULL);
2183 	ASSERT(cpi != &cpuid_info0);
2184 
2185 	/*
2186 	 * Free up any cache leaf related dynamic storage. The first entry was
2187 	 * cached from the standard cpuid storage, so we should not free it.
2188 	 */
2189 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2190 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2191 	if (cpi->cpi_cache_leaf_size > 0)
2192 		kmem_free(cpi->cpi_cache_leaves,
2193 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2194 
2195 	kmem_free(cpi, sizeof (*cpi));
2196 	cpu->cpu_m.mcpu_cpi = NULL;
2197 }
2198 
2199 #if !defined(__xpv)
2200 /*
2201  * Determine the type of the underlying platform. This is used to customize
2202  * initialization of various subsystems (e.g. TSC). determine_platform() must
2203  * only ever be called once to prevent two processors from seeing different
2204  * values of platform_type. Must be called before cpuid_pass_ident(), the
2205  * earliest consumer to execute; the identification pass will call
2206  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2207  */
2208 void
determine_platform(void)2209 determine_platform(void)
2210 {
2211 	struct cpuid_regs cp;
2212 	uint32_t base;
2213 	uint32_t regs[4];
2214 	char *hvstr = (char *)regs;
2215 
2216 	ASSERT(platform_type == -1);
2217 
2218 	platform_type = HW_NATIVE;
2219 
2220 	if (!enable_platform_detection)
2221 		return;
2222 
2223 	/*
2224 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2225 	 * vendor signature, and set platform type accordingly.
2226 	 *
2227 	 * References:
2228 	 * http://lkml.org/lkml/2008/10/1/246
2229 	 * http://kb.vmware.com/kb/1009458
2230 	 */
2231 	cp.cp_eax = 0x1;
2232 	(void) __cpuid_insn(&cp);
2233 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2234 		cp.cp_eax = 0x40000000;
2235 		(void) __cpuid_insn(&cp);
2236 		regs[0] = cp.cp_ebx;
2237 		regs[1] = cp.cp_ecx;
2238 		regs[2] = cp.cp_edx;
2239 		regs[3] = 0;
2240 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2241 			platform_type = HW_XEN_HVM;
2242 			return;
2243 		}
2244 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2245 			platform_type = HW_VMWARE;
2246 			return;
2247 		}
2248 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2249 			platform_type = HW_KVM;
2250 			return;
2251 		}
2252 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2253 			platform_type = HW_BHYVE;
2254 			return;
2255 		}
2256 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2257 			platform_type = HW_MICROSOFT;
2258 			return;
2259 		}
2260 		if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2261 			platform_type = HW_QEMU_TCG;
2262 			return;
2263 		}
2264 		if (strcmp(hvstr, HVSIG_VIRTUALBOX) == 0) {
2265 			platform_type = HW_VIRTUALBOX;
2266 			return;
2267 		}
2268 		if (strcmp(hvstr, HVSIG_ACRN) == 0) {
2269 			platform_type = HW_ACRN;
2270 			return;
2271 		}
2272 	} else {
2273 		/*
2274 		 * Check older VMware hardware versions. VMware hypervisor is
2275 		 * detected by performing an IN operation to VMware hypervisor
2276 		 * port and checking that value returned in %ebx is VMware
2277 		 * hypervisor magic value.
2278 		 *
2279 		 * References: http://kb.vmware.com/kb/1009458
2280 		 */
2281 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2282 		if (regs[1] == VMWARE_HVMAGIC) {
2283 			platform_type = HW_VMWARE;
2284 			return;
2285 		}
2286 	}
2287 
2288 	/*
2289 	 * Check Xen hypervisor. In a fully virtualized domain,
2290 	 * Xen's pseudo-cpuid function returns a string representing the
2291 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2292 	 * supported cpuid function. We need at least a (base + 2) leaf value
2293 	 * to do what we want to do. Try different base values, since the
2294 	 * hypervisor might use a different one depending on whether Hyper-V
2295 	 * emulation is switched on by default or not.
2296 	 */
2297 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2298 		cp.cp_eax = base;
2299 		(void) __cpuid_insn(&cp);
2300 		regs[0] = cp.cp_ebx;
2301 		regs[1] = cp.cp_ecx;
2302 		regs[2] = cp.cp_edx;
2303 		regs[3] = 0;
2304 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2305 		    cp.cp_eax >= (base + 2)) {
2306 			platform_type &= ~HW_NATIVE;
2307 			platform_type |= HW_XEN_HVM;
2308 			return;
2309 		}
2310 	}
2311 }
2312 
2313 int
get_hwenv(void)2314 get_hwenv(void)
2315 {
2316 	ASSERT(platform_type != -1);
2317 	return (platform_type);
2318 }
2319 
2320 int
is_controldom(void)2321 is_controldom(void)
2322 {
2323 	return (0);
2324 }
2325 
2326 #else
2327 
2328 int
get_hwenv(void)2329 get_hwenv(void)
2330 {
2331 	return (HW_XEN_PV);
2332 }
2333 
2334 int
is_controldom(void)2335 is_controldom(void)
2336 {
2337 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2338 }
2339 
2340 #endif	/* __xpv */
2341 
2342 /*
2343  * Gather the extended topology information. This should be the same for both
2344  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2345  */
2346 static void
cpuid_gather_ext_topo_leaf(struct cpuid_info * cpi,uint32_t leaf)2347 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2348 {
2349 	uint_t i;
2350 
2351 	for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2352 		struct cpuid_regs *regs = &cpi->cpi_topo[i];
2353 
2354 		bzero(regs, sizeof (struct cpuid_regs));
2355 		regs->cp_eax = leaf;
2356 		regs->cp_ecx = i;
2357 
2358 		(void) __cpuid_insn(regs);
2359 		if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2360 		    CPUID_AMD_8X26_TYPE_DONE) {
2361 			break;
2362 		}
2363 	}
2364 
2365 	cpi->cpi_topo_nleaves = i;
2366 }
2367 
2368 /*
2369  * Make sure that we have gathered all of the CPUID leaves that we might need to
2370  * determine topology. We assume that the standard leaf 1 has already been done
2371  * and that xmaxeax has already been calculated.
2372  */
2373 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2374 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2375 {
2376 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2377 
2378 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2379 		struct cpuid_regs *cp;
2380 
2381 		cp = &cpi->cpi_extd[8];
2382 		cp->cp_eax = CPUID_LEAF_EXT_8;
2383 		(void) __cpuid_insn(cp);
2384 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2385 	}
2386 
2387 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2388 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2389 		struct cpuid_regs *cp;
2390 
2391 		cp = &cpi->cpi_extd[0x1e];
2392 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2393 		(void) __cpuid_insn(cp);
2394 	}
2395 
2396 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2397 		cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2398 	}
2399 }
2400 
2401 /*
2402  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2403  * it to everything else. If not, and we're on an AMD system where 8000001e is
2404  * valid, then we use that. Othewrise, we fall back to the default value for the
2405  * APIC ID in leaf 1.
2406  */
2407 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2408 cpuid_gather_apicid(struct cpuid_info *cpi)
2409 {
2410 	/*
2411 	 * Leaf B changes based on the arguments to it. Because we don't cache
2412 	 * it, we need to gather it again.
2413 	 */
2414 	if (cpi->cpi_maxeax >= 0xB) {
2415 		struct cpuid_regs regs;
2416 		struct cpuid_regs *cp;
2417 
2418 		cp = &regs;
2419 		cp->cp_eax = 0xB;
2420 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2421 		(void) __cpuid_insn(cp);
2422 
2423 		if (cp->cp_ebx != 0) {
2424 			return (cp->cp_edx);
2425 		}
2426 	}
2427 
2428 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2429 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2430 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2431 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2432 		return (cpi->cpi_extd[0x1e].cp_eax);
2433 	}
2434 
2435 	return (CPI_APIC_ID(cpi));
2436 }
2437 
2438 /*
2439  * For AMD processors, attempt to calculate the number of chips and cores that
2440  * exist. The way that we do this varies based on the generation, because the
2441  * generations themselves have changed dramatically.
2442  *
2443  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2444  * However, with the advent of family 17h (Zen) it actually tells us the number
2445  * of threads, so we need to look at leaf 0x8000001e if available to determine
2446  * its value. Otherwise, for all prior families, the number of enabled cores is
2447  * the same as threads.
2448  *
2449  * If we do not have leaf 0x80000008, then we assume that this processor does
2450  * not have anything. AMD's older CPUID specification says there's no reason to
2451  * fall back to leaf 1.
2452  *
2453  * In some virtualization cases we will not have leaf 8000001e or it will be
2454  * zero. When that happens we assume the number of threads is one.
2455  */
2456 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2457 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2458 {
2459 	uint_t nthreads, nthread_per_core;
2460 
2461 	nthreads = nthread_per_core = 1;
2462 
2463 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2464 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2465 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2466 		nthreads = CPI_CPU_COUNT(cpi);
2467 	}
2468 
2469 	/*
2470 	 * For us to have threads, and know about it, we have to be at least at
2471 	 * family 17h and have the cpuid bit that says we have extended
2472 	 * topology.
2473 	 */
2474 	if (cpi->cpi_family >= 0x17 &&
2475 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2476 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2477 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2478 	}
2479 
2480 	*ncpus = nthreads;
2481 	*ncores = nthreads / nthread_per_core;
2482 }
2483 
2484 /*
2485  * Seed the initial values for the cores and threads for an Intel based
2486  * processor. These values will be overwritten if we detect that the processor
2487  * supports CPUID leaf 0xb.
2488  */
2489 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2490 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2491 {
2492 	/*
2493 	 * Only seed the number of physical cores from the first level leaf 4
2494 	 * information. The number of threads there indicate how many share the
2495 	 * L1 cache, which may or may not have anything to do with the number of
2496 	 * logical CPUs per core.
2497 	 */
2498 	if (cpi->cpi_maxeax >= 4) {
2499 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2500 	} else {
2501 		*ncores = 1;
2502 	}
2503 
2504 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2505 		*ncpus = CPI_CPU_COUNT(cpi);
2506 	} else {
2507 		*ncpus = *ncores;
2508 	}
2509 }
2510 
2511 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2512 cpuid_leafB_getids(cpu_t *cpu)
2513 {
2514 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2515 	struct cpuid_regs regs;
2516 	struct cpuid_regs *cp;
2517 
2518 	if (cpi->cpi_maxeax < 0xB)
2519 		return (B_FALSE);
2520 
2521 	cp = &regs;
2522 	cp->cp_eax = 0xB;
2523 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2524 
2525 	(void) __cpuid_insn(cp);
2526 
2527 	/*
2528 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2529 	 * indicates that the extended topology enumeration leaf is
2530 	 * available.
2531 	 */
2532 	if (cp->cp_ebx != 0) {
2533 		uint32_t x2apic_id = 0;
2534 		uint_t coreid_shift = 0;
2535 		uint_t ncpu_per_core = 1;
2536 		uint_t chipid_shift = 0;
2537 		uint_t ncpu_per_chip = 1;
2538 		uint_t i;
2539 		uint_t level;
2540 
2541 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2542 			cp->cp_eax = 0xB;
2543 			cp->cp_ecx = i;
2544 
2545 			(void) __cpuid_insn(cp);
2546 			level = CPI_CPU_LEVEL_TYPE(cp);
2547 
2548 			if (level == 1) {
2549 				x2apic_id = cp->cp_edx;
2550 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2551 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2552 			} else if (level == 2) {
2553 				x2apic_id = cp->cp_edx;
2554 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2555 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2556 			}
2557 		}
2558 
2559 		/*
2560 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2561 		 */
2562 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2563 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2564 		    ncpu_per_core;
2565 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2566 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2567 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2568 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2569 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2570 		cpi->cpi_compunitid = cpi->cpi_coreid;
2571 
2572 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2573 			cpi->cpi_nthread_bits = coreid_shift;
2574 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2575 		}
2576 
2577 		return (B_TRUE);
2578 	} else {
2579 		return (B_FALSE);
2580 	}
2581 }
2582 
2583 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2584 cpuid_intel_getids(cpu_t *cpu, void *feature)
2585 {
2586 	uint_t i;
2587 	uint_t chipid_shift = 0;
2588 	uint_t coreid_shift = 0;
2589 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2590 
2591 	/*
2592 	 * There are no compute units or processor nodes currently on Intel.
2593 	 * Always set these to one.
2594 	 */
2595 	cpi->cpi_procnodes_per_pkg = 1;
2596 	cpi->cpi_cores_per_compunit = 1;
2597 
2598 	/*
2599 	 * If cpuid Leaf B is present, use that to try and get this information.
2600 	 * It will be the most accurate for Intel CPUs.
2601 	 */
2602 	if (cpuid_leafB_getids(cpu))
2603 		return;
2604 
2605 	/*
2606 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2607 	 * and ncore_per_chip. These represent the largest power of two values
2608 	 * that we need to cover all of the IDs in the system. Therefore, we use
2609 	 * those values to seed the number of bits needed to cover information
2610 	 * in the case when leaf B is not available. These values will probably
2611 	 * be larger than required, but that's OK.
2612 	 */
2613 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2614 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2615 
2616 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2617 		chipid_shift++;
2618 
2619 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2620 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2621 
2622 	if (is_x86_feature(feature, X86FSET_CMP)) {
2623 		/*
2624 		 * Multi-core (and possibly multi-threaded)
2625 		 * processors.
2626 		 */
2627 		uint_t ncpu_per_core = 0;
2628 
2629 		if (cpi->cpi_ncore_per_chip == 1)
2630 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2631 		else if (cpi->cpi_ncore_per_chip > 1)
2632 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2633 			    cpi->cpi_ncore_per_chip;
2634 		/*
2635 		 * 8bit APIC IDs on dual core Pentiums
2636 		 * look like this:
2637 		 *
2638 		 * +-----------------------+------+------+
2639 		 * | Physical Package ID   |  MC  |  HT  |
2640 		 * +-----------------------+------+------+
2641 		 * <------- chipid -------->
2642 		 * <------- coreid --------------->
2643 		 *			   <--- clogid -->
2644 		 *			   <------>
2645 		 *			   pkgcoreid
2646 		 *
2647 		 * Where the number of bits necessary to
2648 		 * represent MC and HT fields together equals
2649 		 * to the minimum number of bits necessary to
2650 		 * store the value of cpi->cpi_ncpu_per_chip.
2651 		 * Of those bits, the MC part uses the number
2652 		 * of bits necessary to store the value of
2653 		 * cpi->cpi_ncore_per_chip.
2654 		 */
2655 		for (i = 1; i < ncpu_per_core; i <<= 1)
2656 			coreid_shift++;
2657 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2658 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2659 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2660 		/*
2661 		 * Single-core multi-threaded processors.
2662 		 */
2663 		cpi->cpi_coreid = cpi->cpi_chipid;
2664 		cpi->cpi_pkgcoreid = 0;
2665 	} else {
2666 		/*
2667 		 * Single-core single-thread processors.
2668 		 */
2669 		cpi->cpi_coreid = cpu->cpu_id;
2670 		cpi->cpi_pkgcoreid = 0;
2671 	}
2672 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2673 	cpi->cpi_compunitid = cpi->cpi_coreid;
2674 }
2675 
2676 /*
2677  * Historically, AMD has had CMP chips with only a single thread per core.
2678  * However, starting in family 17h (Zen), this has changed and they now have
2679  * multiple threads. Our internal core id needs to be a unique value.
2680  *
2681  * To determine the core id of an AMD system, if we're from a family before 17h,
2682  * then we just use the cpu id, as that gives us a good value that will be
2683  * unique for each core. If instead, we're on family 17h or later, then we need
2684  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2685  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2686  * We can't use the normal core id in that leaf as it's only unique within the
2687  * socket, which is perfect for cpi_pkgcoreid, but not us.
2688  */
2689 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2690 cpuid_amd_get_coreid(cpu_t *cpu)
2691 {
2692 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2693 
2694 	if (cpi->cpi_family >= 0x17 &&
2695 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2696 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2697 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2698 		if (nthreads > 1) {
2699 			VERIFY3U(nthreads, ==, 2);
2700 			return (cpi->cpi_apicid >> 1);
2701 		}
2702 	}
2703 
2704 	return (cpu->cpu_id);
2705 }
2706 
2707 /*
2708  * IDs on AMD is a more challenging task. This is notable because of the
2709  * following two facts:
2710  *
2711  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2712  *     also no way to get an actual unique core id from the system. As such, we
2713  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2714  *     however, guarantee that sibling cores of a chip will have sequential
2715  *     coreids starting at a multiple of the number of cores per chip - that is
2716  *     usually the case, but if the APIC IDs have been set up in a different
2717  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2718  *
2719  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2720  *     called compute units. These compute units share the L1I cache, L2 cache,
2721  *     and the FPU. To deal with this, a new topology leaf was added in
2722  *     0x8000001e. However, parts of this leaf have different meanings
2723  *     once we get to family 0x17.
2724  */
2725 
2726 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2727 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2728 {
2729 	int i, first_half, coreidsz;
2730 	uint32_t nb_caps_reg;
2731 	uint_t node2_1;
2732 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2733 	struct cpuid_regs *cp;
2734 
2735 	/*
2736 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2737 	 * hasn't been stripped by virtualization). We always set the compute
2738 	 * unit id to the same value. Also, initialize the default number of
2739 	 * cores per compute unit and nodes per package. This will be
2740 	 * overwritten when we know information about a particular family.
2741 	 */
2742 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2743 	cpi->cpi_compunitid = cpi->cpi_coreid;
2744 	cpi->cpi_cores_per_compunit = 1;
2745 	cpi->cpi_procnodes_per_pkg = 1;
2746 
2747 	/*
2748 	 * To construct the logical ID, we need to determine how many APIC IDs
2749 	 * are dedicated to the cores and threads. This is provided for us in
2750 	 * 0x80000008. However, if it's not present (say due to virtualization),
2751 	 * then we assume it's one. This should be present on all 64-bit AMD
2752 	 * processors.  It was added in family 0xf (Hammer).
2753 	 */
2754 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2755 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2756 
2757 		/*
2758 		 * In AMD parlance chip is really a node while illumos
2759 		 * uses chip as equivalent to socket/package.
2760 		 */
2761 		if (coreidsz == 0) {
2762 			/* Use legacy method */
2763 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2764 				coreidsz++;
2765 			if (coreidsz == 0)
2766 				coreidsz = 1;
2767 		}
2768 	} else {
2769 		/* Assume single-core part */
2770 		coreidsz = 1;
2771 	}
2772 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2773 
2774 	/*
2775 	 * The package core ID varies depending on the family. While it may be
2776 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2777 	 * this value is the core id in the given node. For non-virtualized
2778 	 * family 17h, we need to take the logical core id and shift off the
2779 	 * threads like we do when getting the core id.  Otherwise, we can use
2780 	 * the clogid as is. When family 17h is virtualized, the clogid should
2781 	 * be sufficient as if we don't have valid data in the leaf, then we
2782 	 * won't think we have SMT, in which case the cpi_clogid should be
2783 	 * sufficient.
2784 	 */
2785 	if (cpi->cpi_family >= 0x17 &&
2786 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2787 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2788 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2789 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2790 		if (nthreads > 1) {
2791 			VERIFY3U(nthreads, ==, 2);
2792 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2793 		} else {
2794 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2795 		}
2796 	} else {
2797 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2798 	}
2799 
2800 	/*
2801 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2802 	 * (bulldozer) or newer, then we can derive all of this from leaf
2803 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2804 	 */
2805 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2806 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2807 		cp = &cpi->cpi_extd[0x1e];
2808 
2809 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2810 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2811 
2812 		/*
2813 		 * For Bulldozer-era CPUs, recalculate the compute unit
2814 		 * information.
2815 		 */
2816 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2817 			cpi->cpi_cores_per_compunit =
2818 			    BITX(cp->cp_ebx, 15, 8) + 1;
2819 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2820 			    (cpi->cpi_ncore_per_chip /
2821 			    cpi->cpi_cores_per_compunit) *
2822 			    (cpi->cpi_procnodeid /
2823 			    cpi->cpi_procnodes_per_pkg);
2824 		}
2825 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2826 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2827 	} else if (cpi->cpi_family == 0x10) {
2828 		/*
2829 		 * See if we are a multi-node processor.
2830 		 * All processors in the system have the same number of nodes
2831 		 */
2832 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2833 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2834 			/* Single-node */
2835 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2836 			    coreidsz);
2837 		} else {
2838 
2839 			/*
2840 			 * Multi-node revision D (2 nodes per package
2841 			 * are supported)
2842 			 */
2843 			cpi->cpi_procnodes_per_pkg = 2;
2844 
2845 			first_half = (cpi->cpi_pkgcoreid <=
2846 			    (cpi->cpi_ncore_per_chip/2 - 1));
2847 
2848 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2849 				/* We are BSP */
2850 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2851 			} else {
2852 
2853 				/* We are AP */
2854 				/* NodeId[2:1] bits to use for reading F3xe8 */
2855 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2856 
2857 				nb_caps_reg =
2858 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2859 
2860 				/*
2861 				 * Check IntNodeNum bit (31:30, but bit 31 is
2862 				 * always 0 on dual-node processors)
2863 				 */
2864 				if (BITX(nb_caps_reg, 30, 30) == 0)
2865 					cpi->cpi_procnodeid = node2_1 +
2866 					    !first_half;
2867 				else
2868 					cpi->cpi_procnodeid = node2_1 +
2869 					    first_half;
2870 			}
2871 		}
2872 	} else {
2873 		cpi->cpi_procnodeid = 0;
2874 	}
2875 
2876 	cpi->cpi_chipid =
2877 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2878 
2879 	cpi->cpi_ncore_bits = coreidsz;
2880 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2881 	    cpi->cpi_ncore_per_chip);
2882 }
2883 
2884 static void
spec_uarch_flush_noop(void)2885 spec_uarch_flush_noop(void)
2886 {
2887 }
2888 
2889 /*
2890  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2891  * MDS-related micro-architectural state that would normally happen by calling
2892  * x86_md_clear().
2893  */
2894 static void
spec_uarch_flush_msr(void)2895 spec_uarch_flush_msr(void)
2896 {
2897 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2898 }
2899 
2900 /*
2901  * This function points to a function that will flush certain
2902  * micro-architectural state on the processor. This flush is used to mitigate
2903  * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2904  * This function can point to one of three functions:
2905  *
2906  * - A noop which is done because we either are vulnerable, but do not have
2907  *   microcode available to help deal with a fix, or because we aren't
2908  *   vulnerable.
2909  *
2910  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2911  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2912  *   however, it only flushes the MDS related micro-architectural state on the
2913  *   current hyperthread, it does not do anything for the twin.
2914  *
2915  * - x86_md_clear which will flush the MDS related state. This is done when we
2916  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2917  *   (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2918  *   can clear it (RFDS_CLEAR is set).
2919  */
2920 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2921 
2922 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2923 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2924 {
2925 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2926 
2927 	/* Non-Intel doesn't concern us here. */
2928 	if (cpi->cpi_vendor != X86_VENDOR_Intel)
2929 		return;
2930 
2931 	/*
2932 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2933 	 * has been fixed in hardware, it doesn't cover everything related to
2934 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2935 	 * need to mitigate this.
2936 	 *
2937 	 * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2938 	 * because of the small cases of RFDS.
2939 	 */
2940 
2941 	if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2942 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2943 	    (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2944 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2945 		const uint8_t nop = NOP_INSTR;
2946 		uint8_t *md = (uint8_t *)x86_md_clear;
2947 
2948 		*md = nop;
2949 	}
2950 
2951 	membar_producer();
2952 }
2953 
2954 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2955 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2956 {
2957 	boolean_t need_l1d, need_mds, need_rfds;
2958 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2959 
2960 	/*
2961 	 * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2962 	 * in hardware, then there's nothing left for us to do for enabling
2963 	 * the flush. We can also go ahead and say that SMT exclusion is
2964 	 * unnecessary.
2965 	 */
2966 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2967 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2968 	    is_x86_feature(featureset, X86FSET_MDS_NO) &&
2969 	    is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2970 		extern int smt_exclusion;
2971 		smt_exclusion = 0;
2972 		spec_uarch_flush = spec_uarch_flush_noop;
2973 		membar_producer();
2974 		return;
2975 	}
2976 
2977 	/*
2978 	 * The locations where we need to perform an L1D flush are required both
2979 	 * for mitigating L1TF and MDS. When verw support is present in
2980 	 * microcode, then the L1D flush will take care of doing that as well.
2981 	 * However, if we have a system where RDCL_NO is present, but we don't
2982 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2983 	 * L1D flush.
2984 	 */
2985 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2986 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2987 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2988 		need_l1d = B_TRUE;
2989 	} else {
2990 		need_l1d = B_FALSE;
2991 	}
2992 
2993 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2994 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2995 		need_mds = B_TRUE;
2996 	} else {
2997 		need_mds = B_FALSE;
2998 	}
2999 
3000 	if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
3001 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
3002 		need_rfds = B_TRUE;
3003 	} else {
3004 		need_rfds = B_FALSE;
3005 	}
3006 
3007 	if (need_l1d) {
3008 		/*
3009 		 * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
3010 		 * together. If the following VERIFY trips, we need to add
3011 		 * further fixes here.
3012 		 */
3013 		VERIFY(!need_rfds);
3014 		spec_uarch_flush = spec_uarch_flush_msr;
3015 	} else if (need_mds || need_rfds) {
3016 		spec_uarch_flush = x86_md_clear;
3017 	} else {
3018 		/*
3019 		 * We have no hardware mitigations available to us.
3020 		 */
3021 		spec_uarch_flush = spec_uarch_flush_noop;
3022 	}
3023 	membar_producer();
3024 }
3025 
3026 /*
3027  * Branch History Injection (BHI) mitigations.
3028  *
3029  * Intel has provided a software sequence that will scrub the BHB. Like RSB
3030  * (below) we can scribble a return at the beginning to avoid if if the CPU
3031  * is modern enough. We can also scribble a return if the CPU is old enough
3032  * to not have an RSB (pre-eIBRS).
3033  */
3034 typedef enum {
3035 	X86_BHI_TOO_OLD_OR_DISABLED,	/* Pre-eIBRS or disabled */
3036 	X86_BHI_NEW_ENOUGH,		/* AMD, or Intel with BHI_NO set */
3037 	X86_BHI_DIS_S,			/* BHI_NO == 0, but BHI_DIS_S avail. */
3038 	/* NOTE: BHI_DIS_S above will still need the software sequence. */
3039 	X86_BHI_SOFTWARE_SEQUENCE,	/* Use software sequence */
3040 } x86_native_bhi_mitigation_t;
3041 
3042 x86_native_bhi_mitigation_t x86_bhi_mitigation = X86_BHI_SOFTWARE_SEQUENCE;
3043 
3044 static void
cpuid_enable_bhi_dis_s(void)3045 cpuid_enable_bhi_dis_s(void)
3046 {
3047 	uint64_t val;
3048 
3049 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3050 	val |= IA32_SPEC_CTRL_BHI_DIS_S;
3051 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3052 }
3053 
3054 /*
3055  * This function scribbles RET into the first instruction of x86_bhb_clear()
3056  * if SPECTREV2 mitigations are disabled, the CPU is too old, the CPU is new
3057  * enough to fix (which includes non-Intel CPUs), or the CPU has an explicit
3058  * disable-Branch-History control.
3059  */
3060 static x86_native_bhi_mitigation_t
cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit,cpu_t * cpu,uchar_t * featureset)3061 cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit, cpu_t *cpu,
3062     uchar_t *featureset)
3063 {
3064 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3065 	const uint8_t ret = RET_INSTR;
3066 	uint8_t *bhb_clear = (uint8_t *)x86_bhb_clear;
3067 
3068 	ASSERT0(cpu->cpu_id);
3069 
3070 	/* First check for explicitly disabled... */
3071 	if (v2mit == X86_SPECTREV2_DISABLED) {
3072 		*bhb_clear = ret;
3073 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3074 	}
3075 
3076 	/*
3077 	 * Then check for BHI_NO, which means the CPU doesn't have this bug,
3078 	 * or if it's non-Intel, in which case this mitigation mechanism
3079 	 * doesn't apply.
3080 	 */
3081 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
3082 	    is_x86_feature(featureset, X86FSET_BHI_NO)) {
3083 		*bhb_clear = ret;
3084 		return (X86_BHI_NEW_ENOUGH);
3085 	}
3086 
3087 	/*
3088 	 * Now check for the BHI_CTRL MSR, and then set it if available.
3089 	 * We will still need to use the software sequence, however.
3090 	 */
3091 	if (is_x86_feature(featureset, X86FSET_BHI_CTRL)) {
3092 		cpuid_enable_bhi_dis_s();
3093 		return (X86_BHI_DIS_S);
3094 	}
3095 
3096 	/*
3097 	 * Finally, check if we are too old to bother with RSB:
3098 	 */
3099 	if (v2mit == X86_SPECTREV2_RETPOLINE) {
3100 		*bhb_clear = ret;
3101 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3102 	}
3103 
3104 	ASSERT(*bhb_clear != ret);
3105 	return (X86_BHI_SOFTWARE_SEQUENCE);
3106 }
3107 
3108 /*
3109  * We default to enabling Return Stack Buffer (RSB) mitigations.
3110  *
3111  * We used to skip RSB mitigations with Intel eIBRS, but developments around
3112  * post-barrier RSB (PBRSB) guessing suggests we should enable Intel RSB
3113  * mitigations always unless explicitly bypassed, or unless hardware indicates
3114  * the bug has been fixed.
3115  *
3116  * The current decisions for using, or ignoring, a RSB software stuffing
3117  * sequence are expressed by the following table:
3118  *
3119  * +-------+------------+-----------------+--------+
3120  * | eIBRS |  PBRSB_NO  |  context switch | vmexit |
3121  * +-------+------------+-----------------+--------+
3122  * |   Yes |     No     |  stuff          | stuff  |
3123  * |   Yes |     Yes    |  ignore         | ignore |
3124  * |   No  |     No     |  stuff          | ignore |
3125  * +-------+------------+-----------------+--------+
3126  *
3127  * Note that if an Intel CPU has no eIBRS, it will never enumerate PBRSB_NO,
3128  * because machines with no eIBRS do not have a problem with PBRSB overflow.
3129  * See the Intel document cited below for details.
3130  *
3131  * Also note that AMD AUTO_IBRS has no PBRSB problem, so it is not included in
3132  * the table above, and that there is no situation where vmexit stuffing is
3133  * needed, but context-switch stuffing isn't.
3134  */
3135 
3136 /* BEGIN CSTYLED */
3137 /*
3138  * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/post-barrier-return-stack-buffer-predictions.html
3139  */
3140 /* END CSTYLED */
3141 
3142 /*
3143  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
3144  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
3145  * also states that as long as SMEP and we maintain at least one page between
3146  * the kernel and user space (we have much more of a red zone), then we do not
3147  * need to clear the RSB. We constrain this to only when Automatic IRBS is
3148  * present.
3149  */
3150 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit,bool intel_pbrsb_no)3151 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit, bool intel_pbrsb_no)
3152 {
3153 	const uint8_t ret = RET_INSTR;
3154 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
3155 	uint8_t *vmx_stuff = (uint8_t *)x86_rsb_stuff_vmexit;
3156 
3157 	switch (mit) {
3158 	case X86_SPECTREV2_AUTO_IBRS:
3159 	case X86_SPECTREV2_DISABLED:
3160 		/* Don't bother with any RSB stuffing! */
3161 		*stuff = ret;
3162 		*vmx_stuff = ret;
3163 		break;
3164 	case X86_SPECTREV2_RETPOLINE:
3165 		/*
3166 		 * The Intel document on Post-Barrier RSB says that processors
3167 		 * without eIBRS do not have PBRSB problems upon VMEXIT.
3168 		 */
3169 		VERIFY(!intel_pbrsb_no);
3170 		VERIFY3U(*stuff, !=, ret);
3171 		*vmx_stuff = ret;
3172 		break;
3173 	default:
3174 		/*
3175 		 * eIBRS is all that's left.  If CPU claims PBRSB is fixed,
3176 		 * don't use the RSB mitigation in either case.  Otherwise
3177 		 * both vmexit and context-switching require the software
3178 		 * mitigation.
3179 		 */
3180 		if (intel_pbrsb_no) {
3181 			/* CPU claims PBRSB problems are fixed. */
3182 			*stuff = ret;
3183 			*vmx_stuff = ret;
3184 		}
3185 		VERIFY3U(*stuff, ==, *vmx_stuff);
3186 		break;
3187 	}
3188 }
3189 
3190 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)3191 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3192 {
3193 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3194 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3195 	    "_r14", "_r15" };
3196 	const uint_t nthunks = ARRAY_SIZE(thunks);
3197 	const char *type;
3198 	uint_t i;
3199 
3200 	if (mit == x86_spectrev2_mitigation)
3201 		return;
3202 
3203 	switch (mit) {
3204 	case X86_SPECTREV2_RETPOLINE:
3205 		type = "gen";
3206 		break;
3207 	case X86_SPECTREV2_AUTO_IBRS:
3208 	case X86_SPECTREV2_ENHANCED_IBRS:
3209 	case X86_SPECTREV2_DISABLED:
3210 		type = "jmp";
3211 		break;
3212 	default:
3213 		panic("asked to update retpoline state with unknown state!");
3214 	}
3215 
3216 	for (i = 0; i < nthunks; i++) {
3217 		uintptr_t source, dest;
3218 		int ssize, dsize;
3219 		char sourcebuf[64], destbuf[64];
3220 
3221 		(void) snprintf(destbuf, sizeof (destbuf),
3222 		    "__x86_indirect_thunk%s", thunks[i]);
3223 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
3224 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
3225 
3226 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3227 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
3228 		VERIFY3U(source, !=, 0);
3229 		VERIFY3U(dest, !=, 0);
3230 		VERIFY3S(dsize, >=, ssize);
3231 		bcopy((void *)source, (void *)dest, ssize);
3232 	}
3233 }
3234 
3235 static void
cpuid_enable_enhanced_ibrs(void)3236 cpuid_enable_enhanced_ibrs(void)
3237 {
3238 	uint64_t val;
3239 
3240 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3241 	val |= IA32_SPEC_CTRL_IBRS;
3242 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3243 }
3244 
3245 static void
cpuid_enable_auto_ibrs(void)3246 cpuid_enable_auto_ibrs(void)
3247 {
3248 	uint64_t val;
3249 
3250 	val = rdmsr(MSR_AMD_EFER);
3251 	val |= AMD_EFER_AIBRSE;
3252 	wrmsr(MSR_AMD_EFER, val);
3253 }
3254 
3255 /*
3256  * AMD Zen 5 processors are affected by a defect where the 16- and 32-bit
3257  * forms of the RDSEED instruction may return 0 despite indicating success
3258  * (CF=1) - See AMD-SB-7055 / CVE-2025-62626.
3259  *
3260  * This table records the minimum microcode revision for each affected CPU
3261  * at which RDSEED is considered reliable and may be exposed. On all other
3262  * Zen 5 parts, or when running below the listed revision, RDSEED is masked
3263  * from CPUID leaf 7 feature reporting.
3264  *
3265  * The model field is required to distinguish between Krackan and Krackan2,
3266  * which otherwise share the same chip revision identifier.
3267  */
3268 static struct cpuid_fwrev {
3269 	const x86_chiprev_t	cf_chiprev;
3270 	const uint_t		cf_model;
3271 	const uint32_t		cf_minfwrev;
3272 } cpuid_amd_zen5_rdseed_good[] = {
3273 	{ X86_CHIPREV_AMD_TURIN_C1,		0x02,	0x0b00215a },
3274 	{ X86_CHIPREV_AMD_DENSE_TURIN_B0,	0x11,	0x0b101054 },
3275 	{ X86_CHIPREV_AMD_STRIX_B0,		0x24,	0x0b204037 },
3276 	{ X86_CHIPREV_AMD_GRANITE_RIDGE_B0,	0x44,	0x0b404035 },
3277 	{ X86_CHIPREV_AMD_GRANITE_RIDGE_B1,	0x44,	0x0b404108 },
3278 	{ X86_CHIPREV_AMD_KRACKAN_A0,		0x60,	0x0b600037 },
3279 	{ X86_CHIPREV_AMD_KRACKAN_A0,		0x68,	0x0b608038 },
3280 	{ X86_CHIPREV_AMD_STRIX_HALO_A0,	0x70,	0x0b700037 },
3281 	{ X86_CHIPREV_AMD_SHIMADA_PEAK_C1,	0x08,	0x0b008121 },
3282 };
3283 
3284 static void
cpuid_evaluate_amd_rdseed(cpu_t * cpu,uchar_t * featureset)3285 cpuid_evaluate_amd_rdseed(cpu_t *cpu, uchar_t *featureset)
3286 {
3287 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3288 	struct cpuid_regs *ecp = &cpi->cpi_std[7];
3289 	uint32_t rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
3290 
3291 	ASSERT3U(cpi->cpi_vendor, ==, X86_VENDOR_AMD);
3292 	ASSERT(ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED);
3293 
3294 	/* This erratum only applies to the Zen 5 uarch */
3295 	if (uarchrev_uarch(cpi->cpi_uarchrev) != X86_UARCH_AMD_ZEN5)
3296 		return;
3297 
3298 	/*
3299 	 * If the CPU microcode is new enough then this issue is mitigated.
3300 	 * Unfortunately there is not a bit that indicates this so we need to
3301 	 * check the version explicitly against a table of known good versions.
3302 	 */
3303 	for (size_t i = 0; i < ARRAY_SIZE(cpuid_amd_zen5_rdseed_good); i++) {
3304 		const struct cpuid_fwrev *cf = &cpuid_amd_zen5_rdseed_good[i];
3305 
3306 		if (chiprev_matches(cpi->cpi_chiprev, cf->cf_chiprev) &&
3307 		    cpi->cpi_model == cf->cf_model && rev >= cf->cf_minfwrev) {
3308 			/* Mitigated, leave enabled. */
3309 			return;
3310 		}
3311 	}
3312 
3313 	/*
3314 	 * Go ahead and disable RDSEED on this boot.
3315 	 * In addition to removing it from the feature set and cached value, we
3316 	 * also need to remove it from the features returned by CPUID7 so that
3317 	 * userland programs performing their own feature detection will
3318 	 * determine it is not available.
3319 	 */
3320 	if (cpu->cpu_id == 0)
3321 		cmn_err(CE_WARN, "Masking unreliable RDSEED on this hardware");
3322 
3323 	remove_x86_feature(featureset, X86FSET_RDSEED);
3324 	ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
3325 
3326 	/*
3327 	 * Some hypervisors that expose RDSEED do not emulate this MSR and so
3328 	 * we guard against a trap here.
3329 	 */
3330 #ifndef __xpv
3331 	on_trap_data_t otd;
3332 
3333 	if (!on_trap(&otd, OT_DATA_ACCESS)) {
3334 		uint64_t val;
3335 
3336 		val = rdmsr(MSR_AMD_CPUID7_FEATURES);
3337 		val &= ~MSR_AMD_CPUID7_FEATURES_RDSEED;
3338 		wrmsr(MSR_AMD_CPUID7_FEATURES, val);
3339 	}
3340 	no_trap();
3341 #endif
3342 }
3343 
3344 /*
3345  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3346  * we can disable TSX, we do so.
3347  *
3348  * This determination is done only on the boot CPU, potentially after loading
3349  * updated microcode.
3350  */
3351 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)3352 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3353 {
3354 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3355 
3356 	VERIFY(cpu->cpu_id == 0);
3357 
3358 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3359 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3360 		return;
3361 	}
3362 
3363 	if (x86_disable_taa) {
3364 		x86_taa_mitigation = X86_TAA_DISABLED;
3365 		return;
3366 	}
3367 
3368 	/*
3369 	 * If we do not have the ability to disable TSX, then our only
3370 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3371 	 * MDS mitigation as described above.  The latter relies upon us having
3372 	 * configured MDS mitigations correctly! This includes disabling SMT if
3373 	 * we want to cross-CPU-thread protection.
3374 	 */
3375 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3376 		/*
3377 		 * It's not clear whether any parts will enumerate TAA_NO
3378 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3379 		 */
3380 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3381 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3382 			return;
3383 		}
3384 
3385 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3386 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3387 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3388 		} else {
3389 			x86_taa_mitigation = X86_TAA_NOTHING;
3390 		}
3391 		return;
3392 	}
3393 
3394 	/*
3395 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3396 	 * enough in boot.
3397 	 *
3398 	 * Otherwise, we'll fall back to causing transactions to abort as our
3399 	 * mitigation. TSX-using code will always take the fallback path.
3400 	 */
3401 	if (cpi->cpi_pass < 4) {
3402 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3403 	} else {
3404 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3405 	}
3406 }
3407 
3408 /*
3409  * As mentioned, we should only touch the MSR when we've got a suitable
3410  * microcode loaded on this CPU.
3411  */
3412 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)3413 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3414 {
3415 	uint64_t val;
3416 
3417 	switch (taa) {
3418 	case X86_TAA_TSX_DISABLE:
3419 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3420 			return;
3421 		val = rdmsr(MSR_IA32_TSX_CTRL);
3422 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3423 		wrmsr(MSR_IA32_TSX_CTRL, val);
3424 		break;
3425 	case X86_TAA_TSX_FORCE_ABORT:
3426 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3427 			return;
3428 		val = rdmsr(MSR_IA32_TSX_CTRL);
3429 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3430 		wrmsr(MSR_IA32_TSX_CTRL, val);
3431 		break;
3432 	case X86_TAA_HW_MITIGATED:
3433 	case X86_TAA_MD_CLEAR:
3434 	case X86_TAA_DISABLED:
3435 	case X86_TAA_NOTHING:
3436 		break;
3437 	}
3438 }
3439 
3440 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)3441 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3442 {
3443 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3444 	x86_spectrev2_mitigation_t v2mit;
3445 
3446 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3447 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3448 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3449 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3450 			add_x86_feature(featureset, X86FSET_IBPB);
3451 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3452 			add_x86_feature(featureset, X86FSET_IBRS);
3453 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3454 			add_x86_feature(featureset, X86FSET_STIBP);
3455 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3456 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3457 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3458 			add_x86_feature(featureset, X86FSET_SSBD);
3459 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3460 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3461 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3462 			add_x86_feature(featureset, X86FSET_SSB_NO);
3463 
3464 		/*
3465 		 * Rather than Enhanced IBRS, AMD has a different feature that
3466 		 * is a bit in EFER that can be enabled and will basically do
3467 		 * the right thing while executing in the kernel.
3468 		 */
3469 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3470 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3471 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3472 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3473 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3474 		}
3475 
3476 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3477 	    cpi->cpi_maxeax >= 7) {
3478 		struct cpuid_regs *ecp;
3479 		ecp = &cpi->cpi_std[7];
3480 
3481 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3482 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3483 		}
3484 
3485 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3486 			add_x86_feature(featureset, X86FSET_IBRS);
3487 			add_x86_feature(featureset, X86FSET_IBPB);
3488 		}
3489 
3490 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3491 			add_x86_feature(featureset, X86FSET_STIBP);
3492 		}
3493 
3494 		/*
3495 		 * Some prediction controls are enumerated by subleaf 2 of
3496 		 * leaf 7.
3497 		 */
3498 		if (CPI_FEATURES_7_2_EDX(cpi) & CPUID_INTC_EDX_7_2_BHI_CTRL) {
3499 			add_x86_feature(featureset, X86FSET_BHI_CTRL);
3500 		}
3501 
3502 		/*
3503 		 * Don't read the arch caps MSR on xpv where we lack the
3504 		 * on_trap().
3505 		 */
3506 #ifndef __xpv
3507 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3508 			on_trap_data_t otd;
3509 
3510 			/*
3511 			 * Be paranoid and assume we'll get a #GP.
3512 			 */
3513 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3514 				uint64_t reg;
3515 
3516 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3517 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3518 					add_x86_feature(featureset,
3519 					    X86FSET_RDCL_NO);
3520 				}
3521 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3522 					add_x86_feature(featureset,
3523 					    X86FSET_IBRS_ALL);
3524 				}
3525 				if (reg & IA32_ARCH_CAP_RSBA) {
3526 					add_x86_feature(featureset,
3527 					    X86FSET_RSBA);
3528 				}
3529 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3530 					add_x86_feature(featureset,
3531 					    X86FSET_L1D_VM_NO);
3532 				}
3533 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3534 					add_x86_feature(featureset,
3535 					    X86FSET_SSB_NO);
3536 				}
3537 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3538 					add_x86_feature(featureset,
3539 					    X86FSET_MDS_NO);
3540 				}
3541 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3542 					add_x86_feature(featureset,
3543 					    X86FSET_TSX_CTRL);
3544 				}
3545 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3546 					add_x86_feature(featureset,
3547 					    X86FSET_TAA_NO);
3548 				}
3549 				if (reg & IA32_ARCH_CAP_RFDS_NO) {
3550 					add_x86_feature(featureset,
3551 					    X86FSET_RFDS_NO);
3552 				}
3553 				if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3554 					add_x86_feature(featureset,
3555 					    X86FSET_RFDS_CLEAR);
3556 				}
3557 				if (reg & IA32_ARCH_CAP_PBRSB_NO) {
3558 					add_x86_feature(featureset,
3559 					    X86FSET_PBRSB_NO);
3560 				}
3561 				if (reg & IA32_ARCH_CAP_BHI_NO) {
3562 					add_x86_feature(featureset,
3563 					    X86FSET_BHI_NO);
3564 				}
3565 			}
3566 			no_trap();
3567 		}
3568 #endif	/* !__xpv */
3569 
3570 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3571 			add_x86_feature(featureset, X86FSET_SSBD);
3572 
3573 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3574 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3575 	}
3576 
3577 	/*
3578 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3579 	 * will have already run this function and determined what we need to
3580 	 * do. This gives us a hook for per-HW thread mitigations such as
3581 	 * enhanced IBRS, or disabling TSX.
3582 	 */
3583 	if (cpu->cpu_id != 0) {
3584 		switch (x86_spectrev2_mitigation) {
3585 		case X86_SPECTREV2_ENHANCED_IBRS:
3586 			cpuid_enable_enhanced_ibrs();
3587 			break;
3588 		case X86_SPECTREV2_AUTO_IBRS:
3589 			cpuid_enable_auto_ibrs();
3590 			break;
3591 		default:
3592 			break;
3593 		}
3594 
3595 		/* If we're committed to BHI_DIS_S, set it for this core. */
3596 		if (x86_bhi_mitigation == X86_BHI_DIS_S)
3597 			cpuid_enable_bhi_dis_s();
3598 
3599 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3600 		return;
3601 	}
3602 
3603 	/*
3604 	 * Go through and initialize various security mechanisms that we should
3605 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3606 	 * TAA.
3607 	 */
3608 
3609 	/*
3610 	 * By default we've come in with retpolines enabled. Check whether we
3611 	 * should disable them or enable enhanced or automatic IBRS.
3612 	 *
3613 	 * Note, we do not allow the use of AMD optimized retpolines as it was
3614 	 * disclosed by AMD in March 2022 that they were still
3615 	 * vulnerable. Prior to that point, we used them.
3616 	 */
3617 	if (x86_disable_spectrev2 != 0) {
3618 		v2mit = X86_SPECTREV2_DISABLED;
3619 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3620 		cpuid_enable_auto_ibrs();
3621 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3622 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3623 		cpuid_enable_enhanced_ibrs();
3624 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3625 	} else {
3626 		v2mit = X86_SPECTREV2_RETPOLINE;
3627 	}
3628 
3629 	cpuid_patch_retpolines(v2mit);
3630 	cpuid_patch_rsb(v2mit, is_x86_feature(featureset, X86FSET_PBRSB_NO));
3631 	x86_bhi_mitigation = cpuid_learn_and_patch_bhi(v2mit, cpu, featureset);
3632 	x86_spectrev2_mitigation = v2mit;
3633 	membar_producer();
3634 
3635 	/*
3636 	 * We need to determine what changes are required for mitigating L1TF
3637 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3638 	 * is required.
3639 	 *
3640 	 * If any of these are present, then we need to flush u-arch state at
3641 	 * various points. For MDS, we need to do so whenever we change to a
3642 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3643 	 * flush the L1D cache at VM entry. When we have microcode that handles
3644 	 * MDS, the L1D flush also clears the other u-arch state that the
3645 	 * md_clear does.
3646 	 */
3647 
3648 	/*
3649 	 * Update whether or not we need to be taking explicit action against
3650 	 * MDS or RFDS.
3651 	 */
3652 	cpuid_update_md_clear(cpu, featureset);
3653 
3654 	/*
3655 	 * Determine whether SMT exclusion is required and whether or not we
3656 	 * need to perform an l1d flush.
3657 	 */
3658 	cpuid_update_l1d_flush(cpu, featureset);
3659 
3660 	/*
3661 	 * Determine what our mitigation strategy should be for TAA and then
3662 	 * also apply TAA mitigations.
3663 	 */
3664 	cpuid_update_tsx(cpu, featureset);
3665 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3666 }
3667 
3668 /*
3669  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3670  */
3671 void
setup_xfem(void)3672 setup_xfem(void)
3673 {
3674 	uint64_t flags = XFEATURE_LEGACY_FP;
3675 
3676 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3677 
3678 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3679 		flags |= XFEATURE_SSE;
3680 
3681 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3682 		flags |= XFEATURE_AVX;
3683 
3684 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3685 		flags |= XFEATURE_AVX512;
3686 
3687 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3688 
3689 	xsave_bv_all = flags;
3690 }
3691 
3692 static void
cpuid_basic_topology(cpu_t * cpu,uchar_t * featureset)3693 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3694 {
3695 	struct cpuid_info *cpi;
3696 
3697 	cpi = cpu->cpu_m.mcpu_cpi;
3698 
3699 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3700 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3701 		cpuid_gather_amd_topology_leaves(cpu);
3702 	}
3703 
3704 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3705 
3706 	/*
3707 	 * Before we can calculate the IDs that we should assign to this
3708 	 * processor, we need to understand how many cores and threads it has.
3709 	 */
3710 	switch (cpi->cpi_vendor) {
3711 	case X86_VENDOR_Intel:
3712 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3713 		    &cpi->cpi_ncore_per_chip);
3714 		break;
3715 	case X86_VENDOR_AMD:
3716 	case X86_VENDOR_HYGON:
3717 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3718 		    &cpi->cpi_ncore_per_chip);
3719 		break;
3720 	default:
3721 		/*
3722 		 * If we have some other x86 compatible chip, it's not clear how
3723 		 * they would behave. The most common case is virtualization
3724 		 * today, though there are also 64-bit VIA chips. Assume that
3725 		 * all we can get is the basic Leaf 1 HTT information.
3726 		 */
3727 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3728 			cpi->cpi_ncore_per_chip = 1;
3729 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3730 		}
3731 		break;
3732 	}
3733 
3734 	/*
3735 	 * Based on the calculated number of threads and cores, potentially
3736 	 * assign the HTT and CMT features.
3737 	 */
3738 	if (cpi->cpi_ncore_per_chip > 1) {
3739 		add_x86_feature(featureset, X86FSET_CMP);
3740 	}
3741 
3742 	if (cpi->cpi_ncpu_per_chip > 1 &&
3743 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3744 		add_x86_feature(featureset, X86FSET_HTT);
3745 	}
3746 
3747 	/*
3748 	 * Now that has been set up, we need to go through and calculate all of
3749 	 * the rest of the parameters that exist. If we think the CPU doesn't
3750 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3751 	 * up information in some way. The most likely case for this is
3752 	 * virtualization where we have a lot of partial topology information.
3753 	 */
3754 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3755 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3756 		/*
3757 		 * This is a single core, single-threaded processor.
3758 		 */
3759 		cpi->cpi_procnodes_per_pkg = 1;
3760 		cpi->cpi_cores_per_compunit = 1;
3761 		cpi->cpi_compunitid = 0;
3762 		cpi->cpi_chipid = -1;
3763 		cpi->cpi_clogid = 0;
3764 		cpi->cpi_coreid = cpu->cpu_id;
3765 		cpi->cpi_pkgcoreid = 0;
3766 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3767 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3768 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3769 		} else {
3770 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3771 		}
3772 	} else {
3773 		switch (cpi->cpi_vendor) {
3774 		case X86_VENDOR_Intel:
3775 			cpuid_intel_getids(cpu, featureset);
3776 			break;
3777 		case X86_VENDOR_AMD:
3778 		case X86_VENDOR_HYGON:
3779 			cpuid_amd_getids(cpu, featureset);
3780 			break;
3781 		default:
3782 			/*
3783 			 * In this case, it's hard to say what we should do.
3784 			 * We're going to model them to the OS as single core
3785 			 * threads. We don't have a good identifier for them, so
3786 			 * we're just going to use the cpu id all on a single
3787 			 * chip.
3788 			 *
3789 			 * This case has historically been different from the
3790 			 * case above where we don't have HTT or CMP. While they
3791 			 * could be combined, we've opted to keep it separate to
3792 			 * minimize the risk of topology changes in weird cases.
3793 			 */
3794 			cpi->cpi_procnodes_per_pkg = 1;
3795 			cpi->cpi_cores_per_compunit = 1;
3796 			cpi->cpi_chipid = 0;
3797 			cpi->cpi_coreid = cpu->cpu_id;
3798 			cpi->cpi_clogid = cpu->cpu_id;
3799 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3800 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3801 			cpi->cpi_compunitid = cpi->cpi_coreid;
3802 			break;
3803 		}
3804 	}
3805 }
3806 
3807 /*
3808  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3809  * always gather leaf 6 if it's supported; however, we only look for features on
3810  * Intel systems as AMD does not currently define any of the features we look
3811  * for below.
3812  */
3813 static void
cpuid_basic_thermal(cpu_t * cpu,uchar_t * featureset)3814 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3815 {
3816 	struct cpuid_regs *cp;
3817 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3818 
3819 	if (cpi->cpi_maxeax < 6) {
3820 		return;
3821 	}
3822 
3823 	cp = &cpi->cpi_std[6];
3824 	cp->cp_eax = 6;
3825 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3826 	(void) __cpuid_insn(cp);
3827 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3828 
3829 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3830 		return;
3831 	}
3832 
3833 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3834 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3835 	}
3836 
3837 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3838 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3839 	}
3840 }
3841 
3842 /*
3843  * This is used when we discover that we have AVX support in cpuid. This
3844  * proceeds to scan for the rest of the AVX derived features.
3845  */
3846 static void
cpuid_basic_avx(cpu_t * cpu,uchar_t * featureset)3847 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3848 {
3849 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3850 
3851 	/*
3852 	 * If we don't have AVX, don't bother with most of this.
3853 	 */
3854 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3855 		return;
3856 
3857 	add_x86_feature(featureset, X86FSET_AVX);
3858 
3859 	/*
3860 	 * Intel says we can't check these without also
3861 	 * checking AVX.
3862 	 */
3863 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3864 		add_x86_feature(featureset, X86FSET_F16C);
3865 
3866 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3867 		add_x86_feature(featureset, X86FSET_FMA);
3868 
3869 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3870 		add_x86_feature(featureset, X86FSET_BMI1);
3871 
3872 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3873 		add_x86_feature(featureset, X86FSET_BMI2);
3874 
3875 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3876 		add_x86_feature(featureset, X86FSET_AVX2);
3877 
3878 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3879 		add_x86_feature(featureset, X86FSET_VAES);
3880 
3881 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3882 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3883 
3884 	/*
3885 	 * The rest of the AVX features require AVX512. Do not check them unless
3886 	 * it is present.
3887 	 */
3888 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3889 		return;
3890 	add_x86_feature(featureset, X86FSET_AVX512F);
3891 
3892 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3893 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3894 
3895 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3896 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3897 
3898 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3899 		add_x86_feature(featureset, X86FSET_AVX512PF);
3900 
3901 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3902 		add_x86_feature(featureset, X86FSET_AVX512ER);
3903 
3904 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3905 		add_x86_feature(featureset, X86FSET_AVX512CD);
3906 
3907 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3908 		add_x86_feature(featureset, X86FSET_AVX512BW);
3909 
3910 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3911 		add_x86_feature(featureset, X86FSET_AVX512VL);
3912 
3913 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3914 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3915 
3916 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3917 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3918 
3919 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3920 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3921 
3922 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3923 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3924 
3925 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3926 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3927 
3928 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3929 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3930 
3931 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3932 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3933 
3934 	/*
3935 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3936 	 * we don't need to.
3937 	 */
3938 	if (cpi->cpi_std[7].cp_eax < 1)
3939 		return;
3940 
3941 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3942 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3943 }
3944 
3945 /*
3946  * PPIN is the protected processor inventory number. On AMD this is an actual
3947  * feature bit. However, on Intel systems we need to read the platform
3948  * information MSR if we're on a specific model.
3949  */
3950 #if !defined(__xpv)
3951 static void
cpuid_basic_ppin(cpu_t * cpu,uchar_t * featureset)3952 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3953 {
3954 	on_trap_data_t otd;
3955 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3956 
3957 	switch (cpi->cpi_vendor) {
3958 	case X86_VENDOR_AMD:
3959 		/*
3960 		 * This leaf will have already been gathered in the topology
3961 		 * functions.
3962 		 */
3963 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3964 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3965 				add_x86_feature(featureset, X86FSET_PPIN);
3966 			}
3967 		}
3968 		break;
3969 	case X86_VENDOR_Intel:
3970 		if (cpi->cpi_family != 6)
3971 			break;
3972 		switch (cpi->cpi_model) {
3973 		case INTC_MODEL_IVYBRIDGE_XEON:
3974 		case INTC_MODEL_HASWELL_XEON:
3975 		case INTC_MODEL_BROADWELL_XEON:
3976 		case INTC_MODEL_BROADWELL_XEON_D:
3977 		case INTC_MODEL_SKYLAKE_XEON:
3978 		case INTC_MODEL_ICELAKE_XEON:
3979 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3980 				uint64_t value;
3981 
3982 				value = rdmsr(MSR_PLATFORM_INFO);
3983 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3984 					add_x86_feature(featureset,
3985 					    X86FSET_PPIN);
3986 				}
3987 			}
3988 			no_trap();
3989 			break;
3990 		default:
3991 			break;
3992 		}
3993 		break;
3994 	default:
3995 		break;
3996 	}
3997 }
3998 #endif	/* ! __xpv */
3999 
4000 static void
cpuid_pass_prelude(cpu_t * cpu,void * arg)4001 cpuid_pass_prelude(cpu_t *cpu, void *arg)
4002 {
4003 	uchar_t *featureset = (uchar_t *)arg;
4004 
4005 	/*
4006 	 * We don't run on any processor that doesn't have cpuid, and could not
4007 	 * possibly have arrived here.
4008 	 */
4009 	add_x86_feature(featureset, X86FSET_CPUID);
4010 }
4011 
4012 static void
cpuid_pass_ident(cpu_t * cpu,void * arg __unused)4013 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
4014 {
4015 	struct cpuid_info *cpi;
4016 	struct cpuid_regs *cp;
4017 
4018 	/*
4019 	 * We require that virtual/native detection be complete and that PCI
4020 	 * config space access has been set up; at present there is no reliable
4021 	 * way to determine the latter.
4022 	 */
4023 #if !defined(__xpv)
4024 	ASSERT3S(platform_type, !=, -1);
4025 #endif	/* !__xpv */
4026 
4027 	cpi = cpu->cpu_m.mcpu_cpi;
4028 	ASSERT(cpi != NULL);
4029 
4030 	cp = &cpi->cpi_std[0];
4031 	cp->cp_eax = 0;
4032 	cpi->cpi_maxeax = __cpuid_insn(cp);
4033 	{
4034 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
4035 		*iptr++ = cp->cp_ebx;
4036 		*iptr++ = cp->cp_edx;
4037 		*iptr++ = cp->cp_ecx;
4038 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
4039 	}
4040 
4041 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
4042 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
4043 
4044 	/*
4045 	 * Limit the range in case of weird hardware
4046 	 */
4047 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
4048 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
4049 	if (cpi->cpi_maxeax < 1)
4050 		return;
4051 
4052 	cp = &cpi->cpi_std[1];
4053 	cp->cp_eax = 1;
4054 	(void) __cpuid_insn(cp);
4055 
4056 	/*
4057 	 * Extract identifying constants for easy access.
4058 	 */
4059 	cpi->cpi_model = CPI_MODEL(cpi);
4060 	cpi->cpi_family = CPI_FAMILY(cpi);
4061 
4062 	if (cpi->cpi_family == 0xf)
4063 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
4064 
4065 	/*
4066 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
4067 	 * Intel, and presumably everyone else, uses model == 0xf, as
4068 	 * one would expect (max value means possible overflow).  Sigh.
4069 	 */
4070 
4071 	switch (cpi->cpi_vendor) {
4072 	case X86_VENDOR_Intel:
4073 		if (IS_EXTENDED_MODEL_INTEL(cpi))
4074 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4075 		break;
4076 	case X86_VENDOR_AMD:
4077 		if (CPI_FAMILY(cpi) == 0xf)
4078 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4079 		break;
4080 	case X86_VENDOR_HYGON:
4081 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4082 		break;
4083 	default:
4084 		if (cpi->cpi_model == 0xf)
4085 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
4086 		break;
4087 	}
4088 
4089 	cpi->cpi_step = CPI_STEP(cpi);
4090 	cpi->cpi_brandid = CPI_BRANDID(cpi);
4091 
4092 	/*
4093 	 * Synthesize chip "revision" and socket type
4094 	 */
4095 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4096 	    cpi->cpi_model, cpi->cpi_step);
4097 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4098 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4099 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4100 	    cpi->cpi_model, cpi->cpi_step);
4101 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
4102 	    cpi->cpi_model, cpi->cpi_step);
4103 }
4104 
4105 static void
cpuid_pass_basic(cpu_t * cpu,void * arg)4106 cpuid_pass_basic(cpu_t *cpu, void *arg)
4107 {
4108 	uchar_t *featureset = (uchar_t *)arg;
4109 	uint32_t mask_ecx, mask_edx;
4110 	struct cpuid_info *cpi;
4111 	struct cpuid_regs *cp;
4112 	int xcpuid;
4113 #if !defined(__xpv)
4114 	extern int idle_cpu_prefer_mwait;
4115 #endif
4116 
4117 	cpi = cpu->cpu_m.mcpu_cpi;
4118 	ASSERT(cpi != NULL);
4119 
4120 	if (cpi->cpi_maxeax < 1)
4121 		return;
4122 
4123 	/*
4124 	 * This was filled during the identification pass.
4125 	 */
4126 	cp = &cpi->cpi_std[1];
4127 
4128 	/*
4129 	 * *default* assumptions:
4130 	 * - believe %edx feature word
4131 	 * - ignore %ecx feature word
4132 	 * - 32-bit virtual and physical addressing
4133 	 */
4134 	mask_edx = 0xffffffff;
4135 	mask_ecx = 0;
4136 
4137 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
4138 
4139 	switch (cpi->cpi_vendor) {
4140 	case X86_VENDOR_Intel:
4141 		if (cpi->cpi_family == 5)
4142 			x86_type = X86_TYPE_P5;
4143 		else if (IS_LEGACY_P6(cpi)) {
4144 			x86_type = X86_TYPE_P6;
4145 			pentiumpro_bug4046376 = 1;
4146 			/*
4147 			 * Clear the SEP bit when it was set erroneously
4148 			 */
4149 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
4150 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
4151 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
4152 			x86_type = X86_TYPE_P4;
4153 			/*
4154 			 * We don't currently depend on any of the %ecx
4155 			 * features until Prescott, so we'll only check
4156 			 * this from P4 onwards.  We might want to revisit
4157 			 * that idea later.
4158 			 */
4159 			mask_ecx = 0xffffffff;
4160 		} else if (cpi->cpi_family > 0xf)
4161 			mask_ecx = 0xffffffff;
4162 		/*
4163 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4164 		 * to obtain the monitor linesize.
4165 		 */
4166 		if (cpi->cpi_maxeax < 5)
4167 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4168 		break;
4169 	case X86_VENDOR_IntelClone:
4170 	default:
4171 		break;
4172 	case X86_VENDOR_AMD:
4173 #if defined(OPTERON_ERRATUM_108)
4174 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
4175 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
4176 			cpi->cpi_model = 0xc;
4177 		} else
4178 #endif
4179 		if (cpi->cpi_family == 5) {
4180 			/*
4181 			 * AMD K5 and K6
4182 			 *
4183 			 * These CPUs have an incomplete implementation
4184 			 * of MCA/MCE which we mask away.
4185 			 */
4186 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
4187 
4188 			/*
4189 			 * Model 0 uses the wrong (APIC) bit
4190 			 * to indicate PGE.  Fix it here.
4191 			 */
4192 			if (cpi->cpi_model == 0) {
4193 				if (cp->cp_edx & 0x200) {
4194 					cp->cp_edx &= ~0x200;
4195 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
4196 				}
4197 			}
4198 
4199 			/*
4200 			 * Early models had problems w/ MMX; disable.
4201 			 */
4202 			if (cpi->cpi_model < 6)
4203 				mask_edx &= ~CPUID_INTC_EDX_MMX;
4204 		}
4205 
4206 		/*
4207 		 * For newer families, SSE3 and CX16, at least, are valid;
4208 		 * enable all
4209 		 */
4210 		if (cpi->cpi_family >= 0xf)
4211 			mask_ecx = 0xffffffff;
4212 		/*
4213 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4214 		 * to obtain the monitor linesize.
4215 		 */
4216 		if (cpi->cpi_maxeax < 5)
4217 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4218 
4219 #if !defined(__xpv)
4220 		/*
4221 		 * AMD has not historically used MWAIT in the CPU's idle loop.
4222 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
4223 		 * know for certain that in at least family 17h, per AMD, mwait
4224 		 * is preferred. Families in-between are less certain.
4225 		 */
4226 		if (cpi->cpi_family < 0x17) {
4227 			idle_cpu_prefer_mwait = 0;
4228 		}
4229 #endif
4230 
4231 		break;
4232 	case X86_VENDOR_HYGON:
4233 		/* Enable all for Hygon Dhyana CPU */
4234 		mask_ecx = 0xffffffff;
4235 		break;
4236 	case X86_VENDOR_TM:
4237 		/*
4238 		 * workaround the NT workaround in CMS 4.1
4239 		 */
4240 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
4241 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
4242 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4243 		break;
4244 	case X86_VENDOR_Centaur:
4245 		/*
4246 		 * workaround the NT workarounds again
4247 		 */
4248 		if (cpi->cpi_family == 6)
4249 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4250 		break;
4251 	case X86_VENDOR_Cyrix:
4252 		/*
4253 		 * We rely heavily on the probing in locore
4254 		 * to actually figure out what parts, if any,
4255 		 * of the Cyrix cpuid instruction to believe.
4256 		 */
4257 		switch (x86_type) {
4258 		case X86_TYPE_CYRIX_486:
4259 			mask_edx = 0;
4260 			break;
4261 		case X86_TYPE_CYRIX_6x86:
4262 			mask_edx = 0;
4263 			break;
4264 		case X86_TYPE_CYRIX_6x86L:
4265 			mask_edx =
4266 			    CPUID_INTC_EDX_DE |
4267 			    CPUID_INTC_EDX_CX8;
4268 			break;
4269 		case X86_TYPE_CYRIX_6x86MX:
4270 			mask_edx =
4271 			    CPUID_INTC_EDX_DE |
4272 			    CPUID_INTC_EDX_MSR |
4273 			    CPUID_INTC_EDX_CX8 |
4274 			    CPUID_INTC_EDX_PGE |
4275 			    CPUID_INTC_EDX_CMOV |
4276 			    CPUID_INTC_EDX_MMX;
4277 			break;
4278 		case X86_TYPE_CYRIX_GXm:
4279 			mask_edx =
4280 			    CPUID_INTC_EDX_MSR |
4281 			    CPUID_INTC_EDX_CX8 |
4282 			    CPUID_INTC_EDX_CMOV |
4283 			    CPUID_INTC_EDX_MMX;
4284 			break;
4285 		case X86_TYPE_CYRIX_MediaGX:
4286 			break;
4287 		case X86_TYPE_CYRIX_MII:
4288 		case X86_TYPE_VIA_CYRIX_III:
4289 			mask_edx =
4290 			    CPUID_INTC_EDX_DE |
4291 			    CPUID_INTC_EDX_TSC |
4292 			    CPUID_INTC_EDX_MSR |
4293 			    CPUID_INTC_EDX_CX8 |
4294 			    CPUID_INTC_EDX_PGE |
4295 			    CPUID_INTC_EDX_CMOV |
4296 			    CPUID_INTC_EDX_MMX;
4297 			break;
4298 		default:
4299 			break;
4300 		}
4301 		break;
4302 	}
4303 
4304 #if defined(__xpv)
4305 	/*
4306 	 * Do not support MONITOR/MWAIT under a hypervisor
4307 	 */
4308 	mask_ecx &= ~CPUID_INTC_ECX_MON;
4309 	/*
4310 	 * Do not support XSAVE under a hypervisor for now
4311 	 */
4312 	xsave_force_disable = B_TRUE;
4313 
4314 #endif	/* __xpv */
4315 
4316 	if (xsave_force_disable) {
4317 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4318 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
4319 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
4320 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
4321 	}
4322 
4323 	/*
4324 	 * Now we've figured out the masks that determine
4325 	 * which bits we choose to believe, apply the masks
4326 	 * to the feature words, then map the kernel's view
4327 	 * of these feature words into its feature word.
4328 	 */
4329 	cp->cp_edx &= mask_edx;
4330 	cp->cp_ecx &= mask_ecx;
4331 
4332 	/*
4333 	 * apply any platform restrictions (we don't call this
4334 	 * immediately after __cpuid_insn here, because we need the
4335 	 * workarounds applied above first)
4336 	 */
4337 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4338 
4339 	/*
4340 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
4341 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4342 	 * 7 has sub-leaves determined by ecx.
4343 	 */
4344 	if (cpi->cpi_maxeax >= 7) {
4345 		struct cpuid_regs *ecp;
4346 		ecp = &cpi->cpi_std[7];
4347 		ecp->cp_eax = 7;
4348 		ecp->cp_ecx = 0;
4349 		(void) __cpuid_insn(ecp);
4350 
4351 		/*
4352 		 * If XSAVE has been disabled, just ignore all of the
4353 		 * extended-save-area dependent flags here. By removing most of
4354 		 * the leaf 7, sub-leaf 0 flags, that will ensure that we don't
4355 		 * end up looking at additional xsave dependent leaves right
4356 		 * now.
4357 		 */
4358 		if (xsave_force_disable) {
4359 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4360 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4361 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4362 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4363 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4364 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4365 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4366 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4367 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4368 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4369 		}
4370 
4371 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4372 			add_x86_feature(featureset, X86FSET_SMEP);
4373 
4374 		/*
4375 		 * We check disable_smap here in addition to in startup_smap()
4376 		 * to ensure CPUs that aren't the boot CPU don't accidentally
4377 		 * include it in the feature set and thus generate a mismatched
4378 		 * x86 feature set across CPUs.
4379 		 */
4380 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4381 		    disable_smap == 0)
4382 			add_x86_feature(featureset, X86FSET_SMAP);
4383 
4384 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) {
4385 			add_x86_feature(featureset, X86FSET_RDSEED);
4386 			if (cpi->cpi_vendor == X86_VENDOR_AMD)
4387 				cpuid_evaluate_amd_rdseed(cpu, featureset);
4388 		}
4389 
4390 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4391 			add_x86_feature(featureset, X86FSET_ADX);
4392 
4393 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4394 			add_x86_feature(featureset, X86FSET_FSGSBASE);
4395 
4396 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4397 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4398 
4399 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4400 			add_x86_feature(featureset, X86FSET_INVPCID);
4401 
4402 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4403 			add_x86_feature(featureset, X86FSET_UMIP);
4404 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4405 			add_x86_feature(featureset, X86FSET_PKU);
4406 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4407 			add_x86_feature(featureset, X86FSET_OSPKE);
4408 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4409 			add_x86_feature(featureset, X86FSET_GFNI);
4410 
4411 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4412 			add_x86_feature(featureset, X86FSET_CLWB);
4413 
4414 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4415 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4416 				add_x86_feature(featureset, X86FSET_MPX);
4417 		}
4418 
4419 		/*
4420 		 * If we have subleaf 1 or 2 available, grab and store
4421 		 * that. This is used for more AVX and related features.
4422 		 */
4423 		if (ecp->cp_eax >= 1) {
4424 			struct cpuid_regs *c71;
4425 			c71 = &cpi->cpi_sub7[0];
4426 			c71->cp_eax = 7;
4427 			c71->cp_ecx = 1;
4428 			(void) __cpuid_insn(c71);
4429 		}
4430 
4431 		/* Subleaf 2 has certain security indicators in it. */
4432 		if (ecp->cp_eax >= 2) {
4433 			struct cpuid_regs *c72;
4434 			c72 = &cpi->cpi_sub7[1];
4435 			c72->cp_eax = 7;
4436 			c72->cp_ecx = 2;
4437 			(void) __cpuid_insn(c72);
4438 		}
4439 	}
4440 
4441 	/*
4442 	 * fold in overrides from the "eeprom" mechanism
4443 	 */
4444 	cp->cp_edx |= cpuid_feature_edx_include;
4445 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4446 
4447 	cp->cp_ecx |= cpuid_feature_ecx_include;
4448 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4449 
4450 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4451 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4452 	}
4453 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4454 		add_x86_feature(featureset, X86FSET_TSC);
4455 	}
4456 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4457 		add_x86_feature(featureset, X86FSET_MSR);
4458 	}
4459 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4460 		add_x86_feature(featureset, X86FSET_MTRR);
4461 	}
4462 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4463 		add_x86_feature(featureset, X86FSET_PGE);
4464 	}
4465 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4466 		add_x86_feature(featureset, X86FSET_CMOV);
4467 	}
4468 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4469 		add_x86_feature(featureset, X86FSET_MMX);
4470 	}
4471 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4472 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4473 		add_x86_feature(featureset, X86FSET_MCA);
4474 	}
4475 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4476 		add_x86_feature(featureset, X86FSET_PAE);
4477 	}
4478 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4479 		add_x86_feature(featureset, X86FSET_CX8);
4480 	}
4481 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4482 		add_x86_feature(featureset, X86FSET_CX16);
4483 	}
4484 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4485 		add_x86_feature(featureset, X86FSET_PAT);
4486 	}
4487 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4488 		add_x86_feature(featureset, X86FSET_SEP);
4489 	}
4490 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4491 		/*
4492 		 * In our implementation, fxsave/fxrstor
4493 		 * are prerequisites before we'll even
4494 		 * try and do SSE things.
4495 		 */
4496 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4497 			add_x86_feature(featureset, X86FSET_SSE);
4498 		}
4499 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4500 			add_x86_feature(featureset, X86FSET_SSE2);
4501 		}
4502 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4503 			add_x86_feature(featureset, X86FSET_SSE3);
4504 		}
4505 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4506 			add_x86_feature(featureset, X86FSET_SSSE3);
4507 		}
4508 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4509 			add_x86_feature(featureset, X86FSET_SSE4_1);
4510 		}
4511 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4512 			add_x86_feature(featureset, X86FSET_SSE4_2);
4513 		}
4514 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4515 			add_x86_feature(featureset, X86FSET_AES);
4516 		}
4517 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4518 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4519 		}
4520 
4521 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4522 			add_x86_feature(featureset, X86FSET_SHA);
4523 
4524 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4525 			add_x86_feature(featureset, X86FSET_XSAVE);
4526 
4527 			/* We only test AVX & AVX512 when there is XSAVE */
4528 			cpuid_basic_avx(cpu, featureset);
4529 		}
4530 	}
4531 
4532 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4533 		add_x86_feature(featureset, X86FSET_PCID);
4534 	}
4535 
4536 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4537 		add_x86_feature(featureset, X86FSET_X2APIC);
4538 	}
4539 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4540 		add_x86_feature(featureset, X86FSET_DE);
4541 	}
4542 #if !defined(__xpv)
4543 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4544 
4545 		/*
4546 		 * We require the CLFLUSH instruction for erratum workaround
4547 		 * to use MONITOR/MWAIT.
4548 		 */
4549 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4550 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4551 			add_x86_feature(featureset, X86FSET_MWAIT);
4552 		} else {
4553 			extern int idle_cpu_assert_cflush_monitor;
4554 
4555 			/*
4556 			 * All processors we are aware of which have
4557 			 * MONITOR/MWAIT also have CLFLUSH.
4558 			 */
4559 			if (idle_cpu_assert_cflush_monitor) {
4560 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4561 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4562 			}
4563 		}
4564 	}
4565 #endif	/* __xpv */
4566 
4567 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4568 		add_x86_feature(featureset, X86FSET_VMX);
4569 	}
4570 
4571 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4572 		add_x86_feature(featureset, X86FSET_RDRAND);
4573 
4574 	/*
4575 	 * Only need it first time, rest of the cpus would follow suit.
4576 	 * we only capture this for the bootcpu.
4577 	 */
4578 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4579 		add_x86_feature(featureset, X86FSET_CLFSH);
4580 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4581 	}
4582 	if (is_x86_feature(featureset, X86FSET_PAE))
4583 		cpi->cpi_pabits = 36;
4584 
4585 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4586 		struct cpuid_regs r, *ecp;
4587 
4588 		ecp = &r;
4589 		ecp->cp_eax = 0xD;
4590 		ecp->cp_ecx = 1;
4591 		ecp->cp_edx = ecp->cp_ebx = 0;
4592 		(void) __cpuid_insn(ecp);
4593 
4594 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4595 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4596 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4597 			add_x86_feature(featureset, X86FSET_XSAVEC);
4598 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4599 			add_x86_feature(featureset, X86FSET_XSAVES);
4600 
4601 		/*
4602 		 * Zen 2 family processors suffer from erratum 1386 that causes
4603 		 * xsaves to not function correctly in some circumstances. There
4604 		 * are no supervisor states in Zen 2 and earlier. Practically
4605 		 * speaking this has no impact for us as we currently do not
4606 		 * leverage compressed xsave formats. To safeguard against
4607 		 * issues in the future where we may opt to using it, we remove
4608 		 * it from the feature set now. While Matisse has a microcode
4609 		 * update available with a fix, not all Zen 2 CPUs do so it's
4610 		 * simpler for the moment to unconditionally remove it.
4611 		 */
4612 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4613 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4614 			remove_x86_feature(featureset, X86FSET_XSAVES);
4615 		}
4616 	}
4617 
4618 	/*
4619 	 * Work on the "extended" feature information, doing
4620 	 * some basic initialization to be used in the extended pass.
4621 	 */
4622 	xcpuid = 0;
4623 	switch (cpi->cpi_vendor) {
4624 	case X86_VENDOR_Intel:
4625 		/*
4626 		 * On KVM we know we will have proper support for extended
4627 		 * cpuid.
4628 		 */
4629 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4630 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4631 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4632 			xcpuid++;
4633 		break;
4634 	case X86_VENDOR_AMD:
4635 		if (cpi->cpi_family > 5 ||
4636 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4637 			xcpuid++;
4638 		break;
4639 	case X86_VENDOR_Cyrix:
4640 		/*
4641 		 * Only these Cyrix CPUs are -known- to support
4642 		 * extended cpuid operations.
4643 		 */
4644 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4645 		    x86_type == X86_TYPE_CYRIX_GXm)
4646 			xcpuid++;
4647 		break;
4648 	case X86_VENDOR_HYGON:
4649 	case X86_VENDOR_Centaur:
4650 	case X86_VENDOR_TM:
4651 	default:
4652 		xcpuid++;
4653 		break;
4654 	}
4655 
4656 	if (xcpuid) {
4657 		cp = &cpi->cpi_extd[0];
4658 		cp->cp_eax = CPUID_LEAF_EXT_0;
4659 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4660 	}
4661 
4662 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4663 
4664 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4665 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4666 
4667 		switch (cpi->cpi_vendor) {
4668 		case X86_VENDOR_Intel:
4669 		case X86_VENDOR_AMD:
4670 		case X86_VENDOR_HYGON:
4671 			if (cpi->cpi_xmaxeax < 0x80000001)
4672 				break;
4673 			cp = &cpi->cpi_extd[1];
4674 			cp->cp_eax = 0x80000001;
4675 			(void) __cpuid_insn(cp);
4676 
4677 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4678 			    cpi->cpi_family == 5 &&
4679 			    cpi->cpi_model == 6 &&
4680 			    cpi->cpi_step == 6) {
4681 				/*
4682 				 * K6 model 6 uses bit 10 to indicate SYSC
4683 				 * Later models use bit 11. Fix it here.
4684 				 */
4685 				if (cp->cp_edx & 0x400) {
4686 					cp->cp_edx &= ~0x400;
4687 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4688 				}
4689 			}
4690 
4691 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4692 
4693 			/*
4694 			 * Compute the additions to the kernel's feature word.
4695 			 */
4696 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4697 				add_x86_feature(featureset, X86FSET_NX);
4698 			}
4699 
4700 			/*
4701 			 * Regardless whether or not we boot 64-bit,
4702 			 * we should have a way to identify whether
4703 			 * the CPU is capable of running 64-bit.
4704 			 */
4705 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4706 				add_x86_feature(featureset, X86FSET_64);
4707 			}
4708 
4709 			/* 1 GB large page - enable only for 64 bit kernel */
4710 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4711 				add_x86_feature(featureset, X86FSET_1GPG);
4712 			}
4713 
4714 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4715 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4716 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4717 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4718 				add_x86_feature(featureset, X86FSET_SSE4A);
4719 			}
4720 
4721 			/*
4722 			 * It's really tricky to support syscall/sysret in
4723 			 * the i386 kernel; we rely on sysenter/sysexit
4724 			 * instead.  In the amd64 kernel, things are -way-
4725 			 * better.
4726 			 */
4727 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4728 				add_x86_feature(featureset, X86FSET_ASYSC);
4729 			}
4730 
4731 			/*
4732 			 * While we're thinking about system calls, note
4733 			 * that AMD processors don't support sysenter
4734 			 * in long mode at all, so don't try to program them.
4735 			 */
4736 			if (x86_vendor == X86_VENDOR_AMD ||
4737 			    x86_vendor == X86_VENDOR_HYGON) {
4738 				remove_x86_feature(featureset, X86FSET_SEP);
4739 			}
4740 
4741 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4742 				add_x86_feature(featureset, X86FSET_TSCP);
4743 			}
4744 
4745 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4746 				add_x86_feature(featureset, X86FSET_SVM);
4747 			}
4748 
4749 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4750 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4751 			}
4752 
4753 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4754 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4755 			}
4756 
4757 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4758 				add_x86_feature(featureset, X86FSET_XOP);
4759 			}
4760 
4761 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4762 				add_x86_feature(featureset, X86FSET_FMA4);
4763 			}
4764 
4765 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4766 				add_x86_feature(featureset, X86FSET_TBM);
4767 			}
4768 
4769 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4770 				add_x86_feature(featureset, X86FSET_MONITORX);
4771 			}
4772 			break;
4773 		default:
4774 			break;
4775 		}
4776 
4777 		/*
4778 		 * Get CPUID data about processor cores and hyperthreads.
4779 		 */
4780 		switch (cpi->cpi_vendor) {
4781 		case X86_VENDOR_Intel:
4782 			if (cpi->cpi_maxeax >= 4) {
4783 				cp = &cpi->cpi_std[4];
4784 				cp->cp_eax = 4;
4785 				cp->cp_ecx = 0;
4786 				(void) __cpuid_insn(cp);
4787 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4788 			}
4789 			/*FALLTHROUGH*/
4790 		case X86_VENDOR_AMD:
4791 		case X86_VENDOR_HYGON:
4792 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4793 				break;
4794 			cp = &cpi->cpi_extd[8];
4795 			cp->cp_eax = CPUID_LEAF_EXT_8;
4796 			(void) __cpuid_insn(cp);
4797 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4798 			    cp);
4799 
4800 			/*
4801 			 * AMD uses ebx for some extended functions.
4802 			 */
4803 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4804 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4805 				/*
4806 				 * While we're here, check for the AMD "Error
4807 				 * Pointer Zero/Restore" feature. This can be
4808 				 * used to setup the FP save handlers
4809 				 * appropriately.
4810 				 */
4811 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4812 					cpi->cpi_fp_amd_save = 0;
4813 				} else {
4814 					cpi->cpi_fp_amd_save = 1;
4815 				}
4816 
4817 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4818 					add_x86_feature(featureset,
4819 					    X86FSET_CLZERO);
4820 				}
4821 			}
4822 
4823 			/*
4824 			 * Virtual and physical address limits from
4825 			 * cpuid override previously guessed values.
4826 			 */
4827 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4828 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4829 			break;
4830 		default:
4831 			break;
4832 		}
4833 
4834 		/*
4835 		 * Get CPUID data about TSC Invariance in Deep C-State.
4836 		 */
4837 		switch (cpi->cpi_vendor) {
4838 		case X86_VENDOR_Intel:
4839 		case X86_VENDOR_AMD:
4840 		case X86_VENDOR_HYGON:
4841 			if (cpi->cpi_maxeax >= 7) {
4842 				cp = &cpi->cpi_extd[7];
4843 				cp->cp_eax = 0x80000007;
4844 				cp->cp_ecx = 0;
4845 				(void) __cpuid_insn(cp);
4846 			}
4847 			break;
4848 		default:
4849 			break;
4850 		}
4851 	}
4852 
4853 	/*
4854 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4855 	 * run and thus gathered some of its dependent leaves.
4856 	 */
4857 	cpuid_basic_topology(cpu, featureset);
4858 	cpuid_basic_thermal(cpu, featureset);
4859 #if !defined(__xpv)
4860 	cpuid_basic_ppin(cpu, featureset);
4861 #endif
4862 
4863 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4864 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4865 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4866 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4867 			/* Special handling for AMD FP not necessary. */
4868 			cpi->cpi_fp_amd_save = 0;
4869 		} else {
4870 			cpi->cpi_fp_amd_save = 1;
4871 		}
4872 	}
4873 
4874 	/*
4875 	 * Check (and potentially set) if lfence is serializing.
4876 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4877 	 */
4878 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4879 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4880 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4881 		/*
4882 		 * The AMD white paper Software Techniques For Managing
4883 		 * Speculation on AMD Processors details circumstances for when
4884 		 * lfence instructions are serializing.
4885 		 *
4886 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4887 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4888 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4889 		 * committed to supporting that MSR on all later CPUs.
4890 		 */
4891 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4892 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4893 		} else if (cpi->cpi_family >= 0x10) {
4894 #if !defined(__xpv)
4895 			uint64_t val;
4896 
4897 			/*
4898 			 * Be careful when attempting to enable the bit, and
4899 			 * verify that it was actually set in case we are
4900 			 * running in a hypervisor which is less than faithful
4901 			 * about its emulation of this feature.
4902 			 */
4903 			on_trap_data_t otd;
4904 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4905 				val = rdmsr(MSR_AMD_DE_CFG);
4906 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4907 				wrmsr(MSR_AMD_DE_CFG, val);
4908 				val = rdmsr(MSR_AMD_DE_CFG);
4909 			} else {
4910 				val = 0;
4911 			}
4912 			no_trap();
4913 
4914 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4915 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4916 			}
4917 #endif
4918 		}
4919 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4920 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4921 		/*
4922 		 * Documentation and other OSes indicate that lfence is always
4923 		 * serializing on Intel CPUs.
4924 		 */
4925 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4926 	}
4927 
4928 
4929 	/*
4930 	 * Check the processor leaves that are used for security features. Grab
4931 	 * any additional processor-specific leaves that we may not have yet.
4932 	 */
4933 	switch (cpi->cpi_vendor) {
4934 	case X86_VENDOR_AMD:
4935 	case X86_VENDOR_HYGON:
4936 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4937 			cp = &cpi->cpi_extd[0x21];
4938 			cp->cp_eax = CPUID_LEAF_EXT_21;
4939 			cp->cp_ecx = 0;
4940 			(void) __cpuid_insn(cp);
4941 		}
4942 		break;
4943 	default:
4944 		break;
4945 	}
4946 
4947 	cpuid_scan_security(cpu, featureset);
4948 }
4949 
4950 /*
4951  * Make copies of the cpuid table entries we depend on, in
4952  * part for ease of parsing now, in part so that we have only
4953  * one place to correct any of it, in part for ease of
4954  * later export to userland, and in part so we can look at
4955  * this stuff in a crash dump.
4956  */
4957 
4958 static void
cpuid_pass_extended(cpu_t * cpu,void * _arg __unused)4959 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4960 {
4961 	uint_t n, nmax;
4962 	int i;
4963 	struct cpuid_regs *cp;
4964 	uint8_t *dp;
4965 	uint32_t *iptr;
4966 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4967 
4968 	if (cpi->cpi_maxeax < 1)
4969 		return;
4970 
4971 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4972 		nmax = NMAX_CPI_STD;
4973 	/*
4974 	 * (We already handled n == 0 and n == 1 in the basic pass)
4975 	 */
4976 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4977 		/*
4978 		 * leaves 6 and 7 were handled in the basic pass
4979 		 */
4980 		if (n == 6 || n == 7)
4981 			continue;
4982 
4983 		cp->cp_eax = n;
4984 
4985 		/*
4986 		 * CPUID function 4 expects %ecx to be initialized
4987 		 * with an index which indicates which cache to return
4988 		 * information about. The OS is expected to call function 4
4989 		 * with %ecx set to 0, 1, 2, ... until it returns with
4990 		 * EAX[4:0] set to 0, which indicates there are no more
4991 		 * caches.
4992 		 *
4993 		 * Here, populate cpi_std[4] with the information returned by
4994 		 * function 4 when %ecx == 0, and do the rest in a later pass
4995 		 * when dynamic memory allocation becomes available.
4996 		 *
4997 		 * Note: we need to explicitly initialize %ecx here, since
4998 		 * function 4 may have been previously invoked.
4999 		 */
5000 		if (n == 4)
5001 			cp->cp_ecx = 0;
5002 
5003 		(void) __cpuid_insn(cp);
5004 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
5005 		switch (n) {
5006 		case 2:
5007 			/*
5008 			 * "the lower 8 bits of the %eax register
5009 			 * contain a value that identifies the number
5010 			 * of times the cpuid [instruction] has to be
5011 			 * executed to obtain a complete image of the
5012 			 * processor's caching systems."
5013 			 *
5014 			 * How *do* they make this stuff up?
5015 			 */
5016 			cpi->cpi_ncache = sizeof (*cp) *
5017 			    BITX(cp->cp_eax, 7, 0);
5018 			if (cpi->cpi_ncache == 0)
5019 				break;
5020 			cpi->cpi_ncache--;	/* skip count byte */
5021 
5022 			/*
5023 			 * Well, for now, rather than attempt to implement
5024 			 * this slightly dubious algorithm, we just look
5025 			 * at the first 15 ..
5026 			 */
5027 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
5028 				cpi->cpi_ncache = sizeof (*cp) - 1;
5029 
5030 			dp = cpi->cpi_cacheinfo;
5031 			if (BITX(cp->cp_eax, 31, 31) == 0) {
5032 				uint8_t *p = (void *)&cp->cp_eax;
5033 				for (i = 1; i < 4; i++)
5034 					if (p[i] != 0)
5035 						*dp++ = p[i];
5036 			}
5037 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
5038 				uint8_t *p = (void *)&cp->cp_ebx;
5039 				for (i = 0; i < 4; i++)
5040 					if (p[i] != 0)
5041 						*dp++ = p[i];
5042 			}
5043 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
5044 				uint8_t *p = (void *)&cp->cp_ecx;
5045 				for (i = 0; i < 4; i++)
5046 					if (p[i] != 0)
5047 						*dp++ = p[i];
5048 			}
5049 			if (BITX(cp->cp_edx, 31, 31) == 0) {
5050 				uint8_t *p = (void *)&cp->cp_edx;
5051 				for (i = 0; i < 4; i++)
5052 					if (p[i] != 0)
5053 						*dp++ = p[i];
5054 			}
5055 			break;
5056 
5057 		case 3:	/* Processor serial number, if PSN supported */
5058 			break;
5059 
5060 		case 4:	/* Deterministic cache parameters */
5061 			break;
5062 
5063 		case 5:	/* Monitor/Mwait parameters */
5064 		{
5065 			size_t mwait_size;
5066 
5067 			/*
5068 			 * check cpi_mwait.support which was set in
5069 			 * cpuid_pass_basic()
5070 			 */
5071 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
5072 				break;
5073 
5074 			/*
5075 			 * Protect ourself from insane mwait line size.
5076 			 * Workaround for incomplete hardware emulator(s).
5077 			 */
5078 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
5079 			if (mwait_size < sizeof (uint32_t) ||
5080 			    !ISP2(mwait_size)) {
5081 #if DEBUG
5082 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
5083 				    "size %ld", cpu->cpu_id, (long)mwait_size);
5084 #endif
5085 				break;
5086 			}
5087 
5088 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
5089 			cpi->cpi_mwait.mon_max = mwait_size;
5090 			if (MWAIT_EXTENSION(cpi)) {
5091 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
5092 				if (MWAIT_INT_ENABLE(cpi))
5093 					cpi->cpi_mwait.support |=
5094 					    MWAIT_ECX_INT_ENABLE;
5095 			}
5096 			break;
5097 		}
5098 		default:
5099 			break;
5100 		}
5101 	}
5102 
5103 	/*
5104 	 * XSAVE enumeration
5105 	 */
5106 	if (cpi->cpi_maxeax >= 0xD) {
5107 		struct cpuid_regs regs;
5108 		boolean_t cpuid_d_valid = B_TRUE;
5109 
5110 		cp = &regs;
5111 		cp->cp_eax = 0xD;
5112 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
5113 
5114 		(void) __cpuid_insn(cp);
5115 
5116 		/*
5117 		 * Sanity checks for debug
5118 		 */
5119 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
5120 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
5121 			cpuid_d_valid = B_FALSE;
5122 		}
5123 
5124 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
5125 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
5126 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
5127 
5128 		/*
5129 		 * If the hw supports AVX, get the size and offset in the save
5130 		 * area for the ymm state.
5131 		 */
5132 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
5133 			cp->cp_eax = 0xD;
5134 			cp->cp_ecx = 2;
5135 			cp->cp_edx = cp->cp_ebx = 0;
5136 
5137 			(void) __cpuid_insn(cp);
5138 
5139 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
5140 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
5141 				cpuid_d_valid = B_FALSE;
5142 			}
5143 
5144 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
5145 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
5146 		}
5147 
5148 		/*
5149 		 * If the hw supports MPX, get the size and offset in the
5150 		 * save area for BNDREGS and BNDCSR.
5151 		 */
5152 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
5153 			cp->cp_eax = 0xD;
5154 			cp->cp_ecx = 3;
5155 			cp->cp_edx = cp->cp_ebx = 0;
5156 
5157 			(void) __cpuid_insn(cp);
5158 
5159 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
5160 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
5161 
5162 			cp->cp_eax = 0xD;
5163 			cp->cp_ecx = 4;
5164 			cp->cp_edx = cp->cp_ebx = 0;
5165 
5166 			(void) __cpuid_insn(cp);
5167 
5168 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
5169 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
5170 		}
5171 
5172 		/*
5173 		 * If the hw supports AVX512, get the size and offset in the
5174 		 * save area for the opmask registers and zmm state.
5175 		 */
5176 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
5177 			cp->cp_eax = 0xD;
5178 			cp->cp_ecx = 5;
5179 			cp->cp_edx = cp->cp_ebx = 0;
5180 
5181 			(void) __cpuid_insn(cp);
5182 
5183 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
5184 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
5185 
5186 			cp->cp_eax = 0xD;
5187 			cp->cp_ecx = 6;
5188 			cp->cp_edx = cp->cp_ebx = 0;
5189 
5190 			(void) __cpuid_insn(cp);
5191 
5192 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
5193 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
5194 
5195 			cp->cp_eax = 0xD;
5196 			cp->cp_ecx = 7;
5197 			cp->cp_edx = cp->cp_ebx = 0;
5198 
5199 			(void) __cpuid_insn(cp);
5200 
5201 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
5202 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
5203 		}
5204 
5205 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_PKRU) {
5206 			cp->cp_eax = 0xD;
5207 			cp->cp_ecx = 9;
5208 			cp->cp_edx = cp->cp_ebx = 0;
5209 
5210 			(void) __cpuid_insn(cp);
5211 
5212 			cpi->cpi_xsave.pkru_size = cp->cp_eax;
5213 			cpi->cpi_xsave.pkru_offset = cp->cp_ebx;
5214 		}
5215 
5216 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
5217 			xsave_state_size = 0;
5218 		} else if (cpuid_d_valid) {
5219 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
5220 		} else {
5221 			/* Broken CPUID 0xD, probably in HVM */
5222 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
5223 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
5224 			    ", ymm_size = %d, ymm_offset = %d\n",
5225 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
5226 			    cpi->cpi_xsave.xsav_hw_features_high,
5227 			    (int)cpi->cpi_xsave.xsav_max_size,
5228 			    (int)cpi->cpi_xsave.ymm_size,
5229 			    (int)cpi->cpi_xsave.ymm_offset);
5230 
5231 			if (xsave_state_size != 0) {
5232 				/*
5233 				 * This must be a non-boot CPU. We cannot
5234 				 * continue, because boot cpu has already
5235 				 * enabled XSAVE.
5236 				 */
5237 				ASSERT(cpu->cpu_id != 0);
5238 				cmn_err(CE_PANIC, "cpu%d: we have already "
5239 				    "enabled XSAVE on boot cpu, cannot "
5240 				    "continue.", cpu->cpu_id);
5241 			} else {
5242 				/*
5243 				 * If we reached here on the boot CPU, it's also
5244 				 * almost certain that we'll reach here on the
5245 				 * non-boot CPUs. When we're here on a boot CPU
5246 				 * we should disable the feature, on a non-boot
5247 				 * CPU we need to confirm that we have.
5248 				 */
5249 				if (cpu->cpu_id == 0) {
5250 					remove_x86_feature(x86_featureset,
5251 					    X86FSET_XSAVE);
5252 					remove_x86_feature(x86_featureset,
5253 					    X86FSET_AVX);
5254 					remove_x86_feature(x86_featureset,
5255 					    X86FSET_F16C);
5256 					remove_x86_feature(x86_featureset,
5257 					    X86FSET_BMI1);
5258 					remove_x86_feature(x86_featureset,
5259 					    X86FSET_BMI2);
5260 					remove_x86_feature(x86_featureset,
5261 					    X86FSET_FMA);
5262 					remove_x86_feature(x86_featureset,
5263 					    X86FSET_AVX2);
5264 					remove_x86_feature(x86_featureset,
5265 					    X86FSET_MPX);
5266 					remove_x86_feature(x86_featureset,
5267 					    X86FSET_AVX512F);
5268 					remove_x86_feature(x86_featureset,
5269 					    X86FSET_AVX512DQ);
5270 					remove_x86_feature(x86_featureset,
5271 					    X86FSET_AVX512PF);
5272 					remove_x86_feature(x86_featureset,
5273 					    X86FSET_AVX512ER);
5274 					remove_x86_feature(x86_featureset,
5275 					    X86FSET_AVX512CD);
5276 					remove_x86_feature(x86_featureset,
5277 					    X86FSET_AVX512BW);
5278 					remove_x86_feature(x86_featureset,
5279 					    X86FSET_AVX512VL);
5280 					remove_x86_feature(x86_featureset,
5281 					    X86FSET_AVX512FMA);
5282 					remove_x86_feature(x86_featureset,
5283 					    X86FSET_AVX512VBMI);
5284 					remove_x86_feature(x86_featureset,
5285 					    X86FSET_AVX512VNNI);
5286 					remove_x86_feature(x86_featureset,
5287 					    X86FSET_AVX512VPOPCDQ);
5288 					remove_x86_feature(x86_featureset,
5289 					    X86FSET_AVX512NNIW);
5290 					remove_x86_feature(x86_featureset,
5291 					    X86FSET_AVX512FMAPS);
5292 					remove_x86_feature(x86_featureset,
5293 					    X86FSET_VAES);
5294 					remove_x86_feature(x86_featureset,
5295 					    X86FSET_VPCLMULQDQ);
5296 					remove_x86_feature(x86_featureset,
5297 					    X86FSET_GFNI);
5298 					remove_x86_feature(x86_featureset,
5299 					    X86FSET_AVX512_VP2INT);
5300 					remove_x86_feature(x86_featureset,
5301 					    X86FSET_AVX512_BITALG);
5302 					remove_x86_feature(x86_featureset,
5303 					    X86FSET_AVX512_VBMI2);
5304 					remove_x86_feature(x86_featureset,
5305 					    X86FSET_AVX512_BF16);
5306 
5307 					xsave_force_disable = B_TRUE;
5308 				} else {
5309 					VERIFY(is_x86_feature(x86_featureset,
5310 					    X86FSET_XSAVE) == B_FALSE);
5311 				}
5312 			}
5313 		}
5314 	}
5315 
5316 
5317 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
5318 		return;
5319 
5320 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
5321 		nmax = NMAX_CPI_EXTD;
5322 	/*
5323 	 * Copy the extended properties, fixing them as we go. While we start at
5324 	 * 2 because we've already handled a few cases in the basic pass, the
5325 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5326 	 */
5327 	iptr = (void *)cpi->cpi_brandstr;
5328 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5329 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5330 		(void) __cpuid_insn(cp);
5331 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5332 		    cp);
5333 		switch (n) {
5334 		case 2:
5335 		case 3:
5336 		case 4:
5337 			/*
5338 			 * Extract the brand string
5339 			 */
5340 			*iptr++ = cp->cp_eax;
5341 			*iptr++ = cp->cp_ebx;
5342 			*iptr++ = cp->cp_ecx;
5343 			*iptr++ = cp->cp_edx;
5344 			break;
5345 		case 5:
5346 			switch (cpi->cpi_vendor) {
5347 			case X86_VENDOR_AMD:
5348 				/*
5349 				 * The Athlon and Duron were the first
5350 				 * parts to report the sizes of the
5351 				 * TLB for large pages. Before then,
5352 				 * we don't trust the data.
5353 				 */
5354 				if (cpi->cpi_family < 6 ||
5355 				    (cpi->cpi_family == 6 &&
5356 				    cpi->cpi_model < 1))
5357 					cp->cp_eax = 0;
5358 				break;
5359 			default:
5360 				break;
5361 			}
5362 			break;
5363 		case 6:
5364 			switch (cpi->cpi_vendor) {
5365 			case X86_VENDOR_AMD:
5366 				/*
5367 				 * The Athlon and Duron were the first
5368 				 * AMD parts with L2 TLB's.
5369 				 * Before then, don't trust the data.
5370 				 */
5371 				if (cpi->cpi_family < 6 ||
5372 				    (cpi->cpi_family == 6 &&
5373 				    cpi->cpi_model < 1))
5374 					cp->cp_eax = cp->cp_ebx = 0;
5375 				/*
5376 				 * AMD Duron rev A0 reports L2
5377 				 * cache size incorrectly as 1K
5378 				 * when it is really 64K
5379 				 */
5380 				if (cpi->cpi_family == 6 &&
5381 				    cpi->cpi_model == 3 &&
5382 				    cpi->cpi_step == 0) {
5383 					cp->cp_ecx &= 0xffff;
5384 					cp->cp_ecx |= 0x400000;
5385 				}
5386 				break;
5387 			case X86_VENDOR_Cyrix:	/* VIA C3 */
5388 				/*
5389 				 * VIA C3 processors are a bit messed
5390 				 * up w.r.t. encoding cache sizes in %ecx
5391 				 */
5392 				if (cpi->cpi_family != 6)
5393 					break;
5394 				/*
5395 				 * model 7 and 8 were incorrectly encoded
5396 				 *
5397 				 * xxx is model 8 really broken?
5398 				 */
5399 				if (cpi->cpi_model == 7 ||
5400 				    cpi->cpi_model == 8)
5401 					cp->cp_ecx =
5402 					    BITX(cp->cp_ecx, 31, 24) << 16 |
5403 					    BITX(cp->cp_ecx, 23, 16) << 12 |
5404 					    BITX(cp->cp_ecx, 15, 8) << 8 |
5405 					    BITX(cp->cp_ecx, 7, 0);
5406 				/*
5407 				 * model 9 stepping 1 has wrong associativity
5408 				 */
5409 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5410 					cp->cp_ecx |= 8 << 12;
5411 				break;
5412 			case X86_VENDOR_Intel:
5413 				/*
5414 				 * Extended L2 Cache features function.
5415 				 * First appeared on Prescott.
5416 				 */
5417 			default:
5418 				break;
5419 			}
5420 			break;
5421 		default:
5422 			break;
5423 		}
5424 	}
5425 }
5426 
5427 static const char *
intel_cpubrand(const struct cpuid_info * cpi)5428 intel_cpubrand(const struct cpuid_info *cpi)
5429 {
5430 	int i;
5431 
5432 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5433 
5434 	switch (cpi->cpi_family) {
5435 	case 5:
5436 		return ("Intel Pentium(r)");
5437 	case 6:
5438 		switch (cpi->cpi_model) {
5439 			uint_t celeron, xeon;
5440 			const struct cpuid_regs *cp;
5441 		case 0:
5442 		case 1:
5443 		case 2:
5444 			return ("Intel Pentium(r) Pro");
5445 		case 3:
5446 		case 4:
5447 			return ("Intel Pentium(r) II");
5448 		case 6:
5449 			return ("Intel Celeron(r)");
5450 		case 5:
5451 		case 7:
5452 			celeron = xeon = 0;
5453 			cp = &cpi->cpi_std[2];	/* cache info */
5454 
5455 			for (i = 1; i < 4; i++) {
5456 				uint_t tmp;
5457 
5458 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5459 				if (tmp == 0x40)
5460 					celeron++;
5461 				if (tmp >= 0x44 && tmp <= 0x45)
5462 					xeon++;
5463 			}
5464 
5465 			for (i = 0; i < 2; i++) {
5466 				uint_t tmp;
5467 
5468 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5469 				if (tmp == 0x40)
5470 					celeron++;
5471 				else if (tmp >= 0x44 && tmp <= 0x45)
5472 					xeon++;
5473 			}
5474 
5475 			for (i = 0; i < 4; i++) {
5476 				uint_t tmp;
5477 
5478 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5479 				if (tmp == 0x40)
5480 					celeron++;
5481 				else if (tmp >= 0x44 && tmp <= 0x45)
5482 					xeon++;
5483 			}
5484 
5485 			for (i = 0; i < 4; i++) {
5486 				uint_t tmp;
5487 
5488 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5489 				if (tmp == 0x40)
5490 					celeron++;
5491 				else if (tmp >= 0x44 && tmp <= 0x45)
5492 					xeon++;
5493 			}
5494 
5495 			if (celeron)
5496 				return ("Intel Celeron(r)");
5497 			if (xeon)
5498 				return (cpi->cpi_model == 5 ?
5499 				    "Intel Pentium(r) II Xeon(tm)" :
5500 				    "Intel Pentium(r) III Xeon(tm)");
5501 			return (cpi->cpi_model == 5 ?
5502 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5503 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5504 		default:
5505 			break;
5506 		}
5507 	default:
5508 		break;
5509 	}
5510 
5511 	/* BrandID is present if the field is nonzero */
5512 	if (cpi->cpi_brandid != 0) {
5513 		static const struct {
5514 			uint_t bt_bid;
5515 			const char *bt_str;
5516 		} brand_tbl[] = {
5517 			{ 0x1,	"Intel(r) Celeron(r)" },
5518 			{ 0x2,	"Intel(r) Pentium(r) III" },
5519 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5520 			{ 0x4,	"Intel(r) Pentium(r) III" },
5521 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5522 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5523 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5524 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5525 			{ 0xa,	"Intel(r) Celeron(r)" },
5526 			{ 0xb,	"Intel(r) Xeon(tm)" },
5527 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5528 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5529 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5530 			{ 0x11, "Mobile Genuine Intel(r)" },
5531 			{ 0x12, "Intel(r) Celeron(r) M" },
5532 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5533 			{ 0x14, "Intel(r) Celeron(r)" },
5534 			{ 0x15, "Mobile Genuine Intel(r)" },
5535 			{ 0x16,	"Intel(r) Pentium(r) M" },
5536 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5537 		};
5538 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5539 		uint_t sgn;
5540 
5541 		sgn = (cpi->cpi_family << 8) |
5542 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5543 
5544 		for (i = 0; i < btblmax; i++)
5545 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5546 				break;
5547 		if (i < btblmax) {
5548 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5549 				return ("Intel(r) Celeron(r)");
5550 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5551 				return ("Intel(r) Xeon(tm) MP");
5552 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5553 				return ("Intel(r) Xeon(tm)");
5554 			return (brand_tbl[i].bt_str);
5555 		}
5556 	}
5557 
5558 	return (NULL);
5559 }
5560 
5561 static const char *
amd_cpubrand(const struct cpuid_info * cpi)5562 amd_cpubrand(const struct cpuid_info *cpi)
5563 {
5564 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5565 
5566 	switch (cpi->cpi_family) {
5567 	case 5:
5568 		switch (cpi->cpi_model) {
5569 		case 0:
5570 		case 1:
5571 		case 2:
5572 		case 3:
5573 		case 4:
5574 		case 5:
5575 			return ("AMD-K5(r)");
5576 		case 6:
5577 		case 7:
5578 			return ("AMD-K6(r)");
5579 		case 8:
5580 			return ("AMD-K6(r)-2");
5581 		case 9:
5582 			return ("AMD-K6(r)-III");
5583 		default:
5584 			return ("AMD (family 5)");
5585 		}
5586 	case 6:
5587 		switch (cpi->cpi_model) {
5588 		case 1:
5589 			return ("AMD-K7(tm)");
5590 		case 0:
5591 		case 2:
5592 		case 4:
5593 			return ("AMD Athlon(tm)");
5594 		case 3:
5595 		case 7:
5596 			return ("AMD Duron(tm)");
5597 		case 6:
5598 		case 8:
5599 		case 10:
5600 			/*
5601 			 * Use the L2 cache size to distinguish
5602 			 */
5603 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5604 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5605 		default:
5606 			return ("AMD (family 6)");
5607 		}
5608 	default:
5609 		break;
5610 	}
5611 
5612 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5613 	    cpi->cpi_brandid != 0) {
5614 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5615 		case 3:
5616 			return ("AMD Opteron(tm) UP 1xx");
5617 		case 4:
5618 			return ("AMD Opteron(tm) DP 2xx");
5619 		case 5:
5620 			return ("AMD Opteron(tm) MP 8xx");
5621 		default:
5622 			return ("AMD Opteron(tm)");
5623 		}
5624 	}
5625 
5626 	return (NULL);
5627 }
5628 
5629 static const char *
cyrix_cpubrand(struct cpuid_info * cpi,uint_t type)5630 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5631 {
5632 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5633 
5634 	switch (type) {
5635 	case X86_TYPE_CYRIX_6x86:
5636 		return ("Cyrix 6x86");
5637 	case X86_TYPE_CYRIX_6x86L:
5638 		return ("Cyrix 6x86L");
5639 	case X86_TYPE_CYRIX_6x86MX:
5640 		return ("Cyrix 6x86MX");
5641 	case X86_TYPE_CYRIX_GXm:
5642 		return ("Cyrix GXm");
5643 	case X86_TYPE_CYRIX_MediaGX:
5644 		return ("Cyrix MediaGX");
5645 	case X86_TYPE_CYRIX_MII:
5646 		return ("Cyrix M2");
5647 	case X86_TYPE_VIA_CYRIX_III:
5648 		return ("VIA Cyrix M3");
5649 	default:
5650 		/*
5651 		 * Have another wild guess ..
5652 		 */
5653 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5654 			return ("Cyrix 5x86");
5655 		else if (cpi->cpi_family == 5) {
5656 			switch (cpi->cpi_model) {
5657 			case 2:
5658 				return ("Cyrix 6x86");	/* Cyrix M1 */
5659 			case 4:
5660 				return ("Cyrix MediaGX");
5661 			default:
5662 				break;
5663 			}
5664 		} else if (cpi->cpi_family == 6) {
5665 			switch (cpi->cpi_model) {
5666 			case 0:
5667 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5668 			case 5:
5669 			case 6:
5670 			case 7:
5671 			case 8:
5672 			case 9:
5673 				return ("VIA C3");
5674 			default:
5675 				break;
5676 			}
5677 		}
5678 		break;
5679 	}
5680 	return (NULL);
5681 }
5682 
5683 /*
5684  * This only gets called in the case that the CPU extended
5685  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5686  * aren't available, or contain null bytes for some reason.
5687  */
5688 static void
fabricate_brandstr(struct cpuid_info * cpi)5689 fabricate_brandstr(struct cpuid_info *cpi)
5690 {
5691 	const char *brand = NULL;
5692 
5693 	switch (cpi->cpi_vendor) {
5694 	case X86_VENDOR_Intel:
5695 		brand = intel_cpubrand(cpi);
5696 		break;
5697 	case X86_VENDOR_AMD:
5698 		brand = amd_cpubrand(cpi);
5699 		break;
5700 	case X86_VENDOR_Cyrix:
5701 		brand = cyrix_cpubrand(cpi, x86_type);
5702 		break;
5703 	case X86_VENDOR_NexGen:
5704 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5705 			brand = "NexGen Nx586";
5706 		break;
5707 	case X86_VENDOR_Centaur:
5708 		if (cpi->cpi_family == 5)
5709 			switch (cpi->cpi_model) {
5710 			case 4:
5711 				brand = "Centaur C6";
5712 				break;
5713 			case 8:
5714 				brand = "Centaur C2";
5715 				break;
5716 			case 9:
5717 				brand = "Centaur C3";
5718 				break;
5719 			default:
5720 				break;
5721 			}
5722 		break;
5723 	case X86_VENDOR_Rise:
5724 		if (cpi->cpi_family == 5 &&
5725 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5726 			brand = "Rise mP6";
5727 		break;
5728 	case X86_VENDOR_SiS:
5729 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5730 			brand = "SiS 55x";
5731 		break;
5732 	case X86_VENDOR_TM:
5733 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5734 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5735 		break;
5736 	case X86_VENDOR_NSC:
5737 	case X86_VENDOR_UMC:
5738 	default:
5739 		break;
5740 	}
5741 	if (brand) {
5742 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5743 		return;
5744 	}
5745 
5746 	/*
5747 	 * If all else fails ...
5748 	 */
5749 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5750 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5751 	    cpi->cpi_model, cpi->cpi_step);
5752 }
5753 
5754 /*
5755  * This routine is called just after kernel memory allocation
5756  * becomes available on cpu0, and as part of mp_startup() on
5757  * the other cpus.
5758  *
5759  * Fixup the brand string, and collect any information from cpuid
5760  * that requires dynamically allocated storage to represent.
5761  */
5762 
5763 static void
cpuid_pass_dynamic(cpu_t * cpu,void * _arg __unused)5764 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5765 {
5766 	int	i, max, shft, level, size;
5767 	struct cpuid_regs regs;
5768 	struct cpuid_regs *cp;
5769 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5770 
5771 	/*
5772 	 * Deterministic cache parameters
5773 	 *
5774 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5775 	 * values that are present are currently defined to be the same. This
5776 	 * means we can use the same logic to parse it as long as we use the
5777 	 * appropriate leaf to get the data. If you're updating this, make sure
5778 	 * you're careful about which vendor supports which aspect.
5779 	 *
5780 	 * Take this opportunity to detect the number of threads sharing the
5781 	 * last level cache, and construct a corresponding cache id. The
5782 	 * respective cpuid_info members are initialized to the default case of
5783 	 * "no last level cache sharing".
5784 	 */
5785 	cpi->cpi_ncpu_shr_last_cache = 1;
5786 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5787 
5788 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5789 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5790 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5791 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5792 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5793 		uint32_t leaf;
5794 
5795 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5796 			leaf = 4;
5797 		} else {
5798 			leaf = CPUID_LEAF_EXT_1d;
5799 		}
5800 
5801 		/*
5802 		 * Find the # of elements (size) returned by the leaf and along
5803 		 * the way detect last level cache sharing details.
5804 		 */
5805 		bzero(&regs, sizeof (regs));
5806 		cp = &regs;
5807 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5808 			cp->cp_eax = leaf;
5809 			cp->cp_ecx = i;
5810 
5811 			(void) __cpuid_insn(cp);
5812 
5813 			if (CPI_CACHE_TYPE(cp) == 0)
5814 				break;
5815 			level = CPI_CACHE_LVL(cp);
5816 			if (level > max) {
5817 				max = level;
5818 				cpi->cpi_ncpu_shr_last_cache =
5819 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5820 			}
5821 		}
5822 		cpi->cpi_cache_leaf_size = size = i;
5823 
5824 		/*
5825 		 * Allocate the cpi_cache_leaves array. The first element
5826 		 * references the regs for the corresponding leaf with %ecx set
5827 		 * to 0. This was gathered in cpuid_pass_extended().
5828 		 */
5829 		if (size > 0) {
5830 			cpi->cpi_cache_leaves =
5831 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5832 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5833 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5834 			} else {
5835 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5836 			}
5837 
5838 			/*
5839 			 * Allocate storage to hold the additional regs
5840 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5841 			 *
5842 			 * The regs for the leaf, %ecx == 0 has already
5843 			 * been allocated as indicated above.
5844 			 */
5845 			for (i = 1; i < size; i++) {
5846 				cp = cpi->cpi_cache_leaves[i] =
5847 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5848 				cp->cp_eax = leaf;
5849 				cp->cp_ecx = i;
5850 
5851 				(void) __cpuid_insn(cp);
5852 			}
5853 		}
5854 		/*
5855 		 * Determine the number of bits needed to represent
5856 		 * the number of CPUs sharing the last level cache.
5857 		 *
5858 		 * Shift off that number of bits from the APIC id to
5859 		 * derive the cache id.
5860 		 */
5861 		shft = 0;
5862 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5863 			shft++;
5864 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5865 	}
5866 
5867 	/*
5868 	 * Now fixup the brand string
5869 	 */
5870 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5871 		fabricate_brandstr(cpi);
5872 	} else {
5873 
5874 		/*
5875 		 * If we successfully extracted a brand string from the cpuid
5876 		 * instruction, clean it up by removing leading spaces and
5877 		 * similar junk.
5878 		 */
5879 		if (cpi->cpi_brandstr[0]) {
5880 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5881 			char *src, *dst;
5882 
5883 			dst = src = (char *)cpi->cpi_brandstr;
5884 			src[maxlen - 1] = '\0';
5885 			/*
5886 			 * strip leading spaces
5887 			 */
5888 			while (*src == ' ')
5889 				src++;
5890 			/*
5891 			 * Remove any 'Genuine' or "Authentic" prefixes
5892 			 */
5893 			if (strncmp(src, "Genuine ", 8) == 0)
5894 				src += 8;
5895 			if (strncmp(src, "Authentic ", 10) == 0)
5896 				src += 10;
5897 
5898 			/*
5899 			 * Now do an in-place copy.
5900 			 * Map (R) to (r) and (TM) to (tm).
5901 			 * The era of teletypes is long gone, and there's
5902 			 * -really- no need to shout.
5903 			 */
5904 			while (*src != '\0') {
5905 				if (src[0] == '(') {
5906 					if (strncmp(src + 1, "R)", 2) == 0) {
5907 						(void) strncpy(dst, "(r)", 3);
5908 						src += 3;
5909 						dst += 3;
5910 						continue;
5911 					}
5912 					if (strncmp(src + 1, "TM)", 3) == 0) {
5913 						(void) strncpy(dst, "(tm)", 4);
5914 						src += 4;
5915 						dst += 4;
5916 						continue;
5917 					}
5918 				}
5919 				*dst++ = *src++;
5920 			}
5921 			*dst = '\0';
5922 
5923 			/*
5924 			 * Finally, remove any trailing spaces
5925 			 */
5926 			while (--dst > cpi->cpi_brandstr)
5927 				if (*dst == ' ')
5928 					*dst = '\0';
5929 				else
5930 					break;
5931 		} else
5932 			fabricate_brandstr(cpi);
5933 	}
5934 }
5935 
5936 typedef struct {
5937 	uint32_t avm_av;
5938 	uint32_t avm_feat;
5939 } av_feat_map_t;
5940 
5941 /*
5942  * These arrays are used to map features that we should add based on x86
5943  * features that are present. As a large number depend on kernel features,
5944  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5945  * There is an array of these for each hwcap word. Some features aren't tracked
5946  * in the kernel x86 featureset and that's ok. They will not show up in here.
5947  */
5948 static const av_feat_map_t x86fset_to_av1[] = {
5949 	{ AV_386_CX8, X86FSET_CX8 },
5950 	{ AV_386_SEP, X86FSET_SEP },
5951 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5952 	{ AV_386_CMOV, X86FSET_CMOV },
5953 	{ AV_386_FXSR, X86FSET_SSE },
5954 	{ AV_386_SSE, X86FSET_SSE },
5955 	{ AV_386_SSE2, X86FSET_SSE2 },
5956 	{ AV_386_SSE3, X86FSET_SSE3 },
5957 	{ AV_386_CX16, X86FSET_CX16 },
5958 	{ AV_386_TSCP, X86FSET_TSCP },
5959 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5960 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5961 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5962 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5963 	{ AV_386_AES, X86FSET_AES },
5964 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5965 	{ AV_386_XSAVE, X86FSET_XSAVE },
5966 	{ AV_386_AVX, X86FSET_AVX },
5967 	{ AV_386_VMX, X86FSET_VMX },
5968 	{ AV_386_AMD_SVM, X86FSET_SVM }
5969 };
5970 
5971 static const av_feat_map_t x86fset_to_av2[] = {
5972 	{ AV_386_2_F16C, X86FSET_F16C },
5973 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5974 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5975 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5976 	{ AV_386_2_FMA, X86FSET_FMA },
5977 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5978 	{ AV_386_2_ADX, X86FSET_ADX },
5979 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5980 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5981 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5982 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5983 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5984 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5985 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5986 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5987 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5988 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5989 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5990 	{ AV_386_2_SHA, X86FSET_SHA },
5991 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5992 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5993 	{ AV_386_2_CLWB, X86FSET_CLWB },
5994 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5995 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5996 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5997 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5998 	{ AV_386_2_VAES, X86FSET_VAES },
5999 	{ AV_386_2_GFNI, X86FSET_GFNI },
6000 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
6001 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
6002 };
6003 
6004 static const av_feat_map_t x86fset_to_av3[] = {
6005 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
6006 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
6007 };
6008 
6009 /*
6010  * This routine is called out of bind_hwcap() much later in the life
6011  * of the kernel (post_startup()).  The job of this routine is to resolve
6012  * the hardware feature support and kernel support for those features into
6013  * what we're actually going to tell applications via the aux vector.
6014  *
6015  * Most of the aux vector is derived from the x86_featureset array vector where
6016  * a given feature indicates that an aux vector should be plumbed through. This
6017  * allows the kernel to use one tracking mechanism for these based on whether or
6018  * not it has the required hardware support (most often xsave). Most newer
6019  * features are added there in case we need them in the kernel. Otherwise,
6020  * features are evaluated based on looking at the cpuid features that remain. If
6021  * you find yourself wanting to clear out cpuid features for some reason, they
6022  * should instead be driven by the feature set so we have a consistent view.
6023  */
6024 
6025 static void
cpuid_pass_resolve(cpu_t * cpu,void * arg)6026 cpuid_pass_resolve(cpu_t *cpu, void *arg)
6027 {
6028 	uint_t *hwcap_out = (uint_t *)arg;
6029 	struct cpuid_info *cpi;
6030 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
6031 
6032 	cpi = cpu->cpu_m.mcpu_cpi;
6033 
6034 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
6035 		if (is_x86_feature(x86_featureset,
6036 		    x86fset_to_av1[i].avm_feat)) {
6037 			hwcap_flags |= x86fset_to_av1[i].avm_av;
6038 		}
6039 	}
6040 
6041 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
6042 		if (is_x86_feature(x86_featureset,
6043 		    x86fset_to_av2[i].avm_feat)) {
6044 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
6045 		}
6046 	}
6047 
6048 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
6049 		if (is_x86_feature(x86_featureset,
6050 		    x86fset_to_av3[i].avm_feat)) {
6051 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
6052 		}
6053 	}
6054 
6055 	/*
6056 	 * From here on out we're working through features that don't have
6057 	 * corresponding kernel feature flags for various reasons that are
6058 	 * mostly just due to the historical implementation.
6059 	 */
6060 	if (cpi->cpi_maxeax >= 1) {
6061 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
6062 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
6063 
6064 		*edx = CPI_FEATURES_EDX(cpi);
6065 		*ecx = CPI_FEATURES_ECX(cpi);
6066 
6067 		/*
6068 		 * [no explicit support required beyond x87 fp context]
6069 		 */
6070 		if (!fpu_exists)
6071 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
6072 
6073 		/*
6074 		 * Now map the supported feature vector to things that we
6075 		 * think userland will care about.
6076 		 */
6077 		if (*ecx & CPUID_INTC_ECX_MOVBE)
6078 			hwcap_flags |= AV_386_MOVBE;
6079 
6080 		if (*ecx & CPUID_INTC_ECX_POPCNT)
6081 			hwcap_flags |= AV_386_POPCNT;
6082 		if (*edx & CPUID_INTC_EDX_FPU)
6083 			hwcap_flags |= AV_386_FPU;
6084 		if (*edx & CPUID_INTC_EDX_MMX)
6085 			hwcap_flags |= AV_386_MMX;
6086 		if (*edx & CPUID_INTC_EDX_TSC)
6087 			hwcap_flags |= AV_386_TSC;
6088 	}
6089 
6090 	/*
6091 	 * Check a few miscellaneous features.
6092 	 */
6093 	if (cpi->cpi_xmaxeax < 0x80000001)
6094 		goto resolve_done;
6095 
6096 	switch (cpi->cpi_vendor) {
6097 		uint32_t *edx, *ecx;
6098 
6099 	case X86_VENDOR_Intel:
6100 		/*
6101 		 * Seems like Intel duplicated what we necessary
6102 		 * here to make the initial crop of 64-bit OS's work.
6103 		 * Hopefully, those are the only "extended" bits
6104 		 * they'll add.
6105 		 */
6106 		/*FALLTHROUGH*/
6107 
6108 	case X86_VENDOR_AMD:
6109 	case X86_VENDOR_HYGON:
6110 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
6111 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
6112 
6113 		*edx = CPI_FEATURES_XTD_EDX(cpi);
6114 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
6115 
6116 		/*
6117 		 * [no explicit support required beyond
6118 		 * x87 fp context and exception handlers]
6119 		 */
6120 		if (!fpu_exists)
6121 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
6122 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
6123 
6124 		/*
6125 		 * Now map the supported feature vector to
6126 		 * things that we think userland will care about.
6127 		 */
6128 		if (*edx & CPUID_AMD_EDX_MMXamd)
6129 			hwcap_flags |= AV_386_AMD_MMX;
6130 		if (*edx & CPUID_AMD_EDX_3DNow)
6131 			hwcap_flags |= AV_386_AMD_3DNow;
6132 		if (*edx & CPUID_AMD_EDX_3DNowx)
6133 			hwcap_flags |= AV_386_AMD_3DNowx;
6134 
6135 		switch (cpi->cpi_vendor) {
6136 		case X86_VENDOR_AMD:
6137 		case X86_VENDOR_HYGON:
6138 			if (*ecx & CPUID_AMD_ECX_AHF64)
6139 				hwcap_flags |= AV_386_AHF;
6140 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6141 				hwcap_flags |= AV_386_AMD_LZCNT;
6142 			break;
6143 
6144 		case X86_VENDOR_Intel:
6145 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6146 				hwcap_flags |= AV_386_AMD_LZCNT;
6147 			/*
6148 			 * Aarrgh.
6149 			 * Intel uses a different bit in the same word.
6150 			 */
6151 			if (*ecx & CPUID_INTC_ECX_AHF64)
6152 				hwcap_flags |= AV_386_AHF;
6153 			break;
6154 		default:
6155 			break;
6156 		}
6157 		break;
6158 
6159 	default:
6160 		break;
6161 	}
6162 
6163 resolve_done:
6164 	if (hwcap_out != NULL) {
6165 		hwcap_out[0] = hwcap_flags;
6166 		hwcap_out[1] = hwcap_flags_2;
6167 		hwcap_out[2] = hwcap_flags_3;
6168 	}
6169 }
6170 
6171 
6172 /*
6173  * Simulate the cpuid instruction using the data we previously
6174  * captured about this CPU.  We try our best to return the truth
6175  * about the hardware, independently of kernel support.
6176  */
6177 uint32_t
cpuid_insn(cpu_t * cpu,struct cpuid_regs * cp)6178 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
6179 {
6180 	struct cpuid_info *cpi;
6181 	struct cpuid_regs *xcp;
6182 
6183 	if (cpu == NULL)
6184 		cpu = CPU;
6185 	cpi = cpu->cpu_m.mcpu_cpi;
6186 
6187 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6188 
6189 	/*
6190 	 * CPUID data is cached in two separate places: cpi_std for standard
6191 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
6192 	 */
6193 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
6194 		xcp = &cpi->cpi_std[cp->cp_eax];
6195 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
6196 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
6197 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
6198 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
6199 	} else {
6200 		/*
6201 		 * The caller is asking for data from an input parameter which
6202 		 * the kernel has not cached.  In this case we go fetch from
6203 		 * the hardware and return the data directly to the user.
6204 		 */
6205 		return (__cpuid_insn(cp));
6206 	}
6207 
6208 	cp->cp_eax = xcp->cp_eax;
6209 	cp->cp_ebx = xcp->cp_ebx;
6210 	cp->cp_ecx = xcp->cp_ecx;
6211 	cp->cp_edx = xcp->cp_edx;
6212 	return (cp->cp_eax);
6213 }
6214 
6215 boolean_t
cpuid_checkpass(const cpu_t * const cpu,const cpuid_pass_t pass)6216 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
6217 {
6218 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
6219 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
6220 }
6221 
6222 int
cpuid_getbrandstr(cpu_t * cpu,char * s,size_t n)6223 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
6224 {
6225 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6226 
6227 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
6228 }
6229 
6230 int
cpuid_is_cmt(cpu_t * cpu)6231 cpuid_is_cmt(cpu_t *cpu)
6232 {
6233 	if (cpu == NULL)
6234 		cpu = CPU;
6235 
6236 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6237 
6238 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
6239 }
6240 
6241 /*
6242  * AMD and Intel both implement the 64-bit variant of the syscall
6243  * instruction (syscallq), so if there's -any- support for syscall,
6244  * cpuid currently says "yes, we support this".
6245  *
6246  * However, Intel decided to -not- implement the 32-bit variant of the
6247  * syscall instruction, so we provide a predicate to allow our caller
6248  * to test that subtlety here.
6249  *
6250  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
6251  *	even in the case where the hardware would in fact support it.
6252  */
6253 /*ARGSUSED*/
6254 int
cpuid_syscall32_insn(cpu_t * cpu)6255 cpuid_syscall32_insn(cpu_t *cpu)
6256 {
6257 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
6258 
6259 #if !defined(__xpv)
6260 	if (cpu == NULL)
6261 		cpu = CPU;
6262 
6263 	/*CSTYLED*/
6264 	{
6265 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6266 
6267 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
6268 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
6269 		    cpi->cpi_xmaxeax >= 0x80000001 &&
6270 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
6271 			return (1);
6272 	}
6273 #endif
6274 	return (0);
6275 }
6276 
6277 int
cpuid_getidstr(cpu_t * cpu,char * s,size_t n)6278 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
6279 {
6280 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6281 
6282 	static const char fmt[] =
6283 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
6284 	static const char fmt_ht[] =
6285 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
6286 
6287 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6288 
6289 	if (cpuid_is_cmt(cpu))
6290 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
6291 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6292 		    cpi->cpi_family, cpi->cpi_model,
6293 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6294 	return (snprintf(s, n, fmt,
6295 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6296 	    cpi->cpi_family, cpi->cpi_model,
6297 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6298 }
6299 
6300 const char *
cpuid_getvendorstr(cpu_t * cpu)6301 cpuid_getvendorstr(cpu_t *cpu)
6302 {
6303 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6304 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
6305 }
6306 
6307 uint_t
cpuid_getvendor(cpu_t * cpu)6308 cpuid_getvendor(cpu_t *cpu)
6309 {
6310 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6311 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
6312 }
6313 
6314 uint_t
cpuid_getfamily(cpu_t * cpu)6315 cpuid_getfamily(cpu_t *cpu)
6316 {
6317 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6318 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
6319 }
6320 
6321 uint_t
cpuid_getmodel(cpu_t * cpu)6322 cpuid_getmodel(cpu_t *cpu)
6323 {
6324 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6325 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
6326 }
6327 
6328 uint_t
cpuid_get_ncpu_per_chip(cpu_t * cpu)6329 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6330 {
6331 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6332 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6333 }
6334 
6335 uint_t
cpuid_get_ncore_per_chip(cpu_t * cpu)6336 cpuid_get_ncore_per_chip(cpu_t *cpu)
6337 {
6338 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6339 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6340 }
6341 
6342 uint_t
cpuid_get_ncpu_sharing_last_cache(cpu_t * cpu)6343 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6344 {
6345 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6346 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6347 }
6348 
6349 id_t
cpuid_get_last_lvl_cacheid(cpu_t * cpu)6350 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6351 {
6352 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6353 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6354 }
6355 
6356 uint_t
cpuid_getstep(cpu_t * cpu)6357 cpuid_getstep(cpu_t *cpu)
6358 {
6359 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6360 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
6361 }
6362 
6363 uint_t
cpuid_getsig(struct cpu * cpu)6364 cpuid_getsig(struct cpu *cpu)
6365 {
6366 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6367 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6368 }
6369 
6370 x86_chiprev_t
cpuid_getchiprev(struct cpu * cpu)6371 cpuid_getchiprev(struct cpu *cpu)
6372 {
6373 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6374 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6375 }
6376 
6377 const char *
cpuid_getchiprevstr(struct cpu * cpu)6378 cpuid_getchiprevstr(struct cpu *cpu)
6379 {
6380 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6381 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6382 }
6383 
6384 uint32_t
cpuid_getsockettype(struct cpu * cpu)6385 cpuid_getsockettype(struct cpu *cpu)
6386 {
6387 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6388 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6389 }
6390 
6391 const char *
cpuid_getsocketstr(cpu_t * cpu)6392 cpuid_getsocketstr(cpu_t *cpu)
6393 {
6394 	static const char *socketstr = NULL;
6395 	struct cpuid_info *cpi;
6396 
6397 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6398 	cpi = cpu->cpu_m.mcpu_cpi;
6399 
6400 	/* Assume that socket types are the same across the system */
6401 	if (socketstr == NULL)
6402 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6403 		    cpi->cpi_model, cpi->cpi_step);
6404 
6405 
6406 	return (socketstr);
6407 }
6408 
6409 x86_uarchrev_t
cpuid_getuarchrev(cpu_t * cpu)6410 cpuid_getuarchrev(cpu_t *cpu)
6411 {
6412 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6413 }
6414 
6415 int
cpuid_get_chipid(cpu_t * cpu)6416 cpuid_get_chipid(cpu_t *cpu)
6417 {
6418 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6419 
6420 	if (cpuid_is_cmt(cpu))
6421 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6422 	return (cpu->cpu_id);
6423 }
6424 
6425 id_t
cpuid_get_coreid(cpu_t * cpu)6426 cpuid_get_coreid(cpu_t *cpu)
6427 {
6428 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6429 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6430 }
6431 
6432 int
cpuid_get_pkgcoreid(cpu_t * cpu)6433 cpuid_get_pkgcoreid(cpu_t *cpu)
6434 {
6435 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6436 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6437 }
6438 
6439 int
cpuid_get_clogid(cpu_t * cpu)6440 cpuid_get_clogid(cpu_t *cpu)
6441 {
6442 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6443 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6444 }
6445 
6446 int
cpuid_get_cacheid(cpu_t * cpu)6447 cpuid_get_cacheid(cpu_t *cpu)
6448 {
6449 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6450 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6451 }
6452 
6453 uint_t
cpuid_get_procnodeid(cpu_t * cpu)6454 cpuid_get_procnodeid(cpu_t *cpu)
6455 {
6456 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6457 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6458 }
6459 
6460 uint_t
cpuid_get_procnodes_per_pkg(cpu_t * cpu)6461 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6462 {
6463 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6464 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6465 }
6466 
6467 uint_t
cpuid_get_compunitid(cpu_t * cpu)6468 cpuid_get_compunitid(cpu_t *cpu)
6469 {
6470 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6471 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6472 }
6473 
6474 uint_t
cpuid_get_cores_per_compunit(cpu_t * cpu)6475 cpuid_get_cores_per_compunit(cpu_t *cpu)
6476 {
6477 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6478 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6479 }
6480 
6481 uint32_t
cpuid_get_apicid(cpu_t * cpu)6482 cpuid_get_apicid(cpu_t *cpu)
6483 {
6484 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6485 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6486 		return (UINT32_MAX);
6487 	} else {
6488 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6489 	}
6490 }
6491 
6492 void
cpuid_get_addrsize(cpu_t * cpu,uint_t * pabits,uint_t * vabits)6493 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6494 {
6495 	struct cpuid_info *cpi;
6496 
6497 	if (cpu == NULL)
6498 		cpu = CPU;
6499 	cpi = cpu->cpu_m.mcpu_cpi;
6500 
6501 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6502 
6503 	if (pabits)
6504 		*pabits = cpi->cpi_pabits;
6505 	if (vabits)
6506 		*vabits = cpi->cpi_vabits;
6507 }
6508 
6509 /*
6510  * Export information about known offsets to the kernel. We only care about
6511  * things we have actually enabled support for in %xcr0.
6512  */
6513 void
cpuid_get_xsave_info(uint64_t bit,size_t * sizep,size_t * offp)6514 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6515 {
6516 	size_t size, off;
6517 
6518 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6519 
6520 	if (sizep == NULL)
6521 		sizep = &size;
6522 	if (offp == NULL)
6523 		offp = &off;
6524 
6525 	switch (bit) {
6526 	case XFEATURE_LEGACY_FP:
6527 	case XFEATURE_SSE:
6528 		*sizep = sizeof (struct fxsave_state);
6529 		*offp = 0;
6530 		break;
6531 	case XFEATURE_AVX:
6532 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6533 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6534 		break;
6535 	case XFEATURE_AVX512_OPMASK:
6536 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6537 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6538 		break;
6539 	case XFEATURE_AVX512_ZMM:
6540 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6541 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6542 		break;
6543 	case XFEATURE_AVX512_HI_ZMM:
6544 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6545 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6546 		break;
6547 	default:
6548 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6549 	}
6550 }
6551 
6552 /*
6553  * Use our supported-features indicators (xsave_bv_all) to return the XSAVE
6554  * size of our supported-features that need saving. Some CPUs' maximum save
6555  * size (stored in cpuid_info0.cpi_xsave.xsav_max_size) includes
6556  * unsupported-by-us features (e.g. Intel AMX) which we MAY be able to safely
6557  * dismiss if the supported XSAVE data's offset + length are before the
6558  * unsupported feature.
6559  */
6560 size_t
cpuid_get_xsave_size(void)6561 cpuid_get_xsave_size(void)
6562 {
6563 	size_t furthest_out = sizeof (struct xsave_state);
6564 	uint_t shift = 0;
6565 
6566 	VERIFY(xsave_bv_all != 0);
6567 
6568 	for (uint64_t current = xsave_bv_all; current != 0;
6569 	    current >>= 1, shift++) {
6570 		uint64_t testbit = 1UL << shift;
6571 		size_t size, offset;
6572 
6573 		if ((testbit & xsave_bv_all) == 0)
6574 			continue;
6575 
6576 		cpuid_get_xsave_info(testbit, &size, &offset);
6577 		furthest_out = MAX(furthest_out, offset + size);
6578 	}
6579 
6580 	return (furthest_out);
6581 }
6582 
6583 /*
6584  * Return true if the CPUs on this system require 'pointer clearing' for the
6585  * floating point error pointer exception handling. In the past, this has been
6586  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6587  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6588  * feature bit and is reflected in the cpi_fp_amd_save member.
6589  */
6590 boolean_t
cpuid_need_fp_excp_handling(void)6591 cpuid_need_fp_excp_handling(void)
6592 {
6593 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6594 	    cpuid_info0.cpi_fp_amd_save != 0);
6595 }
6596 
6597 /*
6598  * Returns the number of data TLB entries for a corresponding
6599  * pagesize.  If it can't be computed, or isn't known, the
6600  * routine returns zero.  If you ask about an architecturally
6601  * impossible pagesize, the routine will panic (so that the
6602  * hat implementor knows that things are inconsistent.)
6603  */
6604 uint_t
cpuid_get_dtlb_nent(cpu_t * cpu,size_t pagesize)6605 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6606 {
6607 	struct cpuid_info *cpi;
6608 	uint_t dtlb_nent = 0;
6609 
6610 	if (cpu == NULL)
6611 		cpu = CPU;
6612 	cpi = cpu->cpu_m.mcpu_cpi;
6613 
6614 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6615 
6616 	/*
6617 	 * Check the L2 TLB info
6618 	 */
6619 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6620 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6621 
6622 		switch (pagesize) {
6623 
6624 		case 4 * 1024:
6625 			/*
6626 			 * All zero in the top 16 bits of the register
6627 			 * indicates a unified TLB. Size is in low 16 bits.
6628 			 */
6629 			if ((cp->cp_ebx & 0xffff0000) == 0)
6630 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6631 			else
6632 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6633 			break;
6634 
6635 		case 2 * 1024 * 1024:
6636 			if ((cp->cp_eax & 0xffff0000) == 0)
6637 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6638 			else
6639 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6640 			break;
6641 
6642 		default:
6643 			panic("unknown L2 pagesize");
6644 			/*NOTREACHED*/
6645 		}
6646 	}
6647 
6648 	if (dtlb_nent != 0)
6649 		return (dtlb_nent);
6650 
6651 	/*
6652 	 * No L2 TLB support for this size, try L1.
6653 	 */
6654 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6655 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6656 
6657 		switch (pagesize) {
6658 		case 4 * 1024:
6659 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6660 			break;
6661 		case 2 * 1024 * 1024:
6662 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6663 			break;
6664 		default:
6665 			panic("unknown L1 d-TLB pagesize");
6666 			/*NOTREACHED*/
6667 		}
6668 	}
6669 
6670 	return (dtlb_nent);
6671 }
6672 
6673 /*
6674  * Return 0 if the erratum is not present or not applicable, positive
6675  * if it is, and negative if the status of the erratum is unknown.
6676  *
6677  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6678  * Processors" #25759, Rev 3.57, August 2005
6679  */
6680 int
cpuid_opteron_erratum(cpu_t * cpu,uint_t erratum)6681 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6682 {
6683 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6684 	uint_t eax;
6685 
6686 	/*
6687 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6688 	 * a legacy (32-bit) AMD CPU.
6689 	 */
6690 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6691 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6692 	    cpi->cpi_family == 6) {
6693 		return (0);
6694 	}
6695 
6696 	eax = cpi->cpi_std[1].cp_eax;
6697 
6698 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6699 #define	SH_B3(eax)	(eax == 0xf51)
6700 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6701 
6702 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6703 
6704 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6705 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6706 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6707 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6708 
6709 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6710 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6711 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6712 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6713 
6714 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6715 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6716 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6717 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6718 #define	BH_E4(eax)	(eax == 0x20fb1)
6719 #define	SH_E5(eax)	(eax == 0x20f42)
6720 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6721 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6722 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6723 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6724 			    DH_E6(eax) || JH_E6(eax))
6725 
6726 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6727 #define	DR_B0(eax)	(eax == 0x100f20)
6728 #define	DR_B1(eax)	(eax == 0x100f21)
6729 #define	DR_BA(eax)	(eax == 0x100f2a)
6730 #define	DR_B2(eax)	(eax == 0x100f22)
6731 #define	DR_B3(eax)	(eax == 0x100f23)
6732 #define	RB_C0(eax)	(eax == 0x100f40)
6733 
6734 	switch (erratum) {
6735 	case 1:
6736 		return (cpi->cpi_family < 0x10);
6737 	case 51:	/* what does the asterisk mean? */
6738 		return (B(eax) || SH_C0(eax) || CG(eax));
6739 	case 52:
6740 		return (B(eax));
6741 	case 57:
6742 		return (cpi->cpi_family <= 0x11);
6743 	case 58:
6744 		return (B(eax));
6745 	case 60:
6746 		return (cpi->cpi_family <= 0x11);
6747 	case 61:
6748 	case 62:
6749 	case 63:
6750 	case 64:
6751 	case 65:
6752 	case 66:
6753 	case 68:
6754 	case 69:
6755 	case 70:
6756 	case 71:
6757 		return (B(eax));
6758 	case 72:
6759 		return (SH_B0(eax));
6760 	case 74:
6761 		return (B(eax));
6762 	case 75:
6763 		return (cpi->cpi_family < 0x10);
6764 	case 76:
6765 		return (B(eax));
6766 	case 77:
6767 		return (cpi->cpi_family <= 0x11);
6768 	case 78:
6769 		return (B(eax) || SH_C0(eax));
6770 	case 79:
6771 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6772 	case 80:
6773 	case 81:
6774 	case 82:
6775 		return (B(eax));
6776 	case 83:
6777 		return (B(eax) || SH_C0(eax) || CG(eax));
6778 	case 85:
6779 		return (cpi->cpi_family < 0x10);
6780 	case 86:
6781 		return (SH_C0(eax) || CG(eax));
6782 	case 88:
6783 		return (B(eax) || SH_C0(eax));
6784 	case 89:
6785 		return (cpi->cpi_family < 0x10);
6786 	case 90:
6787 		return (B(eax) || SH_C0(eax) || CG(eax));
6788 	case 91:
6789 	case 92:
6790 		return (B(eax) || SH_C0(eax));
6791 	case 93:
6792 		return (SH_C0(eax));
6793 	case 94:
6794 		return (B(eax) || SH_C0(eax) || CG(eax));
6795 	case 95:
6796 		return (B(eax) || SH_C0(eax));
6797 	case 96:
6798 		return (B(eax) || SH_C0(eax) || CG(eax));
6799 	case 97:
6800 	case 98:
6801 		return (SH_C0(eax) || CG(eax));
6802 	case 99:
6803 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6804 	case 100:
6805 		return (B(eax) || SH_C0(eax));
6806 	case 101:
6807 	case 103:
6808 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6809 	case 104:
6810 		return (SH_C0(eax) || CG(eax) || D0(eax));
6811 	case 105:
6812 	case 106:
6813 	case 107:
6814 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6815 	case 108:
6816 		return (DH_CG(eax));
6817 	case 109:
6818 		return (SH_C0(eax) || CG(eax) || D0(eax));
6819 	case 110:
6820 		return (D0(eax) || EX(eax));
6821 	case 111:
6822 		return (CG(eax));
6823 	case 112:
6824 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6825 	case 113:
6826 		return (eax == 0x20fc0);
6827 	case 114:
6828 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6829 	case 115:
6830 		return (SH_E0(eax) || JH_E1(eax));
6831 	case 116:
6832 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6833 	case 117:
6834 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6835 	case 118:
6836 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6837 		    JH_E6(eax));
6838 	case 121:
6839 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6840 	case 122:
6841 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6842 	case 123:
6843 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6844 	case 131:
6845 		return (cpi->cpi_family < 0x10);
6846 	case 6336786:
6847 
6848 		/*
6849 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6850 		 * if this is a K8 family or newer processor. We're testing for
6851 		 * this 'erratum' to determine whether or not we have a constant
6852 		 * TSC.
6853 		 *
6854 		 * Our current fix for this is to disable the C1-Clock ramping.
6855 		 * However, this doesn't work on newer processor families nor
6856 		 * does it work when virtualized as those devices don't exist.
6857 		 */
6858 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6859 			return (0);
6860 		}
6861 
6862 		if (CPI_FAMILY(cpi) == 0xf) {
6863 			struct cpuid_regs regs;
6864 			regs.cp_eax = 0x80000007;
6865 			(void) __cpuid_insn(&regs);
6866 			return (!(regs.cp_edx & 0x100));
6867 		}
6868 		return (0);
6869 	case 147:
6870 		/*
6871 		 * This erratum (K8 #147) is not present on family 10 and newer.
6872 		 */
6873 		if (cpi->cpi_family >= 0x10) {
6874 			return (0);
6875 		}
6876 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6877 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6878 
6879 	case 6671130:
6880 		/*
6881 		 * check for processors (pre-Shanghai) that do not provide
6882 		 * optimal management of 1gb ptes in its tlb.
6883 		 */
6884 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6885 
6886 	case 298:
6887 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6888 		    DR_B2(eax) || RB_C0(eax));
6889 
6890 	case 721:
6891 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6892 
6893 	default:
6894 		return (-1);
6895 
6896 	}
6897 }
6898 
6899 /*
6900  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6901  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6902  */
6903 int
osvw_opteron_erratum(cpu_t * cpu,uint_t erratum)6904 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6905 {
6906 	struct cpuid_info	*cpi;
6907 	uint_t			osvwid;
6908 	static int		osvwfeature = -1;
6909 	uint64_t		osvwlength;
6910 
6911 
6912 	cpi = cpu->cpu_m.mcpu_cpi;
6913 
6914 	/* confirm OSVW supported */
6915 	if (osvwfeature == -1) {
6916 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6917 	} else {
6918 		/* assert that osvw feature setting is consistent on all cpus */
6919 		ASSERT(osvwfeature ==
6920 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6921 	}
6922 	if (!osvwfeature)
6923 		return (-1);
6924 
6925 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6926 
6927 	switch (erratum) {
6928 	case 298:	/* osvwid is 0 */
6929 		osvwid = 0;
6930 		if (osvwlength <= (uint64_t)osvwid) {
6931 			/* osvwid 0 is unknown */
6932 			return (-1);
6933 		}
6934 
6935 		/*
6936 		 * Check the OSVW STATUS MSR to determine the state
6937 		 * of the erratum where:
6938 		 *   0 - fixed by HW
6939 		 *   1 - BIOS has applied the workaround when BIOS
6940 		 *   workaround is available. (Or for other errata,
6941 		 *   OS workaround is required.)
6942 		 * For a value of 1, caller will confirm that the
6943 		 * erratum 298 workaround has indeed been applied by BIOS.
6944 		 *
6945 		 * A 1 may be set in cpus that have a HW fix
6946 		 * in a mixed cpu system. Regarding erratum 298:
6947 		 *   In a multiprocessor platform, the workaround above
6948 		 *   should be applied to all processors regardless of
6949 		 *   silicon revision when an affected processor is
6950 		 *   present.
6951 		 */
6952 
6953 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6954 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6955 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6956 
6957 	default:
6958 		return (-1);
6959 	}
6960 }
6961 
6962 static const char assoc_str[] = "associativity";
6963 static const char line_str[] = "line-size";
6964 static const char size_str[] = "size";
6965 
6966 static void
add_cache_prop(dev_info_t * devi,const char * label,const char * type,uint32_t val)6967 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6968     uint32_t val)
6969 {
6970 	char buf[128];
6971 
6972 	/*
6973 	 * ndi_prop_update_int() is used because it is desirable for
6974 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6975 	 */
6976 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6977 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6978 }
6979 
6980 /*
6981  * Intel-style cache/tlb description
6982  *
6983  * Standard cpuid level 2 gives a randomly ordered
6984  * selection of tags that index into a table that describes
6985  * cache and tlb properties.
6986  */
6987 
6988 static const char l1_icache_str[] = "l1-icache";
6989 static const char l1_dcache_str[] = "l1-dcache";
6990 static const char l2_cache_str[] = "l2-cache";
6991 static const char l3_cache_str[] = "l3-cache";
6992 static const char itlb4k_str[] = "itlb-4K";
6993 static const char dtlb4k_str[] = "dtlb-4K";
6994 static const char itlb2M_str[] = "itlb-2M";
6995 static const char itlb4M_str[] = "itlb-4M";
6996 static const char dtlb4M_str[] = "dtlb-4M";
6997 static const char dtlb24_str[] = "dtlb0-2M-4M";
6998 static const char itlb424_str[] = "itlb-4K-2M-4M";
6999 static const char itlb24_str[] = "itlb-2M-4M";
7000 static const char dtlb44_str[] = "dtlb-4K-4M";
7001 static const char sl1_dcache_str[] = "sectored-l1-dcache";
7002 static const char sl2_cache_str[] = "sectored-l2-cache";
7003 static const char itrace_str[] = "itrace-cache";
7004 static const char sl3_cache_str[] = "sectored-l3-cache";
7005 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
7006 
7007 static const struct cachetab {
7008 	uint8_t		ct_code;
7009 	uint8_t		ct_assoc;
7010 	uint16_t	ct_line_size;
7011 	size_t		ct_size;
7012 	const char	*ct_label;
7013 } intel_ctab[] = {
7014 	/*
7015 	 * maintain descending order!
7016 	 *
7017 	 * Codes ignored - Reason
7018 	 * ----------------------
7019 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
7020 	 * f0H/f1H - Currently we do not interpret prefetch size by design
7021 	 */
7022 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
7023 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
7024 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
7025 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
7026 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
7027 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
7028 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
7029 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
7030 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
7031 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
7032 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
7033 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
7034 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
7035 	{ 0xc0, 4, 0, 8, dtlb44_str },
7036 	{ 0xba, 4, 0, 64, dtlb4k_str },
7037 	{ 0xb4, 4, 0, 256, dtlb4k_str },
7038 	{ 0xb3, 4, 0, 128, dtlb4k_str },
7039 	{ 0xb2, 4, 0, 64, itlb4k_str },
7040 	{ 0xb0, 4, 0, 128, itlb4k_str },
7041 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
7042 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
7043 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
7044 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
7045 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
7046 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
7047 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
7048 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
7049 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
7050 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
7051 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
7052 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
7053 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
7054 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
7055 	{ 0x73, 8, 0, 64*1024, itrace_str},
7056 	{ 0x72, 8, 0, 32*1024, itrace_str},
7057 	{ 0x71, 8, 0, 16*1024, itrace_str},
7058 	{ 0x70, 8, 0, 12*1024, itrace_str},
7059 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
7060 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
7061 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
7062 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
7063 	{ 0x5d, 0, 0, 256, dtlb44_str},
7064 	{ 0x5c, 0, 0, 128, dtlb44_str},
7065 	{ 0x5b, 0, 0, 64, dtlb44_str},
7066 	{ 0x5a, 4, 0, 32, dtlb24_str},
7067 	{ 0x59, 0, 0, 16, dtlb4k_str},
7068 	{ 0x57, 4, 0, 16, dtlb4k_str},
7069 	{ 0x56, 4, 0, 16, dtlb4M_str},
7070 	{ 0x55, 0, 0, 7, itlb24_str},
7071 	{ 0x52, 0, 0, 256, itlb424_str},
7072 	{ 0x51, 0, 0, 128, itlb424_str},
7073 	{ 0x50, 0, 0, 64, itlb424_str},
7074 	{ 0x4f, 0, 0, 32, itlb4k_str},
7075 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
7076 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
7077 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
7078 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
7079 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
7080 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
7081 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
7082 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
7083 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
7084 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
7085 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
7086 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
7087 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
7088 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
7089 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
7090 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
7091 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
7092 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
7093 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
7094 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
7095 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
7096 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
7097 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
7098 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
7099 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
7100 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
7101 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
7102 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
7103 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
7104 	{ 0x0b, 4, 0, 4, itlb4M_str},
7105 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
7106 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
7107 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
7108 	{ 0x05, 4, 0, 32, dtlb4M_str},
7109 	{ 0x04, 4, 0, 8, dtlb4M_str},
7110 	{ 0x03, 4, 0, 64, dtlb4k_str},
7111 	{ 0x02, 4, 0, 2, itlb4M_str},
7112 	{ 0x01, 4, 0, 32, itlb4k_str},
7113 	{ 0 }
7114 };
7115 
7116 static const struct cachetab cyrix_ctab[] = {
7117 	{ 0x70, 4, 0, 32, "tlb-4K" },
7118 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
7119 	{ 0 }
7120 };
7121 
7122 /*
7123  * Search a cache table for a matching entry
7124  */
7125 static const struct cachetab *
find_cacheent(const struct cachetab * ct,uint_t code)7126 find_cacheent(const struct cachetab *ct, uint_t code)
7127 {
7128 	if (code != 0) {
7129 		for (; ct->ct_code != 0; ct++)
7130 			if (ct->ct_code <= code)
7131 				break;
7132 		if (ct->ct_code == code)
7133 			return (ct);
7134 	}
7135 	return (NULL);
7136 }
7137 
7138 /*
7139  * Populate cachetab entry with L2 or L3 cache-information using
7140  * cpuid function 4. This function is called from intel_walk_cacheinfo()
7141  * when descriptor 0x49 is encountered. It returns 0 if no such cache
7142  * information is found.
7143  */
7144 static int
intel_cpuid_4_cache_info(struct cachetab * ct,struct cpuid_info * cpi)7145 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
7146 {
7147 	uint32_t level, i;
7148 	int ret = 0;
7149 
7150 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
7151 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
7152 
7153 		if (level == 2 || level == 3) {
7154 			ct->ct_assoc =
7155 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
7156 			ct->ct_line_size =
7157 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
7158 			ct->ct_size = ct->ct_assoc *
7159 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
7160 			    ct->ct_line_size *
7161 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
7162 
7163 			if (level == 2) {
7164 				ct->ct_label = l2_cache_str;
7165 			} else if (level == 3) {
7166 				ct->ct_label = l3_cache_str;
7167 			}
7168 			ret = 1;
7169 		}
7170 	}
7171 
7172 	return (ret);
7173 }
7174 
7175 /*
7176  * Walk the cacheinfo descriptor, applying 'func' to every valid element
7177  * The walk is terminated if the walker returns non-zero.
7178  */
7179 static void
intel_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7180 intel_walk_cacheinfo(struct cpuid_info *cpi,
7181     void *arg, int (*func)(void *, const struct cachetab *))
7182 {
7183 	const struct cachetab *ct;
7184 	struct cachetab des_49_ct, des_b1_ct;
7185 	uint8_t *dp;
7186 	int i;
7187 
7188 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7189 		return;
7190 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7191 		/*
7192 		 * For overloaded descriptor 0x49 we use cpuid function 4
7193 		 * if supported by the current processor, to create
7194 		 * cache information.
7195 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
7196 		 * to disambiguate the cache information.
7197 		 */
7198 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
7199 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
7200 				ct = &des_49_ct;
7201 		} else if (*dp == 0xb1) {
7202 			des_b1_ct.ct_code = 0xb1;
7203 			des_b1_ct.ct_assoc = 4;
7204 			des_b1_ct.ct_line_size = 0;
7205 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
7206 				des_b1_ct.ct_size = 8;
7207 				des_b1_ct.ct_label = itlb2M_str;
7208 			} else {
7209 				des_b1_ct.ct_size = 4;
7210 				des_b1_ct.ct_label = itlb4M_str;
7211 			}
7212 			ct = &des_b1_ct;
7213 		} else {
7214 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
7215 				continue;
7216 			}
7217 		}
7218 
7219 		if (func(arg, ct) != 0) {
7220 			break;
7221 		}
7222 	}
7223 }
7224 
7225 /*
7226  * (Like the Intel one, except for Cyrix CPUs)
7227  */
7228 static void
cyrix_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7229 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
7230     void *arg, int (*func)(void *, const struct cachetab *))
7231 {
7232 	const struct cachetab *ct;
7233 	uint8_t *dp;
7234 	int i;
7235 
7236 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7237 		return;
7238 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7239 		/*
7240 		 * Search Cyrix-specific descriptor table first ..
7241 		 */
7242 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
7243 			if (func(arg, ct) != 0)
7244 				break;
7245 			continue;
7246 		}
7247 		/*
7248 		 * .. else fall back to the Intel one
7249 		 */
7250 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
7251 			if (func(arg, ct) != 0)
7252 				break;
7253 			continue;
7254 		}
7255 	}
7256 }
7257 
7258 /*
7259  * A cacheinfo walker that adds associativity, line-size, and size properties
7260  * to the devinfo node it is passed as an argument.
7261  */
7262 static int
add_cacheent_props(void * arg,const struct cachetab * ct)7263 add_cacheent_props(void *arg, const struct cachetab *ct)
7264 {
7265 	dev_info_t *devi = arg;
7266 
7267 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
7268 	if (ct->ct_line_size != 0)
7269 		add_cache_prop(devi, ct->ct_label, line_str,
7270 		    ct->ct_line_size);
7271 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
7272 	return (0);
7273 }
7274 
7275 
7276 static const char fully_assoc[] = "fully-associative?";
7277 
7278 /*
7279  * AMD style cache/tlb description
7280  *
7281  * Extended functions 5 and 6 directly describe properties of
7282  * tlbs and various cache levels.
7283  */
7284 static void
add_amd_assoc(dev_info_t * devi,const char * label,uint_t assoc)7285 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7286 {
7287 	switch (assoc) {
7288 	case 0:	/* reserved; ignore */
7289 		break;
7290 	default:
7291 		add_cache_prop(devi, label, assoc_str, assoc);
7292 		break;
7293 	case 0xff:
7294 		add_cache_prop(devi, label, fully_assoc, 1);
7295 		break;
7296 	}
7297 }
7298 
7299 static void
add_amd_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7300 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7301 {
7302 	if (size == 0)
7303 		return;
7304 	add_cache_prop(devi, label, size_str, size);
7305 	add_amd_assoc(devi, label, assoc);
7306 }
7307 
7308 static void
add_amd_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7309 add_amd_cache(dev_info_t *devi, const char *label,
7310     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7311 {
7312 	if (size == 0 || line_size == 0)
7313 		return;
7314 	add_amd_assoc(devi, label, assoc);
7315 	/*
7316 	 * Most AMD parts have a sectored cache. Multiple cache lines are
7317 	 * associated with each tag. A sector consists of all cache lines
7318 	 * associated with a tag. For example, the AMD K6-III has a sector
7319 	 * size of 2 cache lines per tag.
7320 	 */
7321 	if (lines_per_tag != 0)
7322 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7323 	add_cache_prop(devi, label, line_str, line_size);
7324 	add_cache_prop(devi, label, size_str, size * 1024);
7325 }
7326 
7327 static void
add_amd_l2_assoc(dev_info_t * devi,const char * label,uint_t assoc)7328 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7329 {
7330 	switch (assoc) {
7331 	case 0:	/* off */
7332 		break;
7333 	case 1:
7334 	case 2:
7335 	case 4:
7336 		add_cache_prop(devi, label, assoc_str, assoc);
7337 		break;
7338 	case 6:
7339 		add_cache_prop(devi, label, assoc_str, 8);
7340 		break;
7341 	case 8:
7342 		add_cache_prop(devi, label, assoc_str, 16);
7343 		break;
7344 	case 0xf:
7345 		add_cache_prop(devi, label, fully_assoc, 1);
7346 		break;
7347 	default: /* reserved; ignore */
7348 		break;
7349 	}
7350 }
7351 
7352 static void
add_amd_l2_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7353 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7354 {
7355 	if (size == 0 || assoc == 0)
7356 		return;
7357 	add_amd_l2_assoc(devi, label, assoc);
7358 	add_cache_prop(devi, label, size_str, size);
7359 }
7360 
7361 static void
add_amd_l2_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7362 add_amd_l2_cache(dev_info_t *devi, const char *label,
7363     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7364 {
7365 	if (size == 0 || assoc == 0 || line_size == 0)
7366 		return;
7367 	add_amd_l2_assoc(devi, label, assoc);
7368 	if (lines_per_tag != 0)
7369 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7370 	add_cache_prop(devi, label, line_str, line_size);
7371 	add_cache_prop(devi, label, size_str, size * 1024);
7372 }
7373 
7374 static void
amd_cache_info(struct cpuid_info * cpi,dev_info_t * devi)7375 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7376 {
7377 	struct cpuid_regs *cp;
7378 
7379 	if (cpi->cpi_xmaxeax < 0x80000005)
7380 		return;
7381 	cp = &cpi->cpi_extd[5];
7382 
7383 	/*
7384 	 * 4M/2M L1 TLB configuration
7385 	 *
7386 	 * We report the size for 2M pages because AMD uses two
7387 	 * TLB entries for one 4M page.
7388 	 */
7389 	add_amd_tlb(devi, "dtlb-2M",
7390 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7391 	add_amd_tlb(devi, "itlb-2M",
7392 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7393 
7394 	/*
7395 	 * 4K L1 TLB configuration
7396 	 */
7397 
7398 	switch (cpi->cpi_vendor) {
7399 		uint_t nentries;
7400 	case X86_VENDOR_TM:
7401 		if (cpi->cpi_family >= 5) {
7402 			/*
7403 			 * Crusoe processors have 256 TLB entries, but
7404 			 * cpuid data format constrains them to only
7405 			 * reporting 255 of them.
7406 			 */
7407 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7408 				nentries = 256;
7409 			/*
7410 			 * Crusoe processors also have a unified TLB
7411 			 */
7412 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7413 			    nentries);
7414 			break;
7415 		}
7416 		/*FALLTHROUGH*/
7417 	default:
7418 		add_amd_tlb(devi, itlb4k_str,
7419 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7420 		add_amd_tlb(devi, dtlb4k_str,
7421 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7422 		break;
7423 	}
7424 
7425 	/*
7426 	 * data L1 cache configuration
7427 	 */
7428 
7429 	add_amd_cache(devi, l1_dcache_str,
7430 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7431 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7432 
7433 	/*
7434 	 * code L1 cache configuration
7435 	 */
7436 
7437 	add_amd_cache(devi, l1_icache_str,
7438 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7439 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7440 
7441 	if (cpi->cpi_xmaxeax < 0x80000006)
7442 		return;
7443 	cp = &cpi->cpi_extd[6];
7444 
7445 	/* Check for a unified L2 TLB for large pages */
7446 
7447 	if (BITX(cp->cp_eax, 31, 16) == 0)
7448 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7449 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7450 	else {
7451 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7452 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7453 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7454 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7455 	}
7456 
7457 	/* Check for a unified L2 TLB for 4K pages */
7458 
7459 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7460 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7461 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7462 	} else {
7463 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7464 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7465 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7466 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7467 	}
7468 
7469 	add_amd_l2_cache(devi, l2_cache_str,
7470 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7471 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7472 }
7473 
7474 /*
7475  * There are two basic ways that the x86 world describes it cache
7476  * and tlb architecture - Intel's way and AMD's way.
7477  *
7478  * Return which flavor of cache architecture we should use
7479  */
7480 static int
x86_which_cacheinfo(struct cpuid_info * cpi)7481 x86_which_cacheinfo(struct cpuid_info *cpi)
7482 {
7483 	switch (cpi->cpi_vendor) {
7484 	case X86_VENDOR_Intel:
7485 		if (cpi->cpi_maxeax >= 2)
7486 			return (X86_VENDOR_Intel);
7487 		break;
7488 	case X86_VENDOR_AMD:
7489 		/*
7490 		 * The K5 model 1 was the first part from AMD that reported
7491 		 * cache sizes via extended cpuid functions.
7492 		 */
7493 		if (cpi->cpi_family > 5 ||
7494 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7495 			return (X86_VENDOR_AMD);
7496 		break;
7497 	case X86_VENDOR_HYGON:
7498 		return (X86_VENDOR_AMD);
7499 	case X86_VENDOR_TM:
7500 		if (cpi->cpi_family >= 5)
7501 			return (X86_VENDOR_AMD);
7502 		/*FALLTHROUGH*/
7503 	default:
7504 		/*
7505 		 * If they have extended CPU data for 0x80000005
7506 		 * then we assume they have AMD-format cache
7507 		 * information.
7508 		 *
7509 		 * If not, and the vendor happens to be Cyrix,
7510 		 * then try our-Cyrix specific handler.
7511 		 *
7512 		 * If we're not Cyrix, then assume we're using Intel's
7513 		 * table-driven format instead.
7514 		 */
7515 		if (cpi->cpi_xmaxeax >= 0x80000005)
7516 			return (X86_VENDOR_AMD);
7517 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7518 			return (X86_VENDOR_Cyrix);
7519 		else if (cpi->cpi_maxeax >= 2)
7520 			return (X86_VENDOR_Intel);
7521 		break;
7522 	}
7523 	return (-1);
7524 }
7525 
7526 void
cpuid_set_cpu_properties(void * dip,processorid_t cpu_id,struct cpuid_info * cpi)7527 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7528     struct cpuid_info *cpi)
7529 {
7530 	dev_info_t *cpu_devi;
7531 	int create;
7532 
7533 	cpu_devi = (dev_info_t *)dip;
7534 
7535 	/* device_type */
7536 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7537 	    "device_type", "cpu");
7538 
7539 	/* reg */
7540 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7541 	    "reg", cpu_id);
7542 
7543 	/* cpu-mhz, and clock-frequency */
7544 	if (cpu_freq > 0) {
7545 		long long mul;
7546 
7547 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7548 		    "cpu-mhz", cpu_freq);
7549 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7550 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7551 			    "clock-frequency", (int)mul);
7552 	}
7553 
7554 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7555 
7556 	/* vendor-id */
7557 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7558 	    "vendor-id", cpi->cpi_vendorstr);
7559 
7560 	if (cpi->cpi_maxeax == 0) {
7561 		return;
7562 	}
7563 
7564 	/*
7565 	 * family, model, and step
7566 	 */
7567 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7568 	    "family", CPI_FAMILY(cpi));
7569 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7570 	    "cpu-model", CPI_MODEL(cpi));
7571 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7572 	    "stepping-id", CPI_STEP(cpi));
7573 
7574 	/* type */
7575 	switch (cpi->cpi_vendor) {
7576 	case X86_VENDOR_Intel:
7577 		create = 1;
7578 		break;
7579 	default:
7580 		create = 0;
7581 		break;
7582 	}
7583 	if (create)
7584 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7585 		    "type", CPI_TYPE(cpi));
7586 
7587 	/* ext-family */
7588 	switch (cpi->cpi_vendor) {
7589 	case X86_VENDOR_Intel:
7590 	case X86_VENDOR_AMD:
7591 		create = cpi->cpi_family >= 0xf;
7592 		break;
7593 	case X86_VENDOR_HYGON:
7594 		create = 1;
7595 		break;
7596 	default:
7597 		create = 0;
7598 		break;
7599 	}
7600 	if (create)
7601 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7602 		    "ext-family", CPI_FAMILY_XTD(cpi));
7603 
7604 	/* ext-model */
7605 	switch (cpi->cpi_vendor) {
7606 	case X86_VENDOR_Intel:
7607 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7608 		break;
7609 	case X86_VENDOR_AMD:
7610 		create = CPI_FAMILY(cpi) == 0xf;
7611 		break;
7612 	case X86_VENDOR_HYGON:
7613 		create = 1;
7614 		break;
7615 	default:
7616 		create = 0;
7617 		break;
7618 	}
7619 	if (create)
7620 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7621 		    "ext-model", CPI_MODEL_XTD(cpi));
7622 
7623 	/* generation */
7624 	switch (cpi->cpi_vendor) {
7625 	case X86_VENDOR_AMD:
7626 	case X86_VENDOR_HYGON:
7627 		/*
7628 		 * AMD K5 model 1 was the first part to support this
7629 		 */
7630 		create = cpi->cpi_xmaxeax >= 0x80000001;
7631 		break;
7632 	default:
7633 		create = 0;
7634 		break;
7635 	}
7636 	if (create)
7637 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7638 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7639 
7640 	/* brand-id */
7641 	switch (cpi->cpi_vendor) {
7642 	case X86_VENDOR_Intel:
7643 		/*
7644 		 * brand id first appeared on Pentium III Xeon model 8,
7645 		 * and Celeron model 8 processors and Opteron
7646 		 */
7647 		create = cpi->cpi_family > 6 ||
7648 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7649 		break;
7650 	case X86_VENDOR_AMD:
7651 		create = cpi->cpi_family >= 0xf;
7652 		break;
7653 	case X86_VENDOR_HYGON:
7654 		create = 1;
7655 		break;
7656 	default:
7657 		create = 0;
7658 		break;
7659 	}
7660 	if (create && cpi->cpi_brandid != 0) {
7661 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7662 		    "brand-id", cpi->cpi_brandid);
7663 	}
7664 
7665 	/* chunks, and apic-id */
7666 	switch (cpi->cpi_vendor) {
7667 		/*
7668 		 * first available on Pentium IV and Opteron (K8)
7669 		 */
7670 	case X86_VENDOR_Intel:
7671 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7672 		break;
7673 	case X86_VENDOR_AMD:
7674 		create = cpi->cpi_family >= 0xf;
7675 		break;
7676 	case X86_VENDOR_HYGON:
7677 		create = 1;
7678 		break;
7679 	default:
7680 		create = 0;
7681 		break;
7682 	}
7683 	if (create) {
7684 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7685 		    "chunks", CPI_CHUNKS(cpi));
7686 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7687 		    "apic-id", cpi->cpi_apicid);
7688 		if (cpi->cpi_chipid >= 0) {
7689 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7690 			    "chip#", cpi->cpi_chipid);
7691 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7692 			    "clog#", cpi->cpi_clogid);
7693 		}
7694 	}
7695 
7696 	/* cpuid-features */
7697 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7698 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7699 
7700 
7701 	/* cpuid-features-ecx */
7702 	switch (cpi->cpi_vendor) {
7703 	case X86_VENDOR_Intel:
7704 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7705 		break;
7706 	case X86_VENDOR_AMD:
7707 		create = cpi->cpi_family >= 0xf;
7708 		break;
7709 	case X86_VENDOR_HYGON:
7710 		create = 1;
7711 		break;
7712 	default:
7713 		create = 0;
7714 		break;
7715 	}
7716 	if (create)
7717 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7718 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7719 
7720 	/* ext-cpuid-features */
7721 	switch (cpi->cpi_vendor) {
7722 	case X86_VENDOR_Intel:
7723 	case X86_VENDOR_AMD:
7724 	case X86_VENDOR_HYGON:
7725 	case X86_VENDOR_Cyrix:
7726 	case X86_VENDOR_TM:
7727 	case X86_VENDOR_Centaur:
7728 		create = cpi->cpi_xmaxeax >= 0x80000001;
7729 		break;
7730 	default:
7731 		create = 0;
7732 		break;
7733 	}
7734 	if (create) {
7735 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7736 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7737 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7738 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7739 	}
7740 
7741 	/*
7742 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7743 	 * model 1, and Cyrix GXm.  On earlier models we try and
7744 	 * simulate something similar .. so this string should always
7745 	 * same -something- about the processor, however lame.
7746 	 */
7747 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7748 	    "brand-string", cpi->cpi_brandstr);
7749 
7750 	/*
7751 	 * Finally, cache and tlb information
7752 	 */
7753 	switch (x86_which_cacheinfo(cpi)) {
7754 	case X86_VENDOR_Intel:
7755 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7756 		break;
7757 	case X86_VENDOR_Cyrix:
7758 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7759 		break;
7760 	case X86_VENDOR_AMD:
7761 		amd_cache_info(cpi, cpu_devi);
7762 		break;
7763 	default:
7764 		break;
7765 	}
7766 }
7767 
7768 struct l2info {
7769 	int *l2i_csz;
7770 	int *l2i_lsz;
7771 	int *l2i_assoc;
7772 	int l2i_ret;
7773 };
7774 
7775 /*
7776  * A cacheinfo walker that fetches the size, line-size and associativity
7777  * of the L2 cache
7778  */
7779 static int
intel_l2cinfo(void * arg,const struct cachetab * ct)7780 intel_l2cinfo(void *arg, const struct cachetab *ct)
7781 {
7782 	struct l2info *l2i = arg;
7783 	int *ip;
7784 
7785 	if (ct->ct_label != l2_cache_str &&
7786 	    ct->ct_label != sl2_cache_str)
7787 		return (0);	/* not an L2 -- keep walking */
7788 
7789 	if ((ip = l2i->l2i_csz) != NULL)
7790 		*ip = ct->ct_size;
7791 	if ((ip = l2i->l2i_lsz) != NULL)
7792 		*ip = ct->ct_line_size;
7793 	if ((ip = l2i->l2i_assoc) != NULL)
7794 		*ip = ct->ct_assoc;
7795 	l2i->l2i_ret = ct->ct_size;
7796 	return (1);		/* was an L2 -- terminate walk */
7797 }
7798 
7799 /*
7800  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7801  *
7802  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7803  *	value is the associativity, the associativity for the L2 cache and
7804  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7805  *	an index into the amd_afd[] array to determine the associativity.
7806  *	-1 is undefined. 0 is fully associative.
7807  */
7808 
7809 static int amd_afd[] =
7810 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7811 
7812 static void
amd_l2cacheinfo(struct cpuid_info * cpi,struct l2info * l2i)7813 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7814 {
7815 	struct cpuid_regs *cp;
7816 	uint_t size, assoc;
7817 	int i;
7818 	int *ip;
7819 
7820 	if (cpi->cpi_xmaxeax < 0x80000006)
7821 		return;
7822 	cp = &cpi->cpi_extd[6];
7823 
7824 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7825 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7826 		uint_t cachesz = size * 1024;
7827 		assoc = amd_afd[i];
7828 
7829 		ASSERT(assoc != -1);
7830 
7831 		if ((ip = l2i->l2i_csz) != NULL)
7832 			*ip = cachesz;
7833 		if ((ip = l2i->l2i_lsz) != NULL)
7834 			*ip = BITX(cp->cp_ecx, 7, 0);
7835 		if ((ip = l2i->l2i_assoc) != NULL)
7836 			*ip = assoc;
7837 		l2i->l2i_ret = cachesz;
7838 	}
7839 }
7840 
7841 int
getl2cacheinfo(cpu_t * cpu,int * csz,int * lsz,int * assoc)7842 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7843 {
7844 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7845 	struct l2info __l2info, *l2i = &__l2info;
7846 
7847 	l2i->l2i_csz = csz;
7848 	l2i->l2i_lsz = lsz;
7849 	l2i->l2i_assoc = assoc;
7850 	l2i->l2i_ret = -1;
7851 
7852 	switch (x86_which_cacheinfo(cpi)) {
7853 	case X86_VENDOR_Intel:
7854 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7855 		break;
7856 	case X86_VENDOR_Cyrix:
7857 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7858 		break;
7859 	case X86_VENDOR_AMD:
7860 		amd_l2cacheinfo(cpi, l2i);
7861 		break;
7862 	default:
7863 		break;
7864 	}
7865 	return (l2i->l2i_ret);
7866 }
7867 
7868 #if !defined(__xpv)
7869 
7870 uint32_t *
cpuid_mwait_alloc(cpu_t * cpu)7871 cpuid_mwait_alloc(cpu_t *cpu)
7872 {
7873 	uint32_t	*ret;
7874 	size_t		mwait_size;
7875 
7876 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7877 
7878 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7879 	if (mwait_size == 0)
7880 		return (NULL);
7881 
7882 	/*
7883 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7884 	 * allocations.  mwait_size is currently cache line sized.  Neither
7885 	 * of these implementation details are guarantied to be true in the
7886 	 * future.
7887 	 *
7888 	 * First try allocating mwait_size as kmem_alloc() currently returns
7889 	 * correctly aligned memory.  If kmem_alloc() does not return
7890 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7891 	 *
7892 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7893 	 * decide to free this memory.
7894 	 */
7895 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7896 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7897 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7898 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7899 		*ret = MWAIT_RUNNING;
7900 		return (ret);
7901 	} else {
7902 		kmem_free(ret, mwait_size);
7903 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7904 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7905 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7906 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7907 		*ret = MWAIT_RUNNING;
7908 		return (ret);
7909 	}
7910 }
7911 
7912 void
cpuid_mwait_free(cpu_t * cpu)7913 cpuid_mwait_free(cpu_t *cpu)
7914 {
7915 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7916 		return;
7917 	}
7918 
7919 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7920 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7921 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7922 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7923 	}
7924 
7925 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7926 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7927 }
7928 
7929 void
patch_tsc_read(int flag)7930 patch_tsc_read(int flag)
7931 {
7932 	size_t cnt;
7933 
7934 	switch (flag) {
7935 	case TSC_NONE:
7936 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7937 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7938 		break;
7939 	case TSC_RDTSC_LFENCE:
7940 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7941 		(void) memcpy((void *)tsc_read,
7942 		    (void *)&_tsc_lfence_start, cnt);
7943 		break;
7944 	case TSC_TSCP:
7945 		cnt = &_tscp_end - &_tscp_start;
7946 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7947 		break;
7948 	default:
7949 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7950 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7951 		break;
7952 	}
7953 	tsc_type = flag;
7954 }
7955 
7956 int
cpuid_deep_cstates_supported(void)7957 cpuid_deep_cstates_supported(void)
7958 {
7959 	struct cpuid_info *cpi;
7960 	struct cpuid_regs regs;
7961 
7962 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7963 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7964 
7965 	cpi = CPU->cpu_m.mcpu_cpi;
7966 
7967 	switch (cpi->cpi_vendor) {
7968 	case X86_VENDOR_Intel:
7969 	case X86_VENDOR_AMD:
7970 	case X86_VENDOR_HYGON:
7971 		if (cpi->cpi_xmaxeax < 0x80000007)
7972 			return (0);
7973 
7974 		/*
7975 		 * Does TSC run at a constant rate in all C-states?
7976 		 */
7977 		regs.cp_eax = 0x80000007;
7978 		(void) __cpuid_insn(&regs);
7979 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7980 
7981 	default:
7982 		return (0);
7983 	}
7984 }
7985 
7986 #endif	/* !__xpv */
7987 
7988 void
post_startup_cpu_fixups(void)7989 post_startup_cpu_fixups(void)
7990 {
7991 #ifndef __xpv
7992 	/*
7993 	 * Some AMD processors support C1E state. Entering this state will
7994 	 * cause the local APIC timer to stop, which we can't deal with at
7995 	 * this time.
7996 	 */
7997 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7998 		on_trap_data_t otd;
7999 		uint64_t reg;
8000 
8001 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
8002 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
8003 			/* Disable C1E state if it is enabled by BIOS */
8004 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
8005 			    AMD_ACTONCMPHALT_MASK) {
8006 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
8007 				    AMD_ACTONCMPHALT_SHIFT);
8008 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
8009 			}
8010 		}
8011 		no_trap();
8012 	}
8013 #endif	/* !__xpv */
8014 }
8015 
8016 void
enable_pcid(void)8017 enable_pcid(void)
8018 {
8019 	if (x86_use_pcid == -1)
8020 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
8021 
8022 	if (x86_use_invpcid == -1) {
8023 		x86_use_invpcid = is_x86_feature(x86_featureset,
8024 		    X86FSET_INVPCID);
8025 	}
8026 
8027 	if (!x86_use_pcid)
8028 		return;
8029 
8030 	/*
8031 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
8032 	 * bits; better make sure there's nothing there.
8033 	 */
8034 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
8035 
8036 	setcr4(getcr4() | CR4_PCIDE);
8037 }
8038 
8039 /*
8040  * Setup necessary registers to enable XSAVE feature on this processor.
8041  * This function needs to be called early enough, so that no xsave/xrstor
8042  * ops will execute on the processor before the MSRs are properly set up.
8043  *
8044  * Current implementation has the following assumption:
8045  * - cpuid_pass_basic() is done, so that X86 features are known.
8046  * - fpu_probe() is done, so that fp_save_mech is chosen.
8047  */
8048 void
xsave_setup_msr(cpu_t * cpu)8049 xsave_setup_msr(cpu_t *cpu)
8050 {
8051 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
8052 	ASSERT(fp_save_mech == FP_XSAVE);
8053 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
8054 
8055 	/* Enable OSXSAVE in CR4. */
8056 	setcr4(getcr4() | CR4_OSXSAVE);
8057 	/*
8058 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
8059 	 * correct value.
8060 	 */
8061 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
8062 	setup_xfem();
8063 }
8064 
8065 /*
8066  * Starting with the Westmere processor the local
8067  * APIC timer will continue running in all C-states,
8068  * including the deepest C-states.
8069  */
8070 int
cpuid_arat_supported(void)8071 cpuid_arat_supported(void)
8072 {
8073 	struct cpuid_info *cpi;
8074 	struct cpuid_regs regs;
8075 
8076 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8077 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8078 
8079 	cpi = CPU->cpu_m.mcpu_cpi;
8080 
8081 	switch (cpi->cpi_vendor) {
8082 	case X86_VENDOR_Intel:
8083 	case X86_VENDOR_AMD:
8084 	case X86_VENDOR_HYGON:
8085 		/*
8086 		 * Always-running Local APIC Timer is
8087 		 * indicated by CPUID.6.EAX[2].
8088 		 */
8089 		if (cpi->cpi_maxeax >= 6) {
8090 			regs.cp_eax = 6;
8091 			(void) cpuid_insn(NULL, &regs);
8092 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
8093 		} else {
8094 			return (0);
8095 		}
8096 	default:
8097 		return (0);
8098 	}
8099 }
8100 
8101 /*
8102  * Check support for Intel ENERGY_PERF_BIAS feature
8103  */
8104 int
cpuid_iepb_supported(struct cpu * cp)8105 cpuid_iepb_supported(struct cpu *cp)
8106 {
8107 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
8108 	struct cpuid_regs regs;
8109 
8110 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
8111 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8112 
8113 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
8114 		return (0);
8115 	}
8116 
8117 	/*
8118 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
8119 	 * capability bit CPUID.6.ECX.3
8120 	 */
8121 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
8122 		return (0);
8123 
8124 	regs.cp_eax = 0x6;
8125 	(void) cpuid_insn(NULL, &regs);
8126 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
8127 }
8128 
8129 /*
8130  * Check support for TSC deadline timer
8131  *
8132  * TSC deadline timer provides a superior software programming
8133  * model over local APIC timer that eliminates "time drifts".
8134  * Instead of specifying a relative time, software specifies an
8135  * absolute time as the target at which the processor should
8136  * generate a timer event.
8137  */
8138 int
cpuid_deadline_tsc_supported(void)8139 cpuid_deadline_tsc_supported(void)
8140 {
8141 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
8142 	struct cpuid_regs regs;
8143 
8144 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8145 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8146 
8147 	switch (cpi->cpi_vendor) {
8148 	case X86_VENDOR_Intel:
8149 		if (cpi->cpi_maxeax >= 1) {
8150 			regs.cp_eax = 1;
8151 			(void) cpuid_insn(NULL, &regs);
8152 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
8153 		} else {
8154 			return (0);
8155 		}
8156 	default:
8157 		return (0);
8158 	}
8159 }
8160 
8161 #if !defined(__xpv)
8162 /*
8163  * Patch in versions of bcopy for high performance Intel Nhm processors
8164  * and later...
8165  */
8166 void
patch_memops(uint_t vendor)8167 patch_memops(uint_t vendor)
8168 {
8169 	size_t cnt, i;
8170 	caddr_t to, from;
8171 
8172 	if ((vendor == X86_VENDOR_Intel) &&
8173 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
8174 		cnt = &bcopy_patch_end - &bcopy_patch_start;
8175 		to = &bcopy_ck_size;
8176 		from = &bcopy_patch_start;
8177 		for (i = 0; i < cnt; i++) {
8178 			*to++ = *from++;
8179 		}
8180 	}
8181 }
8182 #endif  /*  !__xpv */
8183 
8184 /*
8185  * We're being asked to tell the system how many bits are required to represent
8186  * the various thread and strand IDs. While it's tempting to derive this based
8187  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
8188  * correct. Instead, this needs to be based on the number of bits that the APIC
8189  * allows for these different configurations. We only update these to a larger
8190  * value if we find one.
8191  */
8192 void
cpuid_get_ext_topo(cpu_t * cpu,uint_t * core_nbits,uint_t * strand_nbits)8193 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
8194 {
8195 	struct cpuid_info *cpi;
8196 
8197 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8198 	cpi = cpu->cpu_m.mcpu_cpi;
8199 
8200 	if (cpi->cpi_ncore_bits > *core_nbits) {
8201 		*core_nbits = cpi->cpi_ncore_bits;
8202 	}
8203 
8204 	if (cpi->cpi_nthread_bits > *strand_nbits) {
8205 		*strand_nbits = cpi->cpi_nthread_bits;
8206 	}
8207 }
8208 
8209 void
cpuid_pass_ucode(cpu_t * cpu,uchar_t * fset)8210 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
8211 {
8212 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
8213 	struct cpuid_regs cp;
8214 
8215 	/*
8216 	 * Reread the CPUID portions that we need for various security
8217 	 * information.
8218 	 */
8219 	switch (cpi->cpi_vendor) {
8220 	case X86_VENDOR_Intel:
8221 		/*
8222 		 * Check if we now have leaf 7 available to us.
8223 		 */
8224 		if (cpi->cpi_maxeax < 7) {
8225 			bzero(&cp, sizeof (cp));
8226 			cp.cp_eax = 0;
8227 			cpi->cpi_maxeax = __cpuid_insn(&cp);
8228 			if (cpi->cpi_maxeax < 7)
8229 				break;
8230 		}
8231 
8232 		bzero(&cp, sizeof (cp));
8233 		cp.cp_eax = 7;
8234 		cp.cp_ecx = 0;
8235 		(void) __cpuid_insn(&cp);
8236 		cpi->cpi_std[7] = cp;
8237 		break;
8238 
8239 	case X86_VENDOR_AMD:
8240 	case X86_VENDOR_HYGON:
8241 		/* No xcpuid support */
8242 		if (cpi->cpi_family < 5 ||
8243 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
8244 			break;
8245 
8246 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
8247 			bzero(&cp, sizeof (cp));
8248 			cp.cp_eax = CPUID_LEAF_EXT_0;
8249 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
8250 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
8251 				break;
8252 		}
8253 
8254 		/*
8255 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
8256 		 * leaf 0x21. So we also check that.
8257 		 */
8258 		bzero(&cp, sizeof (cp));
8259 		cp.cp_eax = CPUID_LEAF_EXT_8;
8260 		(void) __cpuid_insn(&cp);
8261 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
8262 		cpi->cpi_extd[8] = cp;
8263 
8264 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21)
8265 			break;
8266 
8267 		bzero(&cp, sizeof (cp));
8268 		cp.cp_eax = CPUID_LEAF_EXT_21;
8269 		(void) __cpuid_insn(&cp);
8270 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
8271 		cpi->cpi_extd[0x21] = cp;
8272 		break;
8273 
8274 	default:
8275 		/*
8276 		 * Nothing to do here. Return an empty set which has already
8277 		 * been zeroed for us.
8278 		 */
8279 		return;
8280 	}
8281 
8282 	cpuid_scan_security(cpu, fset);
8283 }
8284 
8285 /* ARGSUSED */
8286 static int
cpuid_post_ucodeadm_xc(xc_arg_t arg0,xc_arg_t arg1,xc_arg_t arg2)8287 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
8288 {
8289 	uchar_t *fset;
8290 	boolean_t first_pass = (boolean_t)arg1;
8291 
8292 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
8293 	if (first_pass && CPU->cpu_id != 0)
8294 		return (0);
8295 	if (!first_pass && CPU->cpu_id == 0)
8296 		return (0);
8297 	cpuid_pass_ucode(CPU, fset);
8298 
8299 	return (0);
8300 }
8301 
8302 /*
8303  * After a microcode update where the version has changed, then we need to
8304  * rescan CPUID. To do this we check every CPU to make sure that they have the
8305  * same microcode. Then we perform a cross call to all such CPUs. It's the
8306  * caller's job to make sure that no one else can end up doing an update while
8307  * this is going on.
8308  *
8309  * We assume that the system is microcode capable if we're called.
8310  */
8311 void
cpuid_post_ucodeadm(void)8312 cpuid_post_ucodeadm(void)
8313 {
8314 	uint32_t rev;
8315 	int i;
8316 	struct cpu *cpu;
8317 	cpuset_t cpuset;
8318 	void *argdata;
8319 	uchar_t *f0;
8320 
8321 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
8322 
8323 	mutex_enter(&cpu_lock);
8324 	cpu = cpu_get(0);
8325 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
8326 	CPUSET_ONLY(cpuset, 0);
8327 	for (i = 1; i < max_ncpus; i++) {
8328 		if ((cpu = cpu_get(i)) == NULL)
8329 			continue;
8330 
8331 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
8332 			panic("post microcode update CPU %d has differing "
8333 			    "microcode revision (%u) from CPU 0 (%u)",
8334 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
8335 		}
8336 		CPUSET_ADD(cpuset, i);
8337 	}
8338 
8339 	/*
8340 	 * We do the cross calls in two passes. The first pass is only for the
8341 	 * boot CPU. The second pass is for all of the other CPUs. This allows
8342 	 * the boot CPU to go through and change behavior related to patching or
8343 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
8344 	 * other CPUs to follow suit.
8345 	 */
8346 	kpreempt_disable();
8347 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
8348 	    cpuid_post_ucodeadm_xc);
8349 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
8350 	    cpuid_post_ucodeadm_xc);
8351 	kpreempt_enable();
8352 
8353 	/*
8354 	 * OK, now look at each CPU and see if their feature sets are equal.
8355 	 */
8356 	f0 = argdata;
8357 	for (i = 1; i < max_ncpus; i++) {
8358 		uchar_t *fset;
8359 		if (!CPU_IN_SET(cpuset, i))
8360 			continue;
8361 
8362 		fset = (uchar_t *)((uintptr_t)argdata +
8363 		    sizeof (x86_featureset) * i);
8364 
8365 		if (!compare_x86_featureset(f0, fset)) {
8366 			panic("Post microcode update CPU %d has "
8367 			    "differing security feature (%p) set from CPU 0 "
8368 			    "(%p), not appending to feature set", i,
8369 			    (void *)fset, (void *)f0);
8370 		}
8371 	}
8372 
8373 	mutex_exit(&cpu_lock);
8374 
8375 	for (i = 0; i < NUM_X86_FEATURES; i++) {
8376 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8377 		    x86_feature_names[i]);
8378 		if (is_x86_feature(f0, i)) {
8379 			add_x86_feature(x86_featureset, i);
8380 		}
8381 	}
8382 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8383 }
8384 
8385 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8386 
8387 typedef struct cpuid_pass_def {
8388 	cpuid_pass_t cpd_pass;
8389 	cpuid_pass_f cpd_func;
8390 } cpuid_pass_def_t;
8391 
8392 /*
8393  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8394  * normal sense and should not appear here.
8395  */
8396 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8397 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8398 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
8399 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
8400 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
8401 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8402 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8403 };
8404 
8405 void
cpuid_execpass(cpu_t * cp,cpuid_pass_t pass,void * arg)8406 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8407 {
8408 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
8409 
8410 	if (cp == NULL)
8411 		cp = CPU;
8412 
8413 	/*
8414 	 * Space statically allocated for BSP, ensure pointer is set
8415 	 */
8416 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8417 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
8418 
8419 	ASSERT(cpuid_checkpass(cp, pass - 1));
8420 
8421 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8422 		if (cpuid_pass_defs[i].cpd_pass == pass) {
8423 			cpuid_pass_defs[i].cpd_func(cp, arg);
8424 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8425 			return;
8426 		}
8427 	}
8428 
8429 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8430 	    pass, cp->cpu_id);
8431 }
8432 
8433 /*
8434  * Extract the processor family from a chiprev.  Processor families are not the
8435  * same as cpuid families; see comments above and in x86_archext.h.
8436  */
8437 x86_processor_family_t
chiprev_family(const x86_chiprev_t cr)8438 chiprev_family(const x86_chiprev_t cr)
8439 {
8440 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8441 }
8442 
8443 /*
8444  * A chiprev matches its template if the vendor and family are identical and the
8445  * revision of the chiprev matches one of the bits set in the template.  Callers
8446  * may bitwise-OR together chiprevs of the same vendor and family to form the
8447  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8448  * multiple vendors or processor families with a single call.  Note that this
8449  * function operates on processor families, not cpuid families.
8450  */
8451 boolean_t
chiprev_matches(const x86_chiprev_t cr,const x86_chiprev_t template)8452 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8453 {
8454 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8455 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8456 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8457 }
8458 
8459 /*
8460  * A chiprev is at least min if the vendor and family are identical and the
8461  * revision of the chiprev is at least as recent as that of min.  Processor
8462  * families are considered unordered and cannot be compared using this function.
8463  * Note that this function operates on processor families, not cpuid families.
8464  * Use of the _ANY chiprev variant with this function is not useful; it will
8465  * always return B_FALSE if the _ANY variant is supplied as the minimum
8466  * revision.  To determine only whether a chiprev is of a given processor
8467  * family, test the return value of chiprev_family() instead.
8468  */
8469 boolean_t
chiprev_at_least(const x86_chiprev_t cr,const x86_chiprev_t min)8470 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8471 {
8472 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8473 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8474 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8475 }
8476 
8477 /*
8478  * The uarch functions operate in a manner similar to the chiprev functions
8479  * above.  While it is tempting to allow these to operate on microarchitectures
8480  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8481  * than ZEN2), we elect not to do so because a manufacturer may supply
8482  * processors of multiple different microarchitecture families each of which may
8483  * be internally ordered but unordered with respect to those of other families.
8484  */
8485 x86_uarch_t
uarchrev_uarch(const x86_uarchrev_t ur)8486 uarchrev_uarch(const x86_uarchrev_t ur)
8487 {
8488 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8489 }
8490 
8491 boolean_t
uarchrev_matches(const x86_uarchrev_t ur,const x86_uarchrev_t template)8492 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8493 {
8494 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8495 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8496 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8497 }
8498 
8499 boolean_t
uarchrev_at_least(const x86_uarchrev_t ur,const x86_uarchrev_t min)8500 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8501 {
8502 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8503 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8504 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8505 }
8506 
8507 /*
8508  * Topology cache related information. This is yet another cache interface that
8509  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8510  * AMD Leaf 8x1D (introduced with Zen 1).
8511  */
8512 static boolean_t
cpuid_cache_topo_sup(const struct cpuid_info * cpi)8513 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8514 {
8515 	switch (cpi->cpi_vendor) {
8516 	case X86_VENDOR_Intel:
8517 		if (cpi->cpi_maxeax >= 4) {
8518 			return (B_TRUE);
8519 		}
8520 		break;
8521 	case X86_VENDOR_AMD:
8522 	case X86_VENDOR_HYGON:
8523 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8524 		    is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8525 			return (B_TRUE);
8526 		}
8527 		break;
8528 	default:
8529 		break;
8530 	}
8531 
8532 	return (B_FALSE);
8533 }
8534 
8535 int
cpuid_getncaches(struct cpu * cpu,uint32_t * ncache)8536 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8537 {
8538 	const struct cpuid_info *cpi;
8539 
8540 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8541 	cpi = cpu->cpu_m.mcpu_cpi;
8542 
8543 	if (!cpuid_cache_topo_sup(cpi)) {
8544 		return (ENOTSUP);
8545 	}
8546 
8547 	*ncache = cpi->cpi_cache_leaf_size;
8548 	return (0);
8549 }
8550 
8551 int
cpuid_getcache(struct cpu * cpu,uint32_t cno,x86_cache_t * cache)8552 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8553 {
8554 	const struct cpuid_info *cpi;
8555 	const struct cpuid_regs *cp;
8556 
8557 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8558 	cpi = cpu->cpu_m.mcpu_cpi;
8559 
8560 	if (!cpuid_cache_topo_sup(cpi)) {
8561 		return (ENOTSUP);
8562 	}
8563 
8564 	if (cno >= cpi->cpi_cache_leaf_size) {
8565 		return (EINVAL);
8566 	}
8567 
8568 	bzero(cache, sizeof (x86_cache_t));
8569 	cp = cpi->cpi_cache_leaves[cno];
8570 	switch (CPI_CACHE_TYPE(cp)) {
8571 	case CPI_CACHE_TYPE_DATA:
8572 		cache->xc_type = X86_CACHE_TYPE_DATA;
8573 		break;
8574 	case CPI_CACHE_TYPE_INSTR:
8575 		cache->xc_type = X86_CACHE_TYPE_INST;
8576 		break;
8577 	case CPI_CACHE_TYPE_UNIFIED:
8578 		cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8579 		break;
8580 	case CPI_CACHE_TYPE_DONE:
8581 	default:
8582 		return (EINVAL);
8583 	}
8584 	cache->xc_level = CPI_CACHE_LVL(cp);
8585 	if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8586 		cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8587 	}
8588 	cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8589 	/*
8590 	 * The number of sets is reserved on AMD if the CPU is tagged as fully
8591 	 * associative, where as it is considered valid on Intel.
8592 	 */
8593 	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8594 	    CPI_FULL_ASSOC_CACHE(cp) != 0) {
8595 		cache->xc_nsets = 1;
8596 	} else {
8597 		cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8598 	}
8599 	cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8600 	cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8601 	cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8602 	    cache->xc_line_size;
8603 	/*
8604 	 * We're looking for the number of bits to cover the number of CPUs that
8605 	 * are being shared. Normally this would be the value - 1, but the CPUID
8606 	 * value is encoded as the actual value minus one, so we don't modify
8607 	 * this at all.
8608 	 */
8609 	cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8610 
8611 	/*
8612 	 * To construct a unique ID we construct a uint64_t that looks as
8613 	 * follows:
8614 	 *
8615 	 * [47:40] cache level
8616 	 * [39:32] CPUID cache type
8617 	 * [31:00] shifted APIC ID
8618 	 *
8619 	 * The shifted APIC ID gives us a guarantee that a given cache entry is
8620 	 * unique within its peers. The other two numbers give us something that
8621 	 * ensures that something is unique within the CPU. If we just had the
8622 	 * APIC ID shifted over by the indicated number of bits we'd end up with
8623 	 * an ID of zero for the L1I, L1D, L2, and L3.
8624 	 *
8625 	 * The format of this ID is private to the system and can change across
8626 	 * a reboot for the time being.
8627 	 */
8628 	cache->xc_id = (uint64_t)cache->xc_level << 40;
8629 	cache->xc_id |= (uint64_t)cache->xc_type << 32;
8630 	cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8631 
8632 	return (0);
8633 }
8634