xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision a57aa66e6e494b92c2711a1581224e69b7b40419)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2025 Oxide Computer Company
28  * Copyright 2024 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * ---------
328  * Microcode
329  * ---------
330  *
331  * Microcode updates may be applied by the firmware (BIOS/UEFI) and/or by the
332  * operating system and may result in architecturally visible changes (e.g.,
333  * changed MSR or CPUID bits). As such, we want to apply any updates as early
334  * as possible during the boot process -- right after the IDENT pass.
335  *
336  * Microcode may also be updated at runtime via ucodeadm(8), after which we do
337  * a selective rescan of the cpuid leaves to determine what features have
338  * changed. Microcode updates can provide more details about security related
339  * features to deal with issues like Spectre and L1TF. On occasion, vendors have
340  * violated their contract and removed bits. However, we don't try to detect
341  * that because that puts us in a situation that we really can't deal with. As
342  * such, the only thing we rescan are security related features today. See
343  * cpuid_pass_ucode(). This is not a pass in the same sense as the others and
344  * is run on demand, via cpuid_post_ucodeadm().
345  *
346  *
347  * All of the passes are run on all CPUs. However, for the most part we only
348  * care about what the boot CPU says about this information and use the other
349  * CPUs as a rough guide to sanity check that we have the same feature set.
350  *
351  * We do not support running multiple logical CPUs with disjoint, let alone
352  * different, feature sets.
353  *
354  * ------------------
355  * Processor Topology
356  * ------------------
357  *
358  * One of the important things that we need to do is to understand the topology
359  * of the underlying processor. When we say topology in this case, we're trying
360  * to understand the relationship between the logical CPUs that the operating
361  * system sees and the underlying physical layout. Different logical CPUs may
362  * share different resources which can have important consequences for the
363  * performance of the system. For example, they may share caches, execution
364  * units, and more.
365  *
366  * The topology of the processor changes from generation to generation and
367  * vendor to vendor.  Along with that, different vendors use different
368  * terminology, and the operating system itself uses occasionally overlapping
369  * terminology. It's important to understand what this topology looks like so
370  * one can understand the different things that we try to calculate and
371  * determine.
372  *
373  * To get started, let's talk about a little bit of terminology that we've used
374  * so far, is used throughout this file, and is fairly generic across multiple
375  * vendors:
376  *
377  * CPU
378  *	A central processing unit (CPU) refers to a logical and/or virtual
379  *	entity that the operating system can execute instructions on. The
380  *	underlying resources for this CPU may be shared between multiple
381  *	entities; however, to the operating system it is a discrete unit.
382  *
383  * PROCESSOR and PACKAGE
384  *
385  *	Generally, when we use the term 'processor' on its own, we are referring
386  *	to the physical entity that one buys and plugs into a board. However,
387  *	because processor has been overloaded and one might see it used to mean
388  *	multiple different levels, we will instead use the term 'package' for
389  *	the rest of this file. The term package comes from the electrical
390  *	engineering side and refers to the physical entity that encloses the
391  *	electronics inside. Strictly speaking the package can contain more than
392  *	just the CPU, for example, on many processors it may also have what's
393  *	called an 'integrated graphical processing unit (GPU)'. Because the
394  *	package can encapsulate multiple units, it is the largest physical unit
395  *	that we refer to.
396  *
397  * SOCKET
398  *
399  *	A socket refers to unit on a system board (generally the motherboard)
400  *	that can receive a package. A single package, or processor, is plugged
401  *	into a single socket. A system may have multiple sockets. Often times,
402  *	the term socket is used interchangeably with package and refers to the
403  *	electrical component that has plugged in, and not the receptacle itself.
404  *
405  * CORE
406  *
407  *	A core refers to the physical instantiation of a CPU, generally, with a
408  *	full set of hardware resources available to it. A package may contain
409  *	multiple cores inside of it or it may just have a single one. A
410  *	processor with more than one core is often referred to as 'multi-core'.
411  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
412  *	that has 'multi-core' processors.
413  *
414  *	A core may expose a single logical CPU to the operating system, or it
415  *	may expose multiple CPUs, which we call threads, defined below.
416  *
417  *	Some resources may still be shared by cores in the same package. For
418  *	example, many processors will share the level 3 cache between cores.
419  *	Some AMD generations share hardware resources between cores. For more
420  *	information on that see the section 'AMD Topology'.
421  *
422  * THREAD and STRAND
423  *
424  *	In this file, generally a thread refers to a hardware resources and not
425  *	the operating system's logical abstraction. A thread is always exposed
426  *	as an independent logical CPU to the operating system. A thread belongs
427  *	to a specific core. A core may have more than one thread. When that is
428  *	the case, the threads that are part of the same core are often referred
429  *	to as 'siblings'.
430  *
431  *	When multiple threads exist, this is generally referred to as
432  *	simultaneous multi-threading (SMT). When Intel introduced this in their
433  *	processors they called it hyper-threading (HT). When multiple threads
434  *	are active in a core, they split the resources of the core. For example,
435  *	two threads may share the same set of hardware execution units.
436  *
437  *	The operating system often uses the term 'strand' to refer to a thread.
438  *	This helps disambiguate it from the software concept.
439  *
440  * CHIP
441  *
442  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
443  *	base meaning, it is used to refer to a single integrated circuit, which
444  *	may or may not be the only thing in the package. In illumos, when you
445  *	see the term 'chip' it is almost always referring to the same thing as
446  *	the 'package'. However, many vendors may use chip to refer to one of
447  *	many integrated circuits that have been placed in the package. As an
448  *	example, see the subsequent definition.
449  *
450  *	To try and keep things consistent, we will only use chip when referring
451  *	to the entire integrated circuit package, with the exception of the
452  *	definition of multi-chip module (because it is in the name) and use the
453  *	term 'die' when we want the more general, potential sub-component
454  *	definition.
455  *
456  * DIE
457  *
458  *	A die refers to an integrated circuit. Inside of the package there may
459  *	be a single die or multiple dies. This is sometimes called a 'chip' in
460  *	vendor's parlance, but in this file, we use the term die to refer to a
461  *	subcomponent.
462  *
463  * MULTI-CHIP MODULE
464  *
465  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
466  *	are connected together in the same package. When a multi-chip design is
467  *	used, generally each chip is manufactured independently and then joined
468  *	together in the package. For example, on AMD's Zen microarchitecture
469  *	(family 0x17), the package contains several dies (the second meaning of
470  *	chip from above) that are connected together.
471  *
472  * CACHE
473  *
474  *	A cache is a part of the processor that maintains copies of recently
475  *	accessed memory. Caches are split into levels and then into types.
476  *	Commonly there are one to three levels, called level one, two, and
477  *	three. The lower the level, the smaller it is, the closer it is to the
478  *	execution units of the CPU, and the faster it is to access. The layout
479  *	and design of the cache come in many different flavors, consult other
480  *	resources for a discussion of those.
481  *
482  *	Caches are generally split into two types, the instruction and data
483  *	cache. The caches contain what their names suggest, the instruction
484  *	cache has executable program text, while the data cache has all other
485  *	memory that the processor accesses. As of this writing, data is kept
486  *	coherent between all of the caches on x86, so if one modifies program
487  *	text before it is executed, that will be in the data cache, and the
488  *	instruction cache will be synchronized with that change when the
489  *	processor actually executes those instructions. This coherency also
490  *	covers the fact that data could show up in multiple caches.
491  *
492  *	Generally, the lowest level caches are specific to a core. However, the
493  *	last layer cache is shared between some number of cores. The number of
494  *	CPUs sharing this last level cache is important. This has implications
495  *	for the choices that the scheduler makes, as accessing memory that might
496  *	be in a remote cache after thread migration can be quite expensive.
497  *
498  *	Sometimes, the word cache is abbreviated with a '$', because in US
499  *	English the word cache is pronounced the same as cash. So L1D$ refers to
500  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
501  *	in the rest of this theory statement for clarity.
502  *
503  * MEMORY CONTROLLER
504  *
505  *	The memory controller is a component that provides access to DRAM. Each
506  *	memory controller can access a set number of DRAM channels. Each channel
507  *	can have a number of DIMMs (sticks of memory) associated with it. A
508  *	given package may have more than one memory controller. The association
509  *	of the memory controller to a group of cores is important as it is
510  *	cheaper to access memory on the controller that you are associated with.
511  *
512  * NUMA
513  *
514  *	NUMA or non-uniform memory access, describes a way that systems are
515  *	built. On x86, any processor core can address all of the memory in the
516  *	system. However, When using multiple sockets or possibly within a
517  *	multi-chip module, some of that memory is physically closer and some of
518  *	it is further. Memory that is further away is more expensive to access.
519  *	Consider the following image of multiple sockets with memory:
520  *
521  *	+--------+                                                +--------+
522  *	| DIMM A |         +----------+      +----------+         | DIMM D |
523  *	+--------+-+       |          |      |          |       +-+------+-+
524  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
525  *	  +--------+-+     |          |      |          |     +-+------+-+
526  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
527  *	    +--------+                                        +--------+
528  *
529  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
530  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
531  *	access DIMMs A-C and more expensive to access D-F as it has to go
532  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
533  *	D-F are cheaper than A-C. While the socket form is the most common, when
534  *	using multi-chip modules, this can also sometimes occur. For another
535  *	example of this that's more involved, see the AMD topology section.
536  *
537  *
538  * Intel Topology
539  * --------------
540  *
541  * Most Intel processors since Nehalem, (as of this writing the current gen
542  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
543  * the package is a single monolithic die. MCMs currently aren't used. Most
544  * parts have three levels of caches, with the L3 cache being shared between
545  * all of the cores on the package. The L1/L2 cache is generally specific to
546  * an individual core. The following image shows at a simplified level what
547  * this looks like. The memory controller is commonly part of something called
548  * the 'Uncore', that used to be separate physical chips that were not a part of
549  * the package, but are now part of the same chip.
550  *
551  *  +-----------------------------------------------------------------------+
552  *  | Package                                                               |
553  *  |  +-------------------+  +-------------------+  +-------------------+  |
554  *  |  | Core              |  | Core              |  | Core              |  |
555  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
556  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
557  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
558  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
559  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
560  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
561  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
562  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
563  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
564  *  |  +-------------------+  +-------------------+  +-------------------+  |
565  *  | +-------------------------------------------------------------------+ |
566  *  | |                         Shared L3 Cache                           | |
567  *  | +-------------------------------------------------------------------+ |
568  *  | +-------------------------------------------------------------------+ |
569  *  | |                        Memory Controller                          | |
570  *  | +-------------------------------------------------------------------+ |
571  *  +-----------------------------------------------------------------------+
572  *
573  * A side effect of this current architecture is that what we care about from a
574  * scheduling and topology perspective, is simplified. In general we care about
575  * understanding which logical CPUs are part of the same core and socket.
576  *
577  * To determine the relationship between threads and cores, Intel initially used
578  * the identifier in the advanced programmable interrupt controller (APIC). They
579  * also added cpuid leaf 4 to give additional information about the number of
580  * threads and CPUs in the processor. With the addition of x2apic (which
581  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
582  * additional cpuid topology leaf 0xB was added.
583  *
584  * AMD Topology
585  * ------------
586  *
587  * When discussing AMD topology, we want to break this into three distinct
588  * generations of topology. There's the basic topology that has been used in
589  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
590  * with family 0x15 (Bulldozer), and there's the topology that was introduced
591  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
592  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
593  * additional terminology that's worth talking about.
594  *
595  * Until the introduction of family 0x17 (Zen), AMD did not implement something
596  * that they considered SMT. Whether or not the AMD processors have SMT
597  * influences many things including scheduling and reliability, availability,
598  * and serviceability (RAS) features.
599  *
600  * NODE
601  *
602  *	AMD uses the term node to refer to a die that contains a number of cores
603  *	and I/O resources. Depending on the processor family and model, more
604  *	than one node can be present in the package. When there is more than one
605  *	node this indicates a multi-chip module. Usually each node has its own
606  *	access to memory and I/O devices. This is important and generally
607  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
608  *	result, we track this relationship in the operating system.
609  *
610  *	In processors with an L3 cache, the L3 cache is generally shared across
611  *	the entire node, though the way this is carved up varies from generation
612  *	to generation.
613  *
614  * BULLDOZER
615  *
616  *	Starting with the Bulldozer family (0x15) and continuing until the
617  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
618  *	compute unit. In a compute unit, two traditional cores share a number of
619  *	hardware resources. Critically, they share the FPU, L1 instruction
620  *	cache, and the L2 cache. Several compute units were then combined inside
621  *	of a single node.  Because the integer execution units, L1 data cache,
622  *	and some other resources were not shared between the cores, AMD never
623  *	considered this to be SMT.
624  *
625  * ZEN
626  *
627  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
628  *	is called Zeppelin. These modules are similar to the idea of nodes used
629  *	previously. Each of these nodes has two DRAM channels which all of the
630  *	cores in the node can access uniformly. These nodes are linked together
631  *	in the package, creating a NUMA environment.
632  *
633  *	The Zeppelin die itself contains two different 'core complexes'. Each
634  *	core complex consists of four cores which each have two threads, for a
635  *	total of 8 logical CPUs per complex. Unlike other generations,
636  *	where all the logical CPUs in a given node share the L3 cache, here each
637  *	core complex has its own shared L3 cache.
638  *
639  *	A further thing that we need to consider is that in some configurations,
640  *	particularly with the Threadripper line of processors, not every die
641  *	actually has its memory controllers wired up to actual memory channels.
642  *	This means that some cores have memory attached to them and others
643  *	don't.
644  *
645  *	To put Zen in perspective, consider the following images:
646  *
647  *      +--------------------------------------------------------+
648  *      | Core Complex                                           |
649  *      | +-------------------+    +-------------------+  +---+  |
650  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
651  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
652  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
653  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
654  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
655  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
656  *      | +-------------------+    +-------------------+  | C |  |
657  *      | +-------------------+    +-------------------+  | a |  |
658  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
659  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
660  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
661  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
662  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
663  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
664  *      | +-------------------+    +-------------------+  +---+  |
665  *      |                                                        |
666  *	+--------------------------------------------------------+
667  *
668  *  This first image represents a single Zen core complex that consists of four
669  *  cores.
670  *
671  *
672  *	+--------------------------------------------------------+
673  *	| Zeppelin Die                                           |
674  *	|  +--------------------------------------------------+  |
675  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
676  *	|  +--------------------------------------------------+  |
677  *      |                           HH                           |
678  *	|          +-----------+    HH    +-----------+          |
679  *	|          |           |    HH    |           |          |
680  *	|          |    Core   |==========|    Core   |          |
681  *	|          |  Complex  |==========|  Complex  |          |
682  *	|          |           |    HH    |           |          |
683  *	|          +-----------+    HH    +-----------+          |
684  *      |                           HH                           |
685  *	|  +--------------------------------------------------+  |
686  *	|  |                Memory Controller                 |  |
687  *	|  +--------------------------------------------------+  |
688  *      |                                                        |
689  *	+--------------------------------------------------------+
690  *
691  *  This image represents a single Zeppelin Die. Note how both cores are
692  *  connected to the same memory controller and I/O units. While each core
693  *  complex has its own L3 cache as seen in the first image, they both have
694  *  uniform access to memory.
695  *
696  *
697  *                      PP                     PP
698  *                      PP                     PP
699  *           +----------PP---------------------PP---------+
700  *           |          PP                     PP         |
701  *           |    +-----------+          +-----------+    |
702  *           |    |           |          |           |    |
703  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
704  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
705  *           |    |           |          |           |    |
706  *           |    +-----------+ooo    ...+-----------+    |
707  *           |          HH      ooo  ...       HH         |
708  *           |          HH        oo..         HH         |
709  *           |          HH        ..oo         HH         |
710  *           |          HH      ...  ooo       HH         |
711  *           |    +-----------+...    ooo+-----------+    |
712  *           |    |           |          |           |    |
713  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
714  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
715  *           |    |           |          |           |    |
716  *           |    +-----------+          +-----------+    |
717  *           |          PP                     PP         |
718  *           +----------PP---------------------PP---------+
719  *                      PP                     PP
720  *                      PP                     PP
721  *
722  *  This image represents a single Zen package. In this example, it has four
723  *  Zeppelin dies, though some configurations only have a single one. In this
724  *  example, each die is directly connected to the next. Also, each die is
725  *  represented as being connected to memory by the 'M' character and connected
726  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
727  *  die is made up of two core complexes, we have multiple different NUMA
728  *  domains that we care about for these systems.
729  *
730  * ZEN 2
731  *
732  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
733  *	each Zeppelin Die had its own I/O die, that has been moved out of the
734  *	core complex in Zen 2. The actual core complex looks pretty similar, but
735  *	now the die actually looks much simpler:
736  *
737  *      +--------------------------------------------------------+
738  *      | Zen 2 Core Complex Die    HH                           |
739  *      |                           HH                           |
740  *      |          +-----------+    HH    +-----------+          |
741  *      |          |           |    HH    |           |          |
742  *      |          |    Core   |==========|    Core   |          |
743  *      |          |  Complex  |==========|  Complex  |          |
744  *      |          |           |    HH    |           |          |
745  *      |          +-----------+    HH    +-----------+          |
746  *      |                           HH                           |
747  *      |                           HH                           |
748  *      +--------------------------------------------------------+
749  *
750  *	From here, when we add the central I/O die, this changes things a bit.
751  *	Each die is connected to the I/O die, rather than trying to interconnect
752  *	them directly. The following image takes the same Zen 1 image that we
753  *	had earlier and shows what it looks like with the I/O die instead:
754  *
755  *                                 PP    PP
756  *                                 PP    PP
757  *           +---------------------PP----PP---------------------+
758  *           |                     PP    PP                     |
759  *           |  +-----------+      PP    PP      +-----------+  |
760  *           |  |           |      PP    PP      |           |  |
761  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
762  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
763  *           |  |         |o|oooo|          |oooo|o|         |  |
764  *           |  +-----------+    |          |    +-----------+  |
765  *           |                   |   I/O    |                   |
766  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
767  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
768  *           |                   |          |                   |
769  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
770  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
771  *           |                   |          |                   |
772  *           |  +-----------+    |          |    +-----------+  |
773  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
774  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
775  *           |  |    Die    |      PP    PP      |    Die    |  |
776  *           |  |           |      PP    PP      |           |  |
777  *           |  +-----------+      PP    PP      +-----------+  |
778  *           |                     PP    PP                     |
779  *           +---------------------PP----PP---------------------+
780  *                                 PP    PP
781  *                                 PP    PP
782  *
783  *	The above has four core complex dies installed, though the Zen 2 EPYC
784  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
785  *	generally only have one to two. The more notable difference here is how
786  *	everything communicates. Note that memory and PCIe come out of the
787  *	central die. This changes the way that one die accesses a resource. It
788  *	basically always has to go to the I/O die, where as in Zen 1 it may have
789  *	satisfied it locally. In general, this ends up being a better strategy
790  *	for most things, though it is possible to still treat everything in four
791  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
792  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
793  *	now there is only one 'node' present.
794  *
795  * ZEN 3
796  *
797  *	From an architectural perspective, Zen 3 is a much smaller change from
798  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
799  *	its microarchitectural changes. The biggest thing for us is how the die
800  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
801  *	cache. However, in Zen 3, the L3 is now shared between the entire core
802  *	complex die and is no longer partitioned between each core complex. This
803  *	means that all cores on the die can share the same L3 cache. Otherwise,
804  *	the general layout of the overall package with various core complexes
805  *	and an I/O die stays the same. Here's what the Core Complex Die looks
806  *	like in a bit more detail:
807  *
808  *               +-------------------------------------------------+
809  *               | Zen 3 Core Complex Die                          |
810  *               | +-------------------+    +-------------------+  |
811  *               | | Core       +----+ |    | Core       +----+ |  |
812  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
813  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
814  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
815  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
816  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
817  *               | +-------------------+    +-------------------+  |
818  *               | +-------------------+    +-------------------+  |
819  *               | | Core       +----+ |    | Core       +----+ |  |
820  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
821  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
822  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
823  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
824  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
825  *               | +-------------------+    +-------------------+  |
826  *               |                                                 |
827  *               | +--------------------------------------------+  |
828  *               | |                 L3 Cache                   |  |
829  *               | +--------------------------------------------+  |
830  *               |                                                 |
831  *               | +-------------------+    +-------------------+  |
832  *               | | Core       +----+ |    | Core       +----+ |  |
833  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
834  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
835  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
836  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
837  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
838  *               | +-------------------+    +-------------------+  |
839  *               | +-------------------+    +-------------------+  |
840  *               | | Core       +----+ |    | Core       +----+ |  |
841  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
842  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
843  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
844  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
845  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
846  *               | +-------------------+    +-------------------+  |
847  *               +-------------------------------------------------+
848  *
849  *	While it is not pictured, there are connections from the die to the
850  *	broader data fabric and additional functional blocks to support that
851  *	communication and coherency.
852  *
853  * CPUID LEAVES
854  *
855  * There are a few different CPUID leaves that we can use to try and understand
856  * the actual state of the world. As part of the introduction of family 0xf, AMD
857  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
858  * processors that are in the system. Because families before Zen didn't have
859  * SMT, this was always the number of cores that were in the system. However, it
860  * should always be thought of as the number of logical threads to be consistent
861  * between generations. In addition we also get the size of the APIC ID that is
862  * used to represent the number of logical processors. This is important for
863  * deriving topology information.
864  *
865  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
866  * bit between Bulldozer and later families, but it is quite useful in
867  * determining the topology information. Because this information has changed
868  * across family generations, it's worth calling out what these mean
869  * explicitly. The registers have the following meanings:
870  *
871  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
872  *		APIC ID, even though on systems without x2apic support, it will
873  *		be limited to 8 bits.
874  *
875  *	%ebx	On Bulldozer-era systems this contains information about the
876  *		number of cores that are in a compute unit (cores that share
877  *		resources). It also contains a per-package compute unit ID that
878  *		identifies which compute unit the logical CPU is a part of.
879  *
880  *		On Zen-era systems this instead contains the number of threads
881  *		per core and the ID of the core that the logical CPU is a part
882  *		of. Note, this ID is unique only to the package, it is not
883  *		globally unique across the entire system.
884  *
885  *	%ecx	This contains the number of nodes that exist in the package. It
886  *		also contains an ID that identifies which node the logical CPU
887  *		is a part of.
888  *
889  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
890  * cache layout to determine which logical CPUs are sharing which caches.
891  *
892  * illumos Topology
893  * ----------------
894  *
895  * Based on the above we synthesize the information into several different
896  * variables that we store in the 'struct cpuid_info'. We'll go into the details
897  * of what each member is supposed to represent and their uniqueness. In
898  * general, there are two levels of uniqueness that we care about. We care about
899  * an ID that is globally unique. That means that it will be unique across all
900  * entities in the system. For example, the default logical CPU ID is globally
901  * unique. On the other hand, there is some information that we only care about
902  * being unique within the context of a single package / socket. Here are the
903  * variables that we keep track of and their meaning.
904  *
905  * Several of the values that are asking for an identifier, with the exception
906  * of cpi_apicid, are allowed to be synthetic.
907  *
908  *
909  * cpi_apicid
910  *
911  *	This is the value of the CPU's APIC id. This should be the full 32-bit
912  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
913  *	APIC ID. This value is globally unique between all logical CPUs across
914  *	all packages. This is usually required by the APIC.
915  *
916  * cpi_chipid
917  *
918  *	This value indicates the ID of the package that the logical CPU is a
919  *	part of. This value is allowed to be synthetic. It is usually derived by
920  *	taking the CPU's APIC ID and determining how many bits are used to
921  *	represent CPU cores in the package. All logical CPUs that are part of
922  *	the same package must have the same value.
923  *
924  * cpi_coreid
925  *
926  *	This represents the ID of a CPU core. Two logical CPUs should only have
927  *	the same cpi_coreid value if they are part of the same core. These
928  *	values may be synthetic. On systems that support SMT, this value is
929  *	usually derived from the APIC ID, otherwise it is often synthetic and
930  *	just set to the value of the cpu_id in the cpu_t.
931  *
932  * cpi_pkgcoreid
933  *
934  *	This is similar to the cpi_coreid in that logical CPUs that are part of
935  *	the same core should have the same ID. The main difference is that these
936  *	values are only required to be unique to a given socket.
937  *
938  * cpi_clogid
939  *
940  *	This represents the logical ID of a logical CPU. This value should be
941  *	unique within a given socket for each logical CPU. This is allowed to be
942  *	synthetic, though it is usually based off of the CPU's apic ID. The
943  *	broader system expects that logical CPUs that have are part of the same
944  *	core have contiguous numbers. For example, if there were two threads per
945  *	core, then the core IDs divided by two should be the same and the first
946  *	modulus two should be zero and the second one. For example, IDs 4 and 5
947  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
948  *	6 represent two logical CPUs that are part of different cores.
949  *
950  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
951  *	from the same source, strictly speaking, they don't have to be and the
952  *	two values should be considered logically independent. One should not
953  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
954  *	some kind of relationship. While this is tempting, we've seen cases on
955  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
956  *
957  * cpi_ncpu_per_chip
958  *
959  *	This value indicates the total number of logical CPUs that exist in the
960  *	physical package. Critically, this is not the number of logical CPUs
961  *	that exist for just the single core.
962  *
963  *	This value should be the same for all logical CPUs in the same package.
964  *
965  * cpi_ncore_per_chip
966  *
967  *	This value indicates the total number of physical CPU cores that exist
968  *	in the package. The system compares this value with cpi_ncpu_per_chip to
969  *	determine if simultaneous multi-threading (SMT) is enabled. When
970  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
971  *	the X86FSET_HTT feature is not set. If this value is greater than one,
972  *	than we consider the processor to have the feature X86FSET_CMP, to
973  *	indicate that there is support for more than one core.
974  *
975  *	This value should be the same for all logical CPUs in the same package.
976  *
977  * cpi_procnodes_per_pkg
978  *
979  *	This value indicates the number of 'nodes' that exist in the package.
980  *	When processors are actually a multi-chip module, this represents the
981  *	number of such modules that exist in the package. Currently, on Intel
982  *	based systems this member is always set to 1.
983  *
984  *	This value should be the same for all logical CPUs in the same package.
985  *
986  * cpi_procnodeid
987  *
988  *	This value indicates the ID of the node that the logical CPU is a part
989  *	of. All logical CPUs that are in the same node must have the same value
990  *	here. This value must be unique across all of the packages in the
991  *	system.  On Intel based systems, this is currently set to the value in
992  *	cpi_chipid because there is only one node.
993  *
994  * cpi_cores_per_compunit
995  *
996  *	This value indicates the number of cores that are part of a compute
997  *	unit. See the AMD topology section for this. This member only has real
998  *	meaning currently for AMD Bulldozer family processors. For all other
999  *	processors, this should currently be set to 1.
1000  *
1001  * cpi_compunitid
1002  *
1003  *	This indicates the compute unit that the logical CPU belongs to. For
1004  *	processors without AMD Bulldozer-style compute units this should be set
1005  *	to the value of cpi_coreid.
1006  *
1007  * cpi_ncpu_shr_last_cache
1008  *
1009  *	This indicates the number of logical CPUs that are sharing the same last
1010  *	level cache. This value should be the same for all CPUs that are sharing
1011  *	that cache. The last cache refers to the cache that is closest to memory
1012  *	and furthest away from the CPU.
1013  *
1014  * cpi_last_lvl_cacheid
1015  *
1016  *	This indicates the ID of the last cache that the logical CPU uses. This
1017  *	cache is often shared between multiple logical CPUs and is the cache
1018  *	that is closest to memory and furthest away from the CPU. This value
1019  *	should be the same for a group of logical CPUs only if they actually
1020  *	share the same last level cache. IDs should not overlap between
1021  *	packages.
1022  *
1023  * cpi_ncore_bits
1024  *
1025  *	This indicates the number of bits that are required to represent all of
1026  *	the cores in the system. As cores are derived based on their APIC IDs,
1027  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1028  *	this value to be larger than the actual number of IDs that are present
1029  *	in the system. This is used to size tables by the CMI framework. It is
1030  *	only filled in for Intel and AMD CPUs.
1031  *
1032  * cpi_nthread_bits
1033  *
1034  *	This indicates the number of bits required to represent all of the IDs
1035  *	that cover the logical CPUs that exist on a given core. It's OK for this
1036  *	value to be larger than the actual number of IDs that are present in the
1037  *	system.  This is used to size tables by the CMI framework. It is
1038  *	only filled in for Intel and AMD CPUs.
1039  *
1040  * -----------
1041  * Hypervisors
1042  * -----------
1043  *
1044  * If trying to manage the differences between vendors wasn't bad enough, it can
1045  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1046  * the ability to interpose on all cpuid instructions and change them to suit
1047  * their purposes. In general, this is necessary as the hypervisor wants to be
1048  * able to present a more uniform set of features or not necessarily give the
1049  * guest operating system kernel knowledge of all features so it can be
1050  * more easily migrated between systems.
1051  *
1052  * When it comes to trying to determine topology information, this can be a
1053  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1054  * leaf, it'll often return all zeros. Because of that, you'll often see various
1055  * checks scattered about fields being non-zero before we assume we can use
1056  * them.
1057  *
1058  * When it comes to topology information, the hypervisor is often incentivized
1059  * to lie to you about topology. This is because it doesn't always actually
1060  * guarantee that topology at all. The topology path we take in the system
1061  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1062  * or AMD CPU, then we basically do our normal path. However, when they don't
1063  * use an actual vendor, then that usually turns into multiple one-core CPUs
1064  * that we enumerate that are often on different sockets. The actual behavior
1065  * depends greatly on what the hypervisor actually exposes to us.
1066  *
1067  * --------------------
1068  * Exposing Information
1069  * --------------------
1070  *
1071  * We expose CPUID information in three different forms in the system.
1072  *
1073  * The first is through the x86_featureset variable. This is used in conjunction
1074  * with the is_x86_feature() function. This is queried by x86-specific functions
1075  * to determine which features are or aren't present in the system and to make
1076  * decisions based upon them. For example, users of this include everything from
1077  * parts of the system dedicated to reliability, availability, and
1078  * serviceability (RAS), to making decisions about how to handle security
1079  * mitigations, to various x86-specific drivers. General purpose or
1080  * architecture independent drivers should never be calling this function.
1081  *
1082  * The second means is through the auxiliary vector. The auxiliary vector is a
1083  * series of tagged data that the kernel passes down to a user program when it
1084  * begins executing. This information is used to indicate to programs what
1085  * instruction set extensions are present. For example, information about the
1086  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1087  * since user programs cannot make use of it. However, things like the AVX
1088  * instruction sets are. Programs use this information to make run-time
1089  * decisions about what features they should use. As an example, the run-time
1090  * link-editor (rtld) can relocate different functions depending on the hardware
1091  * support available.
1092  *
1093  * The final form is through a series of accessor functions that all have the
1094  * form cpuid_get*. This is used by a number of different subsystems in the
1095  * kernel to determine more detailed information about what we're running on,
1096  * topology information, etc. Some of these subsystems include processor groups
1097  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1098  * microcode, and performance monitoring. These functions all ASSERT that the
1099  * CPU they're being called on has reached a certain cpuid pass. If the passes
1100  * are rearranged, then this needs to be adjusted.
1101  *
1102  * -----------------------------------------------
1103  * Speculative Execution CPU Side Channel Security
1104  * -----------------------------------------------
1105  *
1106  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1107  * execution in the CPU to create side channels there have been a number of
1108  * different attacks and corresponding issues that the operating system needs to
1109  * mitigate against. The following list is some of the common, but not
1110  * exhaustive, set of issues that we know about and have done some or need to do
1111  * more work in the system to mitigate against:
1112  *
1113  *   - Spectre v1
1114  *   - swapgs (Spectre v1 variant)
1115  *   - Spectre v2
1116  *     - Branch History Injection (BHI).
1117  *   - Meltdown (Spectre v3)
1118  *   - Rogue Register Read (Spectre v3a)
1119  *   - Speculative Store Bypass (Spectre v4)
1120  *   - ret2spec, SpectreRSB
1121  *   - L1 Terminal Fault (L1TF)
1122  *   - Microarchitectural Data Sampling (MDS)
1123  *   - Register File Data Sampling (RFDS)
1124  *
1125  * Each of these requires different sets of mitigations and has different attack
1126  * surfaces. For the most part, this discussion is about protecting the kernel
1127  * from non-kernel executing environments such as user processes and hardware
1128  * virtual machines. Unfortunately, there are a number of user vs. user
1129  * scenarios that exist with these. The rest of this section will describe the
1130  * overall approach that the system has taken to address these as well as their
1131  * shortcomings. Unfortunately, not all of the above have been handled today.
1132  *
1133  * SPECTRE v2, ret2spec, SpectreRSB
1134  *
1135  * The second variant of the spectre attack focuses on performing branch target
1136  * injection. This generally impacts indirect call instructions in the system.
1137  * There are four different ways to mitigate this issue that are commonly
1138  * described today:
1139  *
1140  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1141  *  2. Using Retpolines and RSB Stuffing
1142  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1143  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1144  *
1145  * IBRS uses a feature added to microcode to restrict speculation, among other
1146  * things. This form of mitigation has not been used as it has been generally
1147  * seen as too expensive and requires reactivation upon various transitions in
1148  * the system.
1149  *
1150  * As a less impactful alternative to IBRS, retpolines were developed by
1151  * Google. These basically require one to replace indirect calls with a specific
1152  * trampoline that will cause speculation to fail and break the attack.
1153  * Retpolines require compiler support. We always build with retpolines in the
1154  * external thunk mode. This means that a traditional indirect call is replaced
1155  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1156  * of this is that all indirect function calls are performed through a register.
1157  *
1158  * We have to use a common external location of the thunk and not inline it into
1159  * the callsite so that way we can have a single place to patch these functions.
1160  * As it turns out, we currently have two different forms of retpolines that
1161  * exist in the system:
1162  *
1163  *  1. A full retpoline
1164  *  2. A no-op version
1165  *
1166  * The first one is used in the general case. Historically, there was an
1167  * AMD-specific optimized retopoline variant that was based around using a
1168  * serializing lfence instruction; however, in March 2022 it was announced that
1169  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1170  * use it and it is no longer available in the system.
1171  *
1172  * The third form described above is the most curious. It turns out that the way
1173  * that retpolines are implemented is that they rely on how speculation is
1174  * performed on a 'ret' instruction. Intel has continued to optimize this
1175  * process (which is partly why we need to have return stack buffer stuffing,
1176  * but more on that in a bit) and in processors starting with Cascade Lake
1177  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1178  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1179  *
1180  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1181  * physical core. However, if this is the case, we don't want to use retpolines
1182  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1183  * function (called a thunk) into a jmp instruction. This means that we're still
1184  * paying the cost of an extra jump to the external thunk, but it gives us
1185  * flexibility and the ability to have a single kernel image that works across a
1186  * wide variety of systems and hardware features.
1187  *
1188  * Unfortunately, this alone is insufficient. First, Skylake systems have
1189  * additional speculation for the Return Stack Buffer (RSB) which is used to
1190  * return from call instructions which retpolines take advantage of. However,
1191  * this problem is not just limited to Skylake and is actually more pernicious.
1192  * The SpectreRSB paper introduces several more problems that can arise with
1193  * dealing with this. The RSB can be poisoned just like the indirect branch
1194  * predictor. This means that one needs to clear the RSB when transitioning
1195  * between two different privilege domains. Some examples include:
1196  *
1197  *  - Switching between two different user processes
1198  *  - Going between user land and the kernel
1199  *  - Returning to the kernel from a hardware virtual machine
1200  *
1201  * Mitigating this involves combining a couple of different things. The first is
1202  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1203  * Bridge. When an RSB entry refers to a user address and we're executing in the
1204  * kernel, speculation through it will be stopped when SMEP is enabled. This
1205  * protects against a number of the different cases that we would normally be
1206  * worried about such as when we enter the kernel from user land.
1207  *
1208  * To prevent against additional manipulation of the RSB from other contexts
1209  * such as a non-root VMX context attacking the kernel we first look to
1210  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1211  * nothing else that we need to do to protect the kernel at this time.
1212  *
1213  * Unfortunately, not all eIBRS implementations are sufficient to guard
1214  * against RSB manipulations, so we still need to manually overwrite the
1215  * contents of the return stack buffer unless the hardware specifies we are
1216  * covered. We do this through the x86_rsb_stuff() function.  Currently this
1217  * is employed on context switch and vmx_exit. The x86_rsb_stuff() function is
1218  * disabled only when mitigations in general are, or if we have hardware
1219  * indicating no need for post-barrier RSB protections, either in one place
1220  * (old hardware), or on both (newer hardware).
1221  *
1222  * If SMEP is not present, then we would have to stuff the RSB every time we
1223  * transitioned from user mode to the kernel, which isn't very practical right
1224  * now.
1225  *
1226  * To fully protect user to user and vmx to vmx attacks from these classes of
1227  * issues, we would also need to allow them to opt into performing an Indirect
1228  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1229  *
1230  * The fourth form of mitigation here is specific to AMD and is called Automated
1231  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1232  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1233  * (extended feature enable register) MSR. This bit basically says that IBRS
1234  * acts as though it is always active when executing at CPL0 and when executing
1235  * in the 'host' context when SEV-SNP is enabled.
1236  *
1237  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1238  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1239  * to the kernel, we must still consider the remaining cases that exist, just
1240  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1241  * traditional technique to work, this is not true on all CPUs. While a write to
1242  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1243  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1244  * guard page is present between user and kernel address spaces and SMEP is
1245  * enabled, then there is no need to clear the RSB at all.
1246  *
1247  * By default, the system will enable RSB stuffing and the required variant of
1248  * retpolines and store that information in the x86_spectrev2_mitigation value.
1249  * This will be evaluated after a microcode update as well, though it is
1250  * expected that microcode updates will not take away features. This may mean
1251  * that a late loaded microcode may not end up in the optimal configuration
1252  * (though this should be rare).
1253  *
1254  * Currently we do not build kmdb with retpolines or perform any additional side
1255  * channel security mitigations for it. One complication with kmdb is that it
1256  * requires its own retpoline thunks and it would need to adjust itself based on
1257  * what the kernel does. The threat model of kmdb is more limited and therefore
1258  * it may make more sense to investigate using prediction barriers as the whole
1259  * system is only executing a single instruction at a time while in kmdb.
1260  *
1261  * Branch History Injection (BHI)
1262  *
1263  * BHI is a specific form of SPECTREv2 where an attacker may manipulate branch
1264  * history before transitioning from user to supervisor mode (or from VMX
1265  * non-root/guest to root mode). The attacker can then exploit certain
1266  * compiler-generated code-sequences ("gadgets") to disclose information from
1267  * other contexts or domains.  Recent (late-2023/early-2024) research in
1268  * object code analysis discovered many more potential gadgets than what was
1269  * initially reported (which previously was confined to Linux use of
1270  * unprivileged eBPF).
1271  *
1272  * The BHI threat doesn't exist in processsors that predate eIBRS, or in AMD
1273  * ones. Some eIBRS processors have the ability to disable branch history in
1274  * certain (but not all) cases using an MSR write. eIBRS processors that don't
1275  * have the ability to disable must use a software sequence to scrub the
1276  * branch history buffer.
1277  *
1278  * BHI_DIS_S (the aforementioned MSR) prevents ring 0 from ring 3 (VMX guest
1279  * or VMX root). It does not protect different user processes from each other,
1280  * or ring 3 VMX guest from ring 3 VMX root or vice versa.
1281  *
1282  * The BHI clearing sequence prevents user exploiting kernel gadgets, and user
1283  * A's use of user B's gadgets.
1284  *
1285  * SMEP and eIBRS are a continuing defense-in-depth measure protecting the
1286  * kernel.
1287  *
1288  * SPECTRE v1, v4
1289  *
1290  * The v1 and v4 variants of spectre are not currently mitigated in the
1291  * system and require other classes of changes to occur in the code.
1292  *
1293  * SPECTRE v1 (SWAPGS VARIANT)
1294  *
1295  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1296  * can generally affect any branch-dependent code. The swapgs issue is one
1297  * variant of this. If we are coming in from userspace, we can have code like
1298  * this:
1299  *
1300  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1301  *	je	1f
1302  *	movq	$0, REGOFF_SAVFP(%rsp)
1303  *	swapgs
1304  *	1:
1305  *	movq	%gs:CPU_THREAD, %rax
1306  *
1307  * If an attacker can cause a mis-speculation of the branch here, we could skip
1308  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1309  * load. If subsequent code can act as the usual Spectre cache gadget, this
1310  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1311  * any use of the %gs override.
1312  *
1313  * The other case is also an issue: if we're coming into a trap from kernel
1314  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1315  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1316  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1317  * case, and the fix is the same in both cases (an lfence at the branch target
1318  * 1: in this example), we'll just do it unconditionally.
1319  *
1320  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1321  * harder for user-space to actually set a useful %gsbase value: although it's
1322  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1323  * mitigate anyway.
1324  *
1325  * MELTDOWN
1326  *
1327  * Meltdown, or spectre v3, allowed a user process to read any data in their
1328  * address space regardless of whether or not the page tables in question
1329  * allowed the user to have the ability to read them. The solution to meltdown
1330  * is kernel page table isolation. In this world, there are two page tables that
1331  * are used for a process, one in user land and one in the kernel. To implement
1332  * this we use per-CPU page tables and switch between the user and kernel
1333  * variants when entering and exiting the kernel.  For more information about
1334  * this process and how the trampolines work, please see the big theory
1335  * statements and additional comments in:
1336  *
1337  *  - uts/i86pc/ml/kpti_trampolines.s
1338  *  - uts/i86pc/vm/hat_i86.c
1339  *
1340  * While Meltdown only impacted Intel systems and there are also Intel systems
1341  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1342  * kernel page table isolation enabled. While this may at first seem weird, an
1343  * important thing to remember is that you can't speculatively read an address
1344  * if it's never in your page table at all. Having user processes without kernel
1345  * pages present provides us with an important layer of defense in the kernel
1346  * against any other side channel attacks that exist and have yet to be
1347  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1348  * default, no matter the x86 system.
1349  *
1350  * L1 TERMINAL FAULT
1351  *
1352  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1353  * execution uses page table entries. Effectively, it is two different problems.
1354  * The first is that it ignores the not present bit in the page table entries
1355  * when performing speculative execution. This means that something can
1356  * speculatively read the listed physical address if it's present in the L1
1357  * cache under certain conditions (see Intel's documentation for the full set of
1358  * conditions). Secondly, this can be used to bypass hardware virtualization
1359  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1360  * instructions.
1361  *
1362  * For the non-hardware virtualized case, this is relatively easy to deal with.
1363  * We must make sure that all unmapped pages have an address of zero. This means
1364  * that they could read the first 4k of physical memory; however, we never use
1365  * that first page in the operating system and always skip putting it in our
1366  * memory map, even if firmware tells us we can use it in our memory map. While
1367  * other systems try to put extra metadata in the address and reserved bits,
1368  * which led to this being problematic in those cases, we do not.
1369  *
1370  * For hardware virtual machines things are more complicated. Because they can
1371  * construct their own page tables, it isn't hard for them to perform this
1372  * attack against any physical address. The one wrinkle is that this physical
1373  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1374  * to flush the L1 data cache. We wrap this up in the function
1375  * spec_uarch_flush(). This function is also used in the mitigation of
1376  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1377  * hypervisors such as KVM or bhyve are responsible for performing this before
1378  * entering the guest.
1379  *
1380  * Because this attack takes place in the L1 cache, there's another wrinkle
1381  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1382  * designs. This means that when a thread enters a hardware virtualized context
1383  * and flushes the L1 data cache, the other thread on the processor may then go
1384  * ahead and put new data in it that can be potentially attacked. While one
1385  * solution is to disable SMT on the system, another option that is available is
1386  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1387  * goes through and makes sure that if a HVM is being scheduled on one thread,
1388  * then the thing on the other thread is from the same hardware virtual machine.
1389  * If an interrupt comes in or the guest exits to the broader system, then the
1390  * other SMT thread will be kicked out.
1391  *
1392  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1393  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1394  * perform L1TF related mitigations.
1395  *
1396  * MICROARCHITECTURAL DATA SAMPLING
1397  *
1398  * Microarchitectural data sampling (MDS) is a combination of four discrete
1399  * vulnerabilities that are similar issues affecting various parts of the CPU's
1400  * microarchitectural implementation around load, store, and fill buffers.
1401  * Specifically it is made up of the following subcomponents:
1402  *
1403  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1404  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1405  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1406  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1407  *
1408  * To begin addressing these, Intel has introduced another feature in microcode
1409  * called MD_CLEAR. This changes the verw instruction to operate in a different
1410  * way. This allows us to execute the verw instruction in a particular way to
1411  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1412  * updated when this microcode is present to flush this state.
1413  *
1414  * Primarily we need to flush this state whenever we transition from the kernel
1415  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1416  * little bit different. Here the structures are statically sized when a logical
1417  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1418  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1419  * mwait, or another ACPI method. To perform these flushes, we call
1420  * x86_md_clear() at all of these transition points.
1421  *
1422  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1423  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1424  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1425  * a no-op.
1426  *
1427  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1428  * particular, everything we've discussed above is only valid for a single
1429  * thread executing on a core. In the case where you have hyper-threading
1430  * present, this attack can be performed between threads. The theoretical fix
1431  * for this is to ensure that both threads are always in the same security
1432  * domain. This means that they are executing in the same ring and mutually
1433  * trust each other. Practically speaking, this would mean that a system call
1434  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1435  * Rather than implement this, we recommend that one disables hyper-threading
1436  * through the use of psradm -aS.
1437  *
1438  * TSX ASYNCHRONOUS ABORT
1439  *
1440  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1441  * behaves like MDS, but leverages Intel's transactional instructions as another
1442  * vector. Effectively, when a transaction hits one of these cases (unmapped
1443  * page, various cache snoop activity, etc.) then the same data can be exposed
1444  * as in the case of MDS. This means that you can attack your twin.
1445  *
1446  * Intel has described that there are two different ways that we can mitigate
1447  * this problem on affected processors:
1448  *
1449  *   1) We can use the same techniques used to deal with MDS. Flushing the
1450  *      microarchitectural buffers and disabling hyperthreading will mitigate
1451  *      this in the same way.
1452  *
1453  *   2) Using microcode to disable TSX.
1454  *
1455  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1456  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1457  * That's OK as we're already doing all such mitigations. On the other hand,
1458  * processors with MDS_NO are all supposed to receive microcode updates that
1459  * enumerate support for disabling TSX. In general, we'd rather use this method
1460  * when available as it doesn't require disabling hyperthreading to be
1461  * effective. Currently we basically are relying on microcode for processors
1462  * that enumerate MDS_NO.
1463  *
1464  * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1465  * Sampling: RFDS. This allows an attacker to sample values that were in any
1466  * of integer, floating point, or vector registers. This was discovered by
1467  * Intel during internal validation work.  The existence of the RFDS_NO
1468  * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1469  * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1470  * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1471  * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1472  * MSR that L1D uses.
1473  *
1474  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1475  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1476  * different powers. The first allows us to cause all transactions to
1477  * immediately abort. The second gives us a means of disabling TSX completely,
1478  * which includes removing it from cpuid. If we have support for this in
1479  * microcode during the first cpuid pass, then we'll disable TSX completely such
1480  * that user land never has a chance to observe the bit. However, if we are late
1481  * loading the microcode, then we must use the functionality to cause
1482  * transactions to automatically abort. This is necessary for user land's sake.
1483  * Once a program sees a cpuid bit, it must not be taken away.
1484  *
1485  * We track whether or not we should do this based on what cpuid pass we're in.
1486  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1487  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1488  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1489  * second time after we do the initial microcode update.  As a result we need to
1490  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1491  * suitable microcode on the current CPU (which happens prior to
1492  * cpuid_pass_ucode()).
1493  *
1494  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1495  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1496  * unfortunate feature in a number of ways, and taking the opportunity to
1497  * finally be able to turn it off is likely to be of benefit in the future.
1498  *
1499  * SUMMARY
1500  *
1501  * The following table attempts to summarize the mitigations for various issues
1502  * and what's done in various places:
1503  *
1504  *  - Spectre v1: Not currently mitigated
1505  *  - swapgs: lfences after swapgs paths
1506  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1507  *  - Meltdown: Kernel Page Table Isolation
1508  *  - Spectre v3a: Updated CPU microcode
1509  *  - Spectre v4: Not currently mitigated
1510  *  - SpectreRSB: SMEP and RSB Stuffing
1511  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1512  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1513  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1514  *  - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1515  *  - BHI: software sequence, and use of BHI_DIS_S if microcode has it.
1516  *
1517  * The following table indicates the x86 feature set bits that indicate that a
1518  * given problem has been solved or a notable feature is present:
1519  *
1520  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1521  *  - MDS_NO: All forms of MDS
1522  *  - TAA_NO: TAA
1523  *  - RFDS_NO: RFDS
1524  *  - BHI_NO: BHI
1525  */
1526 
1527 #include <sys/types.h>
1528 #include <sys/archsystm.h>
1529 #include <sys/x86_archext.h>
1530 #include <sys/kmem.h>
1531 #include <sys/systm.h>
1532 #include <sys/cmn_err.h>
1533 #include <sys/sunddi.h>
1534 #include <sys/sunndi.h>
1535 #include <sys/cpuvar.h>
1536 #include <sys/processor.h>
1537 #include <sys/sysmacros.h>
1538 #include <sys/pg.h>
1539 #include <sys/fp.h>
1540 #include <sys/controlregs.h>
1541 #include <sys/bitmap.h>
1542 #include <sys/auxv_386.h>
1543 #include <sys/memnode.h>
1544 #include <sys/pci_cfgspace.h>
1545 #include <sys/comm_page.h>
1546 #include <sys/mach_mmu.h>
1547 #include <sys/ucode.h>
1548 #include <sys/tsc.h>
1549 #include <sys/kobj.h>
1550 #include <sys/asm_misc.h>
1551 #include <sys/bitmap.h>
1552 
1553 #ifdef __xpv
1554 #include <sys/hypervisor.h>
1555 #else
1556 #include <sys/ontrap.h>
1557 #endif
1558 
1559 uint_t x86_vendor = X86_VENDOR_IntelClone;
1560 uint_t x86_type = X86_TYPE_OTHER;
1561 uint_t x86_clflush_size = 0;
1562 
1563 #if defined(__xpv)
1564 int x86_use_pcid = 0;
1565 int x86_use_invpcid = 0;
1566 #else
1567 int x86_use_pcid = -1;
1568 int x86_use_invpcid = -1;
1569 #endif
1570 
1571 typedef enum {
1572 	X86_SPECTREV2_RETPOLINE,
1573 	X86_SPECTREV2_ENHANCED_IBRS,
1574 	X86_SPECTREV2_AUTO_IBRS,
1575 	X86_SPECTREV2_DISABLED
1576 } x86_spectrev2_mitigation_t;
1577 
1578 uint_t x86_disable_spectrev2 = 0;
1579 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1580     X86_SPECTREV2_RETPOLINE;
1581 
1582 /*
1583  * The mitigation status for TAA:
1584  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1585  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1586  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1587  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1588  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1589  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1590  */
1591 typedef enum {
1592 	X86_TAA_NOTHING,
1593 	X86_TAA_DISABLED,
1594 	X86_TAA_MD_CLEAR,
1595 	X86_TAA_TSX_FORCE_ABORT,
1596 	X86_TAA_TSX_DISABLE,
1597 	X86_TAA_HW_MITIGATED
1598 } x86_taa_mitigation_t;
1599 
1600 uint_t x86_disable_taa = 0;
1601 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1602 
1603 uint_t pentiumpro_bug4046376;
1604 
1605 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1606 
1607 static char *x86_feature_names[NUM_X86_FEATURES] = {
1608 	"lgpg",
1609 	"tsc",
1610 	"msr",
1611 	"mtrr",
1612 	"pge",
1613 	"de",
1614 	"cmov",
1615 	"mmx",
1616 	"mca",
1617 	"pae",
1618 	"cv8",
1619 	"pat",
1620 	"sep",
1621 	"sse",
1622 	"sse2",
1623 	"htt",
1624 	"asysc",
1625 	"nx",
1626 	"sse3",
1627 	"cx16",
1628 	"cmp",
1629 	"tscp",
1630 	"mwait",
1631 	"sse4a",
1632 	"cpuid",
1633 	"ssse3",
1634 	"sse4_1",
1635 	"sse4_2",
1636 	"1gpg",
1637 	"clfsh",
1638 	"64",
1639 	"aes",
1640 	"pclmulqdq",
1641 	"xsave",
1642 	"avx",
1643 	"vmx",
1644 	"svm",
1645 	"topoext",
1646 	"f16c",
1647 	"rdrand",
1648 	"x2apic",
1649 	"avx2",
1650 	"bmi1",
1651 	"bmi2",
1652 	"fma",
1653 	"smep",
1654 	"smap",
1655 	"adx",
1656 	"rdseed",
1657 	"mpx",
1658 	"avx512f",
1659 	"avx512dq",
1660 	"avx512pf",
1661 	"avx512er",
1662 	"avx512cd",
1663 	"avx512bw",
1664 	"avx512vl",
1665 	"avx512fma",
1666 	"avx512vbmi",
1667 	"avx512_vpopcntdq",
1668 	"avx512_4vnniw",
1669 	"avx512_4fmaps",
1670 	"xsaveopt",
1671 	"xsavec",
1672 	"xsaves",
1673 	"sha",
1674 	"umip",
1675 	"pku",
1676 	"ospke",
1677 	"pcid",
1678 	"invpcid",
1679 	"ibrs",
1680 	"ibpb",
1681 	"stibp",
1682 	"ssbd",
1683 	"ssbd_virt",
1684 	"rdcl_no",
1685 	"ibrs_all",
1686 	"rsba",
1687 	"ssb_no",
1688 	"stibp_all",
1689 	"flush_cmd",
1690 	"l1d_vmentry_no",
1691 	"fsgsbase",
1692 	"clflushopt",
1693 	"clwb",
1694 	"monitorx",
1695 	"clzero",
1696 	"xop",
1697 	"fma4",
1698 	"tbm",
1699 	"avx512_vnni",
1700 	"amd_pcec",
1701 	"md_clear",
1702 	"mds_no",
1703 	"core_thermal",
1704 	"pkg_thermal",
1705 	"tsx_ctrl",
1706 	"taa_no",
1707 	"ppin",
1708 	"vaes",
1709 	"vpclmulqdq",
1710 	"lfence_serializing",
1711 	"gfni",
1712 	"avx512_vp2intersect",
1713 	"avx512_bitalg",
1714 	"avx512_vbmi2",
1715 	"avx512_bf16",
1716 	"auto_ibrs",
1717 	"rfds_no",
1718 	"rfds_clear",
1719 	"pbrsb_no",
1720 	"bhi_no",
1721 	"bhi_clear"
1722 };
1723 
1724 boolean_t
is_x86_feature(void * featureset,uint_t feature)1725 is_x86_feature(void *featureset, uint_t feature)
1726 {
1727 	ASSERT(feature < NUM_X86_FEATURES);
1728 	return (BT_TEST((ulong_t *)featureset, feature));
1729 }
1730 
1731 void
add_x86_feature(void * featureset,uint_t feature)1732 add_x86_feature(void *featureset, uint_t feature)
1733 {
1734 	ASSERT(feature < NUM_X86_FEATURES);
1735 	BT_SET((ulong_t *)featureset, feature);
1736 }
1737 
1738 void
remove_x86_feature(void * featureset,uint_t feature)1739 remove_x86_feature(void *featureset, uint_t feature)
1740 {
1741 	ASSERT(feature < NUM_X86_FEATURES);
1742 	BT_CLEAR((ulong_t *)featureset, feature);
1743 }
1744 
1745 boolean_t
compare_x86_featureset(void * setA,void * setB)1746 compare_x86_featureset(void *setA, void *setB)
1747 {
1748 	/*
1749 	 * We assume that the unused bits of the bitmap are always zero.
1750 	 */
1751 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1752 		return (B_TRUE);
1753 	} else {
1754 		return (B_FALSE);
1755 	}
1756 }
1757 
1758 void
print_x86_featureset(void * featureset)1759 print_x86_featureset(void *featureset)
1760 {
1761 	uint_t i;
1762 
1763 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1764 		if (is_x86_feature(featureset, i)) {
1765 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1766 			    x86_feature_names[i]);
1767 		}
1768 	}
1769 }
1770 
1771 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1772 static size_t xsave_state_size = 0;
1773 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1774 boolean_t xsave_force_disable = B_FALSE;
1775 extern int disable_smap;
1776 
1777 /*
1778  * This is set to platform type we are running on.
1779  */
1780 static int platform_type = -1;
1781 
1782 #if !defined(__xpv)
1783 /*
1784  * Variable to patch if hypervisor platform detection needs to be
1785  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1786  */
1787 int enable_platform_detection = 1;
1788 #endif
1789 
1790 /*
1791  * monitor/mwait info.
1792  *
1793  * size_actual and buf_actual are the real address and size allocated to get
1794  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1795  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1796  * processor cache-line alignment, but this is not guarantied in the furture.
1797  */
1798 struct mwait_info {
1799 	size_t		mon_min;	/* min size to avoid missed wakeups */
1800 	size_t		mon_max;	/* size to avoid false wakeups */
1801 	size_t		size_actual;	/* size actually allocated */
1802 	void		*buf_actual;	/* memory actually allocated */
1803 	uint32_t	support;	/* processor support of monitor/mwait */
1804 };
1805 
1806 /*
1807  * xsave/xrestor info.
1808  *
1809  * This structure contains HW feature bits and the size of the xsave save area.
1810  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1811  * (xsave_state) to describe the xsave layout. However, at runtime the
1812  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1813  * xsave_state structure simply represents the legacy layout of the beginning
1814  * of the xsave area.
1815  */
1816 struct xsave_info {
1817 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1818 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1819 	size_t		xsav_max_size;  /* max size save area for HW features */
1820 	size_t		ymm_size;	/* AVX: size of ymm save area */
1821 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1822 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1823 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1824 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1825 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1826 	size_t		opmask_size;	/* AVX512: size of opmask save */
1827 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1828 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1829 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1830 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1831 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1832 };
1833 
1834 
1835 /*
1836  * These constants determine how many of the elements of the
1837  * cpuid we cache in the cpuid_info data structure; the
1838  * remaining elements are accessible via the cpuid instruction.
1839  */
1840 
1841 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1842 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1843 #define	NMAX_CPI_TOPO	0x10		/* Sanity check on leaf 8X26, 1F */
1844 
1845 /*
1846  * See the big theory statement for a more detailed explanation of what some of
1847  * these members mean.
1848  */
1849 struct cpuid_info {
1850 	uint_t cpi_pass;		/* last pass completed */
1851 	/*
1852 	 * standard function information
1853 	 */
1854 	uint_t cpi_maxeax;		/* fn 0: %eax */
1855 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1856 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1857 
1858 	uint_t cpi_family;		/* fn 1: extended family */
1859 	uint_t cpi_model;		/* fn 1: extended model */
1860 	uint_t cpi_step;		/* fn 1: stepping */
1861 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1862 					/*		AMD: package/socket # */
1863 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1864 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1865 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1866 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1867 	uint_t cpi_ncache;		/* fn 2: number of elements */
1868 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1869 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1870 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1871 					/* Intel fn: 4, AMD fn: 8000001d */
1872 	struct cpuid_regs **cpi_cache_leaves;	/* Actual leaves from above */
1873 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1874 	struct cpuid_regs cpi_sub7[2];	/* Leaf 7, sub-leaves 1-2 */
1875 	/*
1876 	 * extended function information
1877 	 */
1878 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1879 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1880 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1881 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1882 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1883 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1884 
1885 	id_t cpi_coreid;		/* same coreid => strands share core */
1886 	int cpi_pkgcoreid;		/* core number within single package */
1887 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1888 					/* Intel: fn 4: %eax[31-26] */
1889 
1890 	/*
1891 	 * These values represent the number of bits that are required to store
1892 	 * information about the number of cores and threads.
1893 	 */
1894 	uint_t cpi_ncore_bits;
1895 	uint_t cpi_nthread_bits;
1896 	/*
1897 	 * supported feature information
1898 	 */
1899 	uint32_t cpi_support[6];
1900 #define	STD_EDX_FEATURES	0
1901 #define	AMD_EDX_FEATURES	1
1902 #define	TM_EDX_FEATURES		2
1903 #define	STD_ECX_FEATURES	3
1904 #define	AMD_ECX_FEATURES	4
1905 #define	STD_EBX_FEATURES	5
1906 	/*
1907 	 * Synthesized information, where known.
1908 	 */
1909 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1910 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1911 	uint32_t cpi_socket;		/* Chip package/socket type */
1912 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1913 
1914 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1915 	uint32_t cpi_apicid;
1916 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1917 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1918 					/* Intel: 1 */
1919 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1920 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1921 
1922 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1923 
1924 	/*
1925 	 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1926 	 * eventually leaf 0x1F (Intel).
1927 	 */
1928 	uint_t cpi_topo_nleaves;
1929 	struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1930 };
1931 
1932 
1933 static struct cpuid_info cpuid_info0;
1934 
1935 /*
1936  * These bit fields are defined by the Intel Application Note AP-485
1937  * "Intel Processor Identification and the CPUID Instruction"
1938  */
1939 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1940 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1941 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1942 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1943 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1944 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1945 
1946 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1947 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1948 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1949 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1950 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1951 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1952 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1953 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1954 #define	CPI_FEATURES_7_2_EDX(cpi)	((cpi)->cpi_sub7[1].cp_edx)
1955 
1956 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1957 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1958 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1959 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1960 
1961 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1962 #define	CPI_XMAXEAX_MAX		0x80000100
1963 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1964 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1965 
1966 /*
1967  * Function 4 (Deterministic Cache Parameters) macros
1968  * Defined by Intel Application Note AP-485
1969  */
1970 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1971 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1972 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1973 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1974 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1975 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1976 #define	CPI_CACHE_TYPE_DONE	0
1977 #define	CPI_CACHE_TYPE_DATA	1
1978 #define	CPI_CACHE_TYPE_INSTR	2
1979 #define	CPI_CACHE_TYPE_UNIFIED	3
1980 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1981 
1982 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1983 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1984 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1985 
1986 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1987 
1988 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1989 
1990 
1991 /*
1992  * A couple of shorthand macros to identify "later" P6-family chips
1993  * like the Pentium M and Core.  First, the "older" P6-based stuff
1994  * (loosely defined as "pre-Pentium-4"):
1995  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1996  */
1997 #define	IS_LEGACY_P6(cpi) (			\
1998 	cpi->cpi_family == 6 &&			\
1999 		(cpi->cpi_model == 1 ||		\
2000 		cpi->cpi_model == 3 ||		\
2001 		cpi->cpi_model == 5 ||		\
2002 		cpi->cpi_model == 6 ||		\
2003 		cpi->cpi_model == 7 ||		\
2004 		cpi->cpi_model == 8 ||		\
2005 		cpi->cpi_model == 0xA ||	\
2006 		cpi->cpi_model == 0xB)		\
2007 )
2008 
2009 /* A "new F6" is everything with family 6 that's not the above */
2010 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
2011 
2012 /* Extended family/model support */
2013 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
2014 	cpi->cpi_family >= 0xf)
2015 
2016 /*
2017  * Info for monitor/mwait idle loop.
2018  *
2019  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
2020  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
2021  * 2006.
2022  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
2023  * Documentation Updates" #33633, Rev 2.05, December 2006.
2024  */
2025 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
2026 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
2027 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
2028 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
2029 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
2030 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
2031 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
2032 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
2033 /*
2034  * Number of sub-cstates for a given c-state.
2035  */
2036 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
2037 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
2038 
2039 /*
2040  * XSAVE leaf 0xD enumeration
2041  */
2042 #define	CPUID_LEAFD_2_YMM_OFFSET	576
2043 #define	CPUID_LEAFD_2_YMM_SIZE		256
2044 
2045 /*
2046  * Common extended leaf names to cut down on typos.
2047  */
2048 #define	CPUID_LEAF_EXT_0		0x80000000
2049 #define	CPUID_LEAF_EXT_8		0x80000008
2050 #define	CPUID_LEAF_EXT_1d		0x8000001d
2051 #define	CPUID_LEAF_EXT_1e		0x8000001e
2052 #define	CPUID_LEAF_EXT_21		0x80000021
2053 #define	CPUID_LEAF_EXT_26		0x80000026
2054 
2055 /*
2056  * Functions we consume from cpuid_subr.c;  don't publish these in a header
2057  * file to try and keep people using the expected cpuid_* interfaces.
2058  */
2059 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2060 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2061 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2062 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2063 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2064 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2065 
2066 /*
2067  * Apply up various platform-dependent restrictions where the
2068  * underlying platform restrictions mean the CPU can be marked
2069  * as less capable than its cpuid instruction would imply.
2070  */
2071 #if defined(__xpv)
2072 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)2073 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2074 {
2075 	switch (eax) {
2076 	case 1: {
2077 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2078 		    0 : CPUID_INTC_EDX_MCA;
2079 		cp->cp_edx &=
2080 		    ~(mcamask |
2081 		    CPUID_INTC_EDX_PSE |
2082 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2083 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2084 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2085 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2086 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2087 		break;
2088 	}
2089 
2090 	case 0x80000001:
2091 		cp->cp_edx &=
2092 		    ~(CPUID_AMD_EDX_PSE |
2093 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2094 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2095 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2096 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2097 		    CPUID_AMD_EDX_TSCP);
2098 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2099 		break;
2100 	default:
2101 		break;
2102 	}
2103 
2104 	switch (vendor) {
2105 	case X86_VENDOR_Intel:
2106 		switch (eax) {
2107 		case 4:
2108 			/*
2109 			 * Zero out the (ncores-per-chip - 1) field
2110 			 */
2111 			cp->cp_eax &= 0x03fffffff;
2112 			break;
2113 		default:
2114 			break;
2115 		}
2116 		break;
2117 	case X86_VENDOR_AMD:
2118 	case X86_VENDOR_HYGON:
2119 		switch (eax) {
2120 
2121 		case 0x80000001:
2122 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2123 			break;
2124 
2125 		case CPUID_LEAF_EXT_8:
2126 			/*
2127 			 * Zero out the (ncores-per-chip - 1) field
2128 			 */
2129 			cp->cp_ecx &= 0xffffff00;
2130 			break;
2131 		default:
2132 			break;
2133 		}
2134 		break;
2135 	default:
2136 		break;
2137 	}
2138 }
2139 #else
2140 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2141 #endif
2142 
2143 /*
2144  *  Some undocumented ways of patching the results of the cpuid
2145  *  instruction to permit running Solaris 10 on future cpus that
2146  *  we don't currently support.  Could be set to non-zero values
2147  *  via settings in eeprom.
2148  */
2149 
2150 uint32_t cpuid_feature_ecx_include;
2151 uint32_t cpuid_feature_ecx_exclude;
2152 uint32_t cpuid_feature_edx_include;
2153 uint32_t cpuid_feature_edx_exclude;
2154 
2155 /*
2156  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2157  */
2158 void
cpuid_alloc_space(cpu_t * cpu)2159 cpuid_alloc_space(cpu_t *cpu)
2160 {
2161 	/*
2162 	 * By convention, cpu0 is the boot cpu, which is set up
2163 	 * before memory allocation is available.  All other cpus get
2164 	 * their cpuid_info struct allocated here.
2165 	 */
2166 	ASSERT(cpu->cpu_id != 0);
2167 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2168 	cpu->cpu_m.mcpu_cpi =
2169 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2170 }
2171 
2172 void
cpuid_free_space(cpu_t * cpu)2173 cpuid_free_space(cpu_t *cpu)
2174 {
2175 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2176 	int i;
2177 
2178 	ASSERT(cpi != NULL);
2179 	ASSERT(cpi != &cpuid_info0);
2180 
2181 	/*
2182 	 * Free up any cache leaf related dynamic storage. The first entry was
2183 	 * cached from the standard cpuid storage, so we should not free it.
2184 	 */
2185 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2186 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2187 	if (cpi->cpi_cache_leaf_size > 0)
2188 		kmem_free(cpi->cpi_cache_leaves,
2189 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2190 
2191 	kmem_free(cpi, sizeof (*cpi));
2192 	cpu->cpu_m.mcpu_cpi = NULL;
2193 }
2194 
2195 #if !defined(__xpv)
2196 /*
2197  * Determine the type of the underlying platform. This is used to customize
2198  * initialization of various subsystems (e.g. TSC). determine_platform() must
2199  * only ever be called once to prevent two processors from seeing different
2200  * values of platform_type. Must be called before cpuid_pass_ident(), the
2201  * earliest consumer to execute; the identification pass will call
2202  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2203  */
2204 void
determine_platform(void)2205 determine_platform(void)
2206 {
2207 	struct cpuid_regs cp;
2208 	uint32_t base;
2209 	uint32_t regs[4];
2210 	char *hvstr = (char *)regs;
2211 
2212 	ASSERT(platform_type == -1);
2213 
2214 	platform_type = HW_NATIVE;
2215 
2216 	if (!enable_platform_detection)
2217 		return;
2218 
2219 	/*
2220 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2221 	 * vendor signature, and set platform type accordingly.
2222 	 *
2223 	 * References:
2224 	 * http://lkml.org/lkml/2008/10/1/246
2225 	 * http://kb.vmware.com/kb/1009458
2226 	 */
2227 	cp.cp_eax = 0x1;
2228 	(void) __cpuid_insn(&cp);
2229 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2230 		cp.cp_eax = 0x40000000;
2231 		(void) __cpuid_insn(&cp);
2232 		regs[0] = cp.cp_ebx;
2233 		regs[1] = cp.cp_ecx;
2234 		regs[2] = cp.cp_edx;
2235 		regs[3] = 0;
2236 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2237 			platform_type = HW_XEN_HVM;
2238 			return;
2239 		}
2240 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2241 			platform_type = HW_VMWARE;
2242 			return;
2243 		}
2244 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2245 			platform_type = HW_KVM;
2246 			return;
2247 		}
2248 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2249 			platform_type = HW_BHYVE;
2250 			return;
2251 		}
2252 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2253 			platform_type = HW_MICROSOFT;
2254 			return;
2255 		}
2256 		if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2257 			platform_type = HW_QEMU_TCG;
2258 			return;
2259 		}
2260 		if (strcmp(hvstr, HVSIG_VIRTUALBOX) == 0) {
2261 			platform_type = HW_VIRTUALBOX;
2262 			return;
2263 		}
2264 		if (strcmp(hvstr, HVSIG_ACRN) == 0) {
2265 			platform_type = HW_ACRN;
2266 			return;
2267 		}
2268 	} else {
2269 		/*
2270 		 * Check older VMware hardware versions. VMware hypervisor is
2271 		 * detected by performing an IN operation to VMware hypervisor
2272 		 * port and checking that value returned in %ebx is VMware
2273 		 * hypervisor magic value.
2274 		 *
2275 		 * References: http://kb.vmware.com/kb/1009458
2276 		 */
2277 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2278 		if (regs[1] == VMWARE_HVMAGIC) {
2279 			platform_type = HW_VMWARE;
2280 			return;
2281 		}
2282 	}
2283 
2284 	/*
2285 	 * Check Xen hypervisor. In a fully virtualized domain,
2286 	 * Xen's pseudo-cpuid function returns a string representing the
2287 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2288 	 * supported cpuid function. We need at least a (base + 2) leaf value
2289 	 * to do what we want to do. Try different base values, since the
2290 	 * hypervisor might use a different one depending on whether Hyper-V
2291 	 * emulation is switched on by default or not.
2292 	 */
2293 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2294 		cp.cp_eax = base;
2295 		(void) __cpuid_insn(&cp);
2296 		regs[0] = cp.cp_ebx;
2297 		regs[1] = cp.cp_ecx;
2298 		regs[2] = cp.cp_edx;
2299 		regs[3] = 0;
2300 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2301 		    cp.cp_eax >= (base + 2)) {
2302 			platform_type &= ~HW_NATIVE;
2303 			platform_type |= HW_XEN_HVM;
2304 			return;
2305 		}
2306 	}
2307 }
2308 
2309 int
get_hwenv(void)2310 get_hwenv(void)
2311 {
2312 	ASSERT(platform_type != -1);
2313 	return (platform_type);
2314 }
2315 
2316 int
is_controldom(void)2317 is_controldom(void)
2318 {
2319 	return (0);
2320 }
2321 
2322 #else
2323 
2324 int
get_hwenv(void)2325 get_hwenv(void)
2326 {
2327 	return (HW_XEN_PV);
2328 }
2329 
2330 int
is_controldom(void)2331 is_controldom(void)
2332 {
2333 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2334 }
2335 
2336 #endif	/* __xpv */
2337 
2338 /*
2339  * Gather the extended topology information. This should be the same for both
2340  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2341  */
2342 static void
cpuid_gather_ext_topo_leaf(struct cpuid_info * cpi,uint32_t leaf)2343 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2344 {
2345 	uint_t i;
2346 
2347 	for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2348 		struct cpuid_regs *regs = &cpi->cpi_topo[i];
2349 
2350 		bzero(regs, sizeof (struct cpuid_regs));
2351 		regs->cp_eax = leaf;
2352 		regs->cp_ecx = i;
2353 
2354 		(void) __cpuid_insn(regs);
2355 		if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2356 		    CPUID_AMD_8X26_TYPE_DONE) {
2357 			break;
2358 		}
2359 	}
2360 
2361 	cpi->cpi_topo_nleaves = i;
2362 }
2363 
2364 /*
2365  * Make sure that we have gathered all of the CPUID leaves that we might need to
2366  * determine topology. We assume that the standard leaf 1 has already been done
2367  * and that xmaxeax has already been calculated.
2368  */
2369 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2370 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2371 {
2372 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2373 
2374 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2375 		struct cpuid_regs *cp;
2376 
2377 		cp = &cpi->cpi_extd[8];
2378 		cp->cp_eax = CPUID_LEAF_EXT_8;
2379 		(void) __cpuid_insn(cp);
2380 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2381 	}
2382 
2383 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2384 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2385 		struct cpuid_regs *cp;
2386 
2387 		cp = &cpi->cpi_extd[0x1e];
2388 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2389 		(void) __cpuid_insn(cp);
2390 	}
2391 
2392 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2393 		cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2394 	}
2395 }
2396 
2397 /*
2398  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2399  * it to everything else. If not, and we're on an AMD system where 8000001e is
2400  * valid, then we use that. Othewrise, we fall back to the default value for the
2401  * APIC ID in leaf 1.
2402  */
2403 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2404 cpuid_gather_apicid(struct cpuid_info *cpi)
2405 {
2406 	/*
2407 	 * Leaf B changes based on the arguments to it. Because we don't cache
2408 	 * it, we need to gather it again.
2409 	 */
2410 	if (cpi->cpi_maxeax >= 0xB) {
2411 		struct cpuid_regs regs;
2412 		struct cpuid_regs *cp;
2413 
2414 		cp = &regs;
2415 		cp->cp_eax = 0xB;
2416 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2417 		(void) __cpuid_insn(cp);
2418 
2419 		if (cp->cp_ebx != 0) {
2420 			return (cp->cp_edx);
2421 		}
2422 	}
2423 
2424 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2425 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2426 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2427 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2428 		return (cpi->cpi_extd[0x1e].cp_eax);
2429 	}
2430 
2431 	return (CPI_APIC_ID(cpi));
2432 }
2433 
2434 /*
2435  * For AMD processors, attempt to calculate the number of chips and cores that
2436  * exist. The way that we do this varies based on the generation, because the
2437  * generations themselves have changed dramatically.
2438  *
2439  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2440  * However, with the advent of family 17h (Zen) it actually tells us the number
2441  * of threads, so we need to look at leaf 0x8000001e if available to determine
2442  * its value. Otherwise, for all prior families, the number of enabled cores is
2443  * the same as threads.
2444  *
2445  * If we do not have leaf 0x80000008, then we assume that this processor does
2446  * not have anything. AMD's older CPUID specification says there's no reason to
2447  * fall back to leaf 1.
2448  *
2449  * In some virtualization cases we will not have leaf 8000001e or it will be
2450  * zero. When that happens we assume the number of threads is one.
2451  */
2452 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2453 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2454 {
2455 	uint_t nthreads, nthread_per_core;
2456 
2457 	nthreads = nthread_per_core = 1;
2458 
2459 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2460 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2461 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2462 		nthreads = CPI_CPU_COUNT(cpi);
2463 	}
2464 
2465 	/*
2466 	 * For us to have threads, and know about it, we have to be at least at
2467 	 * family 17h and have the cpuid bit that says we have extended
2468 	 * topology.
2469 	 */
2470 	if (cpi->cpi_family >= 0x17 &&
2471 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2472 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2473 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2474 	}
2475 
2476 	*ncpus = nthreads;
2477 	*ncores = nthreads / nthread_per_core;
2478 }
2479 
2480 /*
2481  * Seed the initial values for the cores and threads for an Intel based
2482  * processor. These values will be overwritten if we detect that the processor
2483  * supports CPUID leaf 0xb.
2484  */
2485 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2486 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2487 {
2488 	/*
2489 	 * Only seed the number of physical cores from the first level leaf 4
2490 	 * information. The number of threads there indicate how many share the
2491 	 * L1 cache, which may or may not have anything to do with the number of
2492 	 * logical CPUs per core.
2493 	 */
2494 	if (cpi->cpi_maxeax >= 4) {
2495 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2496 	} else {
2497 		*ncores = 1;
2498 	}
2499 
2500 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2501 		*ncpus = CPI_CPU_COUNT(cpi);
2502 	} else {
2503 		*ncpus = *ncores;
2504 	}
2505 }
2506 
2507 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2508 cpuid_leafB_getids(cpu_t *cpu)
2509 {
2510 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2511 	struct cpuid_regs regs;
2512 	struct cpuid_regs *cp;
2513 
2514 	if (cpi->cpi_maxeax < 0xB)
2515 		return (B_FALSE);
2516 
2517 	cp = &regs;
2518 	cp->cp_eax = 0xB;
2519 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2520 
2521 	(void) __cpuid_insn(cp);
2522 
2523 	/*
2524 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2525 	 * indicates that the extended topology enumeration leaf is
2526 	 * available.
2527 	 */
2528 	if (cp->cp_ebx != 0) {
2529 		uint32_t x2apic_id = 0;
2530 		uint_t coreid_shift = 0;
2531 		uint_t ncpu_per_core = 1;
2532 		uint_t chipid_shift = 0;
2533 		uint_t ncpu_per_chip = 1;
2534 		uint_t i;
2535 		uint_t level;
2536 
2537 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2538 			cp->cp_eax = 0xB;
2539 			cp->cp_ecx = i;
2540 
2541 			(void) __cpuid_insn(cp);
2542 			level = CPI_CPU_LEVEL_TYPE(cp);
2543 
2544 			if (level == 1) {
2545 				x2apic_id = cp->cp_edx;
2546 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2547 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2548 			} else if (level == 2) {
2549 				x2apic_id = cp->cp_edx;
2550 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2551 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2552 			}
2553 		}
2554 
2555 		/*
2556 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2557 		 */
2558 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2559 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2560 		    ncpu_per_core;
2561 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2562 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2563 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2564 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2565 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2566 		cpi->cpi_compunitid = cpi->cpi_coreid;
2567 
2568 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2569 			cpi->cpi_nthread_bits = coreid_shift;
2570 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2571 		}
2572 
2573 		return (B_TRUE);
2574 	} else {
2575 		return (B_FALSE);
2576 	}
2577 }
2578 
2579 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2580 cpuid_intel_getids(cpu_t *cpu, void *feature)
2581 {
2582 	uint_t i;
2583 	uint_t chipid_shift = 0;
2584 	uint_t coreid_shift = 0;
2585 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2586 
2587 	/*
2588 	 * There are no compute units or processor nodes currently on Intel.
2589 	 * Always set these to one.
2590 	 */
2591 	cpi->cpi_procnodes_per_pkg = 1;
2592 	cpi->cpi_cores_per_compunit = 1;
2593 
2594 	/*
2595 	 * If cpuid Leaf B is present, use that to try and get this information.
2596 	 * It will be the most accurate for Intel CPUs.
2597 	 */
2598 	if (cpuid_leafB_getids(cpu))
2599 		return;
2600 
2601 	/*
2602 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2603 	 * and ncore_per_chip. These represent the largest power of two values
2604 	 * that we need to cover all of the IDs in the system. Therefore, we use
2605 	 * those values to seed the number of bits needed to cover information
2606 	 * in the case when leaf B is not available. These values will probably
2607 	 * be larger than required, but that's OK.
2608 	 */
2609 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2610 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2611 
2612 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2613 		chipid_shift++;
2614 
2615 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2616 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2617 
2618 	if (is_x86_feature(feature, X86FSET_CMP)) {
2619 		/*
2620 		 * Multi-core (and possibly multi-threaded)
2621 		 * processors.
2622 		 */
2623 		uint_t ncpu_per_core = 0;
2624 
2625 		if (cpi->cpi_ncore_per_chip == 1)
2626 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2627 		else if (cpi->cpi_ncore_per_chip > 1)
2628 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2629 			    cpi->cpi_ncore_per_chip;
2630 		/*
2631 		 * 8bit APIC IDs on dual core Pentiums
2632 		 * look like this:
2633 		 *
2634 		 * +-----------------------+------+------+
2635 		 * | Physical Package ID   |  MC  |  HT  |
2636 		 * +-----------------------+------+------+
2637 		 * <------- chipid -------->
2638 		 * <------- coreid --------------->
2639 		 *			   <--- clogid -->
2640 		 *			   <------>
2641 		 *			   pkgcoreid
2642 		 *
2643 		 * Where the number of bits necessary to
2644 		 * represent MC and HT fields together equals
2645 		 * to the minimum number of bits necessary to
2646 		 * store the value of cpi->cpi_ncpu_per_chip.
2647 		 * Of those bits, the MC part uses the number
2648 		 * of bits necessary to store the value of
2649 		 * cpi->cpi_ncore_per_chip.
2650 		 */
2651 		for (i = 1; i < ncpu_per_core; i <<= 1)
2652 			coreid_shift++;
2653 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2654 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2655 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2656 		/*
2657 		 * Single-core multi-threaded processors.
2658 		 */
2659 		cpi->cpi_coreid = cpi->cpi_chipid;
2660 		cpi->cpi_pkgcoreid = 0;
2661 	} else {
2662 		/*
2663 		 * Single-core single-thread processors.
2664 		 */
2665 		cpi->cpi_coreid = cpu->cpu_id;
2666 		cpi->cpi_pkgcoreid = 0;
2667 	}
2668 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2669 	cpi->cpi_compunitid = cpi->cpi_coreid;
2670 }
2671 
2672 /*
2673  * Historically, AMD has had CMP chips with only a single thread per core.
2674  * However, starting in family 17h (Zen), this has changed and they now have
2675  * multiple threads. Our internal core id needs to be a unique value.
2676  *
2677  * To determine the core id of an AMD system, if we're from a family before 17h,
2678  * then we just use the cpu id, as that gives us a good value that will be
2679  * unique for each core. If instead, we're on family 17h or later, then we need
2680  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2681  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2682  * We can't use the normal core id in that leaf as it's only unique within the
2683  * socket, which is perfect for cpi_pkgcoreid, but not us.
2684  */
2685 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2686 cpuid_amd_get_coreid(cpu_t *cpu)
2687 {
2688 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2689 
2690 	if (cpi->cpi_family >= 0x17 &&
2691 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2692 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2693 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2694 		if (nthreads > 1) {
2695 			VERIFY3U(nthreads, ==, 2);
2696 			return (cpi->cpi_apicid >> 1);
2697 		}
2698 	}
2699 
2700 	return (cpu->cpu_id);
2701 }
2702 
2703 /*
2704  * IDs on AMD is a more challenging task. This is notable because of the
2705  * following two facts:
2706  *
2707  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2708  *     also no way to get an actual unique core id from the system. As such, we
2709  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2710  *     however, guarantee that sibling cores of a chip will have sequential
2711  *     coreids starting at a multiple of the number of cores per chip - that is
2712  *     usually the case, but if the APIC IDs have been set up in a different
2713  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2714  *
2715  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2716  *     called compute units. These compute units share the L1I cache, L2 cache,
2717  *     and the FPU. To deal with this, a new topology leaf was added in
2718  *     0x8000001e. However, parts of this leaf have different meanings
2719  *     once we get to family 0x17.
2720  */
2721 
2722 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2723 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2724 {
2725 	int i, first_half, coreidsz;
2726 	uint32_t nb_caps_reg;
2727 	uint_t node2_1;
2728 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2729 	struct cpuid_regs *cp;
2730 
2731 	/*
2732 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2733 	 * hasn't been stripped by virtualization). We always set the compute
2734 	 * unit id to the same value. Also, initialize the default number of
2735 	 * cores per compute unit and nodes per package. This will be
2736 	 * overwritten when we know information about a particular family.
2737 	 */
2738 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2739 	cpi->cpi_compunitid = cpi->cpi_coreid;
2740 	cpi->cpi_cores_per_compunit = 1;
2741 	cpi->cpi_procnodes_per_pkg = 1;
2742 
2743 	/*
2744 	 * To construct the logical ID, we need to determine how many APIC IDs
2745 	 * are dedicated to the cores and threads. This is provided for us in
2746 	 * 0x80000008. However, if it's not present (say due to virtualization),
2747 	 * then we assume it's one. This should be present on all 64-bit AMD
2748 	 * processors.  It was added in family 0xf (Hammer).
2749 	 */
2750 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2751 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2752 
2753 		/*
2754 		 * In AMD parlance chip is really a node while illumos
2755 		 * uses chip as equivalent to socket/package.
2756 		 */
2757 		if (coreidsz == 0) {
2758 			/* Use legacy method */
2759 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2760 				coreidsz++;
2761 			if (coreidsz == 0)
2762 				coreidsz = 1;
2763 		}
2764 	} else {
2765 		/* Assume single-core part */
2766 		coreidsz = 1;
2767 	}
2768 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2769 
2770 	/*
2771 	 * The package core ID varies depending on the family. While it may be
2772 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2773 	 * this value is the core id in the given node. For non-virtualized
2774 	 * family 17h, we need to take the logical core id and shift off the
2775 	 * threads like we do when getting the core id.  Otherwise, we can use
2776 	 * the clogid as is. When family 17h is virtualized, the clogid should
2777 	 * be sufficient as if we don't have valid data in the leaf, then we
2778 	 * won't think we have SMT, in which case the cpi_clogid should be
2779 	 * sufficient.
2780 	 */
2781 	if (cpi->cpi_family >= 0x17 &&
2782 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2783 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2784 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2785 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2786 		if (nthreads > 1) {
2787 			VERIFY3U(nthreads, ==, 2);
2788 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2789 		} else {
2790 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2791 		}
2792 	} else {
2793 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2794 	}
2795 
2796 	/*
2797 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2798 	 * (bulldozer) or newer, then we can derive all of this from leaf
2799 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2800 	 */
2801 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2802 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2803 		cp = &cpi->cpi_extd[0x1e];
2804 
2805 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2806 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2807 
2808 		/*
2809 		 * For Bulldozer-era CPUs, recalculate the compute unit
2810 		 * information.
2811 		 */
2812 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2813 			cpi->cpi_cores_per_compunit =
2814 			    BITX(cp->cp_ebx, 15, 8) + 1;
2815 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2816 			    (cpi->cpi_ncore_per_chip /
2817 			    cpi->cpi_cores_per_compunit) *
2818 			    (cpi->cpi_procnodeid /
2819 			    cpi->cpi_procnodes_per_pkg);
2820 		}
2821 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2822 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2823 	} else if (cpi->cpi_family == 0x10) {
2824 		/*
2825 		 * See if we are a multi-node processor.
2826 		 * All processors in the system have the same number of nodes
2827 		 */
2828 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2829 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2830 			/* Single-node */
2831 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2832 			    coreidsz);
2833 		} else {
2834 
2835 			/*
2836 			 * Multi-node revision D (2 nodes per package
2837 			 * are supported)
2838 			 */
2839 			cpi->cpi_procnodes_per_pkg = 2;
2840 
2841 			first_half = (cpi->cpi_pkgcoreid <=
2842 			    (cpi->cpi_ncore_per_chip/2 - 1));
2843 
2844 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2845 				/* We are BSP */
2846 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2847 			} else {
2848 
2849 				/* We are AP */
2850 				/* NodeId[2:1] bits to use for reading F3xe8 */
2851 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2852 
2853 				nb_caps_reg =
2854 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2855 
2856 				/*
2857 				 * Check IntNodeNum bit (31:30, but bit 31 is
2858 				 * always 0 on dual-node processors)
2859 				 */
2860 				if (BITX(nb_caps_reg, 30, 30) == 0)
2861 					cpi->cpi_procnodeid = node2_1 +
2862 					    !first_half;
2863 				else
2864 					cpi->cpi_procnodeid = node2_1 +
2865 					    first_half;
2866 			}
2867 		}
2868 	} else {
2869 		cpi->cpi_procnodeid = 0;
2870 	}
2871 
2872 	cpi->cpi_chipid =
2873 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2874 
2875 	cpi->cpi_ncore_bits = coreidsz;
2876 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2877 	    cpi->cpi_ncore_per_chip);
2878 }
2879 
2880 static void
spec_uarch_flush_noop(void)2881 spec_uarch_flush_noop(void)
2882 {
2883 }
2884 
2885 /*
2886  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2887  * MDS-related micro-architectural state that would normally happen by calling
2888  * x86_md_clear().
2889  */
2890 static void
spec_uarch_flush_msr(void)2891 spec_uarch_flush_msr(void)
2892 {
2893 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2894 }
2895 
2896 /*
2897  * This function points to a function that will flush certain
2898  * micro-architectural state on the processor. This flush is used to mitigate
2899  * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2900  * This function can point to one of three functions:
2901  *
2902  * - A noop which is done because we either are vulnerable, but do not have
2903  *   microcode available to help deal with a fix, or because we aren't
2904  *   vulnerable.
2905  *
2906  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2907  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2908  *   however, it only flushes the MDS related micro-architectural state on the
2909  *   current hyperthread, it does not do anything for the twin.
2910  *
2911  * - x86_md_clear which will flush the MDS related state. This is done when we
2912  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2913  *   (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2914  *   can clear it (RFDS_CLEAR is set).
2915  */
2916 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2917 
2918 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2919 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2920 {
2921 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2922 
2923 	/* Non-Intel doesn't concern us here. */
2924 	if (cpi->cpi_vendor != X86_VENDOR_Intel)
2925 		return;
2926 
2927 	/*
2928 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2929 	 * has been fixed in hardware, it doesn't cover everything related to
2930 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2931 	 * need to mitigate this.
2932 	 *
2933 	 * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2934 	 * because of the small cases of RFDS.
2935 	 */
2936 
2937 	if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2938 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2939 	    (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2940 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2941 		const uint8_t nop = NOP_INSTR;
2942 		uint8_t *md = (uint8_t *)x86_md_clear;
2943 
2944 		*md = nop;
2945 	}
2946 
2947 	membar_producer();
2948 }
2949 
2950 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2951 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2952 {
2953 	boolean_t need_l1d, need_mds, need_rfds;
2954 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2955 
2956 	/*
2957 	 * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2958 	 * in hardware, then there's nothing left for us to do for enabling
2959 	 * the flush. We can also go ahead and say that SMT exclusion is
2960 	 * unnecessary.
2961 	 */
2962 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2963 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2964 	    is_x86_feature(featureset, X86FSET_MDS_NO) &&
2965 	    is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2966 		extern int smt_exclusion;
2967 		smt_exclusion = 0;
2968 		spec_uarch_flush = spec_uarch_flush_noop;
2969 		membar_producer();
2970 		return;
2971 	}
2972 
2973 	/*
2974 	 * The locations where we need to perform an L1D flush are required both
2975 	 * for mitigating L1TF and MDS. When verw support is present in
2976 	 * microcode, then the L1D flush will take care of doing that as well.
2977 	 * However, if we have a system where RDCL_NO is present, but we don't
2978 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2979 	 * L1D flush.
2980 	 */
2981 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2982 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2983 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2984 		need_l1d = B_TRUE;
2985 	} else {
2986 		need_l1d = B_FALSE;
2987 	}
2988 
2989 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2990 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2991 		need_mds = B_TRUE;
2992 	} else {
2993 		need_mds = B_FALSE;
2994 	}
2995 
2996 	if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2997 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
2998 		need_rfds = B_TRUE;
2999 	} else {
3000 		need_rfds = B_FALSE;
3001 	}
3002 
3003 	if (need_l1d) {
3004 		/*
3005 		 * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
3006 		 * together. If the following VERIFY trips, we need to add
3007 		 * further fixes here.
3008 		 */
3009 		VERIFY(!need_rfds);
3010 		spec_uarch_flush = spec_uarch_flush_msr;
3011 	} else if (need_mds || need_rfds) {
3012 		spec_uarch_flush = x86_md_clear;
3013 	} else {
3014 		/*
3015 		 * We have no hardware mitigations available to us.
3016 		 */
3017 		spec_uarch_flush = spec_uarch_flush_noop;
3018 	}
3019 	membar_producer();
3020 }
3021 
3022 /*
3023  * Branch History Injection (BHI) mitigations.
3024  *
3025  * Intel has provided a software sequence that will scrub the BHB. Like RSB
3026  * (below) we can scribble a return at the beginning to avoid if if the CPU
3027  * is modern enough. We can also scribble a return if the CPU is old enough
3028  * to not have an RSB (pre-eIBRS).
3029  */
3030 typedef enum {
3031 	X86_BHI_TOO_OLD_OR_DISABLED,	/* Pre-eIBRS or disabled */
3032 	X86_BHI_NEW_ENOUGH,		/* AMD, or Intel with BHI_NO set */
3033 	X86_BHI_DIS_S,			/* BHI_NO == 0, but BHI_DIS_S avail. */
3034 	/* NOTE: BHI_DIS_S above will still need the software sequence. */
3035 	X86_BHI_SOFTWARE_SEQUENCE,	/* Use software sequence */
3036 } x86_native_bhi_mitigation_t;
3037 
3038 x86_native_bhi_mitigation_t x86_bhi_mitigation = X86_BHI_SOFTWARE_SEQUENCE;
3039 
3040 static void
cpuid_enable_bhi_dis_s(void)3041 cpuid_enable_bhi_dis_s(void)
3042 {
3043 	uint64_t val;
3044 
3045 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3046 	val |= IA32_SPEC_CTRL_BHI_DIS_S;
3047 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3048 }
3049 
3050 /*
3051  * This function scribbles RET into the first instruction of x86_bhb_clear()
3052  * if SPECTREV2 mitigations are disabled, the CPU is too old, the CPU is new
3053  * enough to fix (which includes non-Intel CPUs), or the CPU has an explicit
3054  * disable-Branch-History control.
3055  */
3056 static x86_native_bhi_mitigation_t
cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit,cpu_t * cpu,uchar_t * featureset)3057 cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit, cpu_t *cpu,
3058     uchar_t *featureset)
3059 {
3060 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3061 	const uint8_t ret = RET_INSTR;
3062 	uint8_t *bhb_clear = (uint8_t *)x86_bhb_clear;
3063 
3064 	ASSERT0(cpu->cpu_id);
3065 
3066 	/* First check for explicitly disabled... */
3067 	if (v2mit == X86_SPECTREV2_DISABLED) {
3068 		*bhb_clear = ret;
3069 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3070 	}
3071 
3072 	/*
3073 	 * Then check for BHI_NO, which means the CPU doesn't have this bug,
3074 	 * or if it's non-Intel, in which case this mitigation mechanism
3075 	 * doesn't apply.
3076 	 */
3077 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
3078 	    is_x86_feature(featureset, X86FSET_BHI_NO)) {
3079 		*bhb_clear = ret;
3080 		return (X86_BHI_NEW_ENOUGH);
3081 	}
3082 
3083 	/*
3084 	 * Now check for the BHI_CTRL MSR, and then set it if available.
3085 	 * We will still need to use the software sequence, however.
3086 	 */
3087 	if (is_x86_feature(featureset, X86FSET_BHI_CTRL)) {
3088 		cpuid_enable_bhi_dis_s();
3089 		return (X86_BHI_DIS_S);
3090 	}
3091 
3092 	/*
3093 	 * Finally, check if we are too old to bother with RSB:
3094 	 */
3095 	if (v2mit == X86_SPECTREV2_RETPOLINE) {
3096 		*bhb_clear = ret;
3097 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3098 	}
3099 
3100 	ASSERT(*bhb_clear != ret);
3101 	return (X86_BHI_SOFTWARE_SEQUENCE);
3102 }
3103 
3104 /*
3105  * We default to enabling Return Stack Buffer (RSB) mitigations.
3106  *
3107  * We used to skip RSB mitigations with Intel eIBRS, but developments around
3108  * post-barrier RSB (PBRSB) guessing suggests we should enable Intel RSB
3109  * mitigations always unless explicitly bypassed, or unless hardware indicates
3110  * the bug has been fixed.
3111  *
3112  * The current decisions for using, or ignoring, a RSB software stuffing
3113  * sequence are expressed by the following table:
3114  *
3115  * +-------+------------+-----------------+--------+
3116  * | eIBRS |  PBRSB_NO  |  context switch | vmexit |
3117  * +-------+------------+-----------------+--------+
3118  * |   Yes |     No     |  stuff          | stuff  |
3119  * |   Yes |     Yes    |  ignore         | ignore |
3120  * |   No  |     No     |  stuff          | ignore |
3121  * +-------+------------+-----------------+--------+
3122  *
3123  * Note that if an Intel CPU has no eIBRS, it will never enumerate PBRSB_NO,
3124  * because machines with no eIBRS do not have a problem with PBRSB overflow.
3125  * See the Intel document cited below for details.
3126  *
3127  * Also note that AMD AUTO_IBRS has no PBRSB problem, so it is not included in
3128  * the table above, and that there is no situation where vmexit stuffing is
3129  * needed, but context-switch stuffing isn't.
3130  */
3131 
3132 /* BEGIN CSTYLED */
3133 /*
3134  * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/post-barrier-return-stack-buffer-predictions.html
3135  */
3136 /* END CSTYLED */
3137 
3138 /*
3139  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
3140  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
3141  * also states that as long as SMEP and we maintain at least one page between
3142  * the kernel and user space (we have much more of a red zone), then we do not
3143  * need to clear the RSB. We constrain this to only when Automatic IRBS is
3144  * present.
3145  */
3146 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit,bool intel_pbrsb_no)3147 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit, bool intel_pbrsb_no)
3148 {
3149 	const uint8_t ret = RET_INSTR;
3150 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
3151 	uint8_t *vmx_stuff = (uint8_t *)x86_rsb_stuff_vmexit;
3152 
3153 	switch (mit) {
3154 	case X86_SPECTREV2_AUTO_IBRS:
3155 	case X86_SPECTREV2_DISABLED:
3156 		/* Don't bother with any RSB stuffing! */
3157 		*stuff = ret;
3158 		*vmx_stuff = ret;
3159 		break;
3160 	case X86_SPECTREV2_RETPOLINE:
3161 		/*
3162 		 * The Intel document on Post-Barrier RSB says that processors
3163 		 * without eIBRS do not have PBRSB problems upon VMEXIT.
3164 		 */
3165 		VERIFY(!intel_pbrsb_no);
3166 		VERIFY3U(*stuff, !=, ret);
3167 		*vmx_stuff = ret;
3168 		break;
3169 	default:
3170 		/*
3171 		 * eIBRS is all that's left.  If CPU claims PBRSB is fixed,
3172 		 * don't use the RSB mitigation in either case.  Otherwise
3173 		 * both vmexit and context-switching require the software
3174 		 * mitigation.
3175 		 */
3176 		if (intel_pbrsb_no) {
3177 			/* CPU claims PBRSB problems are fixed. */
3178 			*stuff = ret;
3179 			*vmx_stuff = ret;
3180 		}
3181 		VERIFY3U(*stuff, ==, *vmx_stuff);
3182 		break;
3183 	}
3184 }
3185 
3186 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)3187 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3188 {
3189 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3190 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3191 	    "_r14", "_r15" };
3192 	const uint_t nthunks = ARRAY_SIZE(thunks);
3193 	const char *type;
3194 	uint_t i;
3195 
3196 	if (mit == x86_spectrev2_mitigation)
3197 		return;
3198 
3199 	switch (mit) {
3200 	case X86_SPECTREV2_RETPOLINE:
3201 		type = "gen";
3202 		break;
3203 	case X86_SPECTREV2_AUTO_IBRS:
3204 	case X86_SPECTREV2_ENHANCED_IBRS:
3205 	case X86_SPECTREV2_DISABLED:
3206 		type = "jmp";
3207 		break;
3208 	default:
3209 		panic("asked to update retpoline state with unknown state!");
3210 	}
3211 
3212 	for (i = 0; i < nthunks; i++) {
3213 		uintptr_t source, dest;
3214 		int ssize, dsize;
3215 		char sourcebuf[64], destbuf[64];
3216 
3217 		(void) snprintf(destbuf, sizeof (destbuf),
3218 		    "__x86_indirect_thunk%s", thunks[i]);
3219 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
3220 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
3221 
3222 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3223 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
3224 		VERIFY3U(source, !=, 0);
3225 		VERIFY3U(dest, !=, 0);
3226 		VERIFY3S(dsize, >=, ssize);
3227 		bcopy((void *)source, (void *)dest, ssize);
3228 	}
3229 }
3230 
3231 static void
cpuid_enable_enhanced_ibrs(void)3232 cpuid_enable_enhanced_ibrs(void)
3233 {
3234 	uint64_t val;
3235 
3236 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3237 	val |= IA32_SPEC_CTRL_IBRS;
3238 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3239 }
3240 
3241 static void
cpuid_enable_auto_ibrs(void)3242 cpuid_enable_auto_ibrs(void)
3243 {
3244 	uint64_t val;
3245 
3246 	val = rdmsr(MSR_AMD_EFER);
3247 	val |= AMD_EFER_AIBRSE;
3248 	wrmsr(MSR_AMD_EFER, val);
3249 }
3250 
3251 /*
3252  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3253  * we can disable TSX, we do so.
3254  *
3255  * This determination is done only on the boot CPU, potentially after loading
3256  * updated microcode.
3257  */
3258 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)3259 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3260 {
3261 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3262 
3263 	VERIFY(cpu->cpu_id == 0);
3264 
3265 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3266 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3267 		return;
3268 	}
3269 
3270 	if (x86_disable_taa) {
3271 		x86_taa_mitigation = X86_TAA_DISABLED;
3272 		return;
3273 	}
3274 
3275 	/*
3276 	 * If we do not have the ability to disable TSX, then our only
3277 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3278 	 * MDS mitigation as described above.  The latter relies upon us having
3279 	 * configured MDS mitigations correctly! This includes disabling SMT if
3280 	 * we want to cross-CPU-thread protection.
3281 	 */
3282 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3283 		/*
3284 		 * It's not clear whether any parts will enumerate TAA_NO
3285 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3286 		 */
3287 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3288 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3289 			return;
3290 		}
3291 
3292 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3293 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3294 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3295 		} else {
3296 			x86_taa_mitigation = X86_TAA_NOTHING;
3297 		}
3298 		return;
3299 	}
3300 
3301 	/*
3302 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3303 	 * enough in boot.
3304 	 *
3305 	 * Otherwise, we'll fall back to causing transactions to abort as our
3306 	 * mitigation. TSX-using code will always take the fallback path.
3307 	 */
3308 	if (cpi->cpi_pass < 4) {
3309 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3310 	} else {
3311 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3312 	}
3313 }
3314 
3315 /*
3316  * As mentioned, we should only touch the MSR when we've got a suitable
3317  * microcode loaded on this CPU.
3318  */
3319 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)3320 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3321 {
3322 	uint64_t val;
3323 
3324 	switch (taa) {
3325 	case X86_TAA_TSX_DISABLE:
3326 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3327 			return;
3328 		val = rdmsr(MSR_IA32_TSX_CTRL);
3329 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3330 		wrmsr(MSR_IA32_TSX_CTRL, val);
3331 		break;
3332 	case X86_TAA_TSX_FORCE_ABORT:
3333 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3334 			return;
3335 		val = rdmsr(MSR_IA32_TSX_CTRL);
3336 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3337 		wrmsr(MSR_IA32_TSX_CTRL, val);
3338 		break;
3339 	case X86_TAA_HW_MITIGATED:
3340 	case X86_TAA_MD_CLEAR:
3341 	case X86_TAA_DISABLED:
3342 	case X86_TAA_NOTHING:
3343 		break;
3344 	}
3345 }
3346 
3347 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)3348 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3349 {
3350 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3351 	x86_spectrev2_mitigation_t v2mit;
3352 
3353 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3354 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3355 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3356 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3357 			add_x86_feature(featureset, X86FSET_IBPB);
3358 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3359 			add_x86_feature(featureset, X86FSET_IBRS);
3360 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3361 			add_x86_feature(featureset, X86FSET_STIBP);
3362 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3363 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3364 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3365 			add_x86_feature(featureset, X86FSET_SSBD);
3366 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3367 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3368 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3369 			add_x86_feature(featureset, X86FSET_SSB_NO);
3370 
3371 		/*
3372 		 * Rather than Enhanced IBRS, AMD has a different feature that
3373 		 * is a bit in EFER that can be enabled and will basically do
3374 		 * the right thing while executing in the kernel.
3375 		 */
3376 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3377 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3378 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3379 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3380 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3381 		}
3382 
3383 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3384 	    cpi->cpi_maxeax >= 7) {
3385 		struct cpuid_regs *ecp;
3386 		ecp = &cpi->cpi_std[7];
3387 
3388 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3389 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3390 		}
3391 
3392 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3393 			add_x86_feature(featureset, X86FSET_IBRS);
3394 			add_x86_feature(featureset, X86FSET_IBPB);
3395 		}
3396 
3397 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3398 			add_x86_feature(featureset, X86FSET_STIBP);
3399 		}
3400 
3401 		/*
3402 		 * Some prediction controls are enumerated by subleaf 2 of
3403 		 * leaf 7.
3404 		 */
3405 		if (CPI_FEATURES_7_2_EDX(cpi) & CPUID_INTC_EDX_7_2_BHI_CTRL) {
3406 			add_x86_feature(featureset, X86FSET_BHI_CTRL);
3407 		}
3408 
3409 		/*
3410 		 * Don't read the arch caps MSR on xpv where we lack the
3411 		 * on_trap().
3412 		 */
3413 #ifndef __xpv
3414 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3415 			on_trap_data_t otd;
3416 
3417 			/*
3418 			 * Be paranoid and assume we'll get a #GP.
3419 			 */
3420 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3421 				uint64_t reg;
3422 
3423 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3424 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3425 					add_x86_feature(featureset,
3426 					    X86FSET_RDCL_NO);
3427 				}
3428 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3429 					add_x86_feature(featureset,
3430 					    X86FSET_IBRS_ALL);
3431 				}
3432 				if (reg & IA32_ARCH_CAP_RSBA) {
3433 					add_x86_feature(featureset,
3434 					    X86FSET_RSBA);
3435 				}
3436 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3437 					add_x86_feature(featureset,
3438 					    X86FSET_L1D_VM_NO);
3439 				}
3440 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3441 					add_x86_feature(featureset,
3442 					    X86FSET_SSB_NO);
3443 				}
3444 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3445 					add_x86_feature(featureset,
3446 					    X86FSET_MDS_NO);
3447 				}
3448 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3449 					add_x86_feature(featureset,
3450 					    X86FSET_TSX_CTRL);
3451 				}
3452 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3453 					add_x86_feature(featureset,
3454 					    X86FSET_TAA_NO);
3455 				}
3456 				if (reg & IA32_ARCH_CAP_RFDS_NO) {
3457 					add_x86_feature(featureset,
3458 					    X86FSET_RFDS_NO);
3459 				}
3460 				if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3461 					add_x86_feature(featureset,
3462 					    X86FSET_RFDS_CLEAR);
3463 				}
3464 				if (reg & IA32_ARCH_CAP_PBRSB_NO) {
3465 					add_x86_feature(featureset,
3466 					    X86FSET_PBRSB_NO);
3467 				}
3468 				if (reg & IA32_ARCH_CAP_BHI_NO) {
3469 					add_x86_feature(featureset,
3470 					    X86FSET_BHI_NO);
3471 				}
3472 			}
3473 			no_trap();
3474 		}
3475 #endif	/* !__xpv */
3476 
3477 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3478 			add_x86_feature(featureset, X86FSET_SSBD);
3479 
3480 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3481 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3482 	}
3483 
3484 	/*
3485 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3486 	 * will have already run this function and determined what we need to
3487 	 * do. This gives us a hook for per-HW thread mitigations such as
3488 	 * enhanced IBRS, or disabling TSX.
3489 	 */
3490 	if (cpu->cpu_id != 0) {
3491 		switch (x86_spectrev2_mitigation) {
3492 		case X86_SPECTREV2_ENHANCED_IBRS:
3493 			cpuid_enable_enhanced_ibrs();
3494 			break;
3495 		case X86_SPECTREV2_AUTO_IBRS:
3496 			cpuid_enable_auto_ibrs();
3497 			break;
3498 		default:
3499 			break;
3500 		}
3501 
3502 		/* If we're committed to BHI_DIS_S, set it for this core. */
3503 		if (x86_bhi_mitigation == X86_BHI_DIS_S)
3504 			cpuid_enable_bhi_dis_s();
3505 
3506 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3507 		return;
3508 	}
3509 
3510 	/*
3511 	 * Go through and initialize various security mechanisms that we should
3512 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3513 	 * TAA.
3514 	 */
3515 
3516 	/*
3517 	 * By default we've come in with retpolines enabled. Check whether we
3518 	 * should disable them or enable enhanced or automatic IBRS.
3519 	 *
3520 	 * Note, we do not allow the use of AMD optimized retpolines as it was
3521 	 * disclosed by AMD in March 2022 that they were still
3522 	 * vulnerable. Prior to that point, we used them.
3523 	 */
3524 	if (x86_disable_spectrev2 != 0) {
3525 		v2mit = X86_SPECTREV2_DISABLED;
3526 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3527 		cpuid_enable_auto_ibrs();
3528 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3529 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3530 		cpuid_enable_enhanced_ibrs();
3531 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3532 	} else {
3533 		v2mit = X86_SPECTREV2_RETPOLINE;
3534 	}
3535 
3536 	cpuid_patch_retpolines(v2mit);
3537 	cpuid_patch_rsb(v2mit, is_x86_feature(featureset, X86FSET_PBRSB_NO));
3538 	x86_bhi_mitigation = cpuid_learn_and_patch_bhi(v2mit, cpu, featureset);
3539 	x86_spectrev2_mitigation = v2mit;
3540 	membar_producer();
3541 
3542 	/*
3543 	 * We need to determine what changes are required for mitigating L1TF
3544 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3545 	 * is required.
3546 	 *
3547 	 * If any of these are present, then we need to flush u-arch state at
3548 	 * various points. For MDS, we need to do so whenever we change to a
3549 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3550 	 * flush the L1D cache at VM entry. When we have microcode that handles
3551 	 * MDS, the L1D flush also clears the other u-arch state that the
3552 	 * md_clear does.
3553 	 */
3554 
3555 	/*
3556 	 * Update whether or not we need to be taking explicit action against
3557 	 * MDS or RFDS.
3558 	 */
3559 	cpuid_update_md_clear(cpu, featureset);
3560 
3561 	/*
3562 	 * Determine whether SMT exclusion is required and whether or not we
3563 	 * need to perform an l1d flush.
3564 	 */
3565 	cpuid_update_l1d_flush(cpu, featureset);
3566 
3567 	/*
3568 	 * Determine what our mitigation strategy should be for TAA and then
3569 	 * also apply TAA mitigations.
3570 	 */
3571 	cpuid_update_tsx(cpu, featureset);
3572 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3573 }
3574 
3575 /*
3576  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3577  */
3578 void
setup_xfem(void)3579 setup_xfem(void)
3580 {
3581 	uint64_t flags = XFEATURE_LEGACY_FP;
3582 
3583 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3584 
3585 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3586 		flags |= XFEATURE_SSE;
3587 
3588 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3589 		flags |= XFEATURE_AVX;
3590 
3591 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3592 		flags |= XFEATURE_AVX512;
3593 
3594 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3595 
3596 	xsave_bv_all = flags;
3597 }
3598 
3599 static void
cpuid_basic_topology(cpu_t * cpu,uchar_t * featureset)3600 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3601 {
3602 	struct cpuid_info *cpi;
3603 
3604 	cpi = cpu->cpu_m.mcpu_cpi;
3605 
3606 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3607 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3608 		cpuid_gather_amd_topology_leaves(cpu);
3609 	}
3610 
3611 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3612 
3613 	/*
3614 	 * Before we can calculate the IDs that we should assign to this
3615 	 * processor, we need to understand how many cores and threads it has.
3616 	 */
3617 	switch (cpi->cpi_vendor) {
3618 	case X86_VENDOR_Intel:
3619 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3620 		    &cpi->cpi_ncore_per_chip);
3621 		break;
3622 	case X86_VENDOR_AMD:
3623 	case X86_VENDOR_HYGON:
3624 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3625 		    &cpi->cpi_ncore_per_chip);
3626 		break;
3627 	default:
3628 		/*
3629 		 * If we have some other x86 compatible chip, it's not clear how
3630 		 * they would behave. The most common case is virtualization
3631 		 * today, though there are also 64-bit VIA chips. Assume that
3632 		 * all we can get is the basic Leaf 1 HTT information.
3633 		 */
3634 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3635 			cpi->cpi_ncore_per_chip = 1;
3636 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3637 		}
3638 		break;
3639 	}
3640 
3641 	/*
3642 	 * Based on the calculated number of threads and cores, potentially
3643 	 * assign the HTT and CMT features.
3644 	 */
3645 	if (cpi->cpi_ncore_per_chip > 1) {
3646 		add_x86_feature(featureset, X86FSET_CMP);
3647 	}
3648 
3649 	if (cpi->cpi_ncpu_per_chip > 1 &&
3650 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3651 		add_x86_feature(featureset, X86FSET_HTT);
3652 	}
3653 
3654 	/*
3655 	 * Now that has been set up, we need to go through and calculate all of
3656 	 * the rest of the parameters that exist. If we think the CPU doesn't
3657 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3658 	 * up information in some way. The most likely case for this is
3659 	 * virtualization where we have a lot of partial topology information.
3660 	 */
3661 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3662 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3663 		/*
3664 		 * This is a single core, single-threaded processor.
3665 		 */
3666 		cpi->cpi_procnodes_per_pkg = 1;
3667 		cpi->cpi_cores_per_compunit = 1;
3668 		cpi->cpi_compunitid = 0;
3669 		cpi->cpi_chipid = -1;
3670 		cpi->cpi_clogid = 0;
3671 		cpi->cpi_coreid = cpu->cpu_id;
3672 		cpi->cpi_pkgcoreid = 0;
3673 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3674 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3675 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3676 		} else {
3677 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3678 		}
3679 	} else {
3680 		switch (cpi->cpi_vendor) {
3681 		case X86_VENDOR_Intel:
3682 			cpuid_intel_getids(cpu, featureset);
3683 			break;
3684 		case X86_VENDOR_AMD:
3685 		case X86_VENDOR_HYGON:
3686 			cpuid_amd_getids(cpu, featureset);
3687 			break;
3688 		default:
3689 			/*
3690 			 * In this case, it's hard to say what we should do.
3691 			 * We're going to model them to the OS as single core
3692 			 * threads. We don't have a good identifier for them, so
3693 			 * we're just going to use the cpu id all on a single
3694 			 * chip.
3695 			 *
3696 			 * This case has historically been different from the
3697 			 * case above where we don't have HTT or CMP. While they
3698 			 * could be combined, we've opted to keep it separate to
3699 			 * minimize the risk of topology changes in weird cases.
3700 			 */
3701 			cpi->cpi_procnodes_per_pkg = 1;
3702 			cpi->cpi_cores_per_compunit = 1;
3703 			cpi->cpi_chipid = 0;
3704 			cpi->cpi_coreid = cpu->cpu_id;
3705 			cpi->cpi_clogid = cpu->cpu_id;
3706 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3707 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3708 			cpi->cpi_compunitid = cpi->cpi_coreid;
3709 			break;
3710 		}
3711 	}
3712 }
3713 
3714 /*
3715  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3716  * always gather leaf 6 if it's supported; however, we only look for features on
3717  * Intel systems as AMD does not currently define any of the features we look
3718  * for below.
3719  */
3720 static void
cpuid_basic_thermal(cpu_t * cpu,uchar_t * featureset)3721 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3722 {
3723 	struct cpuid_regs *cp;
3724 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3725 
3726 	if (cpi->cpi_maxeax < 6) {
3727 		return;
3728 	}
3729 
3730 	cp = &cpi->cpi_std[6];
3731 	cp->cp_eax = 6;
3732 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3733 	(void) __cpuid_insn(cp);
3734 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3735 
3736 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3737 		return;
3738 	}
3739 
3740 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3741 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3742 	}
3743 
3744 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3745 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3746 	}
3747 }
3748 
3749 /*
3750  * This is used when we discover that we have AVX support in cpuid. This
3751  * proceeds to scan for the rest of the AVX derived features.
3752  */
3753 static void
cpuid_basic_avx(cpu_t * cpu,uchar_t * featureset)3754 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3755 {
3756 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3757 
3758 	/*
3759 	 * If we don't have AVX, don't bother with most of this.
3760 	 */
3761 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3762 		return;
3763 
3764 	add_x86_feature(featureset, X86FSET_AVX);
3765 
3766 	/*
3767 	 * Intel says we can't check these without also
3768 	 * checking AVX.
3769 	 */
3770 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3771 		add_x86_feature(featureset, X86FSET_F16C);
3772 
3773 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3774 		add_x86_feature(featureset, X86FSET_FMA);
3775 
3776 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3777 		add_x86_feature(featureset, X86FSET_BMI1);
3778 
3779 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3780 		add_x86_feature(featureset, X86FSET_BMI2);
3781 
3782 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3783 		add_x86_feature(featureset, X86FSET_AVX2);
3784 
3785 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3786 		add_x86_feature(featureset, X86FSET_VAES);
3787 
3788 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3789 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3790 
3791 	/*
3792 	 * The rest of the AVX features require AVX512. Do not check them unless
3793 	 * it is present.
3794 	 */
3795 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3796 		return;
3797 	add_x86_feature(featureset, X86FSET_AVX512F);
3798 
3799 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3800 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3801 
3802 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3803 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3804 
3805 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3806 		add_x86_feature(featureset, X86FSET_AVX512PF);
3807 
3808 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3809 		add_x86_feature(featureset, X86FSET_AVX512ER);
3810 
3811 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3812 		add_x86_feature(featureset, X86FSET_AVX512CD);
3813 
3814 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3815 		add_x86_feature(featureset, X86FSET_AVX512BW);
3816 
3817 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3818 		add_x86_feature(featureset, X86FSET_AVX512VL);
3819 
3820 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3821 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3822 
3823 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3824 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3825 
3826 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3827 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3828 
3829 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3830 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3831 
3832 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3833 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3834 
3835 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3836 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3837 
3838 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3839 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3840 
3841 	/*
3842 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3843 	 * we don't need to.
3844 	 */
3845 	if (cpi->cpi_std[7].cp_eax < 1)
3846 		return;
3847 
3848 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3849 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3850 }
3851 
3852 /*
3853  * PPIN is the protected processor inventory number. On AMD this is an actual
3854  * feature bit. However, on Intel systems we need to read the platform
3855  * information MSR if we're on a specific model.
3856  */
3857 #if !defined(__xpv)
3858 static void
cpuid_basic_ppin(cpu_t * cpu,uchar_t * featureset)3859 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3860 {
3861 	on_trap_data_t otd;
3862 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3863 
3864 	switch (cpi->cpi_vendor) {
3865 	case X86_VENDOR_AMD:
3866 		/*
3867 		 * This leaf will have already been gathered in the topology
3868 		 * functions.
3869 		 */
3870 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3871 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3872 				add_x86_feature(featureset, X86FSET_PPIN);
3873 			}
3874 		}
3875 		break;
3876 	case X86_VENDOR_Intel:
3877 		if (cpi->cpi_family != 6)
3878 			break;
3879 		switch (cpi->cpi_model) {
3880 		case INTC_MODEL_IVYBRIDGE_XEON:
3881 		case INTC_MODEL_HASWELL_XEON:
3882 		case INTC_MODEL_BROADWELL_XEON:
3883 		case INTC_MODEL_BROADWELL_XEON_D:
3884 		case INTC_MODEL_SKYLAKE_XEON:
3885 		case INTC_MODEL_ICELAKE_XEON:
3886 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3887 				uint64_t value;
3888 
3889 				value = rdmsr(MSR_PLATFORM_INFO);
3890 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3891 					add_x86_feature(featureset,
3892 					    X86FSET_PPIN);
3893 				}
3894 			}
3895 			no_trap();
3896 			break;
3897 		default:
3898 			break;
3899 		}
3900 		break;
3901 	default:
3902 		break;
3903 	}
3904 }
3905 #endif	/* ! __xpv */
3906 
3907 static void
cpuid_pass_prelude(cpu_t * cpu,void * arg)3908 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3909 {
3910 	uchar_t *featureset = (uchar_t *)arg;
3911 
3912 	/*
3913 	 * We don't run on any processor that doesn't have cpuid, and could not
3914 	 * possibly have arrived here.
3915 	 */
3916 	add_x86_feature(featureset, X86FSET_CPUID);
3917 }
3918 
3919 static void
cpuid_pass_ident(cpu_t * cpu,void * arg __unused)3920 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3921 {
3922 	struct cpuid_info *cpi;
3923 	struct cpuid_regs *cp;
3924 
3925 	/*
3926 	 * We require that virtual/native detection be complete and that PCI
3927 	 * config space access has been set up; at present there is no reliable
3928 	 * way to determine the latter.
3929 	 */
3930 #if !defined(__xpv)
3931 	ASSERT3S(platform_type, !=, -1);
3932 #endif	/* !__xpv */
3933 
3934 	cpi = cpu->cpu_m.mcpu_cpi;
3935 	ASSERT(cpi != NULL);
3936 
3937 	cp = &cpi->cpi_std[0];
3938 	cp->cp_eax = 0;
3939 	cpi->cpi_maxeax = __cpuid_insn(cp);
3940 	{
3941 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3942 		*iptr++ = cp->cp_ebx;
3943 		*iptr++ = cp->cp_edx;
3944 		*iptr++ = cp->cp_ecx;
3945 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3946 	}
3947 
3948 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3949 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3950 
3951 	/*
3952 	 * Limit the range in case of weird hardware
3953 	 */
3954 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3955 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3956 	if (cpi->cpi_maxeax < 1)
3957 		return;
3958 
3959 	cp = &cpi->cpi_std[1];
3960 	cp->cp_eax = 1;
3961 	(void) __cpuid_insn(cp);
3962 
3963 	/*
3964 	 * Extract identifying constants for easy access.
3965 	 */
3966 	cpi->cpi_model = CPI_MODEL(cpi);
3967 	cpi->cpi_family = CPI_FAMILY(cpi);
3968 
3969 	if (cpi->cpi_family == 0xf)
3970 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3971 
3972 	/*
3973 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3974 	 * Intel, and presumably everyone else, uses model == 0xf, as
3975 	 * one would expect (max value means possible overflow).  Sigh.
3976 	 */
3977 
3978 	switch (cpi->cpi_vendor) {
3979 	case X86_VENDOR_Intel:
3980 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3981 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3982 		break;
3983 	case X86_VENDOR_AMD:
3984 		if (CPI_FAMILY(cpi) == 0xf)
3985 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3986 		break;
3987 	case X86_VENDOR_HYGON:
3988 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3989 		break;
3990 	default:
3991 		if (cpi->cpi_model == 0xf)
3992 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3993 		break;
3994 	}
3995 
3996 	cpi->cpi_step = CPI_STEP(cpi);
3997 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3998 
3999 	/*
4000 	 * Synthesize chip "revision" and socket type
4001 	 */
4002 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4003 	    cpi->cpi_model, cpi->cpi_step);
4004 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4005 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4006 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4007 	    cpi->cpi_model, cpi->cpi_step);
4008 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
4009 	    cpi->cpi_model, cpi->cpi_step);
4010 }
4011 
4012 static void
cpuid_pass_basic(cpu_t * cpu,void * arg)4013 cpuid_pass_basic(cpu_t *cpu, void *arg)
4014 {
4015 	uchar_t *featureset = (uchar_t *)arg;
4016 	uint32_t mask_ecx, mask_edx;
4017 	struct cpuid_info *cpi;
4018 	struct cpuid_regs *cp;
4019 	int xcpuid;
4020 #if !defined(__xpv)
4021 	extern int idle_cpu_prefer_mwait;
4022 #endif
4023 
4024 	cpi = cpu->cpu_m.mcpu_cpi;
4025 	ASSERT(cpi != NULL);
4026 
4027 	if (cpi->cpi_maxeax < 1)
4028 		return;
4029 
4030 	/*
4031 	 * This was filled during the identification pass.
4032 	 */
4033 	cp = &cpi->cpi_std[1];
4034 
4035 	/*
4036 	 * *default* assumptions:
4037 	 * - believe %edx feature word
4038 	 * - ignore %ecx feature word
4039 	 * - 32-bit virtual and physical addressing
4040 	 */
4041 	mask_edx = 0xffffffff;
4042 	mask_ecx = 0;
4043 
4044 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
4045 
4046 	switch (cpi->cpi_vendor) {
4047 	case X86_VENDOR_Intel:
4048 		if (cpi->cpi_family == 5)
4049 			x86_type = X86_TYPE_P5;
4050 		else if (IS_LEGACY_P6(cpi)) {
4051 			x86_type = X86_TYPE_P6;
4052 			pentiumpro_bug4046376 = 1;
4053 			/*
4054 			 * Clear the SEP bit when it was set erroneously
4055 			 */
4056 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
4057 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
4058 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
4059 			x86_type = X86_TYPE_P4;
4060 			/*
4061 			 * We don't currently depend on any of the %ecx
4062 			 * features until Prescott, so we'll only check
4063 			 * this from P4 onwards.  We might want to revisit
4064 			 * that idea later.
4065 			 */
4066 			mask_ecx = 0xffffffff;
4067 		} else if (cpi->cpi_family > 0xf)
4068 			mask_ecx = 0xffffffff;
4069 		/*
4070 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4071 		 * to obtain the monitor linesize.
4072 		 */
4073 		if (cpi->cpi_maxeax < 5)
4074 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4075 		break;
4076 	case X86_VENDOR_IntelClone:
4077 	default:
4078 		break;
4079 	case X86_VENDOR_AMD:
4080 #if defined(OPTERON_ERRATUM_108)
4081 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
4082 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
4083 			cpi->cpi_model = 0xc;
4084 		} else
4085 #endif
4086 		if (cpi->cpi_family == 5) {
4087 			/*
4088 			 * AMD K5 and K6
4089 			 *
4090 			 * These CPUs have an incomplete implementation
4091 			 * of MCA/MCE which we mask away.
4092 			 */
4093 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
4094 
4095 			/*
4096 			 * Model 0 uses the wrong (APIC) bit
4097 			 * to indicate PGE.  Fix it here.
4098 			 */
4099 			if (cpi->cpi_model == 0) {
4100 				if (cp->cp_edx & 0x200) {
4101 					cp->cp_edx &= ~0x200;
4102 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
4103 				}
4104 			}
4105 
4106 			/*
4107 			 * Early models had problems w/ MMX; disable.
4108 			 */
4109 			if (cpi->cpi_model < 6)
4110 				mask_edx &= ~CPUID_INTC_EDX_MMX;
4111 		}
4112 
4113 		/*
4114 		 * For newer families, SSE3 and CX16, at least, are valid;
4115 		 * enable all
4116 		 */
4117 		if (cpi->cpi_family >= 0xf)
4118 			mask_ecx = 0xffffffff;
4119 		/*
4120 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4121 		 * to obtain the monitor linesize.
4122 		 */
4123 		if (cpi->cpi_maxeax < 5)
4124 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4125 
4126 #if !defined(__xpv)
4127 		/*
4128 		 * AMD has not historically used MWAIT in the CPU's idle loop.
4129 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
4130 		 * know for certain that in at least family 17h, per AMD, mwait
4131 		 * is preferred. Families in-between are less certain.
4132 		 */
4133 		if (cpi->cpi_family < 0x17) {
4134 			idle_cpu_prefer_mwait = 0;
4135 		}
4136 #endif
4137 
4138 		break;
4139 	case X86_VENDOR_HYGON:
4140 		/* Enable all for Hygon Dhyana CPU */
4141 		mask_ecx = 0xffffffff;
4142 		break;
4143 	case X86_VENDOR_TM:
4144 		/*
4145 		 * workaround the NT workaround in CMS 4.1
4146 		 */
4147 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
4148 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
4149 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4150 		break;
4151 	case X86_VENDOR_Centaur:
4152 		/*
4153 		 * workaround the NT workarounds again
4154 		 */
4155 		if (cpi->cpi_family == 6)
4156 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4157 		break;
4158 	case X86_VENDOR_Cyrix:
4159 		/*
4160 		 * We rely heavily on the probing in locore
4161 		 * to actually figure out what parts, if any,
4162 		 * of the Cyrix cpuid instruction to believe.
4163 		 */
4164 		switch (x86_type) {
4165 		case X86_TYPE_CYRIX_486:
4166 			mask_edx = 0;
4167 			break;
4168 		case X86_TYPE_CYRIX_6x86:
4169 			mask_edx = 0;
4170 			break;
4171 		case X86_TYPE_CYRIX_6x86L:
4172 			mask_edx =
4173 			    CPUID_INTC_EDX_DE |
4174 			    CPUID_INTC_EDX_CX8;
4175 			break;
4176 		case X86_TYPE_CYRIX_6x86MX:
4177 			mask_edx =
4178 			    CPUID_INTC_EDX_DE |
4179 			    CPUID_INTC_EDX_MSR |
4180 			    CPUID_INTC_EDX_CX8 |
4181 			    CPUID_INTC_EDX_PGE |
4182 			    CPUID_INTC_EDX_CMOV |
4183 			    CPUID_INTC_EDX_MMX;
4184 			break;
4185 		case X86_TYPE_CYRIX_GXm:
4186 			mask_edx =
4187 			    CPUID_INTC_EDX_MSR |
4188 			    CPUID_INTC_EDX_CX8 |
4189 			    CPUID_INTC_EDX_CMOV |
4190 			    CPUID_INTC_EDX_MMX;
4191 			break;
4192 		case X86_TYPE_CYRIX_MediaGX:
4193 			break;
4194 		case X86_TYPE_CYRIX_MII:
4195 		case X86_TYPE_VIA_CYRIX_III:
4196 			mask_edx =
4197 			    CPUID_INTC_EDX_DE |
4198 			    CPUID_INTC_EDX_TSC |
4199 			    CPUID_INTC_EDX_MSR |
4200 			    CPUID_INTC_EDX_CX8 |
4201 			    CPUID_INTC_EDX_PGE |
4202 			    CPUID_INTC_EDX_CMOV |
4203 			    CPUID_INTC_EDX_MMX;
4204 			break;
4205 		default:
4206 			break;
4207 		}
4208 		break;
4209 	}
4210 
4211 #if defined(__xpv)
4212 	/*
4213 	 * Do not support MONITOR/MWAIT under a hypervisor
4214 	 */
4215 	mask_ecx &= ~CPUID_INTC_ECX_MON;
4216 	/*
4217 	 * Do not support XSAVE under a hypervisor for now
4218 	 */
4219 	xsave_force_disable = B_TRUE;
4220 
4221 #endif	/* __xpv */
4222 
4223 	if (xsave_force_disable) {
4224 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4225 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
4226 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
4227 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
4228 	}
4229 
4230 	/*
4231 	 * Now we've figured out the masks that determine
4232 	 * which bits we choose to believe, apply the masks
4233 	 * to the feature words, then map the kernel's view
4234 	 * of these feature words into its feature word.
4235 	 */
4236 	cp->cp_edx &= mask_edx;
4237 	cp->cp_ecx &= mask_ecx;
4238 
4239 	/*
4240 	 * apply any platform restrictions (we don't call this
4241 	 * immediately after __cpuid_insn here, because we need the
4242 	 * workarounds applied above first)
4243 	 */
4244 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4245 
4246 	/*
4247 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
4248 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4249 	 * 7 has sub-leaves determined by ecx.
4250 	 */
4251 	if (cpi->cpi_maxeax >= 7) {
4252 		struct cpuid_regs *ecp;
4253 		ecp = &cpi->cpi_std[7];
4254 		ecp->cp_eax = 7;
4255 		ecp->cp_ecx = 0;
4256 		(void) __cpuid_insn(ecp);
4257 
4258 		/*
4259 		 * If XSAVE has been disabled, just ignore all of the
4260 		 * extended-save-area dependent flags here. By removing most of
4261 		 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
4262 		 * end up looking at additional xsave dependent leaves right
4263 		 * now.
4264 		 */
4265 		if (xsave_force_disable) {
4266 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4267 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4268 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4269 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4270 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4271 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4272 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4273 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4274 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4275 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4276 		}
4277 
4278 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4279 			add_x86_feature(featureset, X86FSET_SMEP);
4280 
4281 		/*
4282 		 * We check disable_smap here in addition to in startup_smap()
4283 		 * to ensure CPUs that aren't the boot CPU don't accidentally
4284 		 * include it in the feature set and thus generate a mismatched
4285 		 * x86 feature set across CPUs.
4286 		 */
4287 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4288 		    disable_smap == 0)
4289 			add_x86_feature(featureset, X86FSET_SMAP);
4290 
4291 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
4292 			add_x86_feature(featureset, X86FSET_RDSEED);
4293 
4294 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4295 			add_x86_feature(featureset, X86FSET_ADX);
4296 
4297 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4298 			add_x86_feature(featureset, X86FSET_FSGSBASE);
4299 
4300 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4301 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4302 
4303 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4304 			add_x86_feature(featureset, X86FSET_INVPCID);
4305 
4306 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4307 			add_x86_feature(featureset, X86FSET_UMIP);
4308 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4309 			add_x86_feature(featureset, X86FSET_PKU);
4310 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4311 			add_x86_feature(featureset, X86FSET_OSPKE);
4312 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4313 			add_x86_feature(featureset, X86FSET_GFNI);
4314 
4315 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4316 			add_x86_feature(featureset, X86FSET_CLWB);
4317 
4318 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4319 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4320 				add_x86_feature(featureset, X86FSET_MPX);
4321 		}
4322 
4323 		/*
4324 		 * If we have subleaf 1 or 2 available, grab and store
4325 		 * that. This is used for more AVX and related features.
4326 		 */
4327 		if (ecp->cp_eax >= 1) {
4328 			struct cpuid_regs *c71;
4329 			c71 = &cpi->cpi_sub7[0];
4330 			c71->cp_eax = 7;
4331 			c71->cp_ecx = 1;
4332 			(void) __cpuid_insn(c71);
4333 		}
4334 
4335 		/* Subleaf 2 has certain security indicators in it. */
4336 		if (ecp->cp_eax >= 2) {
4337 			struct cpuid_regs *c72;
4338 			c72 = &cpi->cpi_sub7[1];
4339 			c72->cp_eax = 7;
4340 			c72->cp_ecx = 2;
4341 			(void) __cpuid_insn(c72);
4342 		}
4343 	}
4344 
4345 	/*
4346 	 * fold in overrides from the "eeprom" mechanism
4347 	 */
4348 	cp->cp_edx |= cpuid_feature_edx_include;
4349 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4350 
4351 	cp->cp_ecx |= cpuid_feature_ecx_include;
4352 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4353 
4354 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4355 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4356 	}
4357 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4358 		add_x86_feature(featureset, X86FSET_TSC);
4359 	}
4360 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4361 		add_x86_feature(featureset, X86FSET_MSR);
4362 	}
4363 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4364 		add_x86_feature(featureset, X86FSET_MTRR);
4365 	}
4366 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4367 		add_x86_feature(featureset, X86FSET_PGE);
4368 	}
4369 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4370 		add_x86_feature(featureset, X86FSET_CMOV);
4371 	}
4372 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4373 		add_x86_feature(featureset, X86FSET_MMX);
4374 	}
4375 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4376 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4377 		add_x86_feature(featureset, X86FSET_MCA);
4378 	}
4379 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4380 		add_x86_feature(featureset, X86FSET_PAE);
4381 	}
4382 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4383 		add_x86_feature(featureset, X86FSET_CX8);
4384 	}
4385 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4386 		add_x86_feature(featureset, X86FSET_CX16);
4387 	}
4388 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4389 		add_x86_feature(featureset, X86FSET_PAT);
4390 	}
4391 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4392 		add_x86_feature(featureset, X86FSET_SEP);
4393 	}
4394 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4395 		/*
4396 		 * In our implementation, fxsave/fxrstor
4397 		 * are prerequisites before we'll even
4398 		 * try and do SSE things.
4399 		 */
4400 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4401 			add_x86_feature(featureset, X86FSET_SSE);
4402 		}
4403 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4404 			add_x86_feature(featureset, X86FSET_SSE2);
4405 		}
4406 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4407 			add_x86_feature(featureset, X86FSET_SSE3);
4408 		}
4409 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4410 			add_x86_feature(featureset, X86FSET_SSSE3);
4411 		}
4412 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4413 			add_x86_feature(featureset, X86FSET_SSE4_1);
4414 		}
4415 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4416 			add_x86_feature(featureset, X86FSET_SSE4_2);
4417 		}
4418 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4419 			add_x86_feature(featureset, X86FSET_AES);
4420 		}
4421 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4422 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4423 		}
4424 
4425 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4426 			add_x86_feature(featureset, X86FSET_SHA);
4427 
4428 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4429 			add_x86_feature(featureset, X86FSET_XSAVE);
4430 
4431 			/* We only test AVX & AVX512 when there is XSAVE */
4432 			cpuid_basic_avx(cpu, featureset);
4433 		}
4434 	}
4435 
4436 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4437 		add_x86_feature(featureset, X86FSET_PCID);
4438 	}
4439 
4440 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4441 		add_x86_feature(featureset, X86FSET_X2APIC);
4442 	}
4443 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4444 		add_x86_feature(featureset, X86FSET_DE);
4445 	}
4446 #if !defined(__xpv)
4447 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4448 
4449 		/*
4450 		 * We require the CLFLUSH instruction for erratum workaround
4451 		 * to use MONITOR/MWAIT.
4452 		 */
4453 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4454 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4455 			add_x86_feature(featureset, X86FSET_MWAIT);
4456 		} else {
4457 			extern int idle_cpu_assert_cflush_monitor;
4458 
4459 			/*
4460 			 * All processors we are aware of which have
4461 			 * MONITOR/MWAIT also have CLFLUSH.
4462 			 */
4463 			if (idle_cpu_assert_cflush_monitor) {
4464 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4465 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4466 			}
4467 		}
4468 	}
4469 #endif	/* __xpv */
4470 
4471 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4472 		add_x86_feature(featureset, X86FSET_VMX);
4473 	}
4474 
4475 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4476 		add_x86_feature(featureset, X86FSET_RDRAND);
4477 
4478 	/*
4479 	 * Only need it first time, rest of the cpus would follow suit.
4480 	 * we only capture this for the bootcpu.
4481 	 */
4482 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4483 		add_x86_feature(featureset, X86FSET_CLFSH);
4484 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4485 	}
4486 	if (is_x86_feature(featureset, X86FSET_PAE))
4487 		cpi->cpi_pabits = 36;
4488 
4489 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4490 		struct cpuid_regs r, *ecp;
4491 
4492 		ecp = &r;
4493 		ecp->cp_eax = 0xD;
4494 		ecp->cp_ecx = 1;
4495 		ecp->cp_edx = ecp->cp_ebx = 0;
4496 		(void) __cpuid_insn(ecp);
4497 
4498 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4499 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4500 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4501 			add_x86_feature(featureset, X86FSET_XSAVEC);
4502 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4503 			add_x86_feature(featureset, X86FSET_XSAVES);
4504 
4505 		/*
4506 		 * Zen 2 family processors suffer from erratum 1386 that causes
4507 		 * xsaves to not function correctly in some circumstances. There
4508 		 * are no supervisor states in Zen 2 and earlier. Practically
4509 		 * speaking this has no impact for us as we currently do not
4510 		 * leverage compressed xsave formats. To safeguard against
4511 		 * issues in the future where we may opt to using it, we remove
4512 		 * it from the feature set now. While Matisse has a microcode
4513 		 * update available with a fix, not all Zen 2 CPUs do so it's
4514 		 * simpler for the moment to unconditionally remove it.
4515 		 */
4516 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4517 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4518 			remove_x86_feature(featureset, X86FSET_XSAVES);
4519 		}
4520 	}
4521 
4522 	/*
4523 	 * Work on the "extended" feature information, doing
4524 	 * some basic initialization to be used in the extended pass.
4525 	 */
4526 	xcpuid = 0;
4527 	switch (cpi->cpi_vendor) {
4528 	case X86_VENDOR_Intel:
4529 		/*
4530 		 * On KVM we know we will have proper support for extended
4531 		 * cpuid.
4532 		 */
4533 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4534 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4535 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4536 			xcpuid++;
4537 		break;
4538 	case X86_VENDOR_AMD:
4539 		if (cpi->cpi_family > 5 ||
4540 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4541 			xcpuid++;
4542 		break;
4543 	case X86_VENDOR_Cyrix:
4544 		/*
4545 		 * Only these Cyrix CPUs are -known- to support
4546 		 * extended cpuid operations.
4547 		 */
4548 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4549 		    x86_type == X86_TYPE_CYRIX_GXm)
4550 			xcpuid++;
4551 		break;
4552 	case X86_VENDOR_HYGON:
4553 	case X86_VENDOR_Centaur:
4554 	case X86_VENDOR_TM:
4555 	default:
4556 		xcpuid++;
4557 		break;
4558 	}
4559 
4560 	if (xcpuid) {
4561 		cp = &cpi->cpi_extd[0];
4562 		cp->cp_eax = CPUID_LEAF_EXT_0;
4563 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4564 	}
4565 
4566 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4567 
4568 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4569 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4570 
4571 		switch (cpi->cpi_vendor) {
4572 		case X86_VENDOR_Intel:
4573 		case X86_VENDOR_AMD:
4574 		case X86_VENDOR_HYGON:
4575 			if (cpi->cpi_xmaxeax < 0x80000001)
4576 				break;
4577 			cp = &cpi->cpi_extd[1];
4578 			cp->cp_eax = 0x80000001;
4579 			(void) __cpuid_insn(cp);
4580 
4581 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4582 			    cpi->cpi_family == 5 &&
4583 			    cpi->cpi_model == 6 &&
4584 			    cpi->cpi_step == 6) {
4585 				/*
4586 				 * K6 model 6 uses bit 10 to indicate SYSC
4587 				 * Later models use bit 11. Fix it here.
4588 				 */
4589 				if (cp->cp_edx & 0x400) {
4590 					cp->cp_edx &= ~0x400;
4591 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4592 				}
4593 			}
4594 
4595 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4596 
4597 			/*
4598 			 * Compute the additions to the kernel's feature word.
4599 			 */
4600 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4601 				add_x86_feature(featureset, X86FSET_NX);
4602 			}
4603 
4604 			/*
4605 			 * Regardless whether or not we boot 64-bit,
4606 			 * we should have a way to identify whether
4607 			 * the CPU is capable of running 64-bit.
4608 			 */
4609 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4610 				add_x86_feature(featureset, X86FSET_64);
4611 			}
4612 
4613 			/* 1 GB large page - enable only for 64 bit kernel */
4614 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4615 				add_x86_feature(featureset, X86FSET_1GPG);
4616 			}
4617 
4618 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4619 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4620 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4621 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4622 				add_x86_feature(featureset, X86FSET_SSE4A);
4623 			}
4624 
4625 			/*
4626 			 * It's really tricky to support syscall/sysret in
4627 			 * the i386 kernel; we rely on sysenter/sysexit
4628 			 * instead.  In the amd64 kernel, things are -way-
4629 			 * better.
4630 			 */
4631 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4632 				add_x86_feature(featureset, X86FSET_ASYSC);
4633 			}
4634 
4635 			/*
4636 			 * While we're thinking about system calls, note
4637 			 * that AMD processors don't support sysenter
4638 			 * in long mode at all, so don't try to program them.
4639 			 */
4640 			if (x86_vendor == X86_VENDOR_AMD ||
4641 			    x86_vendor == X86_VENDOR_HYGON) {
4642 				remove_x86_feature(featureset, X86FSET_SEP);
4643 			}
4644 
4645 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4646 				add_x86_feature(featureset, X86FSET_TSCP);
4647 			}
4648 
4649 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4650 				add_x86_feature(featureset, X86FSET_SVM);
4651 			}
4652 
4653 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4654 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4655 			}
4656 
4657 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4658 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4659 			}
4660 
4661 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4662 				add_x86_feature(featureset, X86FSET_XOP);
4663 			}
4664 
4665 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4666 				add_x86_feature(featureset, X86FSET_FMA4);
4667 			}
4668 
4669 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4670 				add_x86_feature(featureset, X86FSET_TBM);
4671 			}
4672 
4673 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4674 				add_x86_feature(featureset, X86FSET_MONITORX);
4675 			}
4676 			break;
4677 		default:
4678 			break;
4679 		}
4680 
4681 		/*
4682 		 * Get CPUID data about processor cores and hyperthreads.
4683 		 */
4684 		switch (cpi->cpi_vendor) {
4685 		case X86_VENDOR_Intel:
4686 			if (cpi->cpi_maxeax >= 4) {
4687 				cp = &cpi->cpi_std[4];
4688 				cp->cp_eax = 4;
4689 				cp->cp_ecx = 0;
4690 				(void) __cpuid_insn(cp);
4691 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4692 			}
4693 			/*FALLTHROUGH*/
4694 		case X86_VENDOR_AMD:
4695 		case X86_VENDOR_HYGON:
4696 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4697 				break;
4698 			cp = &cpi->cpi_extd[8];
4699 			cp->cp_eax = CPUID_LEAF_EXT_8;
4700 			(void) __cpuid_insn(cp);
4701 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4702 			    cp);
4703 
4704 			/*
4705 			 * AMD uses ebx for some extended functions.
4706 			 */
4707 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4708 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4709 				/*
4710 				 * While we're here, check for the AMD "Error
4711 				 * Pointer Zero/Restore" feature. This can be
4712 				 * used to setup the FP save handlers
4713 				 * appropriately.
4714 				 */
4715 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4716 					cpi->cpi_fp_amd_save = 0;
4717 				} else {
4718 					cpi->cpi_fp_amd_save = 1;
4719 				}
4720 
4721 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4722 					add_x86_feature(featureset,
4723 					    X86FSET_CLZERO);
4724 				}
4725 			}
4726 
4727 			/*
4728 			 * Virtual and physical address limits from
4729 			 * cpuid override previously guessed values.
4730 			 */
4731 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4732 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4733 			break;
4734 		default:
4735 			break;
4736 		}
4737 
4738 		/*
4739 		 * Get CPUID data about TSC Invariance in Deep C-State.
4740 		 */
4741 		switch (cpi->cpi_vendor) {
4742 		case X86_VENDOR_Intel:
4743 		case X86_VENDOR_AMD:
4744 		case X86_VENDOR_HYGON:
4745 			if (cpi->cpi_maxeax >= 7) {
4746 				cp = &cpi->cpi_extd[7];
4747 				cp->cp_eax = 0x80000007;
4748 				cp->cp_ecx = 0;
4749 				(void) __cpuid_insn(cp);
4750 			}
4751 			break;
4752 		default:
4753 			break;
4754 		}
4755 	}
4756 
4757 	/*
4758 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4759 	 * run and thus gathered some of its dependent leaves.
4760 	 */
4761 	cpuid_basic_topology(cpu, featureset);
4762 	cpuid_basic_thermal(cpu, featureset);
4763 #if !defined(__xpv)
4764 	cpuid_basic_ppin(cpu, featureset);
4765 #endif
4766 
4767 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4768 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4769 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4770 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4771 			/* Special handling for AMD FP not necessary. */
4772 			cpi->cpi_fp_amd_save = 0;
4773 		} else {
4774 			cpi->cpi_fp_amd_save = 1;
4775 		}
4776 	}
4777 
4778 	/*
4779 	 * Check (and potentially set) if lfence is serializing.
4780 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4781 	 */
4782 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4783 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4784 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4785 		/*
4786 		 * The AMD white paper Software Techniques For Managing
4787 		 * Speculation on AMD Processors details circumstances for when
4788 		 * lfence instructions are serializing.
4789 		 *
4790 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4791 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4792 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4793 		 * committed to supporting that MSR on all later CPUs.
4794 		 */
4795 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4796 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4797 		} else if (cpi->cpi_family >= 0x10) {
4798 #if !defined(__xpv)
4799 			uint64_t val;
4800 
4801 			/*
4802 			 * Be careful when attempting to enable the bit, and
4803 			 * verify that it was actually set in case we are
4804 			 * running in a hypervisor which is less than faithful
4805 			 * about its emulation of this feature.
4806 			 */
4807 			on_trap_data_t otd;
4808 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4809 				val = rdmsr(MSR_AMD_DE_CFG);
4810 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4811 				wrmsr(MSR_AMD_DE_CFG, val);
4812 				val = rdmsr(MSR_AMD_DE_CFG);
4813 			} else {
4814 				val = 0;
4815 			}
4816 			no_trap();
4817 
4818 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4819 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4820 			}
4821 #endif
4822 		}
4823 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4824 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4825 		/*
4826 		 * Documentation and other OSes indicate that lfence is always
4827 		 * serializing on Intel CPUs.
4828 		 */
4829 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4830 	}
4831 
4832 
4833 	/*
4834 	 * Check the processor leaves that are used for security features. Grab
4835 	 * any additional processor-specific leaves that we may not have yet.
4836 	 */
4837 	switch (cpi->cpi_vendor) {
4838 	case X86_VENDOR_AMD:
4839 	case X86_VENDOR_HYGON:
4840 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4841 			cp = &cpi->cpi_extd[0x21];
4842 			cp->cp_eax = CPUID_LEAF_EXT_21;
4843 			cp->cp_ecx = 0;
4844 			(void) __cpuid_insn(cp);
4845 		}
4846 		break;
4847 	default:
4848 		break;
4849 	}
4850 
4851 	cpuid_scan_security(cpu, featureset);
4852 }
4853 
4854 /*
4855  * Make copies of the cpuid table entries we depend on, in
4856  * part for ease of parsing now, in part so that we have only
4857  * one place to correct any of it, in part for ease of
4858  * later export to userland, and in part so we can look at
4859  * this stuff in a crash dump.
4860  */
4861 
4862 static void
cpuid_pass_extended(cpu_t * cpu,void * _arg __unused)4863 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4864 {
4865 	uint_t n, nmax;
4866 	int i;
4867 	struct cpuid_regs *cp;
4868 	uint8_t *dp;
4869 	uint32_t *iptr;
4870 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4871 
4872 	if (cpi->cpi_maxeax < 1)
4873 		return;
4874 
4875 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4876 		nmax = NMAX_CPI_STD;
4877 	/*
4878 	 * (We already handled n == 0 and n == 1 in the basic pass)
4879 	 */
4880 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4881 		/*
4882 		 * leaves 6 and 7 were handled in the basic pass
4883 		 */
4884 		if (n == 6 || n == 7)
4885 			continue;
4886 
4887 		cp->cp_eax = n;
4888 
4889 		/*
4890 		 * CPUID function 4 expects %ecx to be initialized
4891 		 * with an index which indicates which cache to return
4892 		 * information about. The OS is expected to call function 4
4893 		 * with %ecx set to 0, 1, 2, ... until it returns with
4894 		 * EAX[4:0] set to 0, which indicates there are no more
4895 		 * caches.
4896 		 *
4897 		 * Here, populate cpi_std[4] with the information returned by
4898 		 * function 4 when %ecx == 0, and do the rest in a later pass
4899 		 * when dynamic memory allocation becomes available.
4900 		 *
4901 		 * Note: we need to explicitly initialize %ecx here, since
4902 		 * function 4 may have been previously invoked.
4903 		 */
4904 		if (n == 4)
4905 			cp->cp_ecx = 0;
4906 
4907 		(void) __cpuid_insn(cp);
4908 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4909 		switch (n) {
4910 		case 2:
4911 			/*
4912 			 * "the lower 8 bits of the %eax register
4913 			 * contain a value that identifies the number
4914 			 * of times the cpuid [instruction] has to be
4915 			 * executed to obtain a complete image of the
4916 			 * processor's caching systems."
4917 			 *
4918 			 * How *do* they make this stuff up?
4919 			 */
4920 			cpi->cpi_ncache = sizeof (*cp) *
4921 			    BITX(cp->cp_eax, 7, 0);
4922 			if (cpi->cpi_ncache == 0)
4923 				break;
4924 			cpi->cpi_ncache--;	/* skip count byte */
4925 
4926 			/*
4927 			 * Well, for now, rather than attempt to implement
4928 			 * this slightly dubious algorithm, we just look
4929 			 * at the first 15 ..
4930 			 */
4931 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4932 				cpi->cpi_ncache = sizeof (*cp) - 1;
4933 
4934 			dp = cpi->cpi_cacheinfo;
4935 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4936 				uint8_t *p = (void *)&cp->cp_eax;
4937 				for (i = 1; i < 4; i++)
4938 					if (p[i] != 0)
4939 						*dp++ = p[i];
4940 			}
4941 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4942 				uint8_t *p = (void *)&cp->cp_ebx;
4943 				for (i = 0; i < 4; i++)
4944 					if (p[i] != 0)
4945 						*dp++ = p[i];
4946 			}
4947 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4948 				uint8_t *p = (void *)&cp->cp_ecx;
4949 				for (i = 0; i < 4; i++)
4950 					if (p[i] != 0)
4951 						*dp++ = p[i];
4952 			}
4953 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4954 				uint8_t *p = (void *)&cp->cp_edx;
4955 				for (i = 0; i < 4; i++)
4956 					if (p[i] != 0)
4957 						*dp++ = p[i];
4958 			}
4959 			break;
4960 
4961 		case 3:	/* Processor serial number, if PSN supported */
4962 			break;
4963 
4964 		case 4:	/* Deterministic cache parameters */
4965 			break;
4966 
4967 		case 5:	/* Monitor/Mwait parameters */
4968 		{
4969 			size_t mwait_size;
4970 
4971 			/*
4972 			 * check cpi_mwait.support which was set in
4973 			 * cpuid_pass_basic()
4974 			 */
4975 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4976 				break;
4977 
4978 			/*
4979 			 * Protect ourself from insane mwait line size.
4980 			 * Workaround for incomplete hardware emulator(s).
4981 			 */
4982 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4983 			if (mwait_size < sizeof (uint32_t) ||
4984 			    !ISP2(mwait_size)) {
4985 #if DEBUG
4986 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4987 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4988 #endif
4989 				break;
4990 			}
4991 
4992 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4993 			cpi->cpi_mwait.mon_max = mwait_size;
4994 			if (MWAIT_EXTENSION(cpi)) {
4995 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4996 				if (MWAIT_INT_ENABLE(cpi))
4997 					cpi->cpi_mwait.support |=
4998 					    MWAIT_ECX_INT_ENABLE;
4999 			}
5000 			break;
5001 		}
5002 		default:
5003 			break;
5004 		}
5005 	}
5006 
5007 	/*
5008 	 * XSAVE enumeration
5009 	 */
5010 	if (cpi->cpi_maxeax >= 0xD) {
5011 		struct cpuid_regs regs;
5012 		boolean_t cpuid_d_valid = B_TRUE;
5013 
5014 		cp = &regs;
5015 		cp->cp_eax = 0xD;
5016 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
5017 
5018 		(void) __cpuid_insn(cp);
5019 
5020 		/*
5021 		 * Sanity checks for debug
5022 		 */
5023 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
5024 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
5025 			cpuid_d_valid = B_FALSE;
5026 		}
5027 
5028 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
5029 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
5030 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
5031 
5032 		/*
5033 		 * If the hw supports AVX, get the size and offset in the save
5034 		 * area for the ymm state.
5035 		 */
5036 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
5037 			cp->cp_eax = 0xD;
5038 			cp->cp_ecx = 2;
5039 			cp->cp_edx = cp->cp_ebx = 0;
5040 
5041 			(void) __cpuid_insn(cp);
5042 
5043 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
5044 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
5045 				cpuid_d_valid = B_FALSE;
5046 			}
5047 
5048 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
5049 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
5050 		}
5051 
5052 		/*
5053 		 * If the hw supports MPX, get the size and offset in the
5054 		 * save area for BNDREGS and BNDCSR.
5055 		 */
5056 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
5057 			cp->cp_eax = 0xD;
5058 			cp->cp_ecx = 3;
5059 			cp->cp_edx = cp->cp_ebx = 0;
5060 
5061 			(void) __cpuid_insn(cp);
5062 
5063 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
5064 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
5065 
5066 			cp->cp_eax = 0xD;
5067 			cp->cp_ecx = 4;
5068 			cp->cp_edx = cp->cp_ebx = 0;
5069 
5070 			(void) __cpuid_insn(cp);
5071 
5072 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
5073 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
5074 		}
5075 
5076 		/*
5077 		 * If the hw supports AVX512, get the size and offset in the
5078 		 * save area for the opmask registers and zmm state.
5079 		 */
5080 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
5081 			cp->cp_eax = 0xD;
5082 			cp->cp_ecx = 5;
5083 			cp->cp_edx = cp->cp_ebx = 0;
5084 
5085 			(void) __cpuid_insn(cp);
5086 
5087 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
5088 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
5089 
5090 			cp->cp_eax = 0xD;
5091 			cp->cp_ecx = 6;
5092 			cp->cp_edx = cp->cp_ebx = 0;
5093 
5094 			(void) __cpuid_insn(cp);
5095 
5096 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
5097 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
5098 
5099 			cp->cp_eax = 0xD;
5100 			cp->cp_ecx = 7;
5101 			cp->cp_edx = cp->cp_ebx = 0;
5102 
5103 			(void) __cpuid_insn(cp);
5104 
5105 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
5106 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
5107 		}
5108 
5109 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
5110 			xsave_state_size = 0;
5111 		} else if (cpuid_d_valid) {
5112 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
5113 		} else {
5114 			/* Broken CPUID 0xD, probably in HVM */
5115 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
5116 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
5117 			    ", ymm_size = %d, ymm_offset = %d\n",
5118 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
5119 			    cpi->cpi_xsave.xsav_hw_features_high,
5120 			    (int)cpi->cpi_xsave.xsav_max_size,
5121 			    (int)cpi->cpi_xsave.ymm_size,
5122 			    (int)cpi->cpi_xsave.ymm_offset);
5123 
5124 			if (xsave_state_size != 0) {
5125 				/*
5126 				 * This must be a non-boot CPU. We cannot
5127 				 * continue, because boot cpu has already
5128 				 * enabled XSAVE.
5129 				 */
5130 				ASSERT(cpu->cpu_id != 0);
5131 				cmn_err(CE_PANIC, "cpu%d: we have already "
5132 				    "enabled XSAVE on boot cpu, cannot "
5133 				    "continue.", cpu->cpu_id);
5134 			} else {
5135 				/*
5136 				 * If we reached here on the boot CPU, it's also
5137 				 * almost certain that we'll reach here on the
5138 				 * non-boot CPUs. When we're here on a boot CPU
5139 				 * we should disable the feature, on a non-boot
5140 				 * CPU we need to confirm that we have.
5141 				 */
5142 				if (cpu->cpu_id == 0) {
5143 					remove_x86_feature(x86_featureset,
5144 					    X86FSET_XSAVE);
5145 					remove_x86_feature(x86_featureset,
5146 					    X86FSET_AVX);
5147 					remove_x86_feature(x86_featureset,
5148 					    X86FSET_F16C);
5149 					remove_x86_feature(x86_featureset,
5150 					    X86FSET_BMI1);
5151 					remove_x86_feature(x86_featureset,
5152 					    X86FSET_BMI2);
5153 					remove_x86_feature(x86_featureset,
5154 					    X86FSET_FMA);
5155 					remove_x86_feature(x86_featureset,
5156 					    X86FSET_AVX2);
5157 					remove_x86_feature(x86_featureset,
5158 					    X86FSET_MPX);
5159 					remove_x86_feature(x86_featureset,
5160 					    X86FSET_AVX512F);
5161 					remove_x86_feature(x86_featureset,
5162 					    X86FSET_AVX512DQ);
5163 					remove_x86_feature(x86_featureset,
5164 					    X86FSET_AVX512PF);
5165 					remove_x86_feature(x86_featureset,
5166 					    X86FSET_AVX512ER);
5167 					remove_x86_feature(x86_featureset,
5168 					    X86FSET_AVX512CD);
5169 					remove_x86_feature(x86_featureset,
5170 					    X86FSET_AVX512BW);
5171 					remove_x86_feature(x86_featureset,
5172 					    X86FSET_AVX512VL);
5173 					remove_x86_feature(x86_featureset,
5174 					    X86FSET_AVX512FMA);
5175 					remove_x86_feature(x86_featureset,
5176 					    X86FSET_AVX512VBMI);
5177 					remove_x86_feature(x86_featureset,
5178 					    X86FSET_AVX512VNNI);
5179 					remove_x86_feature(x86_featureset,
5180 					    X86FSET_AVX512VPOPCDQ);
5181 					remove_x86_feature(x86_featureset,
5182 					    X86FSET_AVX512NNIW);
5183 					remove_x86_feature(x86_featureset,
5184 					    X86FSET_AVX512FMAPS);
5185 					remove_x86_feature(x86_featureset,
5186 					    X86FSET_VAES);
5187 					remove_x86_feature(x86_featureset,
5188 					    X86FSET_VPCLMULQDQ);
5189 					remove_x86_feature(x86_featureset,
5190 					    X86FSET_GFNI);
5191 					remove_x86_feature(x86_featureset,
5192 					    X86FSET_AVX512_VP2INT);
5193 					remove_x86_feature(x86_featureset,
5194 					    X86FSET_AVX512_BITALG);
5195 					remove_x86_feature(x86_featureset,
5196 					    X86FSET_AVX512_VBMI2);
5197 					remove_x86_feature(x86_featureset,
5198 					    X86FSET_AVX512_BF16);
5199 
5200 					xsave_force_disable = B_TRUE;
5201 				} else {
5202 					VERIFY(is_x86_feature(x86_featureset,
5203 					    X86FSET_XSAVE) == B_FALSE);
5204 				}
5205 			}
5206 		}
5207 	}
5208 
5209 
5210 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
5211 		return;
5212 
5213 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
5214 		nmax = NMAX_CPI_EXTD;
5215 	/*
5216 	 * Copy the extended properties, fixing them as we go. While we start at
5217 	 * 2 because we've already handled a few cases in the basic pass, the
5218 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5219 	 */
5220 	iptr = (void *)cpi->cpi_brandstr;
5221 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5222 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5223 		(void) __cpuid_insn(cp);
5224 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5225 		    cp);
5226 		switch (n) {
5227 		case 2:
5228 		case 3:
5229 		case 4:
5230 			/*
5231 			 * Extract the brand string
5232 			 */
5233 			*iptr++ = cp->cp_eax;
5234 			*iptr++ = cp->cp_ebx;
5235 			*iptr++ = cp->cp_ecx;
5236 			*iptr++ = cp->cp_edx;
5237 			break;
5238 		case 5:
5239 			switch (cpi->cpi_vendor) {
5240 			case X86_VENDOR_AMD:
5241 				/*
5242 				 * The Athlon and Duron were the first
5243 				 * parts to report the sizes of the
5244 				 * TLB for large pages. Before then,
5245 				 * we don't trust the data.
5246 				 */
5247 				if (cpi->cpi_family < 6 ||
5248 				    (cpi->cpi_family == 6 &&
5249 				    cpi->cpi_model < 1))
5250 					cp->cp_eax = 0;
5251 				break;
5252 			default:
5253 				break;
5254 			}
5255 			break;
5256 		case 6:
5257 			switch (cpi->cpi_vendor) {
5258 			case X86_VENDOR_AMD:
5259 				/*
5260 				 * The Athlon and Duron were the first
5261 				 * AMD parts with L2 TLB's.
5262 				 * Before then, don't trust the data.
5263 				 */
5264 				if (cpi->cpi_family < 6 ||
5265 				    (cpi->cpi_family == 6 &&
5266 				    cpi->cpi_model < 1))
5267 					cp->cp_eax = cp->cp_ebx = 0;
5268 				/*
5269 				 * AMD Duron rev A0 reports L2
5270 				 * cache size incorrectly as 1K
5271 				 * when it is really 64K
5272 				 */
5273 				if (cpi->cpi_family == 6 &&
5274 				    cpi->cpi_model == 3 &&
5275 				    cpi->cpi_step == 0) {
5276 					cp->cp_ecx &= 0xffff;
5277 					cp->cp_ecx |= 0x400000;
5278 				}
5279 				break;
5280 			case X86_VENDOR_Cyrix:	/* VIA C3 */
5281 				/*
5282 				 * VIA C3 processors are a bit messed
5283 				 * up w.r.t. encoding cache sizes in %ecx
5284 				 */
5285 				if (cpi->cpi_family != 6)
5286 					break;
5287 				/*
5288 				 * model 7 and 8 were incorrectly encoded
5289 				 *
5290 				 * xxx is model 8 really broken?
5291 				 */
5292 				if (cpi->cpi_model == 7 ||
5293 				    cpi->cpi_model == 8)
5294 					cp->cp_ecx =
5295 					    BITX(cp->cp_ecx, 31, 24) << 16 |
5296 					    BITX(cp->cp_ecx, 23, 16) << 12 |
5297 					    BITX(cp->cp_ecx, 15, 8) << 8 |
5298 					    BITX(cp->cp_ecx, 7, 0);
5299 				/*
5300 				 * model 9 stepping 1 has wrong associativity
5301 				 */
5302 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5303 					cp->cp_ecx |= 8 << 12;
5304 				break;
5305 			case X86_VENDOR_Intel:
5306 				/*
5307 				 * Extended L2 Cache features function.
5308 				 * First appeared on Prescott.
5309 				 */
5310 			default:
5311 				break;
5312 			}
5313 			break;
5314 		default:
5315 			break;
5316 		}
5317 	}
5318 }
5319 
5320 static const char *
intel_cpubrand(const struct cpuid_info * cpi)5321 intel_cpubrand(const struct cpuid_info *cpi)
5322 {
5323 	int i;
5324 
5325 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5326 
5327 	switch (cpi->cpi_family) {
5328 	case 5:
5329 		return ("Intel Pentium(r)");
5330 	case 6:
5331 		switch (cpi->cpi_model) {
5332 			uint_t celeron, xeon;
5333 			const struct cpuid_regs *cp;
5334 		case 0:
5335 		case 1:
5336 		case 2:
5337 			return ("Intel Pentium(r) Pro");
5338 		case 3:
5339 		case 4:
5340 			return ("Intel Pentium(r) II");
5341 		case 6:
5342 			return ("Intel Celeron(r)");
5343 		case 5:
5344 		case 7:
5345 			celeron = xeon = 0;
5346 			cp = &cpi->cpi_std[2];	/* cache info */
5347 
5348 			for (i = 1; i < 4; i++) {
5349 				uint_t tmp;
5350 
5351 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5352 				if (tmp == 0x40)
5353 					celeron++;
5354 				if (tmp >= 0x44 && tmp <= 0x45)
5355 					xeon++;
5356 			}
5357 
5358 			for (i = 0; i < 2; i++) {
5359 				uint_t tmp;
5360 
5361 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5362 				if (tmp == 0x40)
5363 					celeron++;
5364 				else if (tmp >= 0x44 && tmp <= 0x45)
5365 					xeon++;
5366 			}
5367 
5368 			for (i = 0; i < 4; i++) {
5369 				uint_t tmp;
5370 
5371 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5372 				if (tmp == 0x40)
5373 					celeron++;
5374 				else if (tmp >= 0x44 && tmp <= 0x45)
5375 					xeon++;
5376 			}
5377 
5378 			for (i = 0; i < 4; i++) {
5379 				uint_t tmp;
5380 
5381 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5382 				if (tmp == 0x40)
5383 					celeron++;
5384 				else if (tmp >= 0x44 && tmp <= 0x45)
5385 					xeon++;
5386 			}
5387 
5388 			if (celeron)
5389 				return ("Intel Celeron(r)");
5390 			if (xeon)
5391 				return (cpi->cpi_model == 5 ?
5392 				    "Intel Pentium(r) II Xeon(tm)" :
5393 				    "Intel Pentium(r) III Xeon(tm)");
5394 			return (cpi->cpi_model == 5 ?
5395 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5396 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5397 		default:
5398 			break;
5399 		}
5400 	default:
5401 		break;
5402 	}
5403 
5404 	/* BrandID is present if the field is nonzero */
5405 	if (cpi->cpi_brandid != 0) {
5406 		static const struct {
5407 			uint_t bt_bid;
5408 			const char *bt_str;
5409 		} brand_tbl[] = {
5410 			{ 0x1,	"Intel(r) Celeron(r)" },
5411 			{ 0x2,	"Intel(r) Pentium(r) III" },
5412 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5413 			{ 0x4,	"Intel(r) Pentium(r) III" },
5414 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5415 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5416 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5417 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5418 			{ 0xa,	"Intel(r) Celeron(r)" },
5419 			{ 0xb,	"Intel(r) Xeon(tm)" },
5420 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5421 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5422 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5423 			{ 0x11, "Mobile Genuine Intel(r)" },
5424 			{ 0x12, "Intel(r) Celeron(r) M" },
5425 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5426 			{ 0x14, "Intel(r) Celeron(r)" },
5427 			{ 0x15, "Mobile Genuine Intel(r)" },
5428 			{ 0x16,	"Intel(r) Pentium(r) M" },
5429 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5430 		};
5431 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5432 		uint_t sgn;
5433 
5434 		sgn = (cpi->cpi_family << 8) |
5435 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5436 
5437 		for (i = 0; i < btblmax; i++)
5438 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5439 				break;
5440 		if (i < btblmax) {
5441 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5442 				return ("Intel(r) Celeron(r)");
5443 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5444 				return ("Intel(r) Xeon(tm) MP");
5445 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5446 				return ("Intel(r) Xeon(tm)");
5447 			return (brand_tbl[i].bt_str);
5448 		}
5449 	}
5450 
5451 	return (NULL);
5452 }
5453 
5454 static const char *
amd_cpubrand(const struct cpuid_info * cpi)5455 amd_cpubrand(const struct cpuid_info *cpi)
5456 {
5457 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5458 
5459 	switch (cpi->cpi_family) {
5460 	case 5:
5461 		switch (cpi->cpi_model) {
5462 		case 0:
5463 		case 1:
5464 		case 2:
5465 		case 3:
5466 		case 4:
5467 		case 5:
5468 			return ("AMD-K5(r)");
5469 		case 6:
5470 		case 7:
5471 			return ("AMD-K6(r)");
5472 		case 8:
5473 			return ("AMD-K6(r)-2");
5474 		case 9:
5475 			return ("AMD-K6(r)-III");
5476 		default:
5477 			return ("AMD (family 5)");
5478 		}
5479 	case 6:
5480 		switch (cpi->cpi_model) {
5481 		case 1:
5482 			return ("AMD-K7(tm)");
5483 		case 0:
5484 		case 2:
5485 		case 4:
5486 			return ("AMD Athlon(tm)");
5487 		case 3:
5488 		case 7:
5489 			return ("AMD Duron(tm)");
5490 		case 6:
5491 		case 8:
5492 		case 10:
5493 			/*
5494 			 * Use the L2 cache size to distinguish
5495 			 */
5496 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5497 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5498 		default:
5499 			return ("AMD (family 6)");
5500 		}
5501 	default:
5502 		break;
5503 	}
5504 
5505 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5506 	    cpi->cpi_brandid != 0) {
5507 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5508 		case 3:
5509 			return ("AMD Opteron(tm) UP 1xx");
5510 		case 4:
5511 			return ("AMD Opteron(tm) DP 2xx");
5512 		case 5:
5513 			return ("AMD Opteron(tm) MP 8xx");
5514 		default:
5515 			return ("AMD Opteron(tm)");
5516 		}
5517 	}
5518 
5519 	return (NULL);
5520 }
5521 
5522 static const char *
cyrix_cpubrand(struct cpuid_info * cpi,uint_t type)5523 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5524 {
5525 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5526 
5527 	switch (type) {
5528 	case X86_TYPE_CYRIX_6x86:
5529 		return ("Cyrix 6x86");
5530 	case X86_TYPE_CYRIX_6x86L:
5531 		return ("Cyrix 6x86L");
5532 	case X86_TYPE_CYRIX_6x86MX:
5533 		return ("Cyrix 6x86MX");
5534 	case X86_TYPE_CYRIX_GXm:
5535 		return ("Cyrix GXm");
5536 	case X86_TYPE_CYRIX_MediaGX:
5537 		return ("Cyrix MediaGX");
5538 	case X86_TYPE_CYRIX_MII:
5539 		return ("Cyrix M2");
5540 	case X86_TYPE_VIA_CYRIX_III:
5541 		return ("VIA Cyrix M3");
5542 	default:
5543 		/*
5544 		 * Have another wild guess ..
5545 		 */
5546 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5547 			return ("Cyrix 5x86");
5548 		else if (cpi->cpi_family == 5) {
5549 			switch (cpi->cpi_model) {
5550 			case 2:
5551 				return ("Cyrix 6x86");	/* Cyrix M1 */
5552 			case 4:
5553 				return ("Cyrix MediaGX");
5554 			default:
5555 				break;
5556 			}
5557 		} else if (cpi->cpi_family == 6) {
5558 			switch (cpi->cpi_model) {
5559 			case 0:
5560 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5561 			case 5:
5562 			case 6:
5563 			case 7:
5564 			case 8:
5565 			case 9:
5566 				return ("VIA C3");
5567 			default:
5568 				break;
5569 			}
5570 		}
5571 		break;
5572 	}
5573 	return (NULL);
5574 }
5575 
5576 /*
5577  * This only gets called in the case that the CPU extended
5578  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5579  * aren't available, or contain null bytes for some reason.
5580  */
5581 static void
fabricate_brandstr(struct cpuid_info * cpi)5582 fabricate_brandstr(struct cpuid_info *cpi)
5583 {
5584 	const char *brand = NULL;
5585 
5586 	switch (cpi->cpi_vendor) {
5587 	case X86_VENDOR_Intel:
5588 		brand = intel_cpubrand(cpi);
5589 		break;
5590 	case X86_VENDOR_AMD:
5591 		brand = amd_cpubrand(cpi);
5592 		break;
5593 	case X86_VENDOR_Cyrix:
5594 		brand = cyrix_cpubrand(cpi, x86_type);
5595 		break;
5596 	case X86_VENDOR_NexGen:
5597 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5598 			brand = "NexGen Nx586";
5599 		break;
5600 	case X86_VENDOR_Centaur:
5601 		if (cpi->cpi_family == 5)
5602 			switch (cpi->cpi_model) {
5603 			case 4:
5604 				brand = "Centaur C6";
5605 				break;
5606 			case 8:
5607 				brand = "Centaur C2";
5608 				break;
5609 			case 9:
5610 				brand = "Centaur C3";
5611 				break;
5612 			default:
5613 				break;
5614 			}
5615 		break;
5616 	case X86_VENDOR_Rise:
5617 		if (cpi->cpi_family == 5 &&
5618 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5619 			brand = "Rise mP6";
5620 		break;
5621 	case X86_VENDOR_SiS:
5622 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5623 			brand = "SiS 55x";
5624 		break;
5625 	case X86_VENDOR_TM:
5626 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5627 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5628 		break;
5629 	case X86_VENDOR_NSC:
5630 	case X86_VENDOR_UMC:
5631 	default:
5632 		break;
5633 	}
5634 	if (brand) {
5635 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5636 		return;
5637 	}
5638 
5639 	/*
5640 	 * If all else fails ...
5641 	 */
5642 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5643 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5644 	    cpi->cpi_model, cpi->cpi_step);
5645 }
5646 
5647 /*
5648  * This routine is called just after kernel memory allocation
5649  * becomes available on cpu0, and as part of mp_startup() on
5650  * the other cpus.
5651  *
5652  * Fixup the brand string, and collect any information from cpuid
5653  * that requires dynamically allocated storage to represent.
5654  */
5655 
5656 static void
cpuid_pass_dynamic(cpu_t * cpu,void * _arg __unused)5657 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5658 {
5659 	int	i, max, shft, level, size;
5660 	struct cpuid_regs regs;
5661 	struct cpuid_regs *cp;
5662 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5663 
5664 	/*
5665 	 * Deterministic cache parameters
5666 	 *
5667 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5668 	 * values that are present are currently defined to be the same. This
5669 	 * means we can use the same logic to parse it as long as we use the
5670 	 * appropriate leaf to get the data. If you're updating this, make sure
5671 	 * you're careful about which vendor supports which aspect.
5672 	 *
5673 	 * Take this opportunity to detect the number of threads sharing the
5674 	 * last level cache, and construct a corresponding cache id. The
5675 	 * respective cpuid_info members are initialized to the default case of
5676 	 * "no last level cache sharing".
5677 	 */
5678 	cpi->cpi_ncpu_shr_last_cache = 1;
5679 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5680 
5681 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5682 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5683 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5684 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5685 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5686 		uint32_t leaf;
5687 
5688 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5689 			leaf = 4;
5690 		} else {
5691 			leaf = CPUID_LEAF_EXT_1d;
5692 		}
5693 
5694 		/*
5695 		 * Find the # of elements (size) returned by the leaf and along
5696 		 * the way detect last level cache sharing details.
5697 		 */
5698 		bzero(&regs, sizeof (regs));
5699 		cp = &regs;
5700 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5701 			cp->cp_eax = leaf;
5702 			cp->cp_ecx = i;
5703 
5704 			(void) __cpuid_insn(cp);
5705 
5706 			if (CPI_CACHE_TYPE(cp) == 0)
5707 				break;
5708 			level = CPI_CACHE_LVL(cp);
5709 			if (level > max) {
5710 				max = level;
5711 				cpi->cpi_ncpu_shr_last_cache =
5712 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5713 			}
5714 		}
5715 		cpi->cpi_cache_leaf_size = size = i;
5716 
5717 		/*
5718 		 * Allocate the cpi_cache_leaves array. The first element
5719 		 * references the regs for the corresponding leaf with %ecx set
5720 		 * to 0. This was gathered in cpuid_pass_extended().
5721 		 */
5722 		if (size > 0) {
5723 			cpi->cpi_cache_leaves =
5724 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5725 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5726 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5727 			} else {
5728 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5729 			}
5730 
5731 			/*
5732 			 * Allocate storage to hold the additional regs
5733 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5734 			 *
5735 			 * The regs for the leaf, %ecx == 0 has already
5736 			 * been allocated as indicated above.
5737 			 */
5738 			for (i = 1; i < size; i++) {
5739 				cp = cpi->cpi_cache_leaves[i] =
5740 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5741 				cp->cp_eax = leaf;
5742 				cp->cp_ecx = i;
5743 
5744 				(void) __cpuid_insn(cp);
5745 			}
5746 		}
5747 		/*
5748 		 * Determine the number of bits needed to represent
5749 		 * the number of CPUs sharing the last level cache.
5750 		 *
5751 		 * Shift off that number of bits from the APIC id to
5752 		 * derive the cache id.
5753 		 */
5754 		shft = 0;
5755 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5756 			shft++;
5757 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5758 	}
5759 
5760 	/*
5761 	 * Now fixup the brand string
5762 	 */
5763 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5764 		fabricate_brandstr(cpi);
5765 	} else {
5766 
5767 		/*
5768 		 * If we successfully extracted a brand string from the cpuid
5769 		 * instruction, clean it up by removing leading spaces and
5770 		 * similar junk.
5771 		 */
5772 		if (cpi->cpi_brandstr[0]) {
5773 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5774 			char *src, *dst;
5775 
5776 			dst = src = (char *)cpi->cpi_brandstr;
5777 			src[maxlen - 1] = '\0';
5778 			/*
5779 			 * strip leading spaces
5780 			 */
5781 			while (*src == ' ')
5782 				src++;
5783 			/*
5784 			 * Remove any 'Genuine' or "Authentic" prefixes
5785 			 */
5786 			if (strncmp(src, "Genuine ", 8) == 0)
5787 				src += 8;
5788 			if (strncmp(src, "Authentic ", 10) == 0)
5789 				src += 10;
5790 
5791 			/*
5792 			 * Now do an in-place copy.
5793 			 * Map (R) to (r) and (TM) to (tm).
5794 			 * The era of teletypes is long gone, and there's
5795 			 * -really- no need to shout.
5796 			 */
5797 			while (*src != '\0') {
5798 				if (src[0] == '(') {
5799 					if (strncmp(src + 1, "R)", 2) == 0) {
5800 						(void) strncpy(dst, "(r)", 3);
5801 						src += 3;
5802 						dst += 3;
5803 						continue;
5804 					}
5805 					if (strncmp(src + 1, "TM)", 3) == 0) {
5806 						(void) strncpy(dst, "(tm)", 4);
5807 						src += 4;
5808 						dst += 4;
5809 						continue;
5810 					}
5811 				}
5812 				*dst++ = *src++;
5813 			}
5814 			*dst = '\0';
5815 
5816 			/*
5817 			 * Finally, remove any trailing spaces
5818 			 */
5819 			while (--dst > cpi->cpi_brandstr)
5820 				if (*dst == ' ')
5821 					*dst = '\0';
5822 				else
5823 					break;
5824 		} else
5825 			fabricate_brandstr(cpi);
5826 	}
5827 }
5828 
5829 typedef struct {
5830 	uint32_t avm_av;
5831 	uint32_t avm_feat;
5832 } av_feat_map_t;
5833 
5834 /*
5835  * These arrays are used to map features that we should add based on x86
5836  * features that are present. As a large number depend on kernel features,
5837  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5838  * There is an array of these for each hwcap word. Some features aren't tracked
5839  * in the kernel x86 featureset and that's ok. They will not show up in here.
5840  */
5841 static const av_feat_map_t x86fset_to_av1[] = {
5842 	{ AV_386_CX8, X86FSET_CX8 },
5843 	{ AV_386_SEP, X86FSET_SEP },
5844 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5845 	{ AV_386_CMOV, X86FSET_CMOV },
5846 	{ AV_386_FXSR, X86FSET_SSE },
5847 	{ AV_386_SSE, X86FSET_SSE },
5848 	{ AV_386_SSE2, X86FSET_SSE2 },
5849 	{ AV_386_SSE3, X86FSET_SSE3 },
5850 	{ AV_386_CX16, X86FSET_CX16 },
5851 	{ AV_386_TSCP, X86FSET_TSCP },
5852 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5853 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5854 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5855 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5856 	{ AV_386_AES, X86FSET_AES },
5857 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5858 	{ AV_386_XSAVE, X86FSET_XSAVE },
5859 	{ AV_386_AVX, X86FSET_AVX },
5860 	{ AV_386_VMX, X86FSET_VMX },
5861 	{ AV_386_AMD_SVM, X86FSET_SVM }
5862 };
5863 
5864 static const av_feat_map_t x86fset_to_av2[] = {
5865 	{ AV_386_2_F16C, X86FSET_F16C },
5866 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5867 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5868 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5869 	{ AV_386_2_FMA, X86FSET_FMA },
5870 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5871 	{ AV_386_2_ADX, X86FSET_ADX },
5872 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5873 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5874 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5875 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5876 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5877 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5878 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5879 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5880 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5881 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5882 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5883 	{ AV_386_2_SHA, X86FSET_SHA },
5884 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5885 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5886 	{ AV_386_2_CLWB, X86FSET_CLWB },
5887 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5888 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5889 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5890 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5891 	{ AV_386_2_VAES, X86FSET_VAES },
5892 	{ AV_386_2_GFNI, X86FSET_GFNI },
5893 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5894 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5895 };
5896 
5897 static const av_feat_map_t x86fset_to_av3[] = {
5898 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5899 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5900 };
5901 
5902 /*
5903  * This routine is called out of bind_hwcap() much later in the life
5904  * of the kernel (post_startup()).  The job of this routine is to resolve
5905  * the hardware feature support and kernel support for those features into
5906  * what we're actually going to tell applications via the aux vector.
5907  *
5908  * Most of the aux vector is derived from the x86_featureset array vector where
5909  * a given feature indicates that an aux vector should be plumbed through. This
5910  * allows the kernel to use one tracking mechanism for these based on whether or
5911  * not it has the required hardware support (most often xsave). Most newer
5912  * features are added there in case we need them in the kernel. Otherwise,
5913  * features are evaluated based on looking at the cpuid features that remain. If
5914  * you find yourself wanting to clear out cpuid features for some reason, they
5915  * should instead be driven by the feature set so we have a consistent view.
5916  */
5917 
5918 static void
cpuid_pass_resolve(cpu_t * cpu,void * arg)5919 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5920 {
5921 	uint_t *hwcap_out = (uint_t *)arg;
5922 	struct cpuid_info *cpi;
5923 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5924 
5925 	cpi = cpu->cpu_m.mcpu_cpi;
5926 
5927 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5928 		if (is_x86_feature(x86_featureset,
5929 		    x86fset_to_av1[i].avm_feat)) {
5930 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5931 		}
5932 	}
5933 
5934 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5935 		if (is_x86_feature(x86_featureset,
5936 		    x86fset_to_av2[i].avm_feat)) {
5937 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5938 		}
5939 	}
5940 
5941 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5942 		if (is_x86_feature(x86_featureset,
5943 		    x86fset_to_av3[i].avm_feat)) {
5944 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5945 		}
5946 	}
5947 
5948 	/*
5949 	 * From here on out we're working through features that don't have
5950 	 * corresponding kernel feature flags for various reasons that are
5951 	 * mostly just due to the historical implementation.
5952 	 */
5953 	if (cpi->cpi_maxeax >= 1) {
5954 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5955 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5956 
5957 		*edx = CPI_FEATURES_EDX(cpi);
5958 		*ecx = CPI_FEATURES_ECX(cpi);
5959 
5960 		/*
5961 		 * [no explicit support required beyond x87 fp context]
5962 		 */
5963 		if (!fpu_exists)
5964 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5965 
5966 		/*
5967 		 * Now map the supported feature vector to things that we
5968 		 * think userland will care about.
5969 		 */
5970 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5971 			hwcap_flags |= AV_386_MOVBE;
5972 
5973 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5974 			hwcap_flags |= AV_386_POPCNT;
5975 		if (*edx & CPUID_INTC_EDX_FPU)
5976 			hwcap_flags |= AV_386_FPU;
5977 		if (*edx & CPUID_INTC_EDX_MMX)
5978 			hwcap_flags |= AV_386_MMX;
5979 		if (*edx & CPUID_INTC_EDX_TSC)
5980 			hwcap_flags |= AV_386_TSC;
5981 	}
5982 
5983 	/*
5984 	 * Check a few miscellaneous features.
5985 	 */
5986 	if (cpi->cpi_xmaxeax < 0x80000001)
5987 		goto resolve_done;
5988 
5989 	switch (cpi->cpi_vendor) {
5990 		uint32_t *edx, *ecx;
5991 
5992 	case X86_VENDOR_Intel:
5993 		/*
5994 		 * Seems like Intel duplicated what we necessary
5995 		 * here to make the initial crop of 64-bit OS's work.
5996 		 * Hopefully, those are the only "extended" bits
5997 		 * they'll add.
5998 		 */
5999 		/*FALLTHROUGH*/
6000 
6001 	case X86_VENDOR_AMD:
6002 	case X86_VENDOR_HYGON:
6003 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
6004 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
6005 
6006 		*edx = CPI_FEATURES_XTD_EDX(cpi);
6007 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
6008 
6009 		/*
6010 		 * [no explicit support required beyond
6011 		 * x87 fp context and exception handlers]
6012 		 */
6013 		if (!fpu_exists)
6014 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
6015 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
6016 
6017 		/*
6018 		 * Now map the supported feature vector to
6019 		 * things that we think userland will care about.
6020 		 */
6021 		if (*edx & CPUID_AMD_EDX_MMXamd)
6022 			hwcap_flags |= AV_386_AMD_MMX;
6023 		if (*edx & CPUID_AMD_EDX_3DNow)
6024 			hwcap_flags |= AV_386_AMD_3DNow;
6025 		if (*edx & CPUID_AMD_EDX_3DNowx)
6026 			hwcap_flags |= AV_386_AMD_3DNowx;
6027 
6028 		switch (cpi->cpi_vendor) {
6029 		case X86_VENDOR_AMD:
6030 		case X86_VENDOR_HYGON:
6031 			if (*ecx & CPUID_AMD_ECX_AHF64)
6032 				hwcap_flags |= AV_386_AHF;
6033 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6034 				hwcap_flags |= AV_386_AMD_LZCNT;
6035 			break;
6036 
6037 		case X86_VENDOR_Intel:
6038 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6039 				hwcap_flags |= AV_386_AMD_LZCNT;
6040 			/*
6041 			 * Aarrgh.
6042 			 * Intel uses a different bit in the same word.
6043 			 */
6044 			if (*ecx & CPUID_INTC_ECX_AHF64)
6045 				hwcap_flags |= AV_386_AHF;
6046 			break;
6047 		default:
6048 			break;
6049 		}
6050 		break;
6051 
6052 	default:
6053 		break;
6054 	}
6055 
6056 resolve_done:
6057 	if (hwcap_out != NULL) {
6058 		hwcap_out[0] = hwcap_flags;
6059 		hwcap_out[1] = hwcap_flags_2;
6060 		hwcap_out[2] = hwcap_flags_3;
6061 	}
6062 }
6063 
6064 
6065 /*
6066  * Simulate the cpuid instruction using the data we previously
6067  * captured about this CPU.  We try our best to return the truth
6068  * about the hardware, independently of kernel support.
6069  */
6070 uint32_t
cpuid_insn(cpu_t * cpu,struct cpuid_regs * cp)6071 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
6072 {
6073 	struct cpuid_info *cpi;
6074 	struct cpuid_regs *xcp;
6075 
6076 	if (cpu == NULL)
6077 		cpu = CPU;
6078 	cpi = cpu->cpu_m.mcpu_cpi;
6079 
6080 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6081 
6082 	/*
6083 	 * CPUID data is cached in two separate places: cpi_std for standard
6084 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
6085 	 */
6086 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
6087 		xcp = &cpi->cpi_std[cp->cp_eax];
6088 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
6089 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
6090 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
6091 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
6092 	} else {
6093 		/*
6094 		 * The caller is asking for data from an input parameter which
6095 		 * the kernel has not cached.  In this case we go fetch from
6096 		 * the hardware and return the data directly to the user.
6097 		 */
6098 		return (__cpuid_insn(cp));
6099 	}
6100 
6101 	cp->cp_eax = xcp->cp_eax;
6102 	cp->cp_ebx = xcp->cp_ebx;
6103 	cp->cp_ecx = xcp->cp_ecx;
6104 	cp->cp_edx = xcp->cp_edx;
6105 	return (cp->cp_eax);
6106 }
6107 
6108 boolean_t
cpuid_checkpass(const cpu_t * const cpu,const cpuid_pass_t pass)6109 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
6110 {
6111 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
6112 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
6113 }
6114 
6115 int
cpuid_getbrandstr(cpu_t * cpu,char * s,size_t n)6116 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
6117 {
6118 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6119 
6120 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
6121 }
6122 
6123 int
cpuid_is_cmt(cpu_t * cpu)6124 cpuid_is_cmt(cpu_t *cpu)
6125 {
6126 	if (cpu == NULL)
6127 		cpu = CPU;
6128 
6129 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6130 
6131 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
6132 }
6133 
6134 /*
6135  * AMD and Intel both implement the 64-bit variant of the syscall
6136  * instruction (syscallq), so if there's -any- support for syscall,
6137  * cpuid currently says "yes, we support this".
6138  *
6139  * However, Intel decided to -not- implement the 32-bit variant of the
6140  * syscall instruction, so we provide a predicate to allow our caller
6141  * to test that subtlety here.
6142  *
6143  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
6144  *	even in the case where the hardware would in fact support it.
6145  */
6146 /*ARGSUSED*/
6147 int
cpuid_syscall32_insn(cpu_t * cpu)6148 cpuid_syscall32_insn(cpu_t *cpu)
6149 {
6150 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
6151 
6152 #if !defined(__xpv)
6153 	if (cpu == NULL)
6154 		cpu = CPU;
6155 
6156 	/*CSTYLED*/
6157 	{
6158 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6159 
6160 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
6161 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
6162 		    cpi->cpi_xmaxeax >= 0x80000001 &&
6163 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
6164 			return (1);
6165 	}
6166 #endif
6167 	return (0);
6168 }
6169 
6170 int
cpuid_getidstr(cpu_t * cpu,char * s,size_t n)6171 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
6172 {
6173 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6174 
6175 	static const char fmt[] =
6176 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
6177 	static const char fmt_ht[] =
6178 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
6179 
6180 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6181 
6182 	if (cpuid_is_cmt(cpu))
6183 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
6184 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6185 		    cpi->cpi_family, cpi->cpi_model,
6186 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6187 	return (snprintf(s, n, fmt,
6188 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6189 	    cpi->cpi_family, cpi->cpi_model,
6190 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6191 }
6192 
6193 const char *
cpuid_getvendorstr(cpu_t * cpu)6194 cpuid_getvendorstr(cpu_t *cpu)
6195 {
6196 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6197 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
6198 }
6199 
6200 uint_t
cpuid_getvendor(cpu_t * cpu)6201 cpuid_getvendor(cpu_t *cpu)
6202 {
6203 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6204 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
6205 }
6206 
6207 uint_t
cpuid_getfamily(cpu_t * cpu)6208 cpuid_getfamily(cpu_t *cpu)
6209 {
6210 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6211 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
6212 }
6213 
6214 uint_t
cpuid_getmodel(cpu_t * cpu)6215 cpuid_getmodel(cpu_t *cpu)
6216 {
6217 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6218 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
6219 }
6220 
6221 uint_t
cpuid_get_ncpu_per_chip(cpu_t * cpu)6222 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6223 {
6224 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6225 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6226 }
6227 
6228 uint_t
cpuid_get_ncore_per_chip(cpu_t * cpu)6229 cpuid_get_ncore_per_chip(cpu_t *cpu)
6230 {
6231 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6232 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6233 }
6234 
6235 uint_t
cpuid_get_ncpu_sharing_last_cache(cpu_t * cpu)6236 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6237 {
6238 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6239 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6240 }
6241 
6242 id_t
cpuid_get_last_lvl_cacheid(cpu_t * cpu)6243 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6244 {
6245 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6246 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6247 }
6248 
6249 uint_t
cpuid_getstep(cpu_t * cpu)6250 cpuid_getstep(cpu_t *cpu)
6251 {
6252 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6253 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
6254 }
6255 
6256 uint_t
cpuid_getsig(struct cpu * cpu)6257 cpuid_getsig(struct cpu *cpu)
6258 {
6259 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6260 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6261 }
6262 
6263 x86_chiprev_t
cpuid_getchiprev(struct cpu * cpu)6264 cpuid_getchiprev(struct cpu *cpu)
6265 {
6266 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6267 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6268 }
6269 
6270 const char *
cpuid_getchiprevstr(struct cpu * cpu)6271 cpuid_getchiprevstr(struct cpu *cpu)
6272 {
6273 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6274 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6275 }
6276 
6277 uint32_t
cpuid_getsockettype(struct cpu * cpu)6278 cpuid_getsockettype(struct cpu *cpu)
6279 {
6280 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6281 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6282 }
6283 
6284 const char *
cpuid_getsocketstr(cpu_t * cpu)6285 cpuid_getsocketstr(cpu_t *cpu)
6286 {
6287 	static const char *socketstr = NULL;
6288 	struct cpuid_info *cpi;
6289 
6290 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6291 	cpi = cpu->cpu_m.mcpu_cpi;
6292 
6293 	/* Assume that socket types are the same across the system */
6294 	if (socketstr == NULL)
6295 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6296 		    cpi->cpi_model, cpi->cpi_step);
6297 
6298 
6299 	return (socketstr);
6300 }
6301 
6302 x86_uarchrev_t
cpuid_getuarchrev(cpu_t * cpu)6303 cpuid_getuarchrev(cpu_t *cpu)
6304 {
6305 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6306 }
6307 
6308 int
cpuid_get_chipid(cpu_t * cpu)6309 cpuid_get_chipid(cpu_t *cpu)
6310 {
6311 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6312 
6313 	if (cpuid_is_cmt(cpu))
6314 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6315 	return (cpu->cpu_id);
6316 }
6317 
6318 id_t
cpuid_get_coreid(cpu_t * cpu)6319 cpuid_get_coreid(cpu_t *cpu)
6320 {
6321 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6322 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6323 }
6324 
6325 int
cpuid_get_pkgcoreid(cpu_t * cpu)6326 cpuid_get_pkgcoreid(cpu_t *cpu)
6327 {
6328 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6329 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6330 }
6331 
6332 int
cpuid_get_clogid(cpu_t * cpu)6333 cpuid_get_clogid(cpu_t *cpu)
6334 {
6335 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6336 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6337 }
6338 
6339 int
cpuid_get_cacheid(cpu_t * cpu)6340 cpuid_get_cacheid(cpu_t *cpu)
6341 {
6342 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6343 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6344 }
6345 
6346 uint_t
cpuid_get_procnodeid(cpu_t * cpu)6347 cpuid_get_procnodeid(cpu_t *cpu)
6348 {
6349 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6350 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6351 }
6352 
6353 uint_t
cpuid_get_procnodes_per_pkg(cpu_t * cpu)6354 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6355 {
6356 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6357 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6358 }
6359 
6360 uint_t
cpuid_get_compunitid(cpu_t * cpu)6361 cpuid_get_compunitid(cpu_t *cpu)
6362 {
6363 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6364 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6365 }
6366 
6367 uint_t
cpuid_get_cores_per_compunit(cpu_t * cpu)6368 cpuid_get_cores_per_compunit(cpu_t *cpu)
6369 {
6370 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6371 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6372 }
6373 
6374 uint32_t
cpuid_get_apicid(cpu_t * cpu)6375 cpuid_get_apicid(cpu_t *cpu)
6376 {
6377 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6378 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6379 		return (UINT32_MAX);
6380 	} else {
6381 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6382 	}
6383 }
6384 
6385 void
cpuid_get_addrsize(cpu_t * cpu,uint_t * pabits,uint_t * vabits)6386 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6387 {
6388 	struct cpuid_info *cpi;
6389 
6390 	if (cpu == NULL)
6391 		cpu = CPU;
6392 	cpi = cpu->cpu_m.mcpu_cpi;
6393 
6394 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6395 
6396 	if (pabits)
6397 		*pabits = cpi->cpi_pabits;
6398 	if (vabits)
6399 		*vabits = cpi->cpi_vabits;
6400 }
6401 
6402 size_t
cpuid_get_xsave_size(void)6403 cpuid_get_xsave_size(void)
6404 {
6405 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6406 	    sizeof (struct xsave_state)));
6407 }
6408 
6409 /*
6410  * Export information about known offsets to the kernel. We only care about
6411  * things we have actually enabled support for in %xcr0.
6412  */
6413 void
cpuid_get_xsave_info(uint64_t bit,size_t * sizep,size_t * offp)6414 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6415 {
6416 	size_t size, off;
6417 
6418 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6419 
6420 	if (sizep == NULL)
6421 		sizep = &size;
6422 	if (offp == NULL)
6423 		offp = &off;
6424 
6425 	switch (bit) {
6426 	case XFEATURE_LEGACY_FP:
6427 	case XFEATURE_SSE:
6428 		*sizep = sizeof (struct fxsave_state);
6429 		*offp = 0;
6430 		break;
6431 	case XFEATURE_AVX:
6432 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6433 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6434 		break;
6435 	case XFEATURE_AVX512_OPMASK:
6436 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6437 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6438 		break;
6439 	case XFEATURE_AVX512_ZMM:
6440 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6441 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6442 		break;
6443 	case XFEATURE_AVX512_HI_ZMM:
6444 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6445 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6446 		break;
6447 	default:
6448 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6449 	}
6450 }
6451 
6452 /*
6453  * Return true if the CPUs on this system require 'pointer clearing' for the
6454  * floating point error pointer exception handling. In the past, this has been
6455  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6456  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6457  * feature bit and is reflected in the cpi_fp_amd_save member.
6458  */
6459 boolean_t
cpuid_need_fp_excp_handling(void)6460 cpuid_need_fp_excp_handling(void)
6461 {
6462 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6463 	    cpuid_info0.cpi_fp_amd_save != 0);
6464 }
6465 
6466 /*
6467  * Returns the number of data TLB entries for a corresponding
6468  * pagesize.  If it can't be computed, or isn't known, the
6469  * routine returns zero.  If you ask about an architecturally
6470  * impossible pagesize, the routine will panic (so that the
6471  * hat implementor knows that things are inconsistent.)
6472  */
6473 uint_t
cpuid_get_dtlb_nent(cpu_t * cpu,size_t pagesize)6474 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6475 {
6476 	struct cpuid_info *cpi;
6477 	uint_t dtlb_nent = 0;
6478 
6479 	if (cpu == NULL)
6480 		cpu = CPU;
6481 	cpi = cpu->cpu_m.mcpu_cpi;
6482 
6483 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6484 
6485 	/*
6486 	 * Check the L2 TLB info
6487 	 */
6488 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6489 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6490 
6491 		switch (pagesize) {
6492 
6493 		case 4 * 1024:
6494 			/*
6495 			 * All zero in the top 16 bits of the register
6496 			 * indicates a unified TLB. Size is in low 16 bits.
6497 			 */
6498 			if ((cp->cp_ebx & 0xffff0000) == 0)
6499 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6500 			else
6501 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6502 			break;
6503 
6504 		case 2 * 1024 * 1024:
6505 			if ((cp->cp_eax & 0xffff0000) == 0)
6506 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6507 			else
6508 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6509 			break;
6510 
6511 		default:
6512 			panic("unknown L2 pagesize");
6513 			/*NOTREACHED*/
6514 		}
6515 	}
6516 
6517 	if (dtlb_nent != 0)
6518 		return (dtlb_nent);
6519 
6520 	/*
6521 	 * No L2 TLB support for this size, try L1.
6522 	 */
6523 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6524 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6525 
6526 		switch (pagesize) {
6527 		case 4 * 1024:
6528 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6529 			break;
6530 		case 2 * 1024 * 1024:
6531 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6532 			break;
6533 		default:
6534 			panic("unknown L1 d-TLB pagesize");
6535 			/*NOTREACHED*/
6536 		}
6537 	}
6538 
6539 	return (dtlb_nent);
6540 }
6541 
6542 /*
6543  * Return 0 if the erratum is not present or not applicable, positive
6544  * if it is, and negative if the status of the erratum is unknown.
6545  *
6546  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6547  * Processors" #25759, Rev 3.57, August 2005
6548  */
6549 int
cpuid_opteron_erratum(cpu_t * cpu,uint_t erratum)6550 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6551 {
6552 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6553 	uint_t eax;
6554 
6555 	/*
6556 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6557 	 * a legacy (32-bit) AMD CPU.
6558 	 */
6559 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6560 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6561 	    cpi->cpi_family == 6) {
6562 		return (0);
6563 	}
6564 
6565 	eax = cpi->cpi_std[1].cp_eax;
6566 
6567 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6568 #define	SH_B3(eax)	(eax == 0xf51)
6569 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6570 
6571 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6572 
6573 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6574 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6575 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6576 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6577 
6578 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6579 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6580 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6581 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6582 
6583 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6584 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6585 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6586 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6587 #define	BH_E4(eax)	(eax == 0x20fb1)
6588 #define	SH_E5(eax)	(eax == 0x20f42)
6589 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6590 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6591 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6592 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6593 			    DH_E6(eax) || JH_E6(eax))
6594 
6595 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6596 #define	DR_B0(eax)	(eax == 0x100f20)
6597 #define	DR_B1(eax)	(eax == 0x100f21)
6598 #define	DR_BA(eax)	(eax == 0x100f2a)
6599 #define	DR_B2(eax)	(eax == 0x100f22)
6600 #define	DR_B3(eax)	(eax == 0x100f23)
6601 #define	RB_C0(eax)	(eax == 0x100f40)
6602 
6603 	switch (erratum) {
6604 	case 1:
6605 		return (cpi->cpi_family < 0x10);
6606 	case 51:	/* what does the asterisk mean? */
6607 		return (B(eax) || SH_C0(eax) || CG(eax));
6608 	case 52:
6609 		return (B(eax));
6610 	case 57:
6611 		return (cpi->cpi_family <= 0x11);
6612 	case 58:
6613 		return (B(eax));
6614 	case 60:
6615 		return (cpi->cpi_family <= 0x11);
6616 	case 61:
6617 	case 62:
6618 	case 63:
6619 	case 64:
6620 	case 65:
6621 	case 66:
6622 	case 68:
6623 	case 69:
6624 	case 70:
6625 	case 71:
6626 		return (B(eax));
6627 	case 72:
6628 		return (SH_B0(eax));
6629 	case 74:
6630 		return (B(eax));
6631 	case 75:
6632 		return (cpi->cpi_family < 0x10);
6633 	case 76:
6634 		return (B(eax));
6635 	case 77:
6636 		return (cpi->cpi_family <= 0x11);
6637 	case 78:
6638 		return (B(eax) || SH_C0(eax));
6639 	case 79:
6640 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6641 	case 80:
6642 	case 81:
6643 	case 82:
6644 		return (B(eax));
6645 	case 83:
6646 		return (B(eax) || SH_C0(eax) || CG(eax));
6647 	case 85:
6648 		return (cpi->cpi_family < 0x10);
6649 	case 86:
6650 		return (SH_C0(eax) || CG(eax));
6651 	case 88:
6652 		return (B(eax) || SH_C0(eax));
6653 	case 89:
6654 		return (cpi->cpi_family < 0x10);
6655 	case 90:
6656 		return (B(eax) || SH_C0(eax) || CG(eax));
6657 	case 91:
6658 	case 92:
6659 		return (B(eax) || SH_C0(eax));
6660 	case 93:
6661 		return (SH_C0(eax));
6662 	case 94:
6663 		return (B(eax) || SH_C0(eax) || CG(eax));
6664 	case 95:
6665 		return (B(eax) || SH_C0(eax));
6666 	case 96:
6667 		return (B(eax) || SH_C0(eax) || CG(eax));
6668 	case 97:
6669 	case 98:
6670 		return (SH_C0(eax) || CG(eax));
6671 	case 99:
6672 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6673 	case 100:
6674 		return (B(eax) || SH_C0(eax));
6675 	case 101:
6676 	case 103:
6677 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6678 	case 104:
6679 		return (SH_C0(eax) || CG(eax) || D0(eax));
6680 	case 105:
6681 	case 106:
6682 	case 107:
6683 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6684 	case 108:
6685 		return (DH_CG(eax));
6686 	case 109:
6687 		return (SH_C0(eax) || CG(eax) || D0(eax));
6688 	case 110:
6689 		return (D0(eax) || EX(eax));
6690 	case 111:
6691 		return (CG(eax));
6692 	case 112:
6693 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6694 	case 113:
6695 		return (eax == 0x20fc0);
6696 	case 114:
6697 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6698 	case 115:
6699 		return (SH_E0(eax) || JH_E1(eax));
6700 	case 116:
6701 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6702 	case 117:
6703 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6704 	case 118:
6705 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6706 		    JH_E6(eax));
6707 	case 121:
6708 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6709 	case 122:
6710 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6711 	case 123:
6712 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6713 	case 131:
6714 		return (cpi->cpi_family < 0x10);
6715 	case 6336786:
6716 
6717 		/*
6718 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6719 		 * if this is a K8 family or newer processor. We're testing for
6720 		 * this 'erratum' to determine whether or not we have a constant
6721 		 * TSC.
6722 		 *
6723 		 * Our current fix for this is to disable the C1-Clock ramping.
6724 		 * However, this doesn't work on newer processor families nor
6725 		 * does it work when virtualized as those devices don't exist.
6726 		 */
6727 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6728 			return (0);
6729 		}
6730 
6731 		if (CPI_FAMILY(cpi) == 0xf) {
6732 			struct cpuid_regs regs;
6733 			regs.cp_eax = 0x80000007;
6734 			(void) __cpuid_insn(&regs);
6735 			return (!(regs.cp_edx & 0x100));
6736 		}
6737 		return (0);
6738 	case 147:
6739 		/*
6740 		 * This erratum (K8 #147) is not present on family 10 and newer.
6741 		 */
6742 		if (cpi->cpi_family >= 0x10) {
6743 			return (0);
6744 		}
6745 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6746 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6747 
6748 	case 6671130:
6749 		/*
6750 		 * check for processors (pre-Shanghai) that do not provide
6751 		 * optimal management of 1gb ptes in its tlb.
6752 		 */
6753 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6754 
6755 	case 298:
6756 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6757 		    DR_B2(eax) || RB_C0(eax));
6758 
6759 	case 721:
6760 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6761 
6762 	default:
6763 		return (-1);
6764 
6765 	}
6766 }
6767 
6768 /*
6769  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6770  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6771  */
6772 int
osvw_opteron_erratum(cpu_t * cpu,uint_t erratum)6773 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6774 {
6775 	struct cpuid_info	*cpi;
6776 	uint_t			osvwid;
6777 	static int		osvwfeature = -1;
6778 	uint64_t		osvwlength;
6779 
6780 
6781 	cpi = cpu->cpu_m.mcpu_cpi;
6782 
6783 	/* confirm OSVW supported */
6784 	if (osvwfeature == -1) {
6785 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6786 	} else {
6787 		/* assert that osvw feature setting is consistent on all cpus */
6788 		ASSERT(osvwfeature ==
6789 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6790 	}
6791 	if (!osvwfeature)
6792 		return (-1);
6793 
6794 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6795 
6796 	switch (erratum) {
6797 	case 298:	/* osvwid is 0 */
6798 		osvwid = 0;
6799 		if (osvwlength <= (uint64_t)osvwid) {
6800 			/* osvwid 0 is unknown */
6801 			return (-1);
6802 		}
6803 
6804 		/*
6805 		 * Check the OSVW STATUS MSR to determine the state
6806 		 * of the erratum where:
6807 		 *   0 - fixed by HW
6808 		 *   1 - BIOS has applied the workaround when BIOS
6809 		 *   workaround is available. (Or for other errata,
6810 		 *   OS workaround is required.)
6811 		 * For a value of 1, caller will confirm that the
6812 		 * erratum 298 workaround has indeed been applied by BIOS.
6813 		 *
6814 		 * A 1 may be set in cpus that have a HW fix
6815 		 * in a mixed cpu system. Regarding erratum 298:
6816 		 *   In a multiprocessor platform, the workaround above
6817 		 *   should be applied to all processors regardless of
6818 		 *   silicon revision when an affected processor is
6819 		 *   present.
6820 		 */
6821 
6822 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6823 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6824 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6825 
6826 	default:
6827 		return (-1);
6828 	}
6829 }
6830 
6831 static const char assoc_str[] = "associativity";
6832 static const char line_str[] = "line-size";
6833 static const char size_str[] = "size";
6834 
6835 static void
add_cache_prop(dev_info_t * devi,const char * label,const char * type,uint32_t val)6836 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6837     uint32_t val)
6838 {
6839 	char buf[128];
6840 
6841 	/*
6842 	 * ndi_prop_update_int() is used because it is desirable for
6843 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6844 	 */
6845 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6846 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6847 }
6848 
6849 /*
6850  * Intel-style cache/tlb description
6851  *
6852  * Standard cpuid level 2 gives a randomly ordered
6853  * selection of tags that index into a table that describes
6854  * cache and tlb properties.
6855  */
6856 
6857 static const char l1_icache_str[] = "l1-icache";
6858 static const char l1_dcache_str[] = "l1-dcache";
6859 static const char l2_cache_str[] = "l2-cache";
6860 static const char l3_cache_str[] = "l3-cache";
6861 static const char itlb4k_str[] = "itlb-4K";
6862 static const char dtlb4k_str[] = "dtlb-4K";
6863 static const char itlb2M_str[] = "itlb-2M";
6864 static const char itlb4M_str[] = "itlb-4M";
6865 static const char dtlb4M_str[] = "dtlb-4M";
6866 static const char dtlb24_str[] = "dtlb0-2M-4M";
6867 static const char itlb424_str[] = "itlb-4K-2M-4M";
6868 static const char itlb24_str[] = "itlb-2M-4M";
6869 static const char dtlb44_str[] = "dtlb-4K-4M";
6870 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6871 static const char sl2_cache_str[] = "sectored-l2-cache";
6872 static const char itrace_str[] = "itrace-cache";
6873 static const char sl3_cache_str[] = "sectored-l3-cache";
6874 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6875 
6876 static const struct cachetab {
6877 	uint8_t		ct_code;
6878 	uint8_t		ct_assoc;
6879 	uint16_t	ct_line_size;
6880 	size_t		ct_size;
6881 	const char	*ct_label;
6882 } intel_ctab[] = {
6883 	/*
6884 	 * maintain descending order!
6885 	 *
6886 	 * Codes ignored - Reason
6887 	 * ----------------------
6888 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6889 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6890 	 */
6891 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6892 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6893 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6894 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6895 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6896 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6897 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6898 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6899 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6900 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6901 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6902 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6903 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6904 	{ 0xc0, 4, 0, 8, dtlb44_str },
6905 	{ 0xba, 4, 0, 64, dtlb4k_str },
6906 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6907 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6908 	{ 0xb2, 4, 0, 64, itlb4k_str },
6909 	{ 0xb0, 4, 0, 128, itlb4k_str },
6910 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6911 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6912 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6913 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6914 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6915 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6916 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6917 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6918 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6919 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6920 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6921 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6922 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6923 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6924 	{ 0x73, 8, 0, 64*1024, itrace_str},
6925 	{ 0x72, 8, 0, 32*1024, itrace_str},
6926 	{ 0x71, 8, 0, 16*1024, itrace_str},
6927 	{ 0x70, 8, 0, 12*1024, itrace_str},
6928 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6929 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6930 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6931 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6932 	{ 0x5d, 0, 0, 256, dtlb44_str},
6933 	{ 0x5c, 0, 0, 128, dtlb44_str},
6934 	{ 0x5b, 0, 0, 64, dtlb44_str},
6935 	{ 0x5a, 4, 0, 32, dtlb24_str},
6936 	{ 0x59, 0, 0, 16, dtlb4k_str},
6937 	{ 0x57, 4, 0, 16, dtlb4k_str},
6938 	{ 0x56, 4, 0, 16, dtlb4M_str},
6939 	{ 0x55, 0, 0, 7, itlb24_str},
6940 	{ 0x52, 0, 0, 256, itlb424_str},
6941 	{ 0x51, 0, 0, 128, itlb424_str},
6942 	{ 0x50, 0, 0, 64, itlb424_str},
6943 	{ 0x4f, 0, 0, 32, itlb4k_str},
6944 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6945 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6946 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6947 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6948 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6949 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6950 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6951 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6952 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6953 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6954 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6955 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6956 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6957 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6958 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6959 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6960 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6961 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6962 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6963 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6964 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6965 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6966 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6967 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6968 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6969 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6970 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6971 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6972 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6973 	{ 0x0b, 4, 0, 4, itlb4M_str},
6974 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6975 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6976 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6977 	{ 0x05, 4, 0, 32, dtlb4M_str},
6978 	{ 0x04, 4, 0, 8, dtlb4M_str},
6979 	{ 0x03, 4, 0, 64, dtlb4k_str},
6980 	{ 0x02, 4, 0, 2, itlb4M_str},
6981 	{ 0x01, 4, 0, 32, itlb4k_str},
6982 	{ 0 }
6983 };
6984 
6985 static const struct cachetab cyrix_ctab[] = {
6986 	{ 0x70, 4, 0, 32, "tlb-4K" },
6987 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6988 	{ 0 }
6989 };
6990 
6991 /*
6992  * Search a cache table for a matching entry
6993  */
6994 static const struct cachetab *
find_cacheent(const struct cachetab * ct,uint_t code)6995 find_cacheent(const struct cachetab *ct, uint_t code)
6996 {
6997 	if (code != 0) {
6998 		for (; ct->ct_code != 0; ct++)
6999 			if (ct->ct_code <= code)
7000 				break;
7001 		if (ct->ct_code == code)
7002 			return (ct);
7003 	}
7004 	return (NULL);
7005 }
7006 
7007 /*
7008  * Populate cachetab entry with L2 or L3 cache-information using
7009  * cpuid function 4. This function is called from intel_walk_cacheinfo()
7010  * when descriptor 0x49 is encountered. It returns 0 if no such cache
7011  * information is found.
7012  */
7013 static int
intel_cpuid_4_cache_info(struct cachetab * ct,struct cpuid_info * cpi)7014 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
7015 {
7016 	uint32_t level, i;
7017 	int ret = 0;
7018 
7019 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
7020 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
7021 
7022 		if (level == 2 || level == 3) {
7023 			ct->ct_assoc =
7024 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
7025 			ct->ct_line_size =
7026 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
7027 			ct->ct_size = ct->ct_assoc *
7028 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
7029 			    ct->ct_line_size *
7030 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
7031 
7032 			if (level == 2) {
7033 				ct->ct_label = l2_cache_str;
7034 			} else if (level == 3) {
7035 				ct->ct_label = l3_cache_str;
7036 			}
7037 			ret = 1;
7038 		}
7039 	}
7040 
7041 	return (ret);
7042 }
7043 
7044 /*
7045  * Walk the cacheinfo descriptor, applying 'func' to every valid element
7046  * The walk is terminated if the walker returns non-zero.
7047  */
7048 static void
intel_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7049 intel_walk_cacheinfo(struct cpuid_info *cpi,
7050     void *arg, int (*func)(void *, const struct cachetab *))
7051 {
7052 	const struct cachetab *ct;
7053 	struct cachetab des_49_ct, des_b1_ct;
7054 	uint8_t *dp;
7055 	int i;
7056 
7057 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7058 		return;
7059 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7060 		/*
7061 		 * For overloaded descriptor 0x49 we use cpuid function 4
7062 		 * if supported by the current processor, to create
7063 		 * cache information.
7064 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
7065 		 * to disambiguate the cache information.
7066 		 */
7067 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
7068 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
7069 				ct = &des_49_ct;
7070 		} else if (*dp == 0xb1) {
7071 			des_b1_ct.ct_code = 0xb1;
7072 			des_b1_ct.ct_assoc = 4;
7073 			des_b1_ct.ct_line_size = 0;
7074 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
7075 				des_b1_ct.ct_size = 8;
7076 				des_b1_ct.ct_label = itlb2M_str;
7077 			} else {
7078 				des_b1_ct.ct_size = 4;
7079 				des_b1_ct.ct_label = itlb4M_str;
7080 			}
7081 			ct = &des_b1_ct;
7082 		} else {
7083 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
7084 				continue;
7085 			}
7086 		}
7087 
7088 		if (func(arg, ct) != 0) {
7089 			break;
7090 		}
7091 	}
7092 }
7093 
7094 /*
7095  * (Like the Intel one, except for Cyrix CPUs)
7096  */
7097 static void
cyrix_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7098 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
7099     void *arg, int (*func)(void *, const struct cachetab *))
7100 {
7101 	const struct cachetab *ct;
7102 	uint8_t *dp;
7103 	int i;
7104 
7105 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7106 		return;
7107 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7108 		/*
7109 		 * Search Cyrix-specific descriptor table first ..
7110 		 */
7111 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
7112 			if (func(arg, ct) != 0)
7113 				break;
7114 			continue;
7115 		}
7116 		/*
7117 		 * .. else fall back to the Intel one
7118 		 */
7119 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
7120 			if (func(arg, ct) != 0)
7121 				break;
7122 			continue;
7123 		}
7124 	}
7125 }
7126 
7127 /*
7128  * A cacheinfo walker that adds associativity, line-size, and size properties
7129  * to the devinfo node it is passed as an argument.
7130  */
7131 static int
add_cacheent_props(void * arg,const struct cachetab * ct)7132 add_cacheent_props(void *arg, const struct cachetab *ct)
7133 {
7134 	dev_info_t *devi = arg;
7135 
7136 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
7137 	if (ct->ct_line_size != 0)
7138 		add_cache_prop(devi, ct->ct_label, line_str,
7139 		    ct->ct_line_size);
7140 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
7141 	return (0);
7142 }
7143 
7144 
7145 static const char fully_assoc[] = "fully-associative?";
7146 
7147 /*
7148  * AMD style cache/tlb description
7149  *
7150  * Extended functions 5 and 6 directly describe properties of
7151  * tlbs and various cache levels.
7152  */
7153 static void
add_amd_assoc(dev_info_t * devi,const char * label,uint_t assoc)7154 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7155 {
7156 	switch (assoc) {
7157 	case 0:	/* reserved; ignore */
7158 		break;
7159 	default:
7160 		add_cache_prop(devi, label, assoc_str, assoc);
7161 		break;
7162 	case 0xff:
7163 		add_cache_prop(devi, label, fully_assoc, 1);
7164 		break;
7165 	}
7166 }
7167 
7168 static void
add_amd_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7169 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7170 {
7171 	if (size == 0)
7172 		return;
7173 	add_cache_prop(devi, label, size_str, size);
7174 	add_amd_assoc(devi, label, assoc);
7175 }
7176 
7177 static void
add_amd_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7178 add_amd_cache(dev_info_t *devi, const char *label,
7179     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7180 {
7181 	if (size == 0 || line_size == 0)
7182 		return;
7183 	add_amd_assoc(devi, label, assoc);
7184 	/*
7185 	 * Most AMD parts have a sectored cache. Multiple cache lines are
7186 	 * associated with each tag. A sector consists of all cache lines
7187 	 * associated with a tag. For example, the AMD K6-III has a sector
7188 	 * size of 2 cache lines per tag.
7189 	 */
7190 	if (lines_per_tag != 0)
7191 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7192 	add_cache_prop(devi, label, line_str, line_size);
7193 	add_cache_prop(devi, label, size_str, size * 1024);
7194 }
7195 
7196 static void
add_amd_l2_assoc(dev_info_t * devi,const char * label,uint_t assoc)7197 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7198 {
7199 	switch (assoc) {
7200 	case 0:	/* off */
7201 		break;
7202 	case 1:
7203 	case 2:
7204 	case 4:
7205 		add_cache_prop(devi, label, assoc_str, assoc);
7206 		break;
7207 	case 6:
7208 		add_cache_prop(devi, label, assoc_str, 8);
7209 		break;
7210 	case 8:
7211 		add_cache_prop(devi, label, assoc_str, 16);
7212 		break;
7213 	case 0xf:
7214 		add_cache_prop(devi, label, fully_assoc, 1);
7215 		break;
7216 	default: /* reserved; ignore */
7217 		break;
7218 	}
7219 }
7220 
7221 static void
add_amd_l2_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7222 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7223 {
7224 	if (size == 0 || assoc == 0)
7225 		return;
7226 	add_amd_l2_assoc(devi, label, assoc);
7227 	add_cache_prop(devi, label, size_str, size);
7228 }
7229 
7230 static void
add_amd_l2_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7231 add_amd_l2_cache(dev_info_t *devi, const char *label,
7232     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7233 {
7234 	if (size == 0 || assoc == 0 || line_size == 0)
7235 		return;
7236 	add_amd_l2_assoc(devi, label, assoc);
7237 	if (lines_per_tag != 0)
7238 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7239 	add_cache_prop(devi, label, line_str, line_size);
7240 	add_cache_prop(devi, label, size_str, size * 1024);
7241 }
7242 
7243 static void
amd_cache_info(struct cpuid_info * cpi,dev_info_t * devi)7244 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7245 {
7246 	struct cpuid_regs *cp;
7247 
7248 	if (cpi->cpi_xmaxeax < 0x80000005)
7249 		return;
7250 	cp = &cpi->cpi_extd[5];
7251 
7252 	/*
7253 	 * 4M/2M L1 TLB configuration
7254 	 *
7255 	 * We report the size for 2M pages because AMD uses two
7256 	 * TLB entries for one 4M page.
7257 	 */
7258 	add_amd_tlb(devi, "dtlb-2M",
7259 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7260 	add_amd_tlb(devi, "itlb-2M",
7261 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7262 
7263 	/*
7264 	 * 4K L1 TLB configuration
7265 	 */
7266 
7267 	switch (cpi->cpi_vendor) {
7268 		uint_t nentries;
7269 	case X86_VENDOR_TM:
7270 		if (cpi->cpi_family >= 5) {
7271 			/*
7272 			 * Crusoe processors have 256 TLB entries, but
7273 			 * cpuid data format constrains them to only
7274 			 * reporting 255 of them.
7275 			 */
7276 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7277 				nentries = 256;
7278 			/*
7279 			 * Crusoe processors also have a unified TLB
7280 			 */
7281 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7282 			    nentries);
7283 			break;
7284 		}
7285 		/*FALLTHROUGH*/
7286 	default:
7287 		add_amd_tlb(devi, itlb4k_str,
7288 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7289 		add_amd_tlb(devi, dtlb4k_str,
7290 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7291 		break;
7292 	}
7293 
7294 	/*
7295 	 * data L1 cache configuration
7296 	 */
7297 
7298 	add_amd_cache(devi, l1_dcache_str,
7299 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7300 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7301 
7302 	/*
7303 	 * code L1 cache configuration
7304 	 */
7305 
7306 	add_amd_cache(devi, l1_icache_str,
7307 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7308 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7309 
7310 	if (cpi->cpi_xmaxeax < 0x80000006)
7311 		return;
7312 	cp = &cpi->cpi_extd[6];
7313 
7314 	/* Check for a unified L2 TLB for large pages */
7315 
7316 	if (BITX(cp->cp_eax, 31, 16) == 0)
7317 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7318 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7319 	else {
7320 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7321 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7322 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7323 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7324 	}
7325 
7326 	/* Check for a unified L2 TLB for 4K pages */
7327 
7328 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7329 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7330 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7331 	} else {
7332 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7333 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7334 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7335 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7336 	}
7337 
7338 	add_amd_l2_cache(devi, l2_cache_str,
7339 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7340 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7341 }
7342 
7343 /*
7344  * There are two basic ways that the x86 world describes it cache
7345  * and tlb architecture - Intel's way and AMD's way.
7346  *
7347  * Return which flavor of cache architecture we should use
7348  */
7349 static int
x86_which_cacheinfo(struct cpuid_info * cpi)7350 x86_which_cacheinfo(struct cpuid_info *cpi)
7351 {
7352 	switch (cpi->cpi_vendor) {
7353 	case X86_VENDOR_Intel:
7354 		if (cpi->cpi_maxeax >= 2)
7355 			return (X86_VENDOR_Intel);
7356 		break;
7357 	case X86_VENDOR_AMD:
7358 		/*
7359 		 * The K5 model 1 was the first part from AMD that reported
7360 		 * cache sizes via extended cpuid functions.
7361 		 */
7362 		if (cpi->cpi_family > 5 ||
7363 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7364 			return (X86_VENDOR_AMD);
7365 		break;
7366 	case X86_VENDOR_HYGON:
7367 		return (X86_VENDOR_AMD);
7368 	case X86_VENDOR_TM:
7369 		if (cpi->cpi_family >= 5)
7370 			return (X86_VENDOR_AMD);
7371 		/*FALLTHROUGH*/
7372 	default:
7373 		/*
7374 		 * If they have extended CPU data for 0x80000005
7375 		 * then we assume they have AMD-format cache
7376 		 * information.
7377 		 *
7378 		 * If not, and the vendor happens to be Cyrix,
7379 		 * then try our-Cyrix specific handler.
7380 		 *
7381 		 * If we're not Cyrix, then assume we're using Intel's
7382 		 * table-driven format instead.
7383 		 */
7384 		if (cpi->cpi_xmaxeax >= 0x80000005)
7385 			return (X86_VENDOR_AMD);
7386 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7387 			return (X86_VENDOR_Cyrix);
7388 		else if (cpi->cpi_maxeax >= 2)
7389 			return (X86_VENDOR_Intel);
7390 		break;
7391 	}
7392 	return (-1);
7393 }
7394 
7395 void
cpuid_set_cpu_properties(void * dip,processorid_t cpu_id,struct cpuid_info * cpi)7396 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7397     struct cpuid_info *cpi)
7398 {
7399 	dev_info_t *cpu_devi;
7400 	int create;
7401 
7402 	cpu_devi = (dev_info_t *)dip;
7403 
7404 	/* device_type */
7405 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7406 	    "device_type", "cpu");
7407 
7408 	/* reg */
7409 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7410 	    "reg", cpu_id);
7411 
7412 	/* cpu-mhz, and clock-frequency */
7413 	if (cpu_freq > 0) {
7414 		long long mul;
7415 
7416 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7417 		    "cpu-mhz", cpu_freq);
7418 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7419 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7420 			    "clock-frequency", (int)mul);
7421 	}
7422 
7423 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7424 
7425 	/* vendor-id */
7426 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7427 	    "vendor-id", cpi->cpi_vendorstr);
7428 
7429 	if (cpi->cpi_maxeax == 0) {
7430 		return;
7431 	}
7432 
7433 	/*
7434 	 * family, model, and step
7435 	 */
7436 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7437 	    "family", CPI_FAMILY(cpi));
7438 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7439 	    "cpu-model", CPI_MODEL(cpi));
7440 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7441 	    "stepping-id", CPI_STEP(cpi));
7442 
7443 	/* type */
7444 	switch (cpi->cpi_vendor) {
7445 	case X86_VENDOR_Intel:
7446 		create = 1;
7447 		break;
7448 	default:
7449 		create = 0;
7450 		break;
7451 	}
7452 	if (create)
7453 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7454 		    "type", CPI_TYPE(cpi));
7455 
7456 	/* ext-family */
7457 	switch (cpi->cpi_vendor) {
7458 	case X86_VENDOR_Intel:
7459 	case X86_VENDOR_AMD:
7460 		create = cpi->cpi_family >= 0xf;
7461 		break;
7462 	case X86_VENDOR_HYGON:
7463 		create = 1;
7464 		break;
7465 	default:
7466 		create = 0;
7467 		break;
7468 	}
7469 	if (create)
7470 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7471 		    "ext-family", CPI_FAMILY_XTD(cpi));
7472 
7473 	/* ext-model */
7474 	switch (cpi->cpi_vendor) {
7475 	case X86_VENDOR_Intel:
7476 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7477 		break;
7478 	case X86_VENDOR_AMD:
7479 		create = CPI_FAMILY(cpi) == 0xf;
7480 		break;
7481 	case X86_VENDOR_HYGON:
7482 		create = 1;
7483 		break;
7484 	default:
7485 		create = 0;
7486 		break;
7487 	}
7488 	if (create)
7489 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7490 		    "ext-model", CPI_MODEL_XTD(cpi));
7491 
7492 	/* generation */
7493 	switch (cpi->cpi_vendor) {
7494 	case X86_VENDOR_AMD:
7495 	case X86_VENDOR_HYGON:
7496 		/*
7497 		 * AMD K5 model 1 was the first part to support this
7498 		 */
7499 		create = cpi->cpi_xmaxeax >= 0x80000001;
7500 		break;
7501 	default:
7502 		create = 0;
7503 		break;
7504 	}
7505 	if (create)
7506 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7507 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7508 
7509 	/* brand-id */
7510 	switch (cpi->cpi_vendor) {
7511 	case X86_VENDOR_Intel:
7512 		/*
7513 		 * brand id first appeared on Pentium III Xeon model 8,
7514 		 * and Celeron model 8 processors and Opteron
7515 		 */
7516 		create = cpi->cpi_family > 6 ||
7517 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7518 		break;
7519 	case X86_VENDOR_AMD:
7520 		create = cpi->cpi_family >= 0xf;
7521 		break;
7522 	case X86_VENDOR_HYGON:
7523 		create = 1;
7524 		break;
7525 	default:
7526 		create = 0;
7527 		break;
7528 	}
7529 	if (create && cpi->cpi_brandid != 0) {
7530 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7531 		    "brand-id", cpi->cpi_brandid);
7532 	}
7533 
7534 	/* chunks, and apic-id */
7535 	switch (cpi->cpi_vendor) {
7536 		/*
7537 		 * first available on Pentium IV and Opteron (K8)
7538 		 */
7539 	case X86_VENDOR_Intel:
7540 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7541 		break;
7542 	case X86_VENDOR_AMD:
7543 		create = cpi->cpi_family >= 0xf;
7544 		break;
7545 	case X86_VENDOR_HYGON:
7546 		create = 1;
7547 		break;
7548 	default:
7549 		create = 0;
7550 		break;
7551 	}
7552 	if (create) {
7553 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7554 		    "chunks", CPI_CHUNKS(cpi));
7555 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7556 		    "apic-id", cpi->cpi_apicid);
7557 		if (cpi->cpi_chipid >= 0) {
7558 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7559 			    "chip#", cpi->cpi_chipid);
7560 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7561 			    "clog#", cpi->cpi_clogid);
7562 		}
7563 	}
7564 
7565 	/* cpuid-features */
7566 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7567 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7568 
7569 
7570 	/* cpuid-features-ecx */
7571 	switch (cpi->cpi_vendor) {
7572 	case X86_VENDOR_Intel:
7573 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7574 		break;
7575 	case X86_VENDOR_AMD:
7576 		create = cpi->cpi_family >= 0xf;
7577 		break;
7578 	case X86_VENDOR_HYGON:
7579 		create = 1;
7580 		break;
7581 	default:
7582 		create = 0;
7583 		break;
7584 	}
7585 	if (create)
7586 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7587 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7588 
7589 	/* ext-cpuid-features */
7590 	switch (cpi->cpi_vendor) {
7591 	case X86_VENDOR_Intel:
7592 	case X86_VENDOR_AMD:
7593 	case X86_VENDOR_HYGON:
7594 	case X86_VENDOR_Cyrix:
7595 	case X86_VENDOR_TM:
7596 	case X86_VENDOR_Centaur:
7597 		create = cpi->cpi_xmaxeax >= 0x80000001;
7598 		break;
7599 	default:
7600 		create = 0;
7601 		break;
7602 	}
7603 	if (create) {
7604 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7605 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7606 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7607 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7608 	}
7609 
7610 	/*
7611 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7612 	 * model 1, and Cyrix GXm.  On earlier models we try and
7613 	 * simulate something similar .. so this string should always
7614 	 * same -something- about the processor, however lame.
7615 	 */
7616 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7617 	    "brand-string", cpi->cpi_brandstr);
7618 
7619 	/*
7620 	 * Finally, cache and tlb information
7621 	 */
7622 	switch (x86_which_cacheinfo(cpi)) {
7623 	case X86_VENDOR_Intel:
7624 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7625 		break;
7626 	case X86_VENDOR_Cyrix:
7627 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7628 		break;
7629 	case X86_VENDOR_AMD:
7630 		amd_cache_info(cpi, cpu_devi);
7631 		break;
7632 	default:
7633 		break;
7634 	}
7635 }
7636 
7637 struct l2info {
7638 	int *l2i_csz;
7639 	int *l2i_lsz;
7640 	int *l2i_assoc;
7641 	int l2i_ret;
7642 };
7643 
7644 /*
7645  * A cacheinfo walker that fetches the size, line-size and associativity
7646  * of the L2 cache
7647  */
7648 static int
intel_l2cinfo(void * arg,const struct cachetab * ct)7649 intel_l2cinfo(void *arg, const struct cachetab *ct)
7650 {
7651 	struct l2info *l2i = arg;
7652 	int *ip;
7653 
7654 	if (ct->ct_label != l2_cache_str &&
7655 	    ct->ct_label != sl2_cache_str)
7656 		return (0);	/* not an L2 -- keep walking */
7657 
7658 	if ((ip = l2i->l2i_csz) != NULL)
7659 		*ip = ct->ct_size;
7660 	if ((ip = l2i->l2i_lsz) != NULL)
7661 		*ip = ct->ct_line_size;
7662 	if ((ip = l2i->l2i_assoc) != NULL)
7663 		*ip = ct->ct_assoc;
7664 	l2i->l2i_ret = ct->ct_size;
7665 	return (1);		/* was an L2 -- terminate walk */
7666 }
7667 
7668 /*
7669  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7670  *
7671  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7672  *	value is the associativity, the associativity for the L2 cache and
7673  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7674  *	an index into the amd_afd[] array to determine the associativity.
7675  *	-1 is undefined. 0 is fully associative.
7676  */
7677 
7678 static int amd_afd[] =
7679 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7680 
7681 static void
amd_l2cacheinfo(struct cpuid_info * cpi,struct l2info * l2i)7682 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7683 {
7684 	struct cpuid_regs *cp;
7685 	uint_t size, assoc;
7686 	int i;
7687 	int *ip;
7688 
7689 	if (cpi->cpi_xmaxeax < 0x80000006)
7690 		return;
7691 	cp = &cpi->cpi_extd[6];
7692 
7693 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7694 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7695 		uint_t cachesz = size * 1024;
7696 		assoc = amd_afd[i];
7697 
7698 		ASSERT(assoc != -1);
7699 
7700 		if ((ip = l2i->l2i_csz) != NULL)
7701 			*ip = cachesz;
7702 		if ((ip = l2i->l2i_lsz) != NULL)
7703 			*ip = BITX(cp->cp_ecx, 7, 0);
7704 		if ((ip = l2i->l2i_assoc) != NULL)
7705 			*ip = assoc;
7706 		l2i->l2i_ret = cachesz;
7707 	}
7708 }
7709 
7710 int
getl2cacheinfo(cpu_t * cpu,int * csz,int * lsz,int * assoc)7711 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7712 {
7713 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7714 	struct l2info __l2info, *l2i = &__l2info;
7715 
7716 	l2i->l2i_csz = csz;
7717 	l2i->l2i_lsz = lsz;
7718 	l2i->l2i_assoc = assoc;
7719 	l2i->l2i_ret = -1;
7720 
7721 	switch (x86_which_cacheinfo(cpi)) {
7722 	case X86_VENDOR_Intel:
7723 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7724 		break;
7725 	case X86_VENDOR_Cyrix:
7726 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7727 		break;
7728 	case X86_VENDOR_AMD:
7729 		amd_l2cacheinfo(cpi, l2i);
7730 		break;
7731 	default:
7732 		break;
7733 	}
7734 	return (l2i->l2i_ret);
7735 }
7736 
7737 #if !defined(__xpv)
7738 
7739 uint32_t *
cpuid_mwait_alloc(cpu_t * cpu)7740 cpuid_mwait_alloc(cpu_t *cpu)
7741 {
7742 	uint32_t	*ret;
7743 	size_t		mwait_size;
7744 
7745 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7746 
7747 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7748 	if (mwait_size == 0)
7749 		return (NULL);
7750 
7751 	/*
7752 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7753 	 * allocations.  mwait_size is currently cache line sized.  Neither
7754 	 * of these implementation details are guarantied to be true in the
7755 	 * future.
7756 	 *
7757 	 * First try allocating mwait_size as kmem_alloc() currently returns
7758 	 * correctly aligned memory.  If kmem_alloc() does not return
7759 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7760 	 *
7761 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7762 	 * decide to free this memory.
7763 	 */
7764 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7765 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7766 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7767 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7768 		*ret = MWAIT_RUNNING;
7769 		return (ret);
7770 	} else {
7771 		kmem_free(ret, mwait_size);
7772 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7773 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7774 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7775 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7776 		*ret = MWAIT_RUNNING;
7777 		return (ret);
7778 	}
7779 }
7780 
7781 void
cpuid_mwait_free(cpu_t * cpu)7782 cpuid_mwait_free(cpu_t *cpu)
7783 {
7784 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7785 		return;
7786 	}
7787 
7788 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7789 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7790 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7791 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7792 	}
7793 
7794 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7795 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7796 }
7797 
7798 void
patch_tsc_read(int flag)7799 patch_tsc_read(int flag)
7800 {
7801 	size_t cnt;
7802 
7803 	switch (flag) {
7804 	case TSC_NONE:
7805 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7806 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7807 		break;
7808 	case TSC_RDTSC_LFENCE:
7809 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7810 		(void) memcpy((void *)tsc_read,
7811 		    (void *)&_tsc_lfence_start, cnt);
7812 		break;
7813 	case TSC_TSCP:
7814 		cnt = &_tscp_end - &_tscp_start;
7815 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7816 		break;
7817 	default:
7818 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7819 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7820 		break;
7821 	}
7822 	tsc_type = flag;
7823 }
7824 
7825 int
cpuid_deep_cstates_supported(void)7826 cpuid_deep_cstates_supported(void)
7827 {
7828 	struct cpuid_info *cpi;
7829 	struct cpuid_regs regs;
7830 
7831 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7832 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7833 
7834 	cpi = CPU->cpu_m.mcpu_cpi;
7835 
7836 	switch (cpi->cpi_vendor) {
7837 	case X86_VENDOR_Intel:
7838 	case X86_VENDOR_AMD:
7839 	case X86_VENDOR_HYGON:
7840 		if (cpi->cpi_xmaxeax < 0x80000007)
7841 			return (0);
7842 
7843 		/*
7844 		 * Does TSC run at a constant rate in all C-states?
7845 		 */
7846 		regs.cp_eax = 0x80000007;
7847 		(void) __cpuid_insn(&regs);
7848 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7849 
7850 	default:
7851 		return (0);
7852 	}
7853 }
7854 
7855 #endif	/* !__xpv */
7856 
7857 void
post_startup_cpu_fixups(void)7858 post_startup_cpu_fixups(void)
7859 {
7860 #ifndef __xpv
7861 	/*
7862 	 * Some AMD processors support C1E state. Entering this state will
7863 	 * cause the local APIC timer to stop, which we can't deal with at
7864 	 * this time.
7865 	 */
7866 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7867 		on_trap_data_t otd;
7868 		uint64_t reg;
7869 
7870 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7871 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7872 			/* Disable C1E state if it is enabled by BIOS */
7873 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7874 			    AMD_ACTONCMPHALT_MASK) {
7875 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7876 				    AMD_ACTONCMPHALT_SHIFT);
7877 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7878 			}
7879 		}
7880 		no_trap();
7881 	}
7882 #endif	/* !__xpv */
7883 }
7884 
7885 void
enable_pcid(void)7886 enable_pcid(void)
7887 {
7888 	if (x86_use_pcid == -1)
7889 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7890 
7891 	if (x86_use_invpcid == -1) {
7892 		x86_use_invpcid = is_x86_feature(x86_featureset,
7893 		    X86FSET_INVPCID);
7894 	}
7895 
7896 	if (!x86_use_pcid)
7897 		return;
7898 
7899 	/*
7900 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7901 	 * bits; better make sure there's nothing there.
7902 	 */
7903 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7904 
7905 	setcr4(getcr4() | CR4_PCIDE);
7906 }
7907 
7908 /*
7909  * Setup necessary registers to enable XSAVE feature on this processor.
7910  * This function needs to be called early enough, so that no xsave/xrstor
7911  * ops will execute on the processor before the MSRs are properly set up.
7912  *
7913  * Current implementation has the following assumption:
7914  * - cpuid_pass_basic() is done, so that X86 features are known.
7915  * - fpu_probe() is done, so that fp_save_mech is chosen.
7916  */
7917 void
xsave_setup_msr(cpu_t * cpu)7918 xsave_setup_msr(cpu_t *cpu)
7919 {
7920 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7921 	ASSERT(fp_save_mech == FP_XSAVE);
7922 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7923 
7924 	/* Enable OSXSAVE in CR4. */
7925 	setcr4(getcr4() | CR4_OSXSAVE);
7926 	/*
7927 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7928 	 * correct value.
7929 	 */
7930 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7931 	setup_xfem();
7932 }
7933 
7934 /*
7935  * Starting with the Westmere processor the local
7936  * APIC timer will continue running in all C-states,
7937  * including the deepest C-states.
7938  */
7939 int
cpuid_arat_supported(void)7940 cpuid_arat_supported(void)
7941 {
7942 	struct cpuid_info *cpi;
7943 	struct cpuid_regs regs;
7944 
7945 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7946 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7947 
7948 	cpi = CPU->cpu_m.mcpu_cpi;
7949 
7950 	switch (cpi->cpi_vendor) {
7951 	case X86_VENDOR_Intel:
7952 	case X86_VENDOR_AMD:
7953 	case X86_VENDOR_HYGON:
7954 		/*
7955 		 * Always-running Local APIC Timer is
7956 		 * indicated by CPUID.6.EAX[2].
7957 		 */
7958 		if (cpi->cpi_maxeax >= 6) {
7959 			regs.cp_eax = 6;
7960 			(void) cpuid_insn(NULL, &regs);
7961 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7962 		} else {
7963 			return (0);
7964 		}
7965 	default:
7966 		return (0);
7967 	}
7968 }
7969 
7970 /*
7971  * Check support for Intel ENERGY_PERF_BIAS feature
7972  */
7973 int
cpuid_iepb_supported(struct cpu * cp)7974 cpuid_iepb_supported(struct cpu *cp)
7975 {
7976 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7977 	struct cpuid_regs regs;
7978 
7979 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7980 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7981 
7982 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7983 		return (0);
7984 	}
7985 
7986 	/*
7987 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7988 	 * capability bit CPUID.6.ECX.3
7989 	 */
7990 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7991 		return (0);
7992 
7993 	regs.cp_eax = 0x6;
7994 	(void) cpuid_insn(NULL, &regs);
7995 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7996 }
7997 
7998 /*
7999  * Check support for TSC deadline timer
8000  *
8001  * TSC deadline timer provides a superior software programming
8002  * model over local APIC timer that eliminates "time drifts".
8003  * Instead of specifying a relative time, software specifies an
8004  * absolute time as the target at which the processor should
8005  * generate a timer event.
8006  */
8007 int
cpuid_deadline_tsc_supported(void)8008 cpuid_deadline_tsc_supported(void)
8009 {
8010 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
8011 	struct cpuid_regs regs;
8012 
8013 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8014 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8015 
8016 	switch (cpi->cpi_vendor) {
8017 	case X86_VENDOR_Intel:
8018 		if (cpi->cpi_maxeax >= 1) {
8019 			regs.cp_eax = 1;
8020 			(void) cpuid_insn(NULL, &regs);
8021 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
8022 		} else {
8023 			return (0);
8024 		}
8025 	default:
8026 		return (0);
8027 	}
8028 }
8029 
8030 #if !defined(__xpv)
8031 /*
8032  * Patch in versions of bcopy for high performance Intel Nhm processors
8033  * and later...
8034  */
8035 void
patch_memops(uint_t vendor)8036 patch_memops(uint_t vendor)
8037 {
8038 	size_t cnt, i;
8039 	caddr_t to, from;
8040 
8041 	if ((vendor == X86_VENDOR_Intel) &&
8042 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
8043 		cnt = &bcopy_patch_end - &bcopy_patch_start;
8044 		to = &bcopy_ck_size;
8045 		from = &bcopy_patch_start;
8046 		for (i = 0; i < cnt; i++) {
8047 			*to++ = *from++;
8048 		}
8049 	}
8050 }
8051 #endif  /*  !__xpv */
8052 
8053 /*
8054  * We're being asked to tell the system how many bits are required to represent
8055  * the various thread and strand IDs. While it's tempting to derive this based
8056  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
8057  * correct. Instead, this needs to be based on the number of bits that the APIC
8058  * allows for these different configurations. We only update these to a larger
8059  * value if we find one.
8060  */
8061 void
cpuid_get_ext_topo(cpu_t * cpu,uint_t * core_nbits,uint_t * strand_nbits)8062 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
8063 {
8064 	struct cpuid_info *cpi;
8065 
8066 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8067 	cpi = cpu->cpu_m.mcpu_cpi;
8068 
8069 	if (cpi->cpi_ncore_bits > *core_nbits) {
8070 		*core_nbits = cpi->cpi_ncore_bits;
8071 	}
8072 
8073 	if (cpi->cpi_nthread_bits > *strand_nbits) {
8074 		*strand_nbits = cpi->cpi_nthread_bits;
8075 	}
8076 }
8077 
8078 void
cpuid_pass_ucode(cpu_t * cpu,uchar_t * fset)8079 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
8080 {
8081 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
8082 	struct cpuid_regs cp;
8083 
8084 	/*
8085 	 * Reread the CPUID portions that we need for various security
8086 	 * information.
8087 	 */
8088 	switch (cpi->cpi_vendor) {
8089 	case X86_VENDOR_Intel:
8090 		/*
8091 		 * Check if we now have leaf 7 available to us.
8092 		 */
8093 		if (cpi->cpi_maxeax < 7) {
8094 			bzero(&cp, sizeof (cp));
8095 			cp.cp_eax = 0;
8096 			cpi->cpi_maxeax = __cpuid_insn(&cp);
8097 			if (cpi->cpi_maxeax < 7)
8098 				break;
8099 		}
8100 
8101 		bzero(&cp, sizeof (cp));
8102 		cp.cp_eax = 7;
8103 		cp.cp_ecx = 0;
8104 		(void) __cpuid_insn(&cp);
8105 		cpi->cpi_std[7] = cp;
8106 		break;
8107 
8108 	case X86_VENDOR_AMD:
8109 	case X86_VENDOR_HYGON:
8110 		/* No xcpuid support */
8111 		if (cpi->cpi_family < 5 ||
8112 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
8113 			break;
8114 
8115 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
8116 			bzero(&cp, sizeof (cp));
8117 			cp.cp_eax = CPUID_LEAF_EXT_0;
8118 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
8119 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
8120 				break;
8121 		}
8122 
8123 		/*
8124 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
8125 		 * leaf 0x21. So we also check that.
8126 		 */
8127 		bzero(&cp, sizeof (cp));
8128 		cp.cp_eax = CPUID_LEAF_EXT_8;
8129 		(void) __cpuid_insn(&cp);
8130 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
8131 		cpi->cpi_extd[8] = cp;
8132 
8133 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21)
8134 			break;
8135 
8136 		bzero(&cp, sizeof (cp));
8137 		cp.cp_eax = CPUID_LEAF_EXT_21;
8138 		(void) __cpuid_insn(&cp);
8139 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
8140 		cpi->cpi_extd[0x21] = cp;
8141 		break;
8142 
8143 	default:
8144 		/*
8145 		 * Nothing to do here. Return an empty set which has already
8146 		 * been zeroed for us.
8147 		 */
8148 		return;
8149 	}
8150 
8151 	cpuid_scan_security(cpu, fset);
8152 }
8153 
8154 /* ARGSUSED */
8155 static int
cpuid_post_ucodeadm_xc(xc_arg_t arg0,xc_arg_t arg1,xc_arg_t arg2)8156 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
8157 {
8158 	uchar_t *fset;
8159 	boolean_t first_pass = (boolean_t)arg1;
8160 
8161 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
8162 	if (first_pass && CPU->cpu_id != 0)
8163 		return (0);
8164 	if (!first_pass && CPU->cpu_id == 0)
8165 		return (0);
8166 	cpuid_pass_ucode(CPU, fset);
8167 
8168 	return (0);
8169 }
8170 
8171 /*
8172  * After a microcode update where the version has changed, then we need to
8173  * rescan CPUID. To do this we check every CPU to make sure that they have the
8174  * same microcode. Then we perform a cross call to all such CPUs. It's the
8175  * caller's job to make sure that no one else can end up doing an update while
8176  * this is going on.
8177  *
8178  * We assume that the system is microcode capable if we're called.
8179  */
8180 void
cpuid_post_ucodeadm(void)8181 cpuid_post_ucodeadm(void)
8182 {
8183 	uint32_t rev;
8184 	int i;
8185 	struct cpu *cpu;
8186 	cpuset_t cpuset;
8187 	void *argdata;
8188 	uchar_t *f0;
8189 
8190 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
8191 
8192 	mutex_enter(&cpu_lock);
8193 	cpu = cpu_get(0);
8194 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
8195 	CPUSET_ONLY(cpuset, 0);
8196 	for (i = 1; i < max_ncpus; i++) {
8197 		if ((cpu = cpu_get(i)) == NULL)
8198 			continue;
8199 
8200 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
8201 			panic("post microcode update CPU %d has differing "
8202 			    "microcode revision (%u) from CPU 0 (%u)",
8203 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
8204 		}
8205 		CPUSET_ADD(cpuset, i);
8206 	}
8207 
8208 	/*
8209 	 * We do the cross calls in two passes. The first pass is only for the
8210 	 * boot CPU. The second pass is for all of the other CPUs. This allows
8211 	 * the boot CPU to go through and change behavior related to patching or
8212 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
8213 	 * other CPUs to follow suit.
8214 	 */
8215 	kpreempt_disable();
8216 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
8217 	    cpuid_post_ucodeadm_xc);
8218 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
8219 	    cpuid_post_ucodeadm_xc);
8220 	kpreempt_enable();
8221 
8222 	/*
8223 	 * OK, now look at each CPU and see if their feature sets are equal.
8224 	 */
8225 	f0 = argdata;
8226 	for (i = 1; i < max_ncpus; i++) {
8227 		uchar_t *fset;
8228 		if (!CPU_IN_SET(cpuset, i))
8229 			continue;
8230 
8231 		fset = (uchar_t *)((uintptr_t)argdata +
8232 		    sizeof (x86_featureset) * i);
8233 
8234 		if (!compare_x86_featureset(f0, fset)) {
8235 			panic("Post microcode update CPU %d has "
8236 			    "differing security feature (%p) set from CPU 0 "
8237 			    "(%p), not appending to feature set", i,
8238 			    (void *)fset, (void *)f0);
8239 		}
8240 	}
8241 
8242 	mutex_exit(&cpu_lock);
8243 
8244 	for (i = 0; i < NUM_X86_FEATURES; i++) {
8245 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8246 		    x86_feature_names[i]);
8247 		if (is_x86_feature(f0, i)) {
8248 			add_x86_feature(x86_featureset, i);
8249 		}
8250 	}
8251 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8252 }
8253 
8254 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8255 
8256 typedef struct cpuid_pass_def {
8257 	cpuid_pass_t cpd_pass;
8258 	cpuid_pass_f cpd_func;
8259 } cpuid_pass_def_t;
8260 
8261 /*
8262  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8263  * normal sense and should not appear here.
8264  */
8265 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8266 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8267 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
8268 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
8269 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
8270 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8271 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8272 };
8273 
8274 void
cpuid_execpass(cpu_t * cp,cpuid_pass_t pass,void * arg)8275 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8276 {
8277 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
8278 
8279 	if (cp == NULL)
8280 		cp = CPU;
8281 
8282 	/*
8283 	 * Space statically allocated for BSP, ensure pointer is set
8284 	 */
8285 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8286 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
8287 
8288 	ASSERT(cpuid_checkpass(cp, pass - 1));
8289 
8290 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8291 		if (cpuid_pass_defs[i].cpd_pass == pass) {
8292 			cpuid_pass_defs[i].cpd_func(cp, arg);
8293 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8294 			return;
8295 		}
8296 	}
8297 
8298 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8299 	    pass, cp->cpu_id);
8300 }
8301 
8302 /*
8303  * Extract the processor family from a chiprev.  Processor families are not the
8304  * same as cpuid families; see comments above and in x86_archext.h.
8305  */
8306 x86_processor_family_t
chiprev_family(const x86_chiprev_t cr)8307 chiprev_family(const x86_chiprev_t cr)
8308 {
8309 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8310 }
8311 
8312 /*
8313  * A chiprev matches its template if the vendor and family are identical and the
8314  * revision of the chiprev matches one of the bits set in the template.  Callers
8315  * may bitwise-OR together chiprevs of the same vendor and family to form the
8316  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8317  * multiple vendors or processor families with a single call.  Note that this
8318  * function operates on processor families, not cpuid families.
8319  */
8320 boolean_t
chiprev_matches(const x86_chiprev_t cr,const x86_chiprev_t template)8321 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8322 {
8323 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8324 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8325 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8326 }
8327 
8328 /*
8329  * A chiprev is at least min if the vendor and family are identical and the
8330  * revision of the chiprev is at least as recent as that of min.  Processor
8331  * families are considered unordered and cannot be compared using this function.
8332  * Note that this function operates on processor families, not cpuid families.
8333  * Use of the _ANY chiprev variant with this function is not useful; it will
8334  * always return B_FALSE if the _ANY variant is supplied as the minimum
8335  * revision.  To determine only whether a chiprev is of a given processor
8336  * family, test the return value of chiprev_family() instead.
8337  */
8338 boolean_t
chiprev_at_least(const x86_chiprev_t cr,const x86_chiprev_t min)8339 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8340 {
8341 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8342 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8343 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8344 }
8345 
8346 /*
8347  * The uarch functions operate in a manner similar to the chiprev functions
8348  * above.  While it is tempting to allow these to operate on microarchitectures
8349  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8350  * than ZEN2), we elect not to do so because a manufacturer may supply
8351  * processors of multiple different microarchitecture families each of which may
8352  * be internally ordered but unordered with respect to those of other families.
8353  */
8354 x86_uarch_t
uarchrev_uarch(const x86_uarchrev_t ur)8355 uarchrev_uarch(const x86_uarchrev_t ur)
8356 {
8357 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8358 }
8359 
8360 boolean_t
uarchrev_matches(const x86_uarchrev_t ur,const x86_uarchrev_t template)8361 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8362 {
8363 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8364 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8365 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8366 }
8367 
8368 boolean_t
uarchrev_at_least(const x86_uarchrev_t ur,const x86_uarchrev_t min)8369 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8370 {
8371 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8372 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8373 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8374 }
8375 
8376 /*
8377  * Topology cache related information. This is yet another cache interface that
8378  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8379  * AMD Leaf 8x1D (introduced with Zen 1).
8380  */
8381 static boolean_t
cpuid_cache_topo_sup(const struct cpuid_info * cpi)8382 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8383 {
8384 	switch (cpi->cpi_vendor) {
8385 	case X86_VENDOR_Intel:
8386 		if (cpi->cpi_maxeax >= 4) {
8387 			return (B_TRUE);
8388 		}
8389 		break;
8390 	case X86_VENDOR_AMD:
8391 	case X86_VENDOR_HYGON:
8392 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8393 		    is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8394 			return (B_TRUE);
8395 		}
8396 		break;
8397 	default:
8398 		break;
8399 	}
8400 
8401 	return (B_FALSE);
8402 }
8403 
8404 int
cpuid_getncaches(struct cpu * cpu,uint32_t * ncache)8405 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8406 {
8407 	const struct cpuid_info *cpi;
8408 
8409 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8410 	cpi = cpu->cpu_m.mcpu_cpi;
8411 
8412 	if (!cpuid_cache_topo_sup(cpi)) {
8413 		return (ENOTSUP);
8414 	}
8415 
8416 	*ncache = cpi->cpi_cache_leaf_size;
8417 	return (0);
8418 }
8419 
8420 int
cpuid_getcache(struct cpu * cpu,uint32_t cno,x86_cache_t * cache)8421 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8422 {
8423 	const struct cpuid_info *cpi;
8424 	const struct cpuid_regs *cp;
8425 
8426 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8427 	cpi = cpu->cpu_m.mcpu_cpi;
8428 
8429 	if (!cpuid_cache_topo_sup(cpi)) {
8430 		return (ENOTSUP);
8431 	}
8432 
8433 	if (cno >= cpi->cpi_cache_leaf_size) {
8434 		return (EINVAL);
8435 	}
8436 
8437 	bzero(cache, sizeof (x86_cache_t));
8438 	cp = cpi->cpi_cache_leaves[cno];
8439 	switch (CPI_CACHE_TYPE(cp)) {
8440 	case CPI_CACHE_TYPE_DATA:
8441 		cache->xc_type = X86_CACHE_TYPE_DATA;
8442 		break;
8443 	case CPI_CACHE_TYPE_INSTR:
8444 		cache->xc_type = X86_CACHE_TYPE_INST;
8445 		break;
8446 	case CPI_CACHE_TYPE_UNIFIED:
8447 		cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8448 		break;
8449 	case CPI_CACHE_TYPE_DONE:
8450 	default:
8451 		return (EINVAL);
8452 	}
8453 	cache->xc_level = CPI_CACHE_LVL(cp);
8454 	if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8455 		cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8456 	}
8457 	cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8458 	/*
8459 	 * The number of sets is reserved on AMD if the CPU is tagged as fully
8460 	 * associative, where as it is considered valid on Intel.
8461 	 */
8462 	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8463 	    CPI_FULL_ASSOC_CACHE(cp) != 0) {
8464 		cache->xc_nsets = 1;
8465 	} else {
8466 		cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8467 	}
8468 	cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8469 	cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8470 	cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8471 	    cache->xc_line_size;
8472 	/*
8473 	 * We're looking for the number of bits to cover the number of CPUs that
8474 	 * are being shared. Normally this would be the value - 1, but the CPUID
8475 	 * value is encoded as the actual value minus one, so we don't modify
8476 	 * this at all.
8477 	 */
8478 	cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8479 
8480 	/*
8481 	 * To construct a unique ID we construct a uint64_t that looks as
8482 	 * follows:
8483 	 *
8484 	 * [47:40] cache level
8485 	 * [39:32] CPUID cache type
8486 	 * [31:00] shifted APIC ID
8487 	 *
8488 	 * The shifted APIC ID gives us a guarantee that a given cache entry is
8489 	 * unique within its peers. The other two numbers give us something that
8490 	 * ensures that something is unique within the CPU. If we just had the
8491 	 * APIC ID shifted over by the indicated number of bits we'd end up with
8492 	 * an ID of zero for the L1I, L1D, L2, and L3.
8493 	 *
8494 	 * The format of this ID is private to the system and can change across
8495 	 * a reboot for the time being.
8496 	 */
8497 	cache->xc_id = (uint64_t)cache->xc_level << 40;
8498 	cache->xc_id |= (uint64_t)cache->xc_type << 32;
8499 	cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8500 
8501 	return (0);
8502 }
8503