1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 * Copyright 2020 Joyent, Inc.
27 * Copyright 2025 Oxide Computer Company
28 * Copyright 2024 MNX Cloud, Inc.
29 */
30 /*
31 * Copyright (c) 2010, Intel Corporation.
32 * All rights reserved.
33 */
34 /*
35 * Portions Copyright 2009 Advanced Micro Devices, Inc.
36 */
37
38 /*
39 * CPU Identification logic
40 *
41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42 * with the identification of CPUs, their features, and their topologies. More
43 * specifically, this file helps drive the following:
44 *
45 * 1. Enumeration of features of the processor which are used by the kernel to
46 * determine what features to enable or disable. These may be instruction set
47 * enhancements or features that we use.
48 *
49 * 2. Enumeration of instruction set architecture (ISA) additions that userland
50 * will be told about through the auxiliary vector.
51 *
52 * 3. Understanding the physical topology of the CPU such as the number of
53 * caches, how many cores it has, whether or not it supports symmetric
54 * multi-processing (SMT), etc.
55 *
56 * ------------------------
57 * CPUID History and Basics
58 * ------------------------
59 *
60 * The cpuid instruction was added by Intel roughly around the time that the
61 * original Pentium was introduced. The purpose of cpuid was to tell in a
62 * programmatic fashion information about the CPU that previously was guessed
63 * at. For example, an important part of cpuid is that we can know what
64 * extensions to the ISA exist. If you use an invalid opcode you would get a
65 * #UD, so this method allows a program (whether a user program or the kernel)
66 * to determine what exists without crashing or getting a SIGILL. Of course,
67 * this was also during the era of the clones and the AMD Am5x86. The vendor
68 * name shows up first in cpuid for a reason.
69 *
70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72 * its own meaning. The different leaves are broken down into different regions:
73 *
74 * [ 0, 7fffffff ] This region is called the 'basic'
75 * region. This region is generally defined
76 * by Intel, though some of the original
77 * portions have different meanings based
78 * on the manufacturer. These days, Intel
79 * adds most new features to this region.
80 * AMD adds non-Intel compatible
81 * information in the third, extended
82 * region. Intel uses this for everything
83 * including ISA extensions, CPU
84 * features, cache information, topology,
85 * and more.
86 *
87 * There is a hole carved out of this
88 * region which is reserved for
89 * hypervisors.
90 *
91 * [ 40000000, 4fffffff ] This region, which is found in the
92 * middle of the previous region, is
93 * explicitly promised to never be used by
94 * CPUs. Instead, it is used by hypervisors
95 * to communicate information about
96 * themselves to the operating system. The
97 * values and details are unique for each
98 * hypervisor.
99 *
100 * [ 80000000, ffffffff ] This region is called the 'extended'
101 * region. Some of the low leaves mirror
102 * parts of the basic leaves. This region
103 * has generally been used by AMD for
104 * various extensions. For example, AMD-
105 * specific information about caches,
106 * features, and topology are found in this
107 * region.
108 *
109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111 * the ranges, one of the primary things returned is the maximum valid leaf in
112 * that range. This allows for discovery of what range of CPUID is valid.
113 *
114 * The CPUs have potentially surprising behavior when using an invalid leaf or
115 * unimplemented leaf. If the requested leaf is within the valid basic or
116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117 * set to zero. However, if you specify a leaf that is outside of a valid range,
118 * then instead it will be filled with the last valid _basic_ leaf. For example,
119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120 * an invalid extended leaf will return the information for leaf 3.
121 *
122 * Some leaves are broken down into sub-leaves. This means that the value
123 * depends on both the leaf asked for in %eax and a secondary register. For
124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125 * additional information. Or when getting topology information in leaf 0xb, the
126 * initial value in %ecx changes which level of the topology that you are
127 * getting information about.
128 *
129 * cpuid values are always kept to 32 bits regardless of whether or not the
130 * program is in 64-bit mode. When executing in 64-bit mode, the upper
131 * 32 bits of the register are always set to zero so that way the values are the
132 * same regardless of execution mode.
133 *
134 * ----------------------
135 * Identifying Processors
136 * ----------------------
137 *
138 * We can identify a processor in two steps. The first step looks at cpuid leaf
139 * 0. Leaf 0 contains the processor's vendor information. This is done by
140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142 *
143 * From there, a processor is identified by a combination of three different
144 * values:
145 *
146 * 1. Family
147 * 2. Model
148 * 3. Stepping
149 *
150 * Each vendor uses the family and model to uniquely identify a processor. The
151 * way that family and model are changed depends on the vendor. For example,
152 * Intel has been using family 0x6 for almost all of their processor since the
153 * Pentium Pro/Pentium II era, often called the P6. The model is used to
154 * identify the exact processor. Different models are often used for the client
155 * (consumer) and server parts. Even though each processor often has major
156 * architectural differences, they still are considered the same family by
157 * Intel.
158 *
159 * On the other hand, each major AMD architecture generally has its own family.
160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161 * the model number is used to help identify specific processors. As AMD's
162 * product lines have expanded, they have started putting a mixed bag of
163 * processors into the same family, with each processor under a single
164 * identifying banner (e.g., Milan, Cezanne) using a range of model numbers. We
165 * refer to each such collection as a processor family, distinct from cpuid
166 * family. Importantly, each processor family has a BIOS and Kernel Developer's
167 * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168 * defines the processor family's non-architectural features. In general, we'll
169 * use "family" here to mean the family number reported by the cpuid instruction
170 * and distinguish the processor family from it where appropriate.
171 *
172 * The stepping is used to refer to a revision of a specific microprocessor. The
173 * term comes from equipment used to produce masks that are used to create
174 * integrated circuits.
175 *
176 * The information is present in leaf 1, %eax. In technical documentation you
177 * will see the terms extended model and extended family. The original family,
178 * model, and stepping fields were each 4 bits wide. If the values in either
179 * are 0xf, then one is to consult the extended model and extended family, which
180 * take previously reserved bits and allow for a larger number of models and add
181 * 0xf to them.
182 *
183 * When we process this information, we store the full family, model, and
184 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185 * cpi_step, respectively. Whenever you are performing comparisons with the
186 * family, model, and stepping, you should use these members and not the raw
187 * values from cpuid. If you must use the raw values from cpuid directly, you
188 * must make sure that you add the extended model and family to the base model
189 * and family.
190 *
191 * In general, we do not use information about the family, model, and stepping
192 * to determine whether or not a feature is present; that is generally driven by
193 * specific leaves. However, when something we care about on the processor is
194 * not considered 'architectural' meaning that it is specific to a set of
195 * processors and not promised in the architecture model to be consistent from
196 * generation to generation, then we will fall back on this information. The
197 * most common cases where this comes up is when we have to workaround errata in
198 * the processor, are dealing with processor-specific features such as CPU
199 * performance counters, or we want to provide additional information for things
200 * such as fault management.
201 *
202 * While processors also do have a brand string, which is the name that people
203 * are familiar with when buying the processor, they are not meant for
204 * programmatic consumption. That is what the family, model, and stepping are
205 * for.
206 *
207 * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208 * and stepping(s) that refer to a single or very closely related set of silicon
209 * implementations; while there are sometimes more specific ways to learn of the
210 * presence or absence of a particular erratum or workaround, one may generally
211 * assume that all processors of the same chiprev have the same errata and we
212 * have chosen to represent them this way precisely because that is how AMD
213 * groups them in their revision guides (errata documentation). The processor
214 * family (x86_processor_family_t) may be extracted from the chiprev if that
215 * level of detail is not needed. Processor families are considered unordered
216 * but revisions within a family may be compared for either an exact match or at
217 * least as recent as a reference revision. See the chiprev_xxx() functions
218 * below.
219 *
220 * Similarly, each processor family implements a particular microarchitecture,
221 * which itself may have multiple revisions. In general, non-architectural
222 * features are specific to a processor family, but some may exist across
223 * families containing cores that implement the same microarchitectural revision
224 * (and, such cores share common bugs, too). We provide utility routines
225 * analogous to those for extracting and comparing chiprevs for
226 * microarchitectures as well; see the uarch_xxx() functions.
227 *
228 * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229 * present used and available only for AMD and AMD-like processors.
230 *
231 * ------------
232 * CPUID Passes
233 * ------------
234 *
235 * As part of performing feature detection, we break this into several different
236 * passes. There used to be a pass 0 that was done from assembly in locore.s to
237 * support processors that have a missing or broken cpuid instruction (notably
238 * certain Cyrix processors) but those were all 32-bit processors which are no
239 * longer supported. Passes are no longer numbered explicitly to make it easier
240 * to break them up or move them around as needed; however, they still have a
241 * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242 * x86_archext.h. The external interface to execute a cpuid pass or determine
243 * whether a pass has been completed consists of cpuid_execpass() and
244 * cpuid_checkpass() respectively. The passes now, in that execution order,
245 * are as follows:
246 *
247 * PRELUDE This pass does not have any dependencies on system
248 * setup; in particular, unlike all subsequent passes it is
249 * guaranteed not to require PCI config space access. It
250 * sets the flag indicating that the processor we are
251 * running on supports the cpuid instruction, which all
252 * 64-bit processors do. This would also be the place to
253 * add any other basic state that is required later on and
254 * can be learned without dependencies.
255 *
256 * IDENT Determine which vendor manufactured the CPU, the family,
257 * model, and stepping information, and compute basic
258 * identifying tags from those values. This is done first
259 * so that machine-dependent code can control the features
260 * the cpuid instruction will report during subsequent
261 * passes if needed, and so that any intervening
262 * machine-dependent code that needs basic identity will
263 * have it available. This includes synthesised
264 * identifiers such as chiprev and uarchrev as well as the
265 * values obtained directly from cpuid. Prior to executing
266 * this pass, machine-depedent boot code is responsible for
267 * ensuring that the PCI configuration space access
268 * functions have been set up and, if necessary, that
269 * determine_platform() has been called.
270 *
271 * BASIC This is the primary pass and is responsible for doing a
272 * large number of different things:
273 *
274 * 1. Gathering a large number of feature flags to
275 * determine which features the CPU support and which
276 * indicate things that we need to do other work in the OS
277 * to enable. Features detected this way are added to the
278 * x86_featureset which can be queried to
279 * determine what we should do. This includes processing
280 * all of the basic and extended CPU features that we care
281 * about.
282 *
283 * 2. Determining the CPU's topology. This includes
284 * information about how many cores and threads are present
285 * in the package. It also is responsible for figuring out
286 * which logical CPUs are potentially part of the same core
287 * and what other resources they might share. For more
288 * information see the 'Topology' section.
289 *
290 * 3. Determining the set of CPU security-specific features
291 * that we need to worry about and determine the
292 * appropriate set of workarounds.
293 *
294 * Pass 1 on the boot CPU occurs before KMDB is started.
295 *
296 * EXTENDED The second pass is done after startup(). Here, we check
297 * other miscellaneous features. Most of this is gathering
298 * additional basic and extended features that we'll use in
299 * later passes or for debugging support.
300 *
301 * DYNAMIC The third pass occurs after the kernel memory allocator
302 * has been fully initialized. This gathers information
303 * where we might need dynamic memory available for our
304 * uses. This includes several varying width leaves that
305 * have cache information and the processor's brand string.
306 *
307 * RESOLVE The fourth and final normal pass is performed after the
308 * kernel has brought most everything online. This is
309 * invoked from post_startup(). In this pass, we go through
310 * the set of features that we have enabled and turn that
311 * into the hardware auxiliary vector features that
312 * userland receives. This is used by userland, primarily
313 * by the run-time link-editor (RTLD), though userland
314 * software could also refer to it directly.
315 *
316 * The function that performs a pass is currently assumed to be infallible, and
317 * all existing implementation are. This simplifies callers by allowing
318 * cpuid_execpass() to return void. Similarly, implementers do not need to check
319 * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320 * Both of these assumptions can be relaxed if needed by future developments.
321 * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322 * error to attempt to execute a pass before all previous passes have been
323 * completed on the specified CPU, or to request cpuid information before the
324 * pass that captures it has been executed. These conditions can be tested
325 * using cpuid_checkpass().
326 *
327 * ---------
328 * Microcode
329 * ---------
330 *
331 * Microcode updates may be applied by the firmware (BIOS/UEFI) and/or by the
332 * operating system and may result in architecturally visible changes (e.g.,
333 * changed MSR or CPUID bits). As such, we want to apply any updates as early
334 * as possible during the boot process -- right after the IDENT pass.
335 *
336 * Microcode may also be updated at runtime via ucodeadm(8), after which we do
337 * a selective rescan of the cpuid leaves to determine what features have
338 * changed. Microcode updates can provide more details about security related
339 * features to deal with issues like Spectre and L1TF. On occasion, vendors have
340 * violated their contract and removed bits. However, we don't try to detect
341 * that because that puts us in a situation that we really can't deal with. As
342 * such, the only thing we rescan are security related features today. See
343 * cpuid_pass_ucode(). This is not a pass in the same sense as the others and
344 * is run on demand, via cpuid_post_ucodeadm().
345 *
346 *
347 * All of the passes are run on all CPUs. However, for the most part we only
348 * care about what the boot CPU says about this information and use the other
349 * CPUs as a rough guide to sanity check that we have the same feature set.
350 *
351 * We do not support running multiple logical CPUs with disjoint, let alone
352 * different, feature sets.
353 *
354 * ------------------
355 * Processor Topology
356 * ------------------
357 *
358 * One of the important things that we need to do is to understand the topology
359 * of the underlying processor. When we say topology in this case, we're trying
360 * to understand the relationship between the logical CPUs that the operating
361 * system sees and the underlying physical layout. Different logical CPUs may
362 * share different resources which can have important consequences for the
363 * performance of the system. For example, they may share caches, execution
364 * units, and more.
365 *
366 * The topology of the processor changes from generation to generation and
367 * vendor to vendor. Along with that, different vendors use different
368 * terminology, and the operating system itself uses occasionally overlapping
369 * terminology. It's important to understand what this topology looks like so
370 * one can understand the different things that we try to calculate and
371 * determine.
372 *
373 * To get started, let's talk about a little bit of terminology that we've used
374 * so far, is used throughout this file, and is fairly generic across multiple
375 * vendors:
376 *
377 * CPU
378 * A central processing unit (CPU) refers to a logical and/or virtual
379 * entity that the operating system can execute instructions on. The
380 * underlying resources for this CPU may be shared between multiple
381 * entities; however, to the operating system it is a discrete unit.
382 *
383 * PROCESSOR and PACKAGE
384 *
385 * Generally, when we use the term 'processor' on its own, we are referring
386 * to the physical entity that one buys and plugs into a board. However,
387 * because processor has been overloaded and one might see it used to mean
388 * multiple different levels, we will instead use the term 'package' for
389 * the rest of this file. The term package comes from the electrical
390 * engineering side and refers to the physical entity that encloses the
391 * electronics inside. Strictly speaking the package can contain more than
392 * just the CPU, for example, on many processors it may also have what's
393 * called an 'integrated graphical processing unit (GPU)'. Because the
394 * package can encapsulate multiple units, it is the largest physical unit
395 * that we refer to.
396 *
397 * SOCKET
398 *
399 * A socket refers to unit on a system board (generally the motherboard)
400 * that can receive a package. A single package, or processor, is plugged
401 * into a single socket. A system may have multiple sockets. Often times,
402 * the term socket is used interchangeably with package and refers to the
403 * electrical component that has plugged in, and not the receptacle itself.
404 *
405 * CORE
406 *
407 * A core refers to the physical instantiation of a CPU, generally, with a
408 * full set of hardware resources available to it. A package may contain
409 * multiple cores inside of it or it may just have a single one. A
410 * processor with more than one core is often referred to as 'multi-core'.
411 * In illumos, we will use the feature X86FSET_CMP to refer to a system
412 * that has 'multi-core' processors.
413 *
414 * A core may expose a single logical CPU to the operating system, or it
415 * may expose multiple CPUs, which we call threads, defined below.
416 *
417 * Some resources may still be shared by cores in the same package. For
418 * example, many processors will share the level 3 cache between cores.
419 * Some AMD generations share hardware resources between cores. For more
420 * information on that see the section 'AMD Topology'.
421 *
422 * THREAD and STRAND
423 *
424 * In this file, generally a thread refers to a hardware resources and not
425 * the operating system's logical abstraction. A thread is always exposed
426 * as an independent logical CPU to the operating system. A thread belongs
427 * to a specific core. A core may have more than one thread. When that is
428 * the case, the threads that are part of the same core are often referred
429 * to as 'siblings'.
430 *
431 * When multiple threads exist, this is generally referred to as
432 * simultaneous multi-threading (SMT). When Intel introduced this in their
433 * processors they called it hyper-threading (HT). When multiple threads
434 * are active in a core, they split the resources of the core. For example,
435 * two threads may share the same set of hardware execution units.
436 *
437 * The operating system often uses the term 'strand' to refer to a thread.
438 * This helps disambiguate it from the software concept.
439 *
440 * CHIP
441 *
442 * Unfortunately, the term 'chip' is dramatically overloaded. At its most
443 * base meaning, it is used to refer to a single integrated circuit, which
444 * may or may not be the only thing in the package. In illumos, when you
445 * see the term 'chip' it is almost always referring to the same thing as
446 * the 'package'. However, many vendors may use chip to refer to one of
447 * many integrated circuits that have been placed in the package. As an
448 * example, see the subsequent definition.
449 *
450 * To try and keep things consistent, we will only use chip when referring
451 * to the entire integrated circuit package, with the exception of the
452 * definition of multi-chip module (because it is in the name) and use the
453 * term 'die' when we want the more general, potential sub-component
454 * definition.
455 *
456 * DIE
457 *
458 * A die refers to an integrated circuit. Inside of the package there may
459 * be a single die or multiple dies. This is sometimes called a 'chip' in
460 * vendor's parlance, but in this file, we use the term die to refer to a
461 * subcomponent.
462 *
463 * MULTI-CHIP MODULE
464 *
465 * A multi-chip module (MCM) refers to putting multiple distinct chips that
466 * are connected together in the same package. When a multi-chip design is
467 * used, generally each chip is manufactured independently and then joined
468 * together in the package. For example, on AMD's Zen microarchitecture
469 * (family 0x17), the package contains several dies (the second meaning of
470 * chip from above) that are connected together.
471 *
472 * CACHE
473 *
474 * A cache is a part of the processor that maintains copies of recently
475 * accessed memory. Caches are split into levels and then into types.
476 * Commonly there are one to three levels, called level one, two, and
477 * three. The lower the level, the smaller it is, the closer it is to the
478 * execution units of the CPU, and the faster it is to access. The layout
479 * and design of the cache come in many different flavors, consult other
480 * resources for a discussion of those.
481 *
482 * Caches are generally split into two types, the instruction and data
483 * cache. The caches contain what their names suggest, the instruction
484 * cache has executable program text, while the data cache has all other
485 * memory that the processor accesses. As of this writing, data is kept
486 * coherent between all of the caches on x86, so if one modifies program
487 * text before it is executed, that will be in the data cache, and the
488 * instruction cache will be synchronized with that change when the
489 * processor actually executes those instructions. This coherency also
490 * covers the fact that data could show up in multiple caches.
491 *
492 * Generally, the lowest level caches are specific to a core. However, the
493 * last layer cache is shared between some number of cores. The number of
494 * CPUs sharing this last level cache is important. This has implications
495 * for the choices that the scheduler makes, as accessing memory that might
496 * be in a remote cache after thread migration can be quite expensive.
497 *
498 * Sometimes, the word cache is abbreviated with a '$', because in US
499 * English the word cache is pronounced the same as cash. So L1D$ refers to
500 * the L1 data cache, and L2$ would be the L2 cache. This will not be used
501 * in the rest of this theory statement for clarity.
502 *
503 * MEMORY CONTROLLER
504 *
505 * The memory controller is a component that provides access to DRAM. Each
506 * memory controller can access a set number of DRAM channels. Each channel
507 * can have a number of DIMMs (sticks of memory) associated with it. A
508 * given package may have more than one memory controller. The association
509 * of the memory controller to a group of cores is important as it is
510 * cheaper to access memory on the controller that you are associated with.
511 *
512 * NUMA
513 *
514 * NUMA or non-uniform memory access, describes a way that systems are
515 * built. On x86, any processor core can address all of the memory in the
516 * system. However, When using multiple sockets or possibly within a
517 * multi-chip module, some of that memory is physically closer and some of
518 * it is further. Memory that is further away is more expensive to access.
519 * Consider the following image of multiple sockets with memory:
520 *
521 * +--------+ +--------+
522 * | DIMM A | +----------+ +----------+ | DIMM D |
523 * +--------+-+ | | | | +-+------+-+
524 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
525 * +--------+-+ | | | | +-+------+-+
526 * | DIMM C | +----------+ +----------+ | DIMM F |
527 * +--------+ +--------+
528 *
529 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
530 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to
531 * access DIMMs A-C and more expensive to access D-F as it has to go
532 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
533 * D-F are cheaper than A-C. While the socket form is the most common, when
534 * using multi-chip modules, this can also sometimes occur. For another
535 * example of this that's more involved, see the AMD topology section.
536 *
537 *
538 * Intel Topology
539 * --------------
540 *
541 * Most Intel processors since Nehalem, (as of this writing the current gen
542 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
543 * the package is a single monolithic die. MCMs currently aren't used. Most
544 * parts have three levels of caches, with the L3 cache being shared between
545 * all of the cores on the package. The L1/L2 cache is generally specific to
546 * an individual core. The following image shows at a simplified level what
547 * this looks like. The memory controller is commonly part of something called
548 * the 'Uncore', that used to be separate physical chips that were not a part of
549 * the package, but are now part of the same chip.
550 *
551 * +-----------------------------------------------------------------------+
552 * | Package |
553 * | +-------------------+ +-------------------+ +-------------------+ |
554 * | | Core | | Core | | Core | |
555 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
556 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | |
557 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | |
558 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | |
559 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | |
560 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
561 * | | +--------------+ | | +--------------+ | | +--------------+ | |
562 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | |
563 * | | +--------------+ | | +--------------+ | | +--------------+ | |
564 * | +-------------------+ +-------------------+ +-------------------+ |
565 * | +-------------------------------------------------------------------+ |
566 * | | Shared L3 Cache | |
567 * | +-------------------------------------------------------------------+ |
568 * | +-------------------------------------------------------------------+ |
569 * | | Memory Controller | |
570 * | +-------------------------------------------------------------------+ |
571 * +-----------------------------------------------------------------------+
572 *
573 * A side effect of this current architecture is that what we care about from a
574 * scheduling and topology perspective, is simplified. In general we care about
575 * understanding which logical CPUs are part of the same core and socket.
576 *
577 * To determine the relationship between threads and cores, Intel initially used
578 * the identifier in the advanced programmable interrupt controller (APIC). They
579 * also added cpuid leaf 4 to give additional information about the number of
580 * threads and CPUs in the processor. With the addition of x2apic (which
581 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
582 * additional cpuid topology leaf 0xB was added.
583 *
584 * AMD Topology
585 * ------------
586 *
587 * When discussing AMD topology, we want to break this into three distinct
588 * generations of topology. There's the basic topology that has been used in
589 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
590 * with family 0x15 (Bulldozer), and there's the topology that was introduced
591 * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
592 * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
593 * additional terminology that's worth talking about.
594 *
595 * Until the introduction of family 0x17 (Zen), AMD did not implement something
596 * that they considered SMT. Whether or not the AMD processors have SMT
597 * influences many things including scheduling and reliability, availability,
598 * and serviceability (RAS) features.
599 *
600 * NODE
601 *
602 * AMD uses the term node to refer to a die that contains a number of cores
603 * and I/O resources. Depending on the processor family and model, more
604 * than one node can be present in the package. When there is more than one
605 * node this indicates a multi-chip module. Usually each node has its own
606 * access to memory and I/O devices. This is important and generally
607 * different from the corresponding Intel Nehalem-Skylake+ processors. As a
608 * result, we track this relationship in the operating system.
609 *
610 * In processors with an L3 cache, the L3 cache is generally shared across
611 * the entire node, though the way this is carved up varies from generation
612 * to generation.
613 *
614 * BULLDOZER
615 *
616 * Starting with the Bulldozer family (0x15) and continuing until the
617 * introduction of the Zen microarchitecture, AMD introduced the idea of a
618 * compute unit. In a compute unit, two traditional cores share a number of
619 * hardware resources. Critically, they share the FPU, L1 instruction
620 * cache, and the L2 cache. Several compute units were then combined inside
621 * of a single node. Because the integer execution units, L1 data cache,
622 * and some other resources were not shared between the cores, AMD never
623 * considered this to be SMT.
624 *
625 * ZEN
626 *
627 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module
628 * is called Zeppelin. These modules are similar to the idea of nodes used
629 * previously. Each of these nodes has two DRAM channels which all of the
630 * cores in the node can access uniformly. These nodes are linked together
631 * in the package, creating a NUMA environment.
632 *
633 * The Zeppelin die itself contains two different 'core complexes'. Each
634 * core complex consists of four cores which each have two threads, for a
635 * total of 8 logical CPUs per complex. Unlike other generations,
636 * where all the logical CPUs in a given node share the L3 cache, here each
637 * core complex has its own shared L3 cache.
638 *
639 * A further thing that we need to consider is that in some configurations,
640 * particularly with the Threadripper line of processors, not every die
641 * actually has its memory controllers wired up to actual memory channels.
642 * This means that some cores have memory attached to them and others
643 * don't.
644 *
645 * To put Zen in perspective, consider the following images:
646 *
647 * +--------------------------------------------------------+
648 * | Core Complex |
649 * | +-------------------+ +-------------------+ +---+ |
650 * | | Core +----+ | | Core +----+ | | | |
651 * | | +--------+ | L2 | | | +--------+ | L2 | | | | |
652 * | | | Thread | +----+ | | | Thread | +----+ | | | |
653 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | |
654 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | |
655 * | | +--------+ +--+ | | +--------+ +--+ | | | |
656 * | +-------------------+ +-------------------+ | C | |
657 * | +-------------------+ +-------------------+ | a | |
658 * | | Core +----+ | | Core +----+ | | c | |
659 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | |
660 * | | | Thread | +----+ | | | Thread | +----+ | | e | |
661 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | |
662 * | | | Thread | |L1| | | | Thread | |L1| | | | |
663 * | | +--------+ +--+ | | +--------+ +--+ | | | |
664 * | +-------------------+ +-------------------+ +---+ |
665 * | |
666 * +--------------------------------------------------------+
667 *
668 * This first image represents a single Zen core complex that consists of four
669 * cores.
670 *
671 *
672 * +--------------------------------------------------------+
673 * | Zeppelin Die |
674 * | +--------------------------------------------------+ |
675 * | | I/O Units (PCIe, SATA, USB, etc.) | |
676 * | +--------------------------------------------------+ |
677 * | HH |
678 * | +-----------+ HH +-----------+ |
679 * | | | HH | | |
680 * | | Core |==========| Core | |
681 * | | Complex |==========| Complex | |
682 * | | | HH | | |
683 * | +-----------+ HH +-----------+ |
684 * | HH |
685 * | +--------------------------------------------------+ |
686 * | | Memory Controller | |
687 * | +--------------------------------------------------+ |
688 * | |
689 * +--------------------------------------------------------+
690 *
691 * This image represents a single Zeppelin Die. Note how both cores are
692 * connected to the same memory controller and I/O units. While each core
693 * complex has its own L3 cache as seen in the first image, they both have
694 * uniform access to memory.
695 *
696 *
697 * PP PP
698 * PP PP
699 * +----------PP---------------------PP---------+
700 * | PP PP |
701 * | +-----------+ +-----------+ |
702 * | | | | | |
703 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
704 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
705 * | | | | | |
706 * | +-----------+ooo ...+-----------+ |
707 * | HH ooo ... HH |
708 * | HH oo.. HH |
709 * | HH ..oo HH |
710 * | HH ... ooo HH |
711 * | +-----------+... ooo+-----------+ |
712 * | | | | | |
713 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
714 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
715 * | | | | | |
716 * | +-----------+ +-----------+ |
717 * | PP PP |
718 * +----------PP---------------------PP---------+
719 * PP PP
720 * PP PP
721 *
722 * This image represents a single Zen package. In this example, it has four
723 * Zeppelin dies, though some configurations only have a single one. In this
724 * example, each die is directly connected to the next. Also, each die is
725 * represented as being connected to memory by the 'M' character and connected
726 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
727 * die is made up of two core complexes, we have multiple different NUMA
728 * domains that we care about for these systems.
729 *
730 * ZEN 2
731 *
732 * Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
733 * each Zeppelin Die had its own I/O die, that has been moved out of the
734 * core complex in Zen 2. The actual core complex looks pretty similar, but
735 * now the die actually looks much simpler:
736 *
737 * +--------------------------------------------------------+
738 * | Zen 2 Core Complex Die HH |
739 * | HH |
740 * | +-----------+ HH +-----------+ |
741 * | | | HH | | |
742 * | | Core |==========| Core | |
743 * | | Complex |==========| Complex | |
744 * | | | HH | | |
745 * | +-----------+ HH +-----------+ |
746 * | HH |
747 * | HH |
748 * +--------------------------------------------------------+
749 *
750 * From here, when we add the central I/O die, this changes things a bit.
751 * Each die is connected to the I/O die, rather than trying to interconnect
752 * them directly. The following image takes the same Zen 1 image that we
753 * had earlier and shows what it looks like with the I/O die instead:
754 *
755 * PP PP
756 * PP PP
757 * +---------------------PP----PP---------------------+
758 * | PP PP |
759 * | +-----------+ PP PP +-----------+ |
760 * | | | PP PP | | |
761 * | | Zen 2 | +-PP----PP-+ | Zen 2 | |
762 * | | Die _| | PP PP | |_ Die | |
763 * | | |o|oooo| |oooo|o| | |
764 * | +-----------+ | | +-----------+ |
765 * | | I/O | |
766 * MMMMMMMMMMMMMMMMMMMMMMMMMM Die MMMMMMMMMMMMMMMMMMMMMMMMMM
767 * MMMMMMMMMMMMMMMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMM
768 * | | | |
769 * MMMMMMMMMMMMMMMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMM
770 * MMMMMMMMMMMMMMMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMM
771 * | | | |
772 * | +-----------+ | | +-----------+ |
773 * | | |o|oooo| PP PP |oooo|o| | |
774 * | | Zen 2 -| +-PP----PP-+ |- Zen 2 | |
775 * | | Die | PP PP | Die | |
776 * | | | PP PP | | |
777 * | +-----------+ PP PP +-----------+ |
778 * | PP PP |
779 * +---------------------PP----PP---------------------+
780 * PP PP
781 * PP PP
782 *
783 * The above has four core complex dies installed, though the Zen 2 EPYC
784 * and ThreadRipper parts allow for up to eight, while the Ryzen parts
785 * generally only have one to two. The more notable difference here is how
786 * everything communicates. Note that memory and PCIe come out of the
787 * central die. This changes the way that one die accesses a resource. It
788 * basically always has to go to the I/O die, where as in Zen 1 it may have
789 * satisfied it locally. In general, this ends up being a better strategy
790 * for most things, though it is possible to still treat everything in four
791 * distinct NUMA domains with each Zen 2 die slightly closer to some memory
792 * and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
793 * now there is only one 'node' present.
794 *
795 * ZEN 3
796 *
797 * From an architectural perspective, Zen 3 is a much smaller change from
798 * Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
799 * its microarchitectural changes. The biggest thing for us is how the die
800 * changes. In Zen 1 and Zen 2, each core complex still had its own L3
801 * cache. However, in Zen 3, the L3 is now shared between the entire core
802 * complex die and is no longer partitioned between each core complex. This
803 * means that all cores on the die can share the same L3 cache. Otherwise,
804 * the general layout of the overall package with various core complexes
805 * and an I/O die stays the same. Here's what the Core Complex Die looks
806 * like in a bit more detail:
807 *
808 * +-------------------------------------------------+
809 * | Zen 3 Core Complex Die |
810 * | +-------------------+ +-------------------+ |
811 * | | Core +----+ | | Core +----+ | |
812 * | | +--------+ | L2 | | | +--------+ | L2 | | |
813 * | | | Thread | +----+ | | | Thread | +----+ | |
814 * | | +--------+-+ +--+ | | +--------+-+ +--+ | |
815 * | | | Thread | |L1| | | | Thread | |L1| | |
816 * | | +--------+ +--+ | | +--------+ +--+ | |
817 * | +-------------------+ +-------------------+ |
818 * | +-------------------+ +-------------------+ |
819 * | | Core +----+ | | Core +----+ | |
820 * | | +--------+ | L2 | | | +--------+ | L2 | | |
821 * | | | Thread | +----+ | | | Thread | +----+ | |
822 * | | +--------+-+ +--+ | | +--------+-+ +--+ | |
823 * | | | Thread | |L1| | | | Thread | |L1| | |
824 * | | +--------+ +--+ | | +--------+ +--+ | |
825 * | +-------------------+ +-------------------+ |
826 * | |
827 * | +--------------------------------------------+ |
828 * | | L3 Cache | |
829 * | +--------------------------------------------+ |
830 * | |
831 * | +-------------------+ +-------------------+ |
832 * | | Core +----+ | | Core +----+ | |
833 * | | +--------+ | L2 | | | +--------+ | L2 | | |
834 * | | | Thread | +----+ | | | Thread | +----+ | |
835 * | | +--------+-+ +--+ | | +--------+-+ +--+ | |
836 * | | | Thread | |L1| | | | Thread | |L1| | |
837 * | | +--------+ +--+ | | +--------+ +--+ | |
838 * | +-------------------+ +-------------------+ |
839 * | +-------------------+ +-------------------+ |
840 * | | Core +----+ | | Core +----+ | |
841 * | | +--------+ | L2 | | | +--------+ | L2 | | |
842 * | | | Thread | +----+ | | | Thread | +----+ | |
843 * | | +--------+-+ +--+ | | +--------+-+ +--+ | |
844 * | | | Thread | |L1| | | | Thread | |L1| | |
845 * | | +--------+ +--+ | | +--------+ +--+ | |
846 * | +-------------------+ +-------------------+ |
847 * +-------------------------------------------------+
848 *
849 * While it is not pictured, there are connections from the die to the
850 * broader data fabric and additional functional blocks to support that
851 * communication and coherency.
852 *
853 * CPUID LEAVES
854 *
855 * There are a few different CPUID leaves that we can use to try and understand
856 * the actual state of the world. As part of the introduction of family 0xf, AMD
857 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
858 * processors that are in the system. Because families before Zen didn't have
859 * SMT, this was always the number of cores that were in the system. However, it
860 * should always be thought of as the number of logical threads to be consistent
861 * between generations. In addition we also get the size of the APIC ID that is
862 * used to represent the number of logical processors. This is important for
863 * deriving topology information.
864 *
865 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
866 * bit between Bulldozer and later families, but it is quite useful in
867 * determining the topology information. Because this information has changed
868 * across family generations, it's worth calling out what these mean
869 * explicitly. The registers have the following meanings:
870 *
871 * %eax The APIC ID. The entire register is defined to have a 32-bit
872 * APIC ID, even though on systems without x2apic support, it will
873 * be limited to 8 bits.
874 *
875 * %ebx On Bulldozer-era systems this contains information about the
876 * number of cores that are in a compute unit (cores that share
877 * resources). It also contains a per-package compute unit ID that
878 * identifies which compute unit the logical CPU is a part of.
879 *
880 * On Zen-era systems this instead contains the number of threads
881 * per core and the ID of the core that the logical CPU is a part
882 * of. Note, this ID is unique only to the package, it is not
883 * globally unique across the entire system.
884 *
885 * %ecx This contains the number of nodes that exist in the package. It
886 * also contains an ID that identifies which node the logical CPU
887 * is a part of.
888 *
889 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
890 * cache layout to determine which logical CPUs are sharing which caches.
891 *
892 * illumos Topology
893 * ----------------
894 *
895 * Based on the above we synthesize the information into several different
896 * variables that we store in the 'struct cpuid_info'. We'll go into the details
897 * of what each member is supposed to represent and their uniqueness. In
898 * general, there are two levels of uniqueness that we care about. We care about
899 * an ID that is globally unique. That means that it will be unique across all
900 * entities in the system. For example, the default logical CPU ID is globally
901 * unique. On the other hand, there is some information that we only care about
902 * being unique within the context of a single package / socket. Here are the
903 * variables that we keep track of and their meaning.
904 *
905 * Several of the values that are asking for an identifier, with the exception
906 * of cpi_apicid, are allowed to be synthetic.
907 *
908 *
909 * cpi_apicid
910 *
911 * This is the value of the CPU's APIC id. This should be the full 32-bit
912 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
913 * APIC ID. This value is globally unique between all logical CPUs across
914 * all packages. This is usually required by the APIC.
915 *
916 * cpi_chipid
917 *
918 * This value indicates the ID of the package that the logical CPU is a
919 * part of. This value is allowed to be synthetic. It is usually derived by
920 * taking the CPU's APIC ID and determining how many bits are used to
921 * represent CPU cores in the package. All logical CPUs that are part of
922 * the same package must have the same value.
923 *
924 * cpi_coreid
925 *
926 * This represents the ID of a CPU core. Two logical CPUs should only have
927 * the same cpi_coreid value if they are part of the same core. These
928 * values may be synthetic. On systems that support SMT, this value is
929 * usually derived from the APIC ID, otherwise it is often synthetic and
930 * just set to the value of the cpu_id in the cpu_t.
931 *
932 * cpi_pkgcoreid
933 *
934 * This is similar to the cpi_coreid in that logical CPUs that are part of
935 * the same core should have the same ID. The main difference is that these
936 * values are only required to be unique to a given socket.
937 *
938 * cpi_clogid
939 *
940 * This represents the logical ID of a logical CPU. This value should be
941 * unique within a given socket for each logical CPU. This is allowed to be
942 * synthetic, though it is usually based off of the CPU's apic ID. The
943 * broader system expects that logical CPUs that have are part of the same
944 * core have contiguous numbers. For example, if there were two threads per
945 * core, then the core IDs divided by two should be the same and the first
946 * modulus two should be zero and the second one. For example, IDs 4 and 5
947 * indicate two logical CPUs that are part of the same core. But IDs 5 and
948 * 6 represent two logical CPUs that are part of different cores.
949 *
950 * While it is common for the cpi_coreid and the cpi_clogid to be derived
951 * from the same source, strictly speaking, they don't have to be and the
952 * two values should be considered logically independent. One should not
953 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
954 * some kind of relationship. While this is tempting, we've seen cases on
955 * AMD family 0xf where the system's cpu id is not related to its APIC ID.
956 *
957 * cpi_ncpu_per_chip
958 *
959 * This value indicates the total number of logical CPUs that exist in the
960 * physical package. Critically, this is not the number of logical CPUs
961 * that exist for just the single core.
962 *
963 * This value should be the same for all logical CPUs in the same package.
964 *
965 * cpi_ncore_per_chip
966 *
967 * This value indicates the total number of physical CPU cores that exist
968 * in the package. The system compares this value with cpi_ncpu_per_chip to
969 * determine if simultaneous multi-threading (SMT) is enabled. When
970 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
971 * the X86FSET_HTT feature is not set. If this value is greater than one,
972 * than we consider the processor to have the feature X86FSET_CMP, to
973 * indicate that there is support for more than one core.
974 *
975 * This value should be the same for all logical CPUs in the same package.
976 *
977 * cpi_procnodes_per_pkg
978 *
979 * This value indicates the number of 'nodes' that exist in the package.
980 * When processors are actually a multi-chip module, this represents the
981 * number of such modules that exist in the package. Currently, on Intel
982 * based systems this member is always set to 1.
983 *
984 * This value should be the same for all logical CPUs in the same package.
985 *
986 * cpi_procnodeid
987 *
988 * This value indicates the ID of the node that the logical CPU is a part
989 * of. All logical CPUs that are in the same node must have the same value
990 * here. This value must be unique across all of the packages in the
991 * system. On Intel based systems, this is currently set to the value in
992 * cpi_chipid because there is only one node.
993 *
994 * cpi_cores_per_compunit
995 *
996 * This value indicates the number of cores that are part of a compute
997 * unit. See the AMD topology section for this. This member only has real
998 * meaning currently for AMD Bulldozer family processors. For all other
999 * processors, this should currently be set to 1.
1000 *
1001 * cpi_compunitid
1002 *
1003 * This indicates the compute unit that the logical CPU belongs to. For
1004 * processors without AMD Bulldozer-style compute units this should be set
1005 * to the value of cpi_coreid.
1006 *
1007 * cpi_ncpu_shr_last_cache
1008 *
1009 * This indicates the number of logical CPUs that are sharing the same last
1010 * level cache. This value should be the same for all CPUs that are sharing
1011 * that cache. The last cache refers to the cache that is closest to memory
1012 * and furthest away from the CPU.
1013 *
1014 * cpi_last_lvl_cacheid
1015 *
1016 * This indicates the ID of the last cache that the logical CPU uses. This
1017 * cache is often shared between multiple logical CPUs and is the cache
1018 * that is closest to memory and furthest away from the CPU. This value
1019 * should be the same for a group of logical CPUs only if they actually
1020 * share the same last level cache. IDs should not overlap between
1021 * packages.
1022 *
1023 * cpi_ncore_bits
1024 *
1025 * This indicates the number of bits that are required to represent all of
1026 * the cores in the system. As cores are derived based on their APIC IDs,
1027 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1028 * this value to be larger than the actual number of IDs that are present
1029 * in the system. This is used to size tables by the CMI framework. It is
1030 * only filled in for Intel and AMD CPUs.
1031 *
1032 * cpi_nthread_bits
1033 *
1034 * This indicates the number of bits required to represent all of the IDs
1035 * that cover the logical CPUs that exist on a given core. It's OK for this
1036 * value to be larger than the actual number of IDs that are present in the
1037 * system. This is used to size tables by the CMI framework. It is
1038 * only filled in for Intel and AMD CPUs.
1039 *
1040 * -----------
1041 * Hypervisors
1042 * -----------
1043 *
1044 * If trying to manage the differences between vendors wasn't bad enough, it can
1045 * get worse thanks to our friend hardware virtualization. Hypervisors are given
1046 * the ability to interpose on all cpuid instructions and change them to suit
1047 * their purposes. In general, this is necessary as the hypervisor wants to be
1048 * able to present a more uniform set of features or not necessarily give the
1049 * guest operating system kernel knowledge of all features so it can be
1050 * more easily migrated between systems.
1051 *
1052 * When it comes to trying to determine topology information, this can be a
1053 * double edged sword. When a hypervisor doesn't actually implement a cpuid
1054 * leaf, it'll often return all zeros. Because of that, you'll often see various
1055 * checks scattered about fields being non-zero before we assume we can use
1056 * them.
1057 *
1058 * When it comes to topology information, the hypervisor is often incentivized
1059 * to lie to you about topology. This is because it doesn't always actually
1060 * guarantee that topology at all. The topology path we take in the system
1061 * depends on how the CPU advertises itself. If it advertises itself as an Intel
1062 * or AMD CPU, then we basically do our normal path. However, when they don't
1063 * use an actual vendor, then that usually turns into multiple one-core CPUs
1064 * that we enumerate that are often on different sockets. The actual behavior
1065 * depends greatly on what the hypervisor actually exposes to us.
1066 *
1067 * --------------------
1068 * Exposing Information
1069 * --------------------
1070 *
1071 * We expose CPUID information in three different forms in the system.
1072 *
1073 * The first is through the x86_featureset variable. This is used in conjunction
1074 * with the is_x86_feature() function. This is queried by x86-specific functions
1075 * to determine which features are or aren't present in the system and to make
1076 * decisions based upon them. For example, users of this include everything from
1077 * parts of the system dedicated to reliability, availability, and
1078 * serviceability (RAS), to making decisions about how to handle security
1079 * mitigations, to various x86-specific drivers. General purpose or
1080 * architecture independent drivers should never be calling this function.
1081 *
1082 * The second means is through the auxiliary vector. The auxiliary vector is a
1083 * series of tagged data that the kernel passes down to a user program when it
1084 * begins executing. This information is used to indicate to programs what
1085 * instruction set extensions are present. For example, information about the
1086 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1087 * since user programs cannot make use of it. However, things like the AVX
1088 * instruction sets are. Programs use this information to make run-time
1089 * decisions about what features they should use. As an example, the run-time
1090 * link-editor (rtld) can relocate different functions depending on the hardware
1091 * support available.
1092 *
1093 * The final form is through a series of accessor functions that all have the
1094 * form cpuid_get*. This is used by a number of different subsystems in the
1095 * kernel to determine more detailed information about what we're running on,
1096 * topology information, etc. Some of these subsystems include processor groups
1097 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1098 * microcode, and performance monitoring. These functions all ASSERT that the
1099 * CPU they're being called on has reached a certain cpuid pass. If the passes
1100 * are rearranged, then this needs to be adjusted.
1101 *
1102 * -----------------------------------------------
1103 * Speculative Execution CPU Side Channel Security
1104 * -----------------------------------------------
1105 *
1106 * With the advent of the Spectre and Meltdown attacks which exploit speculative
1107 * execution in the CPU to create side channels there have been a number of
1108 * different attacks and corresponding issues that the operating system needs to
1109 * mitigate against. The following list is some of the common, but not
1110 * exhaustive, set of issues that we know about and have done some or need to do
1111 * more work in the system to mitigate against:
1112 *
1113 * - Spectre v1
1114 * - swapgs (Spectre v1 variant)
1115 * - Spectre v2
1116 * - Branch History Injection (BHI).
1117 * - Meltdown (Spectre v3)
1118 * - Rogue Register Read (Spectre v3a)
1119 * - Speculative Store Bypass (Spectre v4)
1120 * - ret2spec, SpectreRSB
1121 * - L1 Terminal Fault (L1TF)
1122 * - Microarchitectural Data Sampling (MDS)
1123 * - Register File Data Sampling (RFDS)
1124 *
1125 * Each of these requires different sets of mitigations and has different attack
1126 * surfaces. For the most part, this discussion is about protecting the kernel
1127 * from non-kernel executing environments such as user processes and hardware
1128 * virtual machines. Unfortunately, there are a number of user vs. user
1129 * scenarios that exist with these. The rest of this section will describe the
1130 * overall approach that the system has taken to address these as well as their
1131 * shortcomings. Unfortunately, not all of the above have been handled today.
1132 *
1133 * SPECTRE v2, ret2spec, SpectreRSB
1134 *
1135 * The second variant of the spectre attack focuses on performing branch target
1136 * injection. This generally impacts indirect call instructions in the system.
1137 * There are four different ways to mitigate this issue that are commonly
1138 * described today:
1139 *
1140 * 1. Using Indirect Branch Restricted Speculation (IBRS).
1141 * 2. Using Retpolines and RSB Stuffing
1142 * 3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1143 * 4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1144 *
1145 * IBRS uses a feature added to microcode to restrict speculation, among other
1146 * things. This form of mitigation has not been used as it has been generally
1147 * seen as too expensive and requires reactivation upon various transitions in
1148 * the system.
1149 *
1150 * As a less impactful alternative to IBRS, retpolines were developed by
1151 * Google. These basically require one to replace indirect calls with a specific
1152 * trampoline that will cause speculation to fail and break the attack.
1153 * Retpolines require compiler support. We always build with retpolines in the
1154 * external thunk mode. This means that a traditional indirect call is replaced
1155 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1156 * of this is that all indirect function calls are performed through a register.
1157 *
1158 * We have to use a common external location of the thunk and not inline it into
1159 * the callsite so that way we can have a single place to patch these functions.
1160 * As it turns out, we currently have two different forms of retpolines that
1161 * exist in the system:
1162 *
1163 * 1. A full retpoline
1164 * 2. A no-op version
1165 *
1166 * The first one is used in the general case. Historically, there was an
1167 * AMD-specific optimized retopoline variant that was based around using a
1168 * serializing lfence instruction; however, in March 2022 it was announced that
1169 * this was actually still vulnerable to Spectre v2 and therefore we no longer
1170 * use it and it is no longer available in the system.
1171 *
1172 * The third form described above is the most curious. It turns out that the way
1173 * that retpolines are implemented is that they rely on how speculation is
1174 * performed on a 'ret' instruction. Intel has continued to optimize this
1175 * process (which is partly why we need to have return stack buffer stuffing,
1176 * but more on that in a bit) and in processors starting with Cascade Lake
1177 * on the server side, it's dangerous to rely on retpolines. Instead, a new
1178 * mechanism has been introduced called Enhanced IBRS (eIBRS).
1179 *
1180 * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1181 * physical core. However, if this is the case, we don't want to use retpolines
1182 * any more. Therefore if eIBRS is present, we end up turning each retpoline
1183 * function (called a thunk) into a jmp instruction. This means that we're still
1184 * paying the cost of an extra jump to the external thunk, but it gives us
1185 * flexibility and the ability to have a single kernel image that works across a
1186 * wide variety of systems and hardware features.
1187 *
1188 * Unfortunately, this alone is insufficient. First, Skylake systems have
1189 * additional speculation for the Return Stack Buffer (RSB) which is used to
1190 * return from call instructions which retpolines take advantage of. However,
1191 * this problem is not just limited to Skylake and is actually more pernicious.
1192 * The SpectreRSB paper introduces several more problems that can arise with
1193 * dealing with this. The RSB can be poisoned just like the indirect branch
1194 * predictor. This means that one needs to clear the RSB when transitioning
1195 * between two different privilege domains. Some examples include:
1196 *
1197 * - Switching between two different user processes
1198 * - Going between user land and the kernel
1199 * - Returning to the kernel from a hardware virtual machine
1200 *
1201 * Mitigating this involves combining a couple of different things. The first is
1202 * SMEP (supervisor mode execution protection) which was introduced in Ivy
1203 * Bridge. When an RSB entry refers to a user address and we're executing in the
1204 * kernel, speculation through it will be stopped when SMEP is enabled. This
1205 * protects against a number of the different cases that we would normally be
1206 * worried about such as when we enter the kernel from user land.
1207 *
1208 * To prevent against additional manipulation of the RSB from other contexts
1209 * such as a non-root VMX context attacking the kernel we first look to
1210 * enhanced IBRS. When eIBRS is present and enabled, then there should be
1211 * nothing else that we need to do to protect the kernel at this time.
1212 *
1213 * Unfortunately, not all eIBRS implementations are sufficient to guard
1214 * against RSB manipulations, so we still need to manually overwrite the
1215 * contents of the return stack buffer unless the hardware specifies we are
1216 * covered. We do this through the x86_rsb_stuff() function. Currently this
1217 * is employed on context switch and vmx_exit. The x86_rsb_stuff() function is
1218 * disabled only when mitigations in general are, or if we have hardware
1219 * indicating no need for post-barrier RSB protections, either in one place
1220 * (old hardware), or on both (newer hardware).
1221 *
1222 * If SMEP is not present, then we would have to stuff the RSB every time we
1223 * transitioned from user mode to the kernel, which isn't very practical right
1224 * now.
1225 *
1226 * To fully protect user to user and vmx to vmx attacks from these classes of
1227 * issues, we would also need to allow them to opt into performing an Indirect
1228 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1229 *
1230 * The fourth form of mitigation here is specific to AMD and is called Automated
1231 * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1232 * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1233 * (extended feature enable register) MSR. This bit basically says that IBRS
1234 * acts as though it is always active when executing at CPL0 and when executing
1235 * in the 'host' context when SEV-SNP is enabled.
1236 *
1237 * When this is active, AMD states that the RSB is cleared on VMEXIT and
1238 * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1239 * to the kernel, we must still consider the remaining cases that exist, just
1240 * like above. While traditionally AMD employed a 32 entry RSB allowing the
1241 * traditional technique to work, this is not true on all CPUs. While a write to
1242 * IBRS would clear the RSB if the processor supports more than 32 entries (but
1243 * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1244 * guard page is present between user and kernel address spaces and SMEP is
1245 * enabled, then there is no need to clear the RSB at all.
1246 *
1247 * By default, the system will enable RSB stuffing and the required variant of
1248 * retpolines and store that information in the x86_spectrev2_mitigation value.
1249 * This will be evaluated after a microcode update as well, though it is
1250 * expected that microcode updates will not take away features. This may mean
1251 * that a late loaded microcode may not end up in the optimal configuration
1252 * (though this should be rare).
1253 *
1254 * Currently we do not build kmdb with retpolines or perform any additional side
1255 * channel security mitigations for it. One complication with kmdb is that it
1256 * requires its own retpoline thunks and it would need to adjust itself based on
1257 * what the kernel does. The threat model of kmdb is more limited and therefore
1258 * it may make more sense to investigate using prediction barriers as the whole
1259 * system is only executing a single instruction at a time while in kmdb.
1260 *
1261 * Branch History Injection (BHI)
1262 *
1263 * BHI is a specific form of SPECTREv2 where an attacker may manipulate branch
1264 * history before transitioning from user to supervisor mode (or from VMX
1265 * non-root/guest to root mode). The attacker can then exploit certain
1266 * compiler-generated code-sequences ("gadgets") to disclose information from
1267 * other contexts or domains. Recent (late-2023/early-2024) research in
1268 * object code analysis discovered many more potential gadgets than what was
1269 * initially reported (which previously was confined to Linux use of
1270 * unprivileged eBPF).
1271 *
1272 * The BHI threat doesn't exist in processsors that predate eIBRS, or in AMD
1273 * ones. Some eIBRS processors have the ability to disable branch history in
1274 * certain (but not all) cases using an MSR write. eIBRS processors that don't
1275 * have the ability to disable must use a software sequence to scrub the
1276 * branch history buffer.
1277 *
1278 * BHI_DIS_S (the aforementioned MSR) prevents ring 0 from ring 3 (VMX guest
1279 * or VMX root). It does not protect different user processes from each other,
1280 * or ring 3 VMX guest from ring 3 VMX root or vice versa.
1281 *
1282 * The BHI clearing sequence prevents user exploiting kernel gadgets, and user
1283 * A's use of user B's gadgets.
1284 *
1285 * SMEP and eIBRS are a continuing defense-in-depth measure protecting the
1286 * kernel.
1287 *
1288 * SPECTRE v1, v4
1289 *
1290 * The v1 and v4 variants of spectre are not currently mitigated in the
1291 * system and require other classes of changes to occur in the code.
1292 *
1293 * SPECTRE v1 (SWAPGS VARIANT)
1294 *
1295 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1296 * can generally affect any branch-dependent code. The swapgs issue is one
1297 * variant of this. If we are coming in from userspace, we can have code like
1298 * this:
1299 *
1300 * cmpw $KCS_SEL, REGOFF_CS(%rsp)
1301 * je 1f
1302 * movq $0, REGOFF_SAVFP(%rsp)
1303 * swapgs
1304 * 1:
1305 * movq %gs:CPU_THREAD, %rax
1306 *
1307 * If an attacker can cause a mis-speculation of the branch here, we could skip
1308 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1309 * load. If subsequent code can act as the usual Spectre cache gadget, this
1310 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1311 * any use of the %gs override.
1312 *
1313 * The other case is also an issue: if we're coming into a trap from kernel
1314 * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1315 * using it. AMD systems are not vulnerable to this version, as a swapgs is
1316 * serializing with respect to subsequent uses. But as AMD /does/ need the other
1317 * case, and the fix is the same in both cases (an lfence at the branch target
1318 * 1: in this example), we'll just do it unconditionally.
1319 *
1320 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1321 * harder for user-space to actually set a useful %gsbase value: although it's
1322 * not clear, it might still be feasible via lwp_setprivate(), though, so we
1323 * mitigate anyway.
1324 *
1325 * MELTDOWN
1326 *
1327 * Meltdown, or spectre v3, allowed a user process to read any data in their
1328 * address space regardless of whether or not the page tables in question
1329 * allowed the user to have the ability to read them. The solution to meltdown
1330 * is kernel page table isolation. In this world, there are two page tables that
1331 * are used for a process, one in user land and one in the kernel. To implement
1332 * this we use per-CPU page tables and switch between the user and kernel
1333 * variants when entering and exiting the kernel. For more information about
1334 * this process and how the trampolines work, please see the big theory
1335 * statements and additional comments in:
1336 *
1337 * - uts/i86pc/ml/kpti_trampolines.s
1338 * - uts/i86pc/vm/hat_i86.c
1339 *
1340 * While Meltdown only impacted Intel systems and there are also Intel systems
1341 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1342 * kernel page table isolation enabled. While this may at first seem weird, an
1343 * important thing to remember is that you can't speculatively read an address
1344 * if it's never in your page table at all. Having user processes without kernel
1345 * pages present provides us with an important layer of defense in the kernel
1346 * against any other side channel attacks that exist and have yet to be
1347 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1348 * default, no matter the x86 system.
1349 *
1350 * L1 TERMINAL FAULT
1351 *
1352 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1353 * execution uses page table entries. Effectively, it is two different problems.
1354 * The first is that it ignores the not present bit in the page table entries
1355 * when performing speculative execution. This means that something can
1356 * speculatively read the listed physical address if it's present in the L1
1357 * cache under certain conditions (see Intel's documentation for the full set of
1358 * conditions). Secondly, this can be used to bypass hardware virtualization
1359 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1360 * instructions.
1361 *
1362 * For the non-hardware virtualized case, this is relatively easy to deal with.
1363 * We must make sure that all unmapped pages have an address of zero. This means
1364 * that they could read the first 4k of physical memory; however, we never use
1365 * that first page in the operating system and always skip putting it in our
1366 * memory map, even if firmware tells us we can use it in our memory map. While
1367 * other systems try to put extra metadata in the address and reserved bits,
1368 * which led to this being problematic in those cases, we do not.
1369 *
1370 * For hardware virtual machines things are more complicated. Because they can
1371 * construct their own page tables, it isn't hard for them to perform this
1372 * attack against any physical address. The one wrinkle is that this physical
1373 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1374 * to flush the L1 data cache. We wrap this up in the function
1375 * spec_uarch_flush(). This function is also used in the mitigation of
1376 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1377 * hypervisors such as KVM or bhyve are responsible for performing this before
1378 * entering the guest.
1379 *
1380 * Because this attack takes place in the L1 cache, there's another wrinkle
1381 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1382 * designs. This means that when a thread enters a hardware virtualized context
1383 * and flushes the L1 data cache, the other thread on the processor may then go
1384 * ahead and put new data in it that can be potentially attacked. While one
1385 * solution is to disable SMT on the system, another option that is available is
1386 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1387 * goes through and makes sure that if a HVM is being scheduled on one thread,
1388 * then the thing on the other thread is from the same hardware virtual machine.
1389 * If an interrupt comes in or the guest exits to the broader system, then the
1390 * other SMT thread will be kicked out.
1391 *
1392 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1393 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1394 * perform L1TF related mitigations.
1395 *
1396 * MICROARCHITECTURAL DATA SAMPLING
1397 *
1398 * Microarchitectural data sampling (MDS) is a combination of four discrete
1399 * vulnerabilities that are similar issues affecting various parts of the CPU's
1400 * microarchitectural implementation around load, store, and fill buffers.
1401 * Specifically it is made up of the following subcomponents:
1402 *
1403 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1404 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1405 * 3. Microarchitectural Load Port Data Sampling (MLPDS)
1406 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1407 *
1408 * To begin addressing these, Intel has introduced another feature in microcode
1409 * called MD_CLEAR. This changes the verw instruction to operate in a different
1410 * way. This allows us to execute the verw instruction in a particular way to
1411 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1412 * updated when this microcode is present to flush this state.
1413 *
1414 * Primarily we need to flush this state whenever we transition from the kernel
1415 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1416 * little bit different. Here the structures are statically sized when a logical
1417 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1418 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1419 * mwait, or another ACPI method. To perform these flushes, we call
1420 * x86_md_clear() at all of these transition points.
1421 *
1422 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1423 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1424 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1425 * a no-op.
1426 *
1427 * Unfortunately, with this issue hyperthreading rears its ugly head. In
1428 * particular, everything we've discussed above is only valid for a single
1429 * thread executing on a core. In the case where you have hyper-threading
1430 * present, this attack can be performed between threads. The theoretical fix
1431 * for this is to ensure that both threads are always in the same security
1432 * domain. This means that they are executing in the same ring and mutually
1433 * trust each other. Practically speaking, this would mean that a system call
1434 * would have to issue an inter-processor interrupt (IPI) to the other thread.
1435 * Rather than implement this, we recommend that one disables hyper-threading
1436 * through the use of psradm -aS.
1437 *
1438 * TSX ASYNCHRONOUS ABORT
1439 *
1440 * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1441 * behaves like MDS, but leverages Intel's transactional instructions as another
1442 * vector. Effectively, when a transaction hits one of these cases (unmapped
1443 * page, various cache snoop activity, etc.) then the same data can be exposed
1444 * as in the case of MDS. This means that you can attack your twin.
1445 *
1446 * Intel has described that there are two different ways that we can mitigate
1447 * this problem on affected processors:
1448 *
1449 * 1) We can use the same techniques used to deal with MDS. Flushing the
1450 * microarchitectural buffers and disabling hyperthreading will mitigate
1451 * this in the same way.
1452 *
1453 * 2) Using microcode to disable TSX.
1454 *
1455 * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1456 * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1457 * That's OK as we're already doing all such mitigations. On the other hand,
1458 * processors with MDS_NO are all supposed to receive microcode updates that
1459 * enumerate support for disabling TSX. In general, we'd rather use this method
1460 * when available as it doesn't require disabling hyperthreading to be
1461 * effective. Currently we basically are relying on microcode for processors
1462 * that enumerate MDS_NO.
1463 *
1464 * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1465 * Sampling: RFDS. This allows an attacker to sample values that were in any
1466 * of integer, floating point, or vector registers. This was discovered by
1467 * Intel during internal validation work. The existence of the RFDS_NO
1468 * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1469 * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1470 * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1471 * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1472 * MSR that L1D uses.
1473 *
1474 * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1475 * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1476 * different powers. The first allows us to cause all transactions to
1477 * immediately abort. The second gives us a means of disabling TSX completely,
1478 * which includes removing it from cpuid. If we have support for this in
1479 * microcode during the first cpuid pass, then we'll disable TSX completely such
1480 * that user land never has a chance to observe the bit. However, if we are late
1481 * loading the microcode, then we must use the functionality to cause
1482 * transactions to automatically abort. This is necessary for user land's sake.
1483 * Once a program sees a cpuid bit, it must not be taken away.
1484 *
1485 * We track whether or not we should do this based on what cpuid pass we're in.
1486 * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1487 * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1488 * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1489 * second time after we do the initial microcode update. As a result we need to
1490 * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1491 * suitable microcode on the current CPU (which happens prior to
1492 * cpuid_pass_ucode()).
1493 *
1494 * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1495 * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1496 * unfortunate feature in a number of ways, and taking the opportunity to
1497 * finally be able to turn it off is likely to be of benefit in the future.
1498 *
1499 * SUMMARY
1500 *
1501 * The following table attempts to summarize the mitigations for various issues
1502 * and what's done in various places:
1503 *
1504 * - Spectre v1: Not currently mitigated
1505 * - swapgs: lfences after swapgs paths
1506 * - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1507 * - Meltdown: Kernel Page Table Isolation
1508 * - Spectre v3a: Updated CPU microcode
1509 * - Spectre v4: Not currently mitigated
1510 * - SpectreRSB: SMEP and RSB Stuffing
1511 * - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1512 * - MDS: x86_md_clear, requires microcode, disabling SMT
1513 * - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1514 * - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1515 * - BHI: software sequence, and use of BHI_DIS_S if microcode has it.
1516 *
1517 * The following table indicates the x86 feature set bits that indicate that a
1518 * given problem has been solved or a notable feature is present:
1519 *
1520 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1521 * - MDS_NO: All forms of MDS
1522 * - TAA_NO: TAA
1523 * - RFDS_NO: RFDS
1524 * - BHI_NO: BHI
1525 */
1526
1527 #include <sys/types.h>
1528 #include <sys/archsystm.h>
1529 #include <sys/x86_archext.h>
1530 #include <sys/kmem.h>
1531 #include <sys/systm.h>
1532 #include <sys/cmn_err.h>
1533 #include <sys/sunddi.h>
1534 #include <sys/sunndi.h>
1535 #include <sys/cpuvar.h>
1536 #include <sys/processor.h>
1537 #include <sys/sysmacros.h>
1538 #include <sys/pg.h>
1539 #include <sys/fp.h>
1540 #include <sys/controlregs.h>
1541 #include <sys/bitmap.h>
1542 #include <sys/auxv_386.h>
1543 #include <sys/memnode.h>
1544 #include <sys/pci_cfgspace.h>
1545 #include <sys/comm_page.h>
1546 #include <sys/mach_mmu.h>
1547 #include <sys/ucode.h>
1548 #include <sys/tsc.h>
1549 #include <sys/kobj.h>
1550 #include <sys/asm_misc.h>
1551 #include <sys/bitmap.h>
1552
1553 #ifdef __xpv
1554 #include <sys/hypervisor.h>
1555 #else
1556 #include <sys/ontrap.h>
1557 #endif
1558
1559 uint_t x86_vendor = X86_VENDOR_IntelClone;
1560 uint_t x86_type = X86_TYPE_OTHER;
1561 uint_t x86_clflush_size = 0;
1562
1563 #if defined(__xpv)
1564 int x86_use_pcid = 0;
1565 int x86_use_invpcid = 0;
1566 #else
1567 int x86_use_pcid = -1;
1568 int x86_use_invpcid = -1;
1569 #endif
1570
1571 typedef enum {
1572 X86_SPECTREV2_RETPOLINE,
1573 X86_SPECTREV2_ENHANCED_IBRS,
1574 X86_SPECTREV2_AUTO_IBRS,
1575 X86_SPECTREV2_DISABLED
1576 } x86_spectrev2_mitigation_t;
1577
1578 uint_t x86_disable_spectrev2 = 0;
1579 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1580 X86_SPECTREV2_RETPOLINE;
1581
1582 /*
1583 * The mitigation status for TAA:
1584 * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1585 * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1586 * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1587 * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1588 * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1589 * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1590 */
1591 typedef enum {
1592 X86_TAA_NOTHING,
1593 X86_TAA_DISABLED,
1594 X86_TAA_MD_CLEAR,
1595 X86_TAA_TSX_FORCE_ABORT,
1596 X86_TAA_TSX_DISABLE,
1597 X86_TAA_HW_MITIGATED
1598 } x86_taa_mitigation_t;
1599
1600 uint_t x86_disable_taa = 0;
1601 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1602
1603 uint_t pentiumpro_bug4046376;
1604
1605 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1606
1607 static char *x86_feature_names[NUM_X86_FEATURES] = {
1608 "lgpg",
1609 "tsc",
1610 "msr",
1611 "mtrr",
1612 "pge",
1613 "de",
1614 "cmov",
1615 "mmx",
1616 "mca",
1617 "pae",
1618 "cv8",
1619 "pat",
1620 "sep",
1621 "sse",
1622 "sse2",
1623 "htt",
1624 "asysc",
1625 "nx",
1626 "sse3",
1627 "cx16",
1628 "cmp",
1629 "tscp",
1630 "mwait",
1631 "sse4a",
1632 "cpuid",
1633 "ssse3",
1634 "sse4_1",
1635 "sse4_2",
1636 "1gpg",
1637 "clfsh",
1638 "64",
1639 "aes",
1640 "pclmulqdq",
1641 "xsave",
1642 "avx",
1643 "vmx",
1644 "svm",
1645 "topoext",
1646 "f16c",
1647 "rdrand",
1648 "x2apic",
1649 "avx2",
1650 "bmi1",
1651 "bmi2",
1652 "fma",
1653 "smep",
1654 "smap",
1655 "adx",
1656 "rdseed",
1657 "mpx",
1658 "avx512f",
1659 "avx512dq",
1660 "avx512pf",
1661 "avx512er",
1662 "avx512cd",
1663 "avx512bw",
1664 "avx512vl",
1665 "avx512fma",
1666 "avx512vbmi",
1667 "avx512_vpopcntdq",
1668 "avx512_4vnniw",
1669 "avx512_4fmaps",
1670 "xsaveopt",
1671 "xsavec",
1672 "xsaves",
1673 "sha",
1674 "umip",
1675 "pku",
1676 "ospke",
1677 "pcid",
1678 "invpcid",
1679 "ibrs",
1680 "ibpb",
1681 "stibp",
1682 "ssbd",
1683 "ssbd_virt",
1684 "rdcl_no",
1685 "ibrs_all",
1686 "rsba",
1687 "ssb_no",
1688 "stibp_all",
1689 "flush_cmd",
1690 "l1d_vmentry_no",
1691 "fsgsbase",
1692 "clflushopt",
1693 "clwb",
1694 "monitorx",
1695 "clzero",
1696 "xop",
1697 "fma4",
1698 "tbm",
1699 "avx512_vnni",
1700 "amd_pcec",
1701 "md_clear",
1702 "mds_no",
1703 "core_thermal",
1704 "pkg_thermal",
1705 "tsx_ctrl",
1706 "taa_no",
1707 "ppin",
1708 "vaes",
1709 "vpclmulqdq",
1710 "lfence_serializing",
1711 "gfni",
1712 "avx512_vp2intersect",
1713 "avx512_bitalg",
1714 "avx512_vbmi2",
1715 "avx512_bf16",
1716 "auto_ibrs",
1717 "rfds_no",
1718 "rfds_clear",
1719 "pbrsb_no",
1720 "bhi_no",
1721 "bhi_clear"
1722 };
1723
1724 boolean_t
is_x86_feature(void * featureset,uint_t feature)1725 is_x86_feature(void *featureset, uint_t feature)
1726 {
1727 ASSERT(feature < NUM_X86_FEATURES);
1728 return (BT_TEST((ulong_t *)featureset, feature));
1729 }
1730
1731 void
add_x86_feature(void * featureset,uint_t feature)1732 add_x86_feature(void *featureset, uint_t feature)
1733 {
1734 ASSERT(feature < NUM_X86_FEATURES);
1735 BT_SET((ulong_t *)featureset, feature);
1736 }
1737
1738 void
remove_x86_feature(void * featureset,uint_t feature)1739 remove_x86_feature(void *featureset, uint_t feature)
1740 {
1741 ASSERT(feature < NUM_X86_FEATURES);
1742 BT_CLEAR((ulong_t *)featureset, feature);
1743 }
1744
1745 boolean_t
compare_x86_featureset(void * setA,void * setB)1746 compare_x86_featureset(void *setA, void *setB)
1747 {
1748 /*
1749 * We assume that the unused bits of the bitmap are always zero.
1750 */
1751 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1752 return (B_TRUE);
1753 } else {
1754 return (B_FALSE);
1755 }
1756 }
1757
1758 void
print_x86_featureset(void * featureset)1759 print_x86_featureset(void *featureset)
1760 {
1761 uint_t i;
1762
1763 for (i = 0; i < NUM_X86_FEATURES; i++) {
1764 if (is_x86_feature(featureset, i)) {
1765 cmn_err(CE_CONT, "?x86_feature: %s\n",
1766 x86_feature_names[i]);
1767 }
1768 }
1769 }
1770
1771 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1772 static size_t xsave_state_size = 0;
1773 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1774 boolean_t xsave_force_disable = B_FALSE;
1775 extern int disable_smap;
1776
1777 /*
1778 * This is set to platform type we are running on.
1779 */
1780 static int platform_type = -1;
1781
1782 #if !defined(__xpv)
1783 /*
1784 * Variable to patch if hypervisor platform detection needs to be
1785 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1786 */
1787 int enable_platform_detection = 1;
1788 #endif
1789
1790 /*
1791 * monitor/mwait info.
1792 *
1793 * size_actual and buf_actual are the real address and size allocated to get
1794 * proper mwait_buf alignement. buf_actual and size_actual should be passed
1795 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use
1796 * processor cache-line alignment, but this is not guarantied in the furture.
1797 */
1798 struct mwait_info {
1799 size_t mon_min; /* min size to avoid missed wakeups */
1800 size_t mon_max; /* size to avoid false wakeups */
1801 size_t size_actual; /* size actually allocated */
1802 void *buf_actual; /* memory actually allocated */
1803 uint32_t support; /* processor support of monitor/mwait */
1804 };
1805
1806 /*
1807 * xsave/xrestor info.
1808 *
1809 * This structure contains HW feature bits and the size of the xsave save area.
1810 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1811 * (xsave_state) to describe the xsave layout. However, at runtime the
1812 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1813 * xsave_state structure simply represents the legacy layout of the beginning
1814 * of the xsave area.
1815 */
1816 struct xsave_info {
1817 uint32_t xsav_hw_features_low; /* Supported HW features */
1818 uint32_t xsav_hw_features_high; /* Supported HW features */
1819 size_t xsav_max_size; /* max size save area for HW features */
1820 size_t ymm_size; /* AVX: size of ymm save area */
1821 size_t ymm_offset; /* AVX: offset for ymm save area */
1822 size_t bndregs_size; /* MPX: size of bndregs save area */
1823 size_t bndregs_offset; /* MPX: offset for bndregs save area */
1824 size_t bndcsr_size; /* MPX: size of bndcsr save area */
1825 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */
1826 size_t opmask_size; /* AVX512: size of opmask save */
1827 size_t opmask_offset; /* AVX512: offset for opmask save */
1828 size_t zmmlo_size; /* AVX512: size of zmm 256 save */
1829 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */
1830 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */
1831 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */
1832 };
1833
1834
1835 /*
1836 * These constants determine how many of the elements of the
1837 * cpuid we cache in the cpuid_info data structure; the
1838 * remaining elements are accessible via the cpuid instruction.
1839 */
1840
1841 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */
1842 #define NMAX_CPI_EXTD 0x22 /* eax = 0x80000000 .. 0x80000021 */
1843 #define NMAX_CPI_TOPO 0x10 /* Sanity check on leaf 8X26, 1F */
1844
1845 /*
1846 * See the big theory statement for a more detailed explanation of what some of
1847 * these members mean.
1848 */
1849 struct cpuid_info {
1850 uint_t cpi_pass; /* last pass completed */
1851 /*
1852 * standard function information
1853 */
1854 uint_t cpi_maxeax; /* fn 0: %eax */
1855 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */
1856 uint_t cpi_vendor; /* enum of cpi_vendorstr */
1857
1858 uint_t cpi_family; /* fn 1: extended family */
1859 uint_t cpi_model; /* fn 1: extended model */
1860 uint_t cpi_step; /* fn 1: stepping */
1861 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */
1862 /* AMD: package/socket # */
1863 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */
1864 int cpi_clogid; /* fn 1: %ebx: thread # */
1865 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */
1866 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */
1867 uint_t cpi_ncache; /* fn 2: number of elements */
1868 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
1869 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */
1870 uint_t cpi_cache_leaf_size; /* Number of cache elements */
1871 /* Intel fn: 4, AMD fn: 8000001d */
1872 struct cpuid_regs **cpi_cache_leaves; /* Actual leaves from above */
1873 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */
1874 struct cpuid_regs cpi_sub7[2]; /* Leaf 7, sub-leaves 1-2 */
1875 /*
1876 * extended function information
1877 */
1878 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */
1879 char cpi_brandstr[49]; /* fn 0x8000000[234] */
1880 uint8_t cpi_pabits; /* fn 0x80000006: %eax */
1881 uint8_t cpi_vabits; /* fn 0x80000006: %eax */
1882 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */
1883 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */
1884
1885 id_t cpi_coreid; /* same coreid => strands share core */
1886 int cpi_pkgcoreid; /* core number within single package */
1887 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */
1888 /* Intel: fn 4: %eax[31-26] */
1889
1890 /*
1891 * These values represent the number of bits that are required to store
1892 * information about the number of cores and threads.
1893 */
1894 uint_t cpi_ncore_bits;
1895 uint_t cpi_nthread_bits;
1896 /*
1897 * supported feature information
1898 */
1899 uint32_t cpi_support[6];
1900 #define STD_EDX_FEATURES 0
1901 #define AMD_EDX_FEATURES 1
1902 #define TM_EDX_FEATURES 2
1903 #define STD_ECX_FEATURES 3
1904 #define AMD_ECX_FEATURES 4
1905 #define STD_EBX_FEATURES 5
1906 /*
1907 * Synthesized information, where known.
1908 */
1909 x86_chiprev_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */
1910 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */
1911 uint32_t cpi_socket; /* Chip package/socket type */
1912 x86_uarchrev_t cpi_uarchrev; /* Microarchitecture and revision */
1913
1914 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */
1915 uint32_t cpi_apicid;
1916 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */
1917 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */
1918 /* Intel: 1 */
1919 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */
1920 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */
1921
1922 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */
1923
1924 /*
1925 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1926 * eventually leaf 0x1F (Intel).
1927 */
1928 uint_t cpi_topo_nleaves;
1929 struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1930 };
1931
1932
1933 static struct cpuid_info cpuid_info0;
1934
1935 /*
1936 * These bit fields are defined by the Intel Application Note AP-485
1937 * "Intel Processor Identification and the CPUID Instruction"
1938 */
1939 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1940 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1941 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1942 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1943 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1944 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1945
1946 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx)
1947 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx)
1948 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx)
1949 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx)
1950 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx)
1951 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx)
1952 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx)
1953 #define CPI_FEATURES_7_1_EAX(cpi) ((cpi)->cpi_sub7[0].cp_eax)
1954 #define CPI_FEATURES_7_2_EDX(cpi) ((cpi)->cpi_sub7[1].cp_edx)
1955
1956 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1957 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1958 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1959 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1960
1961 #define CPI_MAXEAX_MAX 0x100 /* sanity control */
1962 #define CPI_XMAXEAX_MAX 0x80000100
1963 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */
1964 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */
1965
1966 /*
1967 * Function 4 (Deterministic Cache Parameters) macros
1968 * Defined by Intel Application Note AP-485
1969 */
1970 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26)
1971 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14)
1972 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9)
1973 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8)
1974 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5)
1975 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0)
1976 #define CPI_CACHE_TYPE_DONE 0
1977 #define CPI_CACHE_TYPE_DATA 1
1978 #define CPI_CACHE_TYPE_INSTR 2
1979 #define CPI_CACHE_TYPE_UNIFIED 3
1980 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8)
1981
1982 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22)
1983 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12)
1984 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0)
1985
1986 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0)
1987
1988 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0)
1989
1990
1991 /*
1992 * A couple of shorthand macros to identify "later" P6-family chips
1993 * like the Pentium M and Core. First, the "older" P6-based stuff
1994 * (loosely defined as "pre-Pentium-4"):
1995 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1996 */
1997 #define IS_LEGACY_P6(cpi) ( \
1998 cpi->cpi_family == 6 && \
1999 (cpi->cpi_model == 1 || \
2000 cpi->cpi_model == 3 || \
2001 cpi->cpi_model == 5 || \
2002 cpi->cpi_model == 6 || \
2003 cpi->cpi_model == 7 || \
2004 cpi->cpi_model == 8 || \
2005 cpi->cpi_model == 0xA || \
2006 cpi->cpi_model == 0xB) \
2007 )
2008
2009 /* A "new F6" is everything with family 6 that's not the above */
2010 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
2011
2012 /* Extended family/model support */
2013 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
2014 cpi->cpi_family >= 0xf)
2015
2016 /*
2017 * Info for monitor/mwait idle loop.
2018 *
2019 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
2020 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
2021 * 2006.
2022 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
2023 * Documentation Updates" #33633, Rev 2.05, December 2006.
2024 */
2025 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */
2026 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */
2027 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */
2028 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
2029 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2)
2030 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1)
2031 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
2032 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
2033 /*
2034 * Number of sub-cstates for a given c-state.
2035 */
2036 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \
2037 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
2038
2039 /*
2040 * XSAVE leaf 0xD enumeration
2041 */
2042 #define CPUID_LEAFD_2_YMM_OFFSET 576
2043 #define CPUID_LEAFD_2_YMM_SIZE 256
2044
2045 /*
2046 * Common extended leaf names to cut down on typos.
2047 */
2048 #define CPUID_LEAF_EXT_0 0x80000000
2049 #define CPUID_LEAF_EXT_8 0x80000008
2050 #define CPUID_LEAF_EXT_1d 0x8000001d
2051 #define CPUID_LEAF_EXT_1e 0x8000001e
2052 #define CPUID_LEAF_EXT_21 0x80000021
2053 #define CPUID_LEAF_EXT_26 0x80000026
2054
2055 /*
2056 * Functions we consume from cpuid_subr.c; don't publish these in a header
2057 * file to try and keep people using the expected cpuid_* interfaces.
2058 */
2059 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2060 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2061 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2062 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2063 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2064 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2065
2066 /*
2067 * Apply up various platform-dependent restrictions where the
2068 * underlying platform restrictions mean the CPU can be marked
2069 * as less capable than its cpuid instruction would imply.
2070 */
2071 #if defined(__xpv)
2072 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)2073 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2074 {
2075 switch (eax) {
2076 case 1: {
2077 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2078 0 : CPUID_INTC_EDX_MCA;
2079 cp->cp_edx &=
2080 ~(mcamask |
2081 CPUID_INTC_EDX_PSE |
2082 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2083 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2084 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2085 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2086 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2087 break;
2088 }
2089
2090 case 0x80000001:
2091 cp->cp_edx &=
2092 ~(CPUID_AMD_EDX_PSE |
2093 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2094 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2095 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2096 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2097 CPUID_AMD_EDX_TSCP);
2098 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2099 break;
2100 default:
2101 break;
2102 }
2103
2104 switch (vendor) {
2105 case X86_VENDOR_Intel:
2106 switch (eax) {
2107 case 4:
2108 /*
2109 * Zero out the (ncores-per-chip - 1) field
2110 */
2111 cp->cp_eax &= 0x03fffffff;
2112 break;
2113 default:
2114 break;
2115 }
2116 break;
2117 case X86_VENDOR_AMD:
2118 case X86_VENDOR_HYGON:
2119 switch (eax) {
2120
2121 case 0x80000001:
2122 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2123 break;
2124
2125 case CPUID_LEAF_EXT_8:
2126 /*
2127 * Zero out the (ncores-per-chip - 1) field
2128 */
2129 cp->cp_ecx &= 0xffffff00;
2130 break;
2131 default:
2132 break;
2133 }
2134 break;
2135 default:
2136 break;
2137 }
2138 }
2139 #else
2140 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */
2141 #endif
2142
2143 /*
2144 * Some undocumented ways of patching the results of the cpuid
2145 * instruction to permit running Solaris 10 on future cpus that
2146 * we don't currently support. Could be set to non-zero values
2147 * via settings in eeprom.
2148 */
2149
2150 uint32_t cpuid_feature_ecx_include;
2151 uint32_t cpuid_feature_ecx_exclude;
2152 uint32_t cpuid_feature_edx_include;
2153 uint32_t cpuid_feature_edx_exclude;
2154
2155 /*
2156 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2157 */
2158 void
cpuid_alloc_space(cpu_t * cpu)2159 cpuid_alloc_space(cpu_t *cpu)
2160 {
2161 /*
2162 * By convention, cpu0 is the boot cpu, which is set up
2163 * before memory allocation is available. All other cpus get
2164 * their cpuid_info struct allocated here.
2165 */
2166 ASSERT(cpu->cpu_id != 0);
2167 ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2168 cpu->cpu_m.mcpu_cpi =
2169 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2170 }
2171
2172 void
cpuid_free_space(cpu_t * cpu)2173 cpuid_free_space(cpu_t *cpu)
2174 {
2175 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2176 int i;
2177
2178 ASSERT(cpi != NULL);
2179 ASSERT(cpi != &cpuid_info0);
2180
2181 /*
2182 * Free up any cache leaf related dynamic storage. The first entry was
2183 * cached from the standard cpuid storage, so we should not free it.
2184 */
2185 for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2186 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2187 if (cpi->cpi_cache_leaf_size > 0)
2188 kmem_free(cpi->cpi_cache_leaves,
2189 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2190
2191 kmem_free(cpi, sizeof (*cpi));
2192 cpu->cpu_m.mcpu_cpi = NULL;
2193 }
2194
2195 #if !defined(__xpv)
2196 /*
2197 * Determine the type of the underlying platform. This is used to customize
2198 * initialization of various subsystems (e.g. TSC). determine_platform() must
2199 * only ever be called once to prevent two processors from seeing different
2200 * values of platform_type. Must be called before cpuid_pass_ident(), the
2201 * earliest consumer to execute; the identification pass will call
2202 * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2203 */
2204 void
determine_platform(void)2205 determine_platform(void)
2206 {
2207 struct cpuid_regs cp;
2208 uint32_t base;
2209 uint32_t regs[4];
2210 char *hvstr = (char *)regs;
2211
2212 ASSERT(platform_type == -1);
2213
2214 platform_type = HW_NATIVE;
2215
2216 if (!enable_platform_detection)
2217 return;
2218
2219 /*
2220 * If Hypervisor CPUID bit is set, try to determine hypervisor
2221 * vendor signature, and set platform type accordingly.
2222 *
2223 * References:
2224 * http://lkml.org/lkml/2008/10/1/246
2225 * http://kb.vmware.com/kb/1009458
2226 */
2227 cp.cp_eax = 0x1;
2228 (void) __cpuid_insn(&cp);
2229 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2230 cp.cp_eax = 0x40000000;
2231 (void) __cpuid_insn(&cp);
2232 regs[0] = cp.cp_ebx;
2233 regs[1] = cp.cp_ecx;
2234 regs[2] = cp.cp_edx;
2235 regs[3] = 0;
2236 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2237 platform_type = HW_XEN_HVM;
2238 return;
2239 }
2240 if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2241 platform_type = HW_VMWARE;
2242 return;
2243 }
2244 if (strcmp(hvstr, HVSIG_KVM) == 0) {
2245 platform_type = HW_KVM;
2246 return;
2247 }
2248 if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2249 platform_type = HW_BHYVE;
2250 return;
2251 }
2252 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2253 platform_type = HW_MICROSOFT;
2254 return;
2255 }
2256 if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2257 platform_type = HW_QEMU_TCG;
2258 return;
2259 }
2260 if (strcmp(hvstr, HVSIG_VIRTUALBOX) == 0) {
2261 platform_type = HW_VIRTUALBOX;
2262 return;
2263 }
2264 if (strcmp(hvstr, HVSIG_ACRN) == 0) {
2265 platform_type = HW_ACRN;
2266 return;
2267 }
2268 } else {
2269 /*
2270 * Check older VMware hardware versions. VMware hypervisor is
2271 * detected by performing an IN operation to VMware hypervisor
2272 * port and checking that value returned in %ebx is VMware
2273 * hypervisor magic value.
2274 *
2275 * References: http://kb.vmware.com/kb/1009458
2276 */
2277 vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2278 if (regs[1] == VMWARE_HVMAGIC) {
2279 platform_type = HW_VMWARE;
2280 return;
2281 }
2282 }
2283
2284 /*
2285 * Check Xen hypervisor. In a fully virtualized domain,
2286 * Xen's pseudo-cpuid function returns a string representing the
2287 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2288 * supported cpuid function. We need at least a (base + 2) leaf value
2289 * to do what we want to do. Try different base values, since the
2290 * hypervisor might use a different one depending on whether Hyper-V
2291 * emulation is switched on by default or not.
2292 */
2293 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2294 cp.cp_eax = base;
2295 (void) __cpuid_insn(&cp);
2296 regs[0] = cp.cp_ebx;
2297 regs[1] = cp.cp_ecx;
2298 regs[2] = cp.cp_edx;
2299 regs[3] = 0;
2300 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2301 cp.cp_eax >= (base + 2)) {
2302 platform_type &= ~HW_NATIVE;
2303 platform_type |= HW_XEN_HVM;
2304 return;
2305 }
2306 }
2307 }
2308
2309 int
get_hwenv(void)2310 get_hwenv(void)
2311 {
2312 ASSERT(platform_type != -1);
2313 return (platform_type);
2314 }
2315
2316 int
is_controldom(void)2317 is_controldom(void)
2318 {
2319 return (0);
2320 }
2321
2322 #else
2323
2324 int
get_hwenv(void)2325 get_hwenv(void)
2326 {
2327 return (HW_XEN_PV);
2328 }
2329
2330 int
is_controldom(void)2331 is_controldom(void)
2332 {
2333 return (DOMAIN_IS_INITDOMAIN(xen_info));
2334 }
2335
2336 #endif /* __xpv */
2337
2338 /*
2339 * Gather the extended topology information. This should be the same for both
2340 * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2341 */
2342 static void
cpuid_gather_ext_topo_leaf(struct cpuid_info * cpi,uint32_t leaf)2343 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2344 {
2345 uint_t i;
2346
2347 for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2348 struct cpuid_regs *regs = &cpi->cpi_topo[i];
2349
2350 bzero(regs, sizeof (struct cpuid_regs));
2351 regs->cp_eax = leaf;
2352 regs->cp_ecx = i;
2353
2354 (void) __cpuid_insn(regs);
2355 if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2356 CPUID_AMD_8X26_TYPE_DONE) {
2357 break;
2358 }
2359 }
2360
2361 cpi->cpi_topo_nleaves = i;
2362 }
2363
2364 /*
2365 * Make sure that we have gathered all of the CPUID leaves that we might need to
2366 * determine topology. We assume that the standard leaf 1 has already been done
2367 * and that xmaxeax has already been calculated.
2368 */
2369 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2370 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2371 {
2372 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2373
2374 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2375 struct cpuid_regs *cp;
2376
2377 cp = &cpi->cpi_extd[8];
2378 cp->cp_eax = CPUID_LEAF_EXT_8;
2379 (void) __cpuid_insn(cp);
2380 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2381 }
2382
2383 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2384 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2385 struct cpuid_regs *cp;
2386
2387 cp = &cpi->cpi_extd[0x1e];
2388 cp->cp_eax = CPUID_LEAF_EXT_1e;
2389 (void) __cpuid_insn(cp);
2390 }
2391
2392 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2393 cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2394 }
2395 }
2396
2397 /*
2398 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2399 * it to everything else. If not, and we're on an AMD system where 8000001e is
2400 * valid, then we use that. Othewrise, we fall back to the default value for the
2401 * APIC ID in leaf 1.
2402 */
2403 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2404 cpuid_gather_apicid(struct cpuid_info *cpi)
2405 {
2406 /*
2407 * Leaf B changes based on the arguments to it. Because we don't cache
2408 * it, we need to gather it again.
2409 */
2410 if (cpi->cpi_maxeax >= 0xB) {
2411 struct cpuid_regs regs;
2412 struct cpuid_regs *cp;
2413
2414 cp = ®s;
2415 cp->cp_eax = 0xB;
2416 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2417 (void) __cpuid_insn(cp);
2418
2419 if (cp->cp_ebx != 0) {
2420 return (cp->cp_edx);
2421 }
2422 }
2423
2424 if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2425 cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2426 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2427 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2428 return (cpi->cpi_extd[0x1e].cp_eax);
2429 }
2430
2431 return (CPI_APIC_ID(cpi));
2432 }
2433
2434 /*
2435 * For AMD processors, attempt to calculate the number of chips and cores that
2436 * exist. The way that we do this varies based on the generation, because the
2437 * generations themselves have changed dramatically.
2438 *
2439 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2440 * However, with the advent of family 17h (Zen) it actually tells us the number
2441 * of threads, so we need to look at leaf 0x8000001e if available to determine
2442 * its value. Otherwise, for all prior families, the number of enabled cores is
2443 * the same as threads.
2444 *
2445 * If we do not have leaf 0x80000008, then we assume that this processor does
2446 * not have anything. AMD's older CPUID specification says there's no reason to
2447 * fall back to leaf 1.
2448 *
2449 * In some virtualization cases we will not have leaf 8000001e or it will be
2450 * zero. When that happens we assume the number of threads is one.
2451 */
2452 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2453 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2454 {
2455 uint_t nthreads, nthread_per_core;
2456
2457 nthreads = nthread_per_core = 1;
2458
2459 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2460 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2461 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2462 nthreads = CPI_CPU_COUNT(cpi);
2463 }
2464
2465 /*
2466 * For us to have threads, and know about it, we have to be at least at
2467 * family 17h and have the cpuid bit that says we have extended
2468 * topology.
2469 */
2470 if (cpi->cpi_family >= 0x17 &&
2471 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2472 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2473 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2474 }
2475
2476 *ncpus = nthreads;
2477 *ncores = nthreads / nthread_per_core;
2478 }
2479
2480 /*
2481 * Seed the initial values for the cores and threads for an Intel based
2482 * processor. These values will be overwritten if we detect that the processor
2483 * supports CPUID leaf 0xb.
2484 */
2485 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2486 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2487 {
2488 /*
2489 * Only seed the number of physical cores from the first level leaf 4
2490 * information. The number of threads there indicate how many share the
2491 * L1 cache, which may or may not have anything to do with the number of
2492 * logical CPUs per core.
2493 */
2494 if (cpi->cpi_maxeax >= 4) {
2495 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2496 } else {
2497 *ncores = 1;
2498 }
2499
2500 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2501 *ncpus = CPI_CPU_COUNT(cpi);
2502 } else {
2503 *ncpus = *ncores;
2504 }
2505 }
2506
2507 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2508 cpuid_leafB_getids(cpu_t *cpu)
2509 {
2510 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2511 struct cpuid_regs regs;
2512 struct cpuid_regs *cp;
2513
2514 if (cpi->cpi_maxeax < 0xB)
2515 return (B_FALSE);
2516
2517 cp = ®s;
2518 cp->cp_eax = 0xB;
2519 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2520
2521 (void) __cpuid_insn(cp);
2522
2523 /*
2524 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2525 * indicates that the extended topology enumeration leaf is
2526 * available.
2527 */
2528 if (cp->cp_ebx != 0) {
2529 uint32_t x2apic_id = 0;
2530 uint_t coreid_shift = 0;
2531 uint_t ncpu_per_core = 1;
2532 uint_t chipid_shift = 0;
2533 uint_t ncpu_per_chip = 1;
2534 uint_t i;
2535 uint_t level;
2536
2537 for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2538 cp->cp_eax = 0xB;
2539 cp->cp_ecx = i;
2540
2541 (void) __cpuid_insn(cp);
2542 level = CPI_CPU_LEVEL_TYPE(cp);
2543
2544 if (level == 1) {
2545 x2apic_id = cp->cp_edx;
2546 coreid_shift = BITX(cp->cp_eax, 4, 0);
2547 ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2548 } else if (level == 2) {
2549 x2apic_id = cp->cp_edx;
2550 chipid_shift = BITX(cp->cp_eax, 4, 0);
2551 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2552 }
2553 }
2554
2555 /*
2556 * cpi_apicid is taken care of in cpuid_gather_apicid.
2557 */
2558 cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2559 cpi->cpi_ncore_per_chip = ncpu_per_chip /
2560 ncpu_per_core;
2561 cpi->cpi_chipid = x2apic_id >> chipid_shift;
2562 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2563 cpi->cpi_coreid = x2apic_id >> coreid_shift;
2564 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2565 cpi->cpi_procnodeid = cpi->cpi_chipid;
2566 cpi->cpi_compunitid = cpi->cpi_coreid;
2567
2568 if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2569 cpi->cpi_nthread_bits = coreid_shift;
2570 cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2571 }
2572
2573 return (B_TRUE);
2574 } else {
2575 return (B_FALSE);
2576 }
2577 }
2578
2579 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2580 cpuid_intel_getids(cpu_t *cpu, void *feature)
2581 {
2582 uint_t i;
2583 uint_t chipid_shift = 0;
2584 uint_t coreid_shift = 0;
2585 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2586
2587 /*
2588 * There are no compute units or processor nodes currently on Intel.
2589 * Always set these to one.
2590 */
2591 cpi->cpi_procnodes_per_pkg = 1;
2592 cpi->cpi_cores_per_compunit = 1;
2593
2594 /*
2595 * If cpuid Leaf B is present, use that to try and get this information.
2596 * It will be the most accurate for Intel CPUs.
2597 */
2598 if (cpuid_leafB_getids(cpu))
2599 return;
2600
2601 /*
2602 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2603 * and ncore_per_chip. These represent the largest power of two values
2604 * that we need to cover all of the IDs in the system. Therefore, we use
2605 * those values to seed the number of bits needed to cover information
2606 * in the case when leaf B is not available. These values will probably
2607 * be larger than required, but that's OK.
2608 */
2609 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2610 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2611
2612 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2613 chipid_shift++;
2614
2615 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2616 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2617
2618 if (is_x86_feature(feature, X86FSET_CMP)) {
2619 /*
2620 * Multi-core (and possibly multi-threaded)
2621 * processors.
2622 */
2623 uint_t ncpu_per_core = 0;
2624
2625 if (cpi->cpi_ncore_per_chip == 1)
2626 ncpu_per_core = cpi->cpi_ncpu_per_chip;
2627 else if (cpi->cpi_ncore_per_chip > 1)
2628 ncpu_per_core = cpi->cpi_ncpu_per_chip /
2629 cpi->cpi_ncore_per_chip;
2630 /*
2631 * 8bit APIC IDs on dual core Pentiums
2632 * look like this:
2633 *
2634 * +-----------------------+------+------+
2635 * | Physical Package ID | MC | HT |
2636 * +-----------------------+------+------+
2637 * <------- chipid -------->
2638 * <------- coreid --------------->
2639 * <--- clogid -->
2640 * <------>
2641 * pkgcoreid
2642 *
2643 * Where the number of bits necessary to
2644 * represent MC and HT fields together equals
2645 * to the minimum number of bits necessary to
2646 * store the value of cpi->cpi_ncpu_per_chip.
2647 * Of those bits, the MC part uses the number
2648 * of bits necessary to store the value of
2649 * cpi->cpi_ncore_per_chip.
2650 */
2651 for (i = 1; i < ncpu_per_core; i <<= 1)
2652 coreid_shift++;
2653 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2654 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2655 } else if (is_x86_feature(feature, X86FSET_HTT)) {
2656 /*
2657 * Single-core multi-threaded processors.
2658 */
2659 cpi->cpi_coreid = cpi->cpi_chipid;
2660 cpi->cpi_pkgcoreid = 0;
2661 } else {
2662 /*
2663 * Single-core single-thread processors.
2664 */
2665 cpi->cpi_coreid = cpu->cpu_id;
2666 cpi->cpi_pkgcoreid = 0;
2667 }
2668 cpi->cpi_procnodeid = cpi->cpi_chipid;
2669 cpi->cpi_compunitid = cpi->cpi_coreid;
2670 }
2671
2672 /*
2673 * Historically, AMD has had CMP chips with only a single thread per core.
2674 * However, starting in family 17h (Zen), this has changed and they now have
2675 * multiple threads. Our internal core id needs to be a unique value.
2676 *
2677 * To determine the core id of an AMD system, if we're from a family before 17h,
2678 * then we just use the cpu id, as that gives us a good value that will be
2679 * unique for each core. If instead, we're on family 17h or later, then we need
2680 * to do something more complicated. CPUID leaf 0x8000001e can tell us
2681 * how many threads are in the system. Based on that, we'll shift the APIC ID.
2682 * We can't use the normal core id in that leaf as it's only unique within the
2683 * socket, which is perfect for cpi_pkgcoreid, but not us.
2684 */
2685 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2686 cpuid_amd_get_coreid(cpu_t *cpu)
2687 {
2688 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2689
2690 if (cpi->cpi_family >= 0x17 &&
2691 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2692 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2693 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2694 if (nthreads > 1) {
2695 VERIFY3U(nthreads, ==, 2);
2696 return (cpi->cpi_apicid >> 1);
2697 }
2698 }
2699
2700 return (cpu->cpu_id);
2701 }
2702
2703 /*
2704 * IDs on AMD is a more challenging task. This is notable because of the
2705 * following two facts:
2706 *
2707 * 1. Before family 0x17 (Zen), there was no support for SMT and there was
2708 * also no way to get an actual unique core id from the system. As such, we
2709 * synthesize this case by using cpu->cpu_id. This scheme does not,
2710 * however, guarantee that sibling cores of a chip will have sequential
2711 * coreids starting at a multiple of the number of cores per chip - that is
2712 * usually the case, but if the APIC IDs have been set up in a different
2713 * order then we need to perform a few more gymnastics for the pkgcoreid.
2714 *
2715 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2716 * called compute units. These compute units share the L1I cache, L2 cache,
2717 * and the FPU. To deal with this, a new topology leaf was added in
2718 * 0x8000001e. However, parts of this leaf have different meanings
2719 * once we get to family 0x17.
2720 */
2721
2722 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2723 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2724 {
2725 int i, first_half, coreidsz;
2726 uint32_t nb_caps_reg;
2727 uint_t node2_1;
2728 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2729 struct cpuid_regs *cp;
2730
2731 /*
2732 * Calculate the core id (this comes from hardware in family 0x17 if it
2733 * hasn't been stripped by virtualization). We always set the compute
2734 * unit id to the same value. Also, initialize the default number of
2735 * cores per compute unit and nodes per package. This will be
2736 * overwritten when we know information about a particular family.
2737 */
2738 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2739 cpi->cpi_compunitid = cpi->cpi_coreid;
2740 cpi->cpi_cores_per_compunit = 1;
2741 cpi->cpi_procnodes_per_pkg = 1;
2742
2743 /*
2744 * To construct the logical ID, we need to determine how many APIC IDs
2745 * are dedicated to the cores and threads. This is provided for us in
2746 * 0x80000008. However, if it's not present (say due to virtualization),
2747 * then we assume it's one. This should be present on all 64-bit AMD
2748 * processors. It was added in family 0xf (Hammer).
2749 */
2750 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2751 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2752
2753 /*
2754 * In AMD parlance chip is really a node while illumos
2755 * uses chip as equivalent to socket/package.
2756 */
2757 if (coreidsz == 0) {
2758 /* Use legacy method */
2759 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2760 coreidsz++;
2761 if (coreidsz == 0)
2762 coreidsz = 1;
2763 }
2764 } else {
2765 /* Assume single-core part */
2766 coreidsz = 1;
2767 }
2768 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2769
2770 /*
2771 * The package core ID varies depending on the family. While it may be
2772 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2773 * this value is the core id in the given node. For non-virtualized
2774 * family 17h, we need to take the logical core id and shift off the
2775 * threads like we do when getting the core id. Otherwise, we can use
2776 * the clogid as is. When family 17h is virtualized, the clogid should
2777 * be sufficient as if we don't have valid data in the leaf, then we
2778 * won't think we have SMT, in which case the cpi_clogid should be
2779 * sufficient.
2780 */
2781 if (cpi->cpi_family >= 0x17 &&
2782 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2783 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2784 cpi->cpi_extd[0x1e].cp_ebx != 0) {
2785 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2786 if (nthreads > 1) {
2787 VERIFY3U(nthreads, ==, 2);
2788 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2789 } else {
2790 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2791 }
2792 } else {
2793 cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2794 }
2795
2796 /*
2797 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2798 * (bulldozer) or newer, then we can derive all of this from leaf
2799 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2800 */
2801 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2802 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2803 cp = &cpi->cpi_extd[0x1e];
2804
2805 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2806 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2807
2808 /*
2809 * For Bulldozer-era CPUs, recalculate the compute unit
2810 * information.
2811 */
2812 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2813 cpi->cpi_cores_per_compunit =
2814 BITX(cp->cp_ebx, 15, 8) + 1;
2815 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2816 (cpi->cpi_ncore_per_chip /
2817 cpi->cpi_cores_per_compunit) *
2818 (cpi->cpi_procnodeid /
2819 cpi->cpi_procnodes_per_pkg);
2820 }
2821 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2822 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2823 } else if (cpi->cpi_family == 0x10) {
2824 /*
2825 * See if we are a multi-node processor.
2826 * All processors in the system have the same number of nodes
2827 */
2828 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8);
2829 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2830 /* Single-node */
2831 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2832 coreidsz);
2833 } else {
2834
2835 /*
2836 * Multi-node revision D (2 nodes per package
2837 * are supported)
2838 */
2839 cpi->cpi_procnodes_per_pkg = 2;
2840
2841 first_half = (cpi->cpi_pkgcoreid <=
2842 (cpi->cpi_ncore_per_chip/2 - 1));
2843
2844 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2845 /* We are BSP */
2846 cpi->cpi_procnodeid = (first_half ? 0 : 1);
2847 } else {
2848
2849 /* We are AP */
2850 /* NodeId[2:1] bits to use for reading F3xe8 */
2851 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2852
2853 nb_caps_reg =
2854 pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2855
2856 /*
2857 * Check IntNodeNum bit (31:30, but bit 31 is
2858 * always 0 on dual-node processors)
2859 */
2860 if (BITX(nb_caps_reg, 30, 30) == 0)
2861 cpi->cpi_procnodeid = node2_1 +
2862 !first_half;
2863 else
2864 cpi->cpi_procnodeid = node2_1 +
2865 first_half;
2866 }
2867 }
2868 } else {
2869 cpi->cpi_procnodeid = 0;
2870 }
2871
2872 cpi->cpi_chipid =
2873 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2874
2875 cpi->cpi_ncore_bits = coreidsz;
2876 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2877 cpi->cpi_ncore_per_chip);
2878 }
2879
2880 static void
spec_uarch_flush_noop(void)2881 spec_uarch_flush_noop(void)
2882 {
2883 }
2884
2885 /*
2886 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2887 * MDS-related micro-architectural state that would normally happen by calling
2888 * x86_md_clear().
2889 */
2890 static void
spec_uarch_flush_msr(void)2891 spec_uarch_flush_msr(void)
2892 {
2893 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2894 }
2895
2896 /*
2897 * This function points to a function that will flush certain
2898 * micro-architectural state on the processor. This flush is used to mitigate
2899 * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2900 * This function can point to one of three functions:
2901 *
2902 * - A noop which is done because we either are vulnerable, but do not have
2903 * microcode available to help deal with a fix, or because we aren't
2904 * vulnerable.
2905 *
2906 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2907 * mitigate MDS is present, also perform the equivalent of the MDS flush;
2908 * however, it only flushes the MDS related micro-architectural state on the
2909 * current hyperthread, it does not do anything for the twin.
2910 *
2911 * - x86_md_clear which will flush the MDS related state. This is done when we
2912 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2913 * (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2914 * can clear it (RFDS_CLEAR is set).
2915 */
2916 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2917
2918 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2919 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2920 {
2921 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2922
2923 /* Non-Intel doesn't concern us here. */
2924 if (cpi->cpi_vendor != X86_VENDOR_Intel)
2925 return;
2926
2927 /*
2928 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2929 * has been fixed in hardware, it doesn't cover everything related to
2930 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2931 * need to mitigate this.
2932 *
2933 * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2934 * because of the small cases of RFDS.
2935 */
2936
2937 if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2938 is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2939 (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2940 is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2941 const uint8_t nop = NOP_INSTR;
2942 uint8_t *md = (uint8_t *)x86_md_clear;
2943
2944 *md = nop;
2945 }
2946
2947 membar_producer();
2948 }
2949
2950 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2951 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2952 {
2953 boolean_t need_l1d, need_mds, need_rfds;
2954 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2955
2956 /*
2957 * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2958 * in hardware, then there's nothing left for us to do for enabling
2959 * the flush. We can also go ahead and say that SMT exclusion is
2960 * unnecessary.
2961 */
2962 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2963 (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2964 is_x86_feature(featureset, X86FSET_MDS_NO) &&
2965 is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2966 extern int smt_exclusion;
2967 smt_exclusion = 0;
2968 spec_uarch_flush = spec_uarch_flush_noop;
2969 membar_producer();
2970 return;
2971 }
2972
2973 /*
2974 * The locations where we need to perform an L1D flush are required both
2975 * for mitigating L1TF and MDS. When verw support is present in
2976 * microcode, then the L1D flush will take care of doing that as well.
2977 * However, if we have a system where RDCL_NO is present, but we don't
2978 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2979 * L1D flush.
2980 */
2981 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2982 is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2983 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2984 need_l1d = B_TRUE;
2985 } else {
2986 need_l1d = B_FALSE;
2987 }
2988
2989 if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2990 is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2991 need_mds = B_TRUE;
2992 } else {
2993 need_mds = B_FALSE;
2994 }
2995
2996 if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2997 is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
2998 need_rfds = B_TRUE;
2999 } else {
3000 need_rfds = B_FALSE;
3001 }
3002
3003 if (need_l1d) {
3004 /*
3005 * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
3006 * together. If the following VERIFY trips, we need to add
3007 * further fixes here.
3008 */
3009 VERIFY(!need_rfds);
3010 spec_uarch_flush = spec_uarch_flush_msr;
3011 } else if (need_mds || need_rfds) {
3012 spec_uarch_flush = x86_md_clear;
3013 } else {
3014 /*
3015 * We have no hardware mitigations available to us.
3016 */
3017 spec_uarch_flush = spec_uarch_flush_noop;
3018 }
3019 membar_producer();
3020 }
3021
3022 /*
3023 * Branch History Injection (BHI) mitigations.
3024 *
3025 * Intel has provided a software sequence that will scrub the BHB. Like RSB
3026 * (below) we can scribble a return at the beginning to avoid if if the CPU
3027 * is modern enough. We can also scribble a return if the CPU is old enough
3028 * to not have an RSB (pre-eIBRS).
3029 */
3030 typedef enum {
3031 X86_BHI_TOO_OLD_OR_DISABLED, /* Pre-eIBRS or disabled */
3032 X86_BHI_NEW_ENOUGH, /* AMD, or Intel with BHI_NO set */
3033 X86_BHI_DIS_S, /* BHI_NO == 0, but BHI_DIS_S avail. */
3034 /* NOTE: BHI_DIS_S above will still need the software sequence. */
3035 X86_BHI_SOFTWARE_SEQUENCE, /* Use software sequence */
3036 } x86_native_bhi_mitigation_t;
3037
3038 x86_native_bhi_mitigation_t x86_bhi_mitigation = X86_BHI_SOFTWARE_SEQUENCE;
3039
3040 static void
cpuid_enable_bhi_dis_s(void)3041 cpuid_enable_bhi_dis_s(void)
3042 {
3043 uint64_t val;
3044
3045 val = rdmsr(MSR_IA32_SPEC_CTRL);
3046 val |= IA32_SPEC_CTRL_BHI_DIS_S;
3047 wrmsr(MSR_IA32_SPEC_CTRL, val);
3048 }
3049
3050 /*
3051 * This function scribbles RET into the first instruction of x86_bhb_clear()
3052 * if SPECTREV2 mitigations are disabled, the CPU is too old, the CPU is new
3053 * enough to fix (which includes non-Intel CPUs), or the CPU has an explicit
3054 * disable-Branch-History control.
3055 */
3056 static x86_native_bhi_mitigation_t
cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit,cpu_t * cpu,uchar_t * featureset)3057 cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit, cpu_t *cpu,
3058 uchar_t *featureset)
3059 {
3060 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3061 const uint8_t ret = RET_INSTR;
3062 uint8_t *bhb_clear = (uint8_t *)x86_bhb_clear;
3063
3064 ASSERT0(cpu->cpu_id);
3065
3066 /* First check for explicitly disabled... */
3067 if (v2mit == X86_SPECTREV2_DISABLED) {
3068 *bhb_clear = ret;
3069 return (X86_BHI_TOO_OLD_OR_DISABLED);
3070 }
3071
3072 /*
3073 * Then check for BHI_NO, which means the CPU doesn't have this bug,
3074 * or if it's non-Intel, in which case this mitigation mechanism
3075 * doesn't apply.
3076 */
3077 if (cpi->cpi_vendor != X86_VENDOR_Intel ||
3078 is_x86_feature(featureset, X86FSET_BHI_NO)) {
3079 *bhb_clear = ret;
3080 return (X86_BHI_NEW_ENOUGH);
3081 }
3082
3083 /*
3084 * Now check for the BHI_CTRL MSR, and then set it if available.
3085 * We will still need to use the software sequence, however.
3086 */
3087 if (is_x86_feature(featureset, X86FSET_BHI_CTRL)) {
3088 cpuid_enable_bhi_dis_s();
3089 return (X86_BHI_DIS_S);
3090 }
3091
3092 /*
3093 * Finally, check if we are too old to bother with RSB:
3094 */
3095 if (v2mit == X86_SPECTREV2_RETPOLINE) {
3096 *bhb_clear = ret;
3097 return (X86_BHI_TOO_OLD_OR_DISABLED);
3098 }
3099
3100 ASSERT(*bhb_clear != ret);
3101 return (X86_BHI_SOFTWARE_SEQUENCE);
3102 }
3103
3104 /*
3105 * We default to enabling Return Stack Buffer (RSB) mitigations.
3106 *
3107 * We used to skip RSB mitigations with Intel eIBRS, but developments around
3108 * post-barrier RSB (PBRSB) guessing suggests we should enable Intel RSB
3109 * mitigations always unless explicitly bypassed, or unless hardware indicates
3110 * the bug has been fixed.
3111 *
3112 * The current decisions for using, or ignoring, a RSB software stuffing
3113 * sequence are expressed by the following table:
3114 *
3115 * +-------+------------+-----------------+--------+
3116 * | eIBRS | PBRSB_NO | context switch | vmexit |
3117 * +-------+------------+-----------------+--------+
3118 * | Yes | No | stuff | stuff |
3119 * | Yes | Yes | ignore | ignore |
3120 * | No | No | stuff | ignore |
3121 * +-------+------------+-----------------+--------+
3122 *
3123 * Note that if an Intel CPU has no eIBRS, it will never enumerate PBRSB_NO,
3124 * because machines with no eIBRS do not have a problem with PBRSB overflow.
3125 * See the Intel document cited below for details.
3126 *
3127 * Also note that AMD AUTO_IBRS has no PBRSB problem, so it is not included in
3128 * the table above, and that there is no situation where vmexit stuffing is
3129 * needed, but context-switch stuffing isn't.
3130 */
3131
3132 /* BEGIN CSTYLED */
3133 /*
3134 * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/post-barrier-return-stack-buffer-predictions.html
3135 */
3136 /* END CSTYLED */
3137
3138 /*
3139 * AMD indicates that when Automatic IBRS is enabled we do not need to implement
3140 * return stack buffer clearing for VMEXIT as it takes care of it. The manual
3141 * also states that as long as SMEP and we maintain at least one page between
3142 * the kernel and user space (we have much more of a red zone), then we do not
3143 * need to clear the RSB. We constrain this to only when Automatic IRBS is
3144 * present.
3145 */
3146 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit,bool intel_pbrsb_no)3147 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit, bool intel_pbrsb_no)
3148 {
3149 const uint8_t ret = RET_INSTR;
3150 uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
3151 uint8_t *vmx_stuff = (uint8_t *)x86_rsb_stuff_vmexit;
3152
3153 switch (mit) {
3154 case X86_SPECTREV2_AUTO_IBRS:
3155 case X86_SPECTREV2_DISABLED:
3156 /* Don't bother with any RSB stuffing! */
3157 *stuff = ret;
3158 *vmx_stuff = ret;
3159 break;
3160 case X86_SPECTREV2_RETPOLINE:
3161 /*
3162 * The Intel document on Post-Barrier RSB says that processors
3163 * without eIBRS do not have PBRSB problems upon VMEXIT.
3164 */
3165 VERIFY(!intel_pbrsb_no);
3166 VERIFY3U(*stuff, !=, ret);
3167 *vmx_stuff = ret;
3168 break;
3169 default:
3170 /*
3171 * eIBRS is all that's left. If CPU claims PBRSB is fixed,
3172 * don't use the RSB mitigation in either case. Otherwise
3173 * both vmexit and context-switching require the software
3174 * mitigation.
3175 */
3176 if (intel_pbrsb_no) {
3177 /* CPU claims PBRSB problems are fixed. */
3178 *stuff = ret;
3179 *vmx_stuff = ret;
3180 }
3181 VERIFY3U(*stuff, ==, *vmx_stuff);
3182 break;
3183 }
3184 }
3185
3186 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)3187 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3188 {
3189 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3190 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3191 "_r14", "_r15" };
3192 const uint_t nthunks = ARRAY_SIZE(thunks);
3193 const char *type;
3194 uint_t i;
3195
3196 if (mit == x86_spectrev2_mitigation)
3197 return;
3198
3199 switch (mit) {
3200 case X86_SPECTREV2_RETPOLINE:
3201 type = "gen";
3202 break;
3203 case X86_SPECTREV2_AUTO_IBRS:
3204 case X86_SPECTREV2_ENHANCED_IBRS:
3205 case X86_SPECTREV2_DISABLED:
3206 type = "jmp";
3207 break;
3208 default:
3209 panic("asked to update retpoline state with unknown state!");
3210 }
3211
3212 for (i = 0; i < nthunks; i++) {
3213 uintptr_t source, dest;
3214 int ssize, dsize;
3215 char sourcebuf[64], destbuf[64];
3216
3217 (void) snprintf(destbuf, sizeof (destbuf),
3218 "__x86_indirect_thunk%s", thunks[i]);
3219 (void) snprintf(sourcebuf, sizeof (sourcebuf),
3220 "__x86_indirect_thunk_%s%s", type, thunks[i]);
3221
3222 source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3223 dest = kobj_getelfsym(destbuf, NULL, &dsize);
3224 VERIFY3U(source, !=, 0);
3225 VERIFY3U(dest, !=, 0);
3226 VERIFY3S(dsize, >=, ssize);
3227 bcopy((void *)source, (void *)dest, ssize);
3228 }
3229 }
3230
3231 static void
cpuid_enable_enhanced_ibrs(void)3232 cpuid_enable_enhanced_ibrs(void)
3233 {
3234 uint64_t val;
3235
3236 val = rdmsr(MSR_IA32_SPEC_CTRL);
3237 val |= IA32_SPEC_CTRL_IBRS;
3238 wrmsr(MSR_IA32_SPEC_CTRL, val);
3239 }
3240
3241 static void
cpuid_enable_auto_ibrs(void)3242 cpuid_enable_auto_ibrs(void)
3243 {
3244 uint64_t val;
3245
3246 val = rdmsr(MSR_AMD_EFER);
3247 val |= AMD_EFER_AIBRSE;
3248 wrmsr(MSR_AMD_EFER, val);
3249 }
3250
3251 /*
3252 * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3253 * we can disable TSX, we do so.
3254 *
3255 * This determination is done only on the boot CPU, potentially after loading
3256 * updated microcode.
3257 */
3258 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)3259 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3260 {
3261 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3262
3263 VERIFY(cpu->cpu_id == 0);
3264
3265 if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3266 x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3267 return;
3268 }
3269
3270 if (x86_disable_taa) {
3271 x86_taa_mitigation = X86_TAA_DISABLED;
3272 return;
3273 }
3274
3275 /*
3276 * If we do not have the ability to disable TSX, then our only
3277 * mitigation options are in hardware (TAA_NO), or by using our existing
3278 * MDS mitigation as described above. The latter relies upon us having
3279 * configured MDS mitigations correctly! This includes disabling SMT if
3280 * we want to cross-CPU-thread protection.
3281 */
3282 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3283 /*
3284 * It's not clear whether any parts will enumerate TAA_NO
3285 * *without* TSX_CTRL, but let's mark it as such if we see this.
3286 */
3287 if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3288 x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3289 return;
3290 }
3291
3292 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3293 !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3294 x86_taa_mitigation = X86_TAA_MD_CLEAR;
3295 } else {
3296 x86_taa_mitigation = X86_TAA_NOTHING;
3297 }
3298 return;
3299 }
3300
3301 /*
3302 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3303 * enough in boot.
3304 *
3305 * Otherwise, we'll fall back to causing transactions to abort as our
3306 * mitigation. TSX-using code will always take the fallback path.
3307 */
3308 if (cpi->cpi_pass < 4) {
3309 x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3310 } else {
3311 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3312 }
3313 }
3314
3315 /*
3316 * As mentioned, we should only touch the MSR when we've got a suitable
3317 * microcode loaded on this CPU.
3318 */
3319 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)3320 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3321 {
3322 uint64_t val;
3323
3324 switch (taa) {
3325 case X86_TAA_TSX_DISABLE:
3326 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3327 return;
3328 val = rdmsr(MSR_IA32_TSX_CTRL);
3329 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3330 wrmsr(MSR_IA32_TSX_CTRL, val);
3331 break;
3332 case X86_TAA_TSX_FORCE_ABORT:
3333 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3334 return;
3335 val = rdmsr(MSR_IA32_TSX_CTRL);
3336 val |= IA32_TSX_CTRL_RTM_DISABLE;
3337 wrmsr(MSR_IA32_TSX_CTRL, val);
3338 break;
3339 case X86_TAA_HW_MITIGATED:
3340 case X86_TAA_MD_CLEAR:
3341 case X86_TAA_DISABLED:
3342 case X86_TAA_NOTHING:
3343 break;
3344 }
3345 }
3346
3347 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)3348 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3349 {
3350 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3351 x86_spectrev2_mitigation_t v2mit;
3352
3353 if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3354 cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3355 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3356 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3357 add_x86_feature(featureset, X86FSET_IBPB);
3358 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3359 add_x86_feature(featureset, X86FSET_IBRS);
3360 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3361 add_x86_feature(featureset, X86FSET_STIBP);
3362 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3363 add_x86_feature(featureset, X86FSET_STIBP_ALL);
3364 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3365 add_x86_feature(featureset, X86FSET_SSBD);
3366 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3367 add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3368 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3369 add_x86_feature(featureset, X86FSET_SSB_NO);
3370
3371 /*
3372 * Rather than Enhanced IBRS, AMD has a different feature that
3373 * is a bit in EFER that can be enabled and will basically do
3374 * the right thing while executing in the kernel.
3375 */
3376 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3377 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3378 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3379 (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3380 add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3381 }
3382
3383 } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3384 cpi->cpi_maxeax >= 7) {
3385 struct cpuid_regs *ecp;
3386 ecp = &cpi->cpi_std[7];
3387
3388 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3389 add_x86_feature(featureset, X86FSET_MD_CLEAR);
3390 }
3391
3392 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3393 add_x86_feature(featureset, X86FSET_IBRS);
3394 add_x86_feature(featureset, X86FSET_IBPB);
3395 }
3396
3397 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3398 add_x86_feature(featureset, X86FSET_STIBP);
3399 }
3400
3401 /*
3402 * Some prediction controls are enumerated by subleaf 2 of
3403 * leaf 7.
3404 */
3405 if (CPI_FEATURES_7_2_EDX(cpi) & CPUID_INTC_EDX_7_2_BHI_CTRL) {
3406 add_x86_feature(featureset, X86FSET_BHI_CTRL);
3407 }
3408
3409 /*
3410 * Don't read the arch caps MSR on xpv where we lack the
3411 * on_trap().
3412 */
3413 #ifndef __xpv
3414 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3415 on_trap_data_t otd;
3416
3417 /*
3418 * Be paranoid and assume we'll get a #GP.
3419 */
3420 if (!on_trap(&otd, OT_DATA_ACCESS)) {
3421 uint64_t reg;
3422
3423 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3424 if (reg & IA32_ARCH_CAP_RDCL_NO) {
3425 add_x86_feature(featureset,
3426 X86FSET_RDCL_NO);
3427 }
3428 if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3429 add_x86_feature(featureset,
3430 X86FSET_IBRS_ALL);
3431 }
3432 if (reg & IA32_ARCH_CAP_RSBA) {
3433 add_x86_feature(featureset,
3434 X86FSET_RSBA);
3435 }
3436 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3437 add_x86_feature(featureset,
3438 X86FSET_L1D_VM_NO);
3439 }
3440 if (reg & IA32_ARCH_CAP_SSB_NO) {
3441 add_x86_feature(featureset,
3442 X86FSET_SSB_NO);
3443 }
3444 if (reg & IA32_ARCH_CAP_MDS_NO) {
3445 add_x86_feature(featureset,
3446 X86FSET_MDS_NO);
3447 }
3448 if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3449 add_x86_feature(featureset,
3450 X86FSET_TSX_CTRL);
3451 }
3452 if (reg & IA32_ARCH_CAP_TAA_NO) {
3453 add_x86_feature(featureset,
3454 X86FSET_TAA_NO);
3455 }
3456 if (reg & IA32_ARCH_CAP_RFDS_NO) {
3457 add_x86_feature(featureset,
3458 X86FSET_RFDS_NO);
3459 }
3460 if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3461 add_x86_feature(featureset,
3462 X86FSET_RFDS_CLEAR);
3463 }
3464 if (reg & IA32_ARCH_CAP_PBRSB_NO) {
3465 add_x86_feature(featureset,
3466 X86FSET_PBRSB_NO);
3467 }
3468 if (reg & IA32_ARCH_CAP_BHI_NO) {
3469 add_x86_feature(featureset,
3470 X86FSET_BHI_NO);
3471 }
3472 }
3473 no_trap();
3474 }
3475 #endif /* !__xpv */
3476
3477 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3478 add_x86_feature(featureset, X86FSET_SSBD);
3479
3480 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3481 add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3482 }
3483
3484 /*
3485 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3486 * will have already run this function and determined what we need to
3487 * do. This gives us a hook for per-HW thread mitigations such as
3488 * enhanced IBRS, or disabling TSX.
3489 */
3490 if (cpu->cpu_id != 0) {
3491 switch (x86_spectrev2_mitigation) {
3492 case X86_SPECTREV2_ENHANCED_IBRS:
3493 cpuid_enable_enhanced_ibrs();
3494 break;
3495 case X86_SPECTREV2_AUTO_IBRS:
3496 cpuid_enable_auto_ibrs();
3497 break;
3498 default:
3499 break;
3500 }
3501
3502 /* If we're committed to BHI_DIS_S, set it for this core. */
3503 if (x86_bhi_mitigation == X86_BHI_DIS_S)
3504 cpuid_enable_bhi_dis_s();
3505
3506 cpuid_apply_tsx(x86_taa_mitigation, featureset);
3507 return;
3508 }
3509
3510 /*
3511 * Go through and initialize various security mechanisms that we should
3512 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3513 * TAA.
3514 */
3515
3516 /*
3517 * By default we've come in with retpolines enabled. Check whether we
3518 * should disable them or enable enhanced or automatic IBRS.
3519 *
3520 * Note, we do not allow the use of AMD optimized retpolines as it was
3521 * disclosed by AMD in March 2022 that they were still
3522 * vulnerable. Prior to that point, we used them.
3523 */
3524 if (x86_disable_spectrev2 != 0) {
3525 v2mit = X86_SPECTREV2_DISABLED;
3526 } else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3527 cpuid_enable_auto_ibrs();
3528 v2mit = X86_SPECTREV2_AUTO_IBRS;
3529 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3530 cpuid_enable_enhanced_ibrs();
3531 v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3532 } else {
3533 v2mit = X86_SPECTREV2_RETPOLINE;
3534 }
3535
3536 cpuid_patch_retpolines(v2mit);
3537 cpuid_patch_rsb(v2mit, is_x86_feature(featureset, X86FSET_PBRSB_NO));
3538 x86_bhi_mitigation = cpuid_learn_and_patch_bhi(v2mit, cpu, featureset);
3539 x86_spectrev2_mitigation = v2mit;
3540 membar_producer();
3541
3542 /*
3543 * We need to determine what changes are required for mitigating L1TF
3544 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3545 * is required.
3546 *
3547 * If any of these are present, then we need to flush u-arch state at
3548 * various points. For MDS, we need to do so whenever we change to a
3549 * lesser privilege level or we are halting the CPU. For L1TF we need to
3550 * flush the L1D cache at VM entry. When we have microcode that handles
3551 * MDS, the L1D flush also clears the other u-arch state that the
3552 * md_clear does.
3553 */
3554
3555 /*
3556 * Update whether or not we need to be taking explicit action against
3557 * MDS or RFDS.
3558 */
3559 cpuid_update_md_clear(cpu, featureset);
3560
3561 /*
3562 * Determine whether SMT exclusion is required and whether or not we
3563 * need to perform an l1d flush.
3564 */
3565 cpuid_update_l1d_flush(cpu, featureset);
3566
3567 /*
3568 * Determine what our mitigation strategy should be for TAA and then
3569 * also apply TAA mitigations.
3570 */
3571 cpuid_update_tsx(cpu, featureset);
3572 cpuid_apply_tsx(x86_taa_mitigation, featureset);
3573 }
3574
3575 /*
3576 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3577 */
3578 void
setup_xfem(void)3579 setup_xfem(void)
3580 {
3581 uint64_t flags = XFEATURE_LEGACY_FP;
3582
3583 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3584
3585 if (is_x86_feature(x86_featureset, X86FSET_SSE))
3586 flags |= XFEATURE_SSE;
3587
3588 if (is_x86_feature(x86_featureset, X86FSET_AVX))
3589 flags |= XFEATURE_AVX;
3590
3591 if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3592 flags |= XFEATURE_AVX512;
3593
3594 set_xcr(XFEATURE_ENABLED_MASK, flags);
3595
3596 xsave_bv_all = flags;
3597 }
3598
3599 static void
cpuid_basic_topology(cpu_t * cpu,uchar_t * featureset)3600 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3601 {
3602 struct cpuid_info *cpi;
3603
3604 cpi = cpu->cpu_m.mcpu_cpi;
3605
3606 if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3607 cpi->cpi_vendor == X86_VENDOR_HYGON) {
3608 cpuid_gather_amd_topology_leaves(cpu);
3609 }
3610
3611 cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3612
3613 /*
3614 * Before we can calculate the IDs that we should assign to this
3615 * processor, we need to understand how many cores and threads it has.
3616 */
3617 switch (cpi->cpi_vendor) {
3618 case X86_VENDOR_Intel:
3619 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3620 &cpi->cpi_ncore_per_chip);
3621 break;
3622 case X86_VENDOR_AMD:
3623 case X86_VENDOR_HYGON:
3624 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3625 &cpi->cpi_ncore_per_chip);
3626 break;
3627 default:
3628 /*
3629 * If we have some other x86 compatible chip, it's not clear how
3630 * they would behave. The most common case is virtualization
3631 * today, though there are also 64-bit VIA chips. Assume that
3632 * all we can get is the basic Leaf 1 HTT information.
3633 */
3634 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3635 cpi->cpi_ncore_per_chip = 1;
3636 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3637 }
3638 break;
3639 }
3640
3641 /*
3642 * Based on the calculated number of threads and cores, potentially
3643 * assign the HTT and CMT features.
3644 */
3645 if (cpi->cpi_ncore_per_chip > 1) {
3646 add_x86_feature(featureset, X86FSET_CMP);
3647 }
3648
3649 if (cpi->cpi_ncpu_per_chip > 1 &&
3650 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3651 add_x86_feature(featureset, X86FSET_HTT);
3652 }
3653
3654 /*
3655 * Now that has been set up, we need to go through and calculate all of
3656 * the rest of the parameters that exist. If we think the CPU doesn't
3657 * have either SMT (HTT) or CMP, then we basically go through and fake
3658 * up information in some way. The most likely case for this is
3659 * virtualization where we have a lot of partial topology information.
3660 */
3661 if (!is_x86_feature(featureset, X86FSET_HTT) &&
3662 !is_x86_feature(featureset, X86FSET_CMP)) {
3663 /*
3664 * This is a single core, single-threaded processor.
3665 */
3666 cpi->cpi_procnodes_per_pkg = 1;
3667 cpi->cpi_cores_per_compunit = 1;
3668 cpi->cpi_compunitid = 0;
3669 cpi->cpi_chipid = -1;
3670 cpi->cpi_clogid = 0;
3671 cpi->cpi_coreid = cpu->cpu_id;
3672 cpi->cpi_pkgcoreid = 0;
3673 if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3674 cpi->cpi_vendor == X86_VENDOR_HYGON) {
3675 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3676 } else {
3677 cpi->cpi_procnodeid = cpi->cpi_chipid;
3678 }
3679 } else {
3680 switch (cpi->cpi_vendor) {
3681 case X86_VENDOR_Intel:
3682 cpuid_intel_getids(cpu, featureset);
3683 break;
3684 case X86_VENDOR_AMD:
3685 case X86_VENDOR_HYGON:
3686 cpuid_amd_getids(cpu, featureset);
3687 break;
3688 default:
3689 /*
3690 * In this case, it's hard to say what we should do.
3691 * We're going to model them to the OS as single core
3692 * threads. We don't have a good identifier for them, so
3693 * we're just going to use the cpu id all on a single
3694 * chip.
3695 *
3696 * This case has historically been different from the
3697 * case above where we don't have HTT or CMP. While they
3698 * could be combined, we've opted to keep it separate to
3699 * minimize the risk of topology changes in weird cases.
3700 */
3701 cpi->cpi_procnodes_per_pkg = 1;
3702 cpi->cpi_cores_per_compunit = 1;
3703 cpi->cpi_chipid = 0;
3704 cpi->cpi_coreid = cpu->cpu_id;
3705 cpi->cpi_clogid = cpu->cpu_id;
3706 cpi->cpi_pkgcoreid = cpu->cpu_id;
3707 cpi->cpi_procnodeid = cpi->cpi_chipid;
3708 cpi->cpi_compunitid = cpi->cpi_coreid;
3709 break;
3710 }
3711 }
3712 }
3713
3714 /*
3715 * Gather relevant CPU features from leaf 6 which covers thermal information. We
3716 * always gather leaf 6 if it's supported; however, we only look for features on
3717 * Intel systems as AMD does not currently define any of the features we look
3718 * for below.
3719 */
3720 static void
cpuid_basic_thermal(cpu_t * cpu,uchar_t * featureset)3721 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3722 {
3723 struct cpuid_regs *cp;
3724 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3725
3726 if (cpi->cpi_maxeax < 6) {
3727 return;
3728 }
3729
3730 cp = &cpi->cpi_std[6];
3731 cp->cp_eax = 6;
3732 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3733 (void) __cpuid_insn(cp);
3734 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3735
3736 if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3737 return;
3738 }
3739
3740 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3741 add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3742 }
3743
3744 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3745 add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3746 }
3747 }
3748
3749 /*
3750 * This is used when we discover that we have AVX support in cpuid. This
3751 * proceeds to scan for the rest of the AVX derived features.
3752 */
3753 static void
cpuid_basic_avx(cpu_t * cpu,uchar_t * featureset)3754 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3755 {
3756 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3757
3758 /*
3759 * If we don't have AVX, don't bother with most of this.
3760 */
3761 if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3762 return;
3763
3764 add_x86_feature(featureset, X86FSET_AVX);
3765
3766 /*
3767 * Intel says we can't check these without also
3768 * checking AVX.
3769 */
3770 if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3771 add_x86_feature(featureset, X86FSET_F16C);
3772
3773 if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3774 add_x86_feature(featureset, X86FSET_FMA);
3775
3776 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3777 add_x86_feature(featureset, X86FSET_BMI1);
3778
3779 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3780 add_x86_feature(featureset, X86FSET_BMI2);
3781
3782 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3783 add_x86_feature(featureset, X86FSET_AVX2);
3784
3785 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3786 add_x86_feature(featureset, X86FSET_VAES);
3787
3788 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3789 add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3790
3791 /*
3792 * The rest of the AVX features require AVX512. Do not check them unless
3793 * it is present.
3794 */
3795 if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3796 return;
3797 add_x86_feature(featureset, X86FSET_AVX512F);
3798
3799 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3800 add_x86_feature(featureset, X86FSET_AVX512DQ);
3801
3802 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3803 add_x86_feature(featureset, X86FSET_AVX512FMA);
3804
3805 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3806 add_x86_feature(featureset, X86FSET_AVX512PF);
3807
3808 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3809 add_x86_feature(featureset, X86FSET_AVX512ER);
3810
3811 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3812 add_x86_feature(featureset, X86FSET_AVX512CD);
3813
3814 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3815 add_x86_feature(featureset, X86FSET_AVX512BW);
3816
3817 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3818 add_x86_feature(featureset, X86FSET_AVX512VL);
3819
3820 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3821 add_x86_feature(featureset, X86FSET_AVX512VBMI);
3822
3823 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3824 add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3825
3826 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3827 add_x86_feature(featureset, X86FSET_AVX512VNNI);
3828
3829 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3830 add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3831
3832 if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3833 add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3834
3835 if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3836 add_x86_feature(featureset, X86FSET_AVX512NNIW);
3837
3838 if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3839 add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3840
3841 /*
3842 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3843 * we don't need to.
3844 */
3845 if (cpi->cpi_std[7].cp_eax < 1)
3846 return;
3847
3848 if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3849 add_x86_feature(featureset, X86FSET_AVX512_BF16);
3850 }
3851
3852 /*
3853 * PPIN is the protected processor inventory number. On AMD this is an actual
3854 * feature bit. However, on Intel systems we need to read the platform
3855 * information MSR if we're on a specific model.
3856 */
3857 #if !defined(__xpv)
3858 static void
cpuid_basic_ppin(cpu_t * cpu,uchar_t * featureset)3859 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3860 {
3861 on_trap_data_t otd;
3862 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3863
3864 switch (cpi->cpi_vendor) {
3865 case X86_VENDOR_AMD:
3866 /*
3867 * This leaf will have already been gathered in the topology
3868 * functions.
3869 */
3870 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3871 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3872 add_x86_feature(featureset, X86FSET_PPIN);
3873 }
3874 }
3875 break;
3876 case X86_VENDOR_Intel:
3877 if (cpi->cpi_family != 6)
3878 break;
3879 switch (cpi->cpi_model) {
3880 case INTC_MODEL_IVYBRIDGE_XEON:
3881 case INTC_MODEL_HASWELL_XEON:
3882 case INTC_MODEL_BROADWELL_XEON:
3883 case INTC_MODEL_BROADWELL_XEON_D:
3884 case INTC_MODEL_SKYLAKE_XEON:
3885 case INTC_MODEL_ICELAKE_XEON:
3886 if (!on_trap(&otd, OT_DATA_ACCESS)) {
3887 uint64_t value;
3888
3889 value = rdmsr(MSR_PLATFORM_INFO);
3890 if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3891 add_x86_feature(featureset,
3892 X86FSET_PPIN);
3893 }
3894 }
3895 no_trap();
3896 break;
3897 default:
3898 break;
3899 }
3900 break;
3901 default:
3902 break;
3903 }
3904 }
3905 #endif /* ! __xpv */
3906
3907 static void
cpuid_pass_prelude(cpu_t * cpu,void * arg)3908 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3909 {
3910 uchar_t *featureset = (uchar_t *)arg;
3911
3912 /*
3913 * We don't run on any processor that doesn't have cpuid, and could not
3914 * possibly have arrived here.
3915 */
3916 add_x86_feature(featureset, X86FSET_CPUID);
3917 }
3918
3919 static void
cpuid_pass_ident(cpu_t * cpu,void * arg __unused)3920 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3921 {
3922 struct cpuid_info *cpi;
3923 struct cpuid_regs *cp;
3924
3925 /*
3926 * We require that virtual/native detection be complete and that PCI
3927 * config space access has been set up; at present there is no reliable
3928 * way to determine the latter.
3929 */
3930 #if !defined(__xpv)
3931 ASSERT3S(platform_type, !=, -1);
3932 #endif /* !__xpv */
3933
3934 cpi = cpu->cpu_m.mcpu_cpi;
3935 ASSERT(cpi != NULL);
3936
3937 cp = &cpi->cpi_std[0];
3938 cp->cp_eax = 0;
3939 cpi->cpi_maxeax = __cpuid_insn(cp);
3940 {
3941 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3942 *iptr++ = cp->cp_ebx;
3943 *iptr++ = cp->cp_edx;
3944 *iptr++ = cp->cp_ecx;
3945 *(char *)&cpi->cpi_vendorstr[12] = '\0';
3946 }
3947
3948 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3949 x86_vendor = cpi->cpi_vendor; /* for compatibility */
3950
3951 /*
3952 * Limit the range in case of weird hardware
3953 */
3954 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3955 cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3956 if (cpi->cpi_maxeax < 1)
3957 return;
3958
3959 cp = &cpi->cpi_std[1];
3960 cp->cp_eax = 1;
3961 (void) __cpuid_insn(cp);
3962
3963 /*
3964 * Extract identifying constants for easy access.
3965 */
3966 cpi->cpi_model = CPI_MODEL(cpi);
3967 cpi->cpi_family = CPI_FAMILY(cpi);
3968
3969 if (cpi->cpi_family == 0xf)
3970 cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3971
3972 /*
3973 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3974 * Intel, and presumably everyone else, uses model == 0xf, as
3975 * one would expect (max value means possible overflow). Sigh.
3976 */
3977
3978 switch (cpi->cpi_vendor) {
3979 case X86_VENDOR_Intel:
3980 if (IS_EXTENDED_MODEL_INTEL(cpi))
3981 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3982 break;
3983 case X86_VENDOR_AMD:
3984 if (CPI_FAMILY(cpi) == 0xf)
3985 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3986 break;
3987 case X86_VENDOR_HYGON:
3988 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3989 break;
3990 default:
3991 if (cpi->cpi_model == 0xf)
3992 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3993 break;
3994 }
3995
3996 cpi->cpi_step = CPI_STEP(cpi);
3997 cpi->cpi_brandid = CPI_BRANDID(cpi);
3998
3999 /*
4000 * Synthesize chip "revision" and socket type
4001 */
4002 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4003 cpi->cpi_model, cpi->cpi_step);
4004 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4005 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4006 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4007 cpi->cpi_model, cpi->cpi_step);
4008 cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
4009 cpi->cpi_model, cpi->cpi_step);
4010 }
4011
4012 static void
cpuid_pass_basic(cpu_t * cpu,void * arg)4013 cpuid_pass_basic(cpu_t *cpu, void *arg)
4014 {
4015 uchar_t *featureset = (uchar_t *)arg;
4016 uint32_t mask_ecx, mask_edx;
4017 struct cpuid_info *cpi;
4018 struct cpuid_regs *cp;
4019 int xcpuid;
4020 #if !defined(__xpv)
4021 extern int idle_cpu_prefer_mwait;
4022 #endif
4023
4024 cpi = cpu->cpu_m.mcpu_cpi;
4025 ASSERT(cpi != NULL);
4026
4027 if (cpi->cpi_maxeax < 1)
4028 return;
4029
4030 /*
4031 * This was filled during the identification pass.
4032 */
4033 cp = &cpi->cpi_std[1];
4034
4035 /*
4036 * *default* assumptions:
4037 * - believe %edx feature word
4038 * - ignore %ecx feature word
4039 * - 32-bit virtual and physical addressing
4040 */
4041 mask_edx = 0xffffffff;
4042 mask_ecx = 0;
4043
4044 cpi->cpi_pabits = cpi->cpi_vabits = 32;
4045
4046 switch (cpi->cpi_vendor) {
4047 case X86_VENDOR_Intel:
4048 if (cpi->cpi_family == 5)
4049 x86_type = X86_TYPE_P5;
4050 else if (IS_LEGACY_P6(cpi)) {
4051 x86_type = X86_TYPE_P6;
4052 pentiumpro_bug4046376 = 1;
4053 /*
4054 * Clear the SEP bit when it was set erroneously
4055 */
4056 if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
4057 cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
4058 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
4059 x86_type = X86_TYPE_P4;
4060 /*
4061 * We don't currently depend on any of the %ecx
4062 * features until Prescott, so we'll only check
4063 * this from P4 onwards. We might want to revisit
4064 * that idea later.
4065 */
4066 mask_ecx = 0xffffffff;
4067 } else if (cpi->cpi_family > 0xf)
4068 mask_ecx = 0xffffffff;
4069 /*
4070 * We don't support MONITOR/MWAIT if leaf 5 is not available
4071 * to obtain the monitor linesize.
4072 */
4073 if (cpi->cpi_maxeax < 5)
4074 mask_ecx &= ~CPUID_INTC_ECX_MON;
4075 break;
4076 case X86_VENDOR_IntelClone:
4077 default:
4078 break;
4079 case X86_VENDOR_AMD:
4080 #if defined(OPTERON_ERRATUM_108)
4081 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
4082 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
4083 cpi->cpi_model = 0xc;
4084 } else
4085 #endif
4086 if (cpi->cpi_family == 5) {
4087 /*
4088 * AMD K5 and K6
4089 *
4090 * These CPUs have an incomplete implementation
4091 * of MCA/MCE which we mask away.
4092 */
4093 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
4094
4095 /*
4096 * Model 0 uses the wrong (APIC) bit
4097 * to indicate PGE. Fix it here.
4098 */
4099 if (cpi->cpi_model == 0) {
4100 if (cp->cp_edx & 0x200) {
4101 cp->cp_edx &= ~0x200;
4102 cp->cp_edx |= CPUID_INTC_EDX_PGE;
4103 }
4104 }
4105
4106 /*
4107 * Early models had problems w/ MMX; disable.
4108 */
4109 if (cpi->cpi_model < 6)
4110 mask_edx &= ~CPUID_INTC_EDX_MMX;
4111 }
4112
4113 /*
4114 * For newer families, SSE3 and CX16, at least, are valid;
4115 * enable all
4116 */
4117 if (cpi->cpi_family >= 0xf)
4118 mask_ecx = 0xffffffff;
4119 /*
4120 * We don't support MONITOR/MWAIT if leaf 5 is not available
4121 * to obtain the monitor linesize.
4122 */
4123 if (cpi->cpi_maxeax < 5)
4124 mask_ecx &= ~CPUID_INTC_ECX_MON;
4125
4126 #if !defined(__xpv)
4127 /*
4128 * AMD has not historically used MWAIT in the CPU's idle loop.
4129 * Pre-family-10h Opterons do not have the MWAIT instruction. We
4130 * know for certain that in at least family 17h, per AMD, mwait
4131 * is preferred. Families in-between are less certain.
4132 */
4133 if (cpi->cpi_family < 0x17) {
4134 idle_cpu_prefer_mwait = 0;
4135 }
4136 #endif
4137
4138 break;
4139 case X86_VENDOR_HYGON:
4140 /* Enable all for Hygon Dhyana CPU */
4141 mask_ecx = 0xffffffff;
4142 break;
4143 case X86_VENDOR_TM:
4144 /*
4145 * workaround the NT workaround in CMS 4.1
4146 */
4147 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
4148 (cpi->cpi_step == 2 || cpi->cpi_step == 3))
4149 cp->cp_edx |= CPUID_INTC_EDX_CX8;
4150 break;
4151 case X86_VENDOR_Centaur:
4152 /*
4153 * workaround the NT workarounds again
4154 */
4155 if (cpi->cpi_family == 6)
4156 cp->cp_edx |= CPUID_INTC_EDX_CX8;
4157 break;
4158 case X86_VENDOR_Cyrix:
4159 /*
4160 * We rely heavily on the probing in locore
4161 * to actually figure out what parts, if any,
4162 * of the Cyrix cpuid instruction to believe.
4163 */
4164 switch (x86_type) {
4165 case X86_TYPE_CYRIX_486:
4166 mask_edx = 0;
4167 break;
4168 case X86_TYPE_CYRIX_6x86:
4169 mask_edx = 0;
4170 break;
4171 case X86_TYPE_CYRIX_6x86L:
4172 mask_edx =
4173 CPUID_INTC_EDX_DE |
4174 CPUID_INTC_EDX_CX8;
4175 break;
4176 case X86_TYPE_CYRIX_6x86MX:
4177 mask_edx =
4178 CPUID_INTC_EDX_DE |
4179 CPUID_INTC_EDX_MSR |
4180 CPUID_INTC_EDX_CX8 |
4181 CPUID_INTC_EDX_PGE |
4182 CPUID_INTC_EDX_CMOV |
4183 CPUID_INTC_EDX_MMX;
4184 break;
4185 case X86_TYPE_CYRIX_GXm:
4186 mask_edx =
4187 CPUID_INTC_EDX_MSR |
4188 CPUID_INTC_EDX_CX8 |
4189 CPUID_INTC_EDX_CMOV |
4190 CPUID_INTC_EDX_MMX;
4191 break;
4192 case X86_TYPE_CYRIX_MediaGX:
4193 break;
4194 case X86_TYPE_CYRIX_MII:
4195 case X86_TYPE_VIA_CYRIX_III:
4196 mask_edx =
4197 CPUID_INTC_EDX_DE |
4198 CPUID_INTC_EDX_TSC |
4199 CPUID_INTC_EDX_MSR |
4200 CPUID_INTC_EDX_CX8 |
4201 CPUID_INTC_EDX_PGE |
4202 CPUID_INTC_EDX_CMOV |
4203 CPUID_INTC_EDX_MMX;
4204 break;
4205 default:
4206 break;
4207 }
4208 break;
4209 }
4210
4211 #if defined(__xpv)
4212 /*
4213 * Do not support MONITOR/MWAIT under a hypervisor
4214 */
4215 mask_ecx &= ~CPUID_INTC_ECX_MON;
4216 /*
4217 * Do not support XSAVE under a hypervisor for now
4218 */
4219 xsave_force_disable = B_TRUE;
4220
4221 #endif /* __xpv */
4222
4223 if (xsave_force_disable) {
4224 mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4225 mask_ecx &= ~CPUID_INTC_ECX_AVX;
4226 mask_ecx &= ~CPUID_INTC_ECX_F16C;
4227 mask_ecx &= ~CPUID_INTC_ECX_FMA;
4228 }
4229
4230 /*
4231 * Now we've figured out the masks that determine
4232 * which bits we choose to believe, apply the masks
4233 * to the feature words, then map the kernel's view
4234 * of these feature words into its feature word.
4235 */
4236 cp->cp_edx &= mask_edx;
4237 cp->cp_ecx &= mask_ecx;
4238
4239 /*
4240 * apply any platform restrictions (we don't call this
4241 * immediately after __cpuid_insn here, because we need the
4242 * workarounds applied above first)
4243 */
4244 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4245
4246 /*
4247 * In addition to ecx and edx, Intel and AMD are storing a bunch of
4248 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4249 * 7 has sub-leaves determined by ecx.
4250 */
4251 if (cpi->cpi_maxeax >= 7) {
4252 struct cpuid_regs *ecp;
4253 ecp = &cpi->cpi_std[7];
4254 ecp->cp_eax = 7;
4255 ecp->cp_ecx = 0;
4256 (void) __cpuid_insn(ecp);
4257
4258 /*
4259 * If XSAVE has been disabled, just ignore all of the
4260 * extended-save-area dependent flags here. By removing most of
4261 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
4262 * end up looking at additional xsave dependent leaves right
4263 * now.
4264 */
4265 if (xsave_force_disable) {
4266 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4267 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4268 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4269 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4270 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4271 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4272 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4273 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4274 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4275 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4276 }
4277
4278 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4279 add_x86_feature(featureset, X86FSET_SMEP);
4280
4281 /*
4282 * We check disable_smap here in addition to in startup_smap()
4283 * to ensure CPUs that aren't the boot CPU don't accidentally
4284 * include it in the feature set and thus generate a mismatched
4285 * x86 feature set across CPUs.
4286 */
4287 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4288 disable_smap == 0)
4289 add_x86_feature(featureset, X86FSET_SMAP);
4290
4291 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
4292 add_x86_feature(featureset, X86FSET_RDSEED);
4293
4294 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4295 add_x86_feature(featureset, X86FSET_ADX);
4296
4297 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4298 add_x86_feature(featureset, X86FSET_FSGSBASE);
4299
4300 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4301 add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4302
4303 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4304 add_x86_feature(featureset, X86FSET_INVPCID);
4305
4306 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4307 add_x86_feature(featureset, X86FSET_UMIP);
4308 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4309 add_x86_feature(featureset, X86FSET_PKU);
4310 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4311 add_x86_feature(featureset, X86FSET_OSPKE);
4312 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4313 add_x86_feature(featureset, X86FSET_GFNI);
4314
4315 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4316 add_x86_feature(featureset, X86FSET_CLWB);
4317
4318 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4319 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4320 add_x86_feature(featureset, X86FSET_MPX);
4321 }
4322
4323 /*
4324 * If we have subleaf 1 or 2 available, grab and store
4325 * that. This is used for more AVX and related features.
4326 */
4327 if (ecp->cp_eax >= 1) {
4328 struct cpuid_regs *c71;
4329 c71 = &cpi->cpi_sub7[0];
4330 c71->cp_eax = 7;
4331 c71->cp_ecx = 1;
4332 (void) __cpuid_insn(c71);
4333 }
4334
4335 /* Subleaf 2 has certain security indicators in it. */
4336 if (ecp->cp_eax >= 2) {
4337 struct cpuid_regs *c72;
4338 c72 = &cpi->cpi_sub7[1];
4339 c72->cp_eax = 7;
4340 c72->cp_ecx = 2;
4341 (void) __cpuid_insn(c72);
4342 }
4343 }
4344
4345 /*
4346 * fold in overrides from the "eeprom" mechanism
4347 */
4348 cp->cp_edx |= cpuid_feature_edx_include;
4349 cp->cp_edx &= ~cpuid_feature_edx_exclude;
4350
4351 cp->cp_ecx |= cpuid_feature_ecx_include;
4352 cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4353
4354 if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4355 add_x86_feature(featureset, X86FSET_LARGEPAGE);
4356 }
4357 if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4358 add_x86_feature(featureset, X86FSET_TSC);
4359 }
4360 if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4361 add_x86_feature(featureset, X86FSET_MSR);
4362 }
4363 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4364 add_x86_feature(featureset, X86FSET_MTRR);
4365 }
4366 if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4367 add_x86_feature(featureset, X86FSET_PGE);
4368 }
4369 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4370 add_x86_feature(featureset, X86FSET_CMOV);
4371 }
4372 if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4373 add_x86_feature(featureset, X86FSET_MMX);
4374 }
4375 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4376 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4377 add_x86_feature(featureset, X86FSET_MCA);
4378 }
4379 if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4380 add_x86_feature(featureset, X86FSET_PAE);
4381 }
4382 if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4383 add_x86_feature(featureset, X86FSET_CX8);
4384 }
4385 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4386 add_x86_feature(featureset, X86FSET_CX16);
4387 }
4388 if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4389 add_x86_feature(featureset, X86FSET_PAT);
4390 }
4391 if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4392 add_x86_feature(featureset, X86FSET_SEP);
4393 }
4394 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4395 /*
4396 * In our implementation, fxsave/fxrstor
4397 * are prerequisites before we'll even
4398 * try and do SSE things.
4399 */
4400 if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4401 add_x86_feature(featureset, X86FSET_SSE);
4402 }
4403 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4404 add_x86_feature(featureset, X86FSET_SSE2);
4405 }
4406 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4407 add_x86_feature(featureset, X86FSET_SSE3);
4408 }
4409 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4410 add_x86_feature(featureset, X86FSET_SSSE3);
4411 }
4412 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4413 add_x86_feature(featureset, X86FSET_SSE4_1);
4414 }
4415 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4416 add_x86_feature(featureset, X86FSET_SSE4_2);
4417 }
4418 if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4419 add_x86_feature(featureset, X86FSET_AES);
4420 }
4421 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4422 add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4423 }
4424
4425 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4426 add_x86_feature(featureset, X86FSET_SHA);
4427
4428 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4429 add_x86_feature(featureset, X86FSET_XSAVE);
4430
4431 /* We only test AVX & AVX512 when there is XSAVE */
4432 cpuid_basic_avx(cpu, featureset);
4433 }
4434 }
4435
4436 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4437 add_x86_feature(featureset, X86FSET_PCID);
4438 }
4439
4440 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4441 add_x86_feature(featureset, X86FSET_X2APIC);
4442 }
4443 if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4444 add_x86_feature(featureset, X86FSET_DE);
4445 }
4446 #if !defined(__xpv)
4447 if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4448
4449 /*
4450 * We require the CLFLUSH instruction for erratum workaround
4451 * to use MONITOR/MWAIT.
4452 */
4453 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4454 cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4455 add_x86_feature(featureset, X86FSET_MWAIT);
4456 } else {
4457 extern int idle_cpu_assert_cflush_monitor;
4458
4459 /*
4460 * All processors we are aware of which have
4461 * MONITOR/MWAIT also have CLFLUSH.
4462 */
4463 if (idle_cpu_assert_cflush_monitor) {
4464 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4465 (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4466 }
4467 }
4468 }
4469 #endif /* __xpv */
4470
4471 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4472 add_x86_feature(featureset, X86FSET_VMX);
4473 }
4474
4475 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4476 add_x86_feature(featureset, X86FSET_RDRAND);
4477
4478 /*
4479 * Only need it first time, rest of the cpus would follow suit.
4480 * we only capture this for the bootcpu.
4481 */
4482 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4483 add_x86_feature(featureset, X86FSET_CLFSH);
4484 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4485 }
4486 if (is_x86_feature(featureset, X86FSET_PAE))
4487 cpi->cpi_pabits = 36;
4488
4489 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4490 struct cpuid_regs r, *ecp;
4491
4492 ecp = &r;
4493 ecp->cp_eax = 0xD;
4494 ecp->cp_ecx = 1;
4495 ecp->cp_edx = ecp->cp_ebx = 0;
4496 (void) __cpuid_insn(ecp);
4497
4498 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4499 add_x86_feature(featureset, X86FSET_XSAVEOPT);
4500 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4501 add_x86_feature(featureset, X86FSET_XSAVEC);
4502 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4503 add_x86_feature(featureset, X86FSET_XSAVES);
4504
4505 /*
4506 * Zen 2 family processors suffer from erratum 1386 that causes
4507 * xsaves to not function correctly in some circumstances. There
4508 * are no supervisor states in Zen 2 and earlier. Practically
4509 * speaking this has no impact for us as we currently do not
4510 * leverage compressed xsave formats. To safeguard against
4511 * issues in the future where we may opt to using it, we remove
4512 * it from the feature set now. While Matisse has a microcode
4513 * update available with a fix, not all Zen 2 CPUs do so it's
4514 * simpler for the moment to unconditionally remove it.
4515 */
4516 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4517 uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4518 remove_x86_feature(featureset, X86FSET_XSAVES);
4519 }
4520 }
4521
4522 /*
4523 * Work on the "extended" feature information, doing
4524 * some basic initialization to be used in the extended pass.
4525 */
4526 xcpuid = 0;
4527 switch (cpi->cpi_vendor) {
4528 case X86_VENDOR_Intel:
4529 /*
4530 * On KVM we know we will have proper support for extended
4531 * cpuid.
4532 */
4533 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4534 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4535 (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4536 xcpuid++;
4537 break;
4538 case X86_VENDOR_AMD:
4539 if (cpi->cpi_family > 5 ||
4540 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4541 xcpuid++;
4542 break;
4543 case X86_VENDOR_Cyrix:
4544 /*
4545 * Only these Cyrix CPUs are -known- to support
4546 * extended cpuid operations.
4547 */
4548 if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4549 x86_type == X86_TYPE_CYRIX_GXm)
4550 xcpuid++;
4551 break;
4552 case X86_VENDOR_HYGON:
4553 case X86_VENDOR_Centaur:
4554 case X86_VENDOR_TM:
4555 default:
4556 xcpuid++;
4557 break;
4558 }
4559
4560 if (xcpuid) {
4561 cp = &cpi->cpi_extd[0];
4562 cp->cp_eax = CPUID_LEAF_EXT_0;
4563 cpi->cpi_xmaxeax = __cpuid_insn(cp);
4564 }
4565
4566 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4567
4568 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4569 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4570
4571 switch (cpi->cpi_vendor) {
4572 case X86_VENDOR_Intel:
4573 case X86_VENDOR_AMD:
4574 case X86_VENDOR_HYGON:
4575 if (cpi->cpi_xmaxeax < 0x80000001)
4576 break;
4577 cp = &cpi->cpi_extd[1];
4578 cp->cp_eax = 0x80000001;
4579 (void) __cpuid_insn(cp);
4580
4581 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4582 cpi->cpi_family == 5 &&
4583 cpi->cpi_model == 6 &&
4584 cpi->cpi_step == 6) {
4585 /*
4586 * K6 model 6 uses bit 10 to indicate SYSC
4587 * Later models use bit 11. Fix it here.
4588 */
4589 if (cp->cp_edx & 0x400) {
4590 cp->cp_edx &= ~0x400;
4591 cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4592 }
4593 }
4594
4595 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4596
4597 /*
4598 * Compute the additions to the kernel's feature word.
4599 */
4600 if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4601 add_x86_feature(featureset, X86FSET_NX);
4602 }
4603
4604 /*
4605 * Regardless whether or not we boot 64-bit,
4606 * we should have a way to identify whether
4607 * the CPU is capable of running 64-bit.
4608 */
4609 if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4610 add_x86_feature(featureset, X86FSET_64);
4611 }
4612
4613 /* 1 GB large page - enable only for 64 bit kernel */
4614 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4615 add_x86_feature(featureset, X86FSET_1GPG);
4616 }
4617
4618 if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4619 cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4620 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4621 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4622 add_x86_feature(featureset, X86FSET_SSE4A);
4623 }
4624
4625 /*
4626 * It's really tricky to support syscall/sysret in
4627 * the i386 kernel; we rely on sysenter/sysexit
4628 * instead. In the amd64 kernel, things are -way-
4629 * better.
4630 */
4631 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4632 add_x86_feature(featureset, X86FSET_ASYSC);
4633 }
4634
4635 /*
4636 * While we're thinking about system calls, note
4637 * that AMD processors don't support sysenter
4638 * in long mode at all, so don't try to program them.
4639 */
4640 if (x86_vendor == X86_VENDOR_AMD ||
4641 x86_vendor == X86_VENDOR_HYGON) {
4642 remove_x86_feature(featureset, X86FSET_SEP);
4643 }
4644
4645 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4646 add_x86_feature(featureset, X86FSET_TSCP);
4647 }
4648
4649 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4650 add_x86_feature(featureset, X86FSET_SVM);
4651 }
4652
4653 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4654 add_x86_feature(featureset, X86FSET_TOPOEXT);
4655 }
4656
4657 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4658 add_x86_feature(featureset, X86FSET_AMD_PCEC);
4659 }
4660
4661 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4662 add_x86_feature(featureset, X86FSET_XOP);
4663 }
4664
4665 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4666 add_x86_feature(featureset, X86FSET_FMA4);
4667 }
4668
4669 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4670 add_x86_feature(featureset, X86FSET_TBM);
4671 }
4672
4673 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4674 add_x86_feature(featureset, X86FSET_MONITORX);
4675 }
4676 break;
4677 default:
4678 break;
4679 }
4680
4681 /*
4682 * Get CPUID data about processor cores and hyperthreads.
4683 */
4684 switch (cpi->cpi_vendor) {
4685 case X86_VENDOR_Intel:
4686 if (cpi->cpi_maxeax >= 4) {
4687 cp = &cpi->cpi_std[4];
4688 cp->cp_eax = 4;
4689 cp->cp_ecx = 0;
4690 (void) __cpuid_insn(cp);
4691 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4692 }
4693 /*FALLTHROUGH*/
4694 case X86_VENDOR_AMD:
4695 case X86_VENDOR_HYGON:
4696 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4697 break;
4698 cp = &cpi->cpi_extd[8];
4699 cp->cp_eax = CPUID_LEAF_EXT_8;
4700 (void) __cpuid_insn(cp);
4701 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4702 cp);
4703
4704 /*
4705 * AMD uses ebx for some extended functions.
4706 */
4707 if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4708 cpi->cpi_vendor == X86_VENDOR_HYGON) {
4709 /*
4710 * While we're here, check for the AMD "Error
4711 * Pointer Zero/Restore" feature. This can be
4712 * used to setup the FP save handlers
4713 * appropriately.
4714 */
4715 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4716 cpi->cpi_fp_amd_save = 0;
4717 } else {
4718 cpi->cpi_fp_amd_save = 1;
4719 }
4720
4721 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4722 add_x86_feature(featureset,
4723 X86FSET_CLZERO);
4724 }
4725 }
4726
4727 /*
4728 * Virtual and physical address limits from
4729 * cpuid override previously guessed values.
4730 */
4731 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4732 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4733 break;
4734 default:
4735 break;
4736 }
4737
4738 /*
4739 * Get CPUID data about TSC Invariance in Deep C-State.
4740 */
4741 switch (cpi->cpi_vendor) {
4742 case X86_VENDOR_Intel:
4743 case X86_VENDOR_AMD:
4744 case X86_VENDOR_HYGON:
4745 if (cpi->cpi_maxeax >= 7) {
4746 cp = &cpi->cpi_extd[7];
4747 cp->cp_eax = 0x80000007;
4748 cp->cp_ecx = 0;
4749 (void) __cpuid_insn(cp);
4750 }
4751 break;
4752 default:
4753 break;
4754 }
4755 }
4756
4757 /*
4758 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4759 * run and thus gathered some of its dependent leaves.
4760 */
4761 cpuid_basic_topology(cpu, featureset);
4762 cpuid_basic_thermal(cpu, featureset);
4763 #if !defined(__xpv)
4764 cpuid_basic_ppin(cpu, featureset);
4765 #endif
4766
4767 if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4768 cpi->cpi_vendor == X86_VENDOR_HYGON) {
4769 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4770 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4771 /* Special handling for AMD FP not necessary. */
4772 cpi->cpi_fp_amd_save = 0;
4773 } else {
4774 cpi->cpi_fp_amd_save = 1;
4775 }
4776 }
4777
4778 /*
4779 * Check (and potentially set) if lfence is serializing.
4780 * This is useful for accurate rdtsc measurements and AMD retpolines.
4781 */
4782 if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4783 cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4784 is_x86_feature(featureset, X86FSET_SSE2)) {
4785 /*
4786 * The AMD white paper Software Techniques For Managing
4787 * Speculation on AMD Processors details circumstances for when
4788 * lfence instructions are serializing.
4789 *
4790 * On family 0xf and 0x11, it is inherently so. On family 0x10
4791 * and later (excluding 0x11), a bit in the DE_CFG MSR
4792 * determines the lfence behavior. Per that whitepaper, AMD has
4793 * committed to supporting that MSR on all later CPUs.
4794 */
4795 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4796 add_x86_feature(featureset, X86FSET_LFENCE_SER);
4797 } else if (cpi->cpi_family >= 0x10) {
4798 #if !defined(__xpv)
4799 uint64_t val;
4800
4801 /*
4802 * Be careful when attempting to enable the bit, and
4803 * verify that it was actually set in case we are
4804 * running in a hypervisor which is less than faithful
4805 * about its emulation of this feature.
4806 */
4807 on_trap_data_t otd;
4808 if (!on_trap(&otd, OT_DATA_ACCESS)) {
4809 val = rdmsr(MSR_AMD_DE_CFG);
4810 val |= AMD_DE_CFG_LFENCE_DISPATCH;
4811 wrmsr(MSR_AMD_DE_CFG, val);
4812 val = rdmsr(MSR_AMD_DE_CFG);
4813 } else {
4814 val = 0;
4815 }
4816 no_trap();
4817
4818 if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4819 add_x86_feature(featureset, X86FSET_LFENCE_SER);
4820 }
4821 #endif
4822 }
4823 } else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4824 is_x86_feature(featureset, X86FSET_SSE2)) {
4825 /*
4826 * Documentation and other OSes indicate that lfence is always
4827 * serializing on Intel CPUs.
4828 */
4829 add_x86_feature(featureset, X86FSET_LFENCE_SER);
4830 }
4831
4832
4833 /*
4834 * Check the processor leaves that are used for security features. Grab
4835 * any additional processor-specific leaves that we may not have yet.
4836 */
4837 switch (cpi->cpi_vendor) {
4838 case X86_VENDOR_AMD:
4839 case X86_VENDOR_HYGON:
4840 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4841 cp = &cpi->cpi_extd[0x21];
4842 cp->cp_eax = CPUID_LEAF_EXT_21;
4843 cp->cp_ecx = 0;
4844 (void) __cpuid_insn(cp);
4845 }
4846 break;
4847 default:
4848 break;
4849 }
4850
4851 cpuid_scan_security(cpu, featureset);
4852 }
4853
4854 /*
4855 * Make copies of the cpuid table entries we depend on, in
4856 * part for ease of parsing now, in part so that we have only
4857 * one place to correct any of it, in part for ease of
4858 * later export to userland, and in part so we can look at
4859 * this stuff in a crash dump.
4860 */
4861
4862 static void
cpuid_pass_extended(cpu_t * cpu,void * _arg __unused)4863 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4864 {
4865 uint_t n, nmax;
4866 int i;
4867 struct cpuid_regs *cp;
4868 uint8_t *dp;
4869 uint32_t *iptr;
4870 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4871
4872 if (cpi->cpi_maxeax < 1)
4873 return;
4874
4875 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4876 nmax = NMAX_CPI_STD;
4877 /*
4878 * (We already handled n == 0 and n == 1 in the basic pass)
4879 */
4880 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4881 /*
4882 * leaves 6 and 7 were handled in the basic pass
4883 */
4884 if (n == 6 || n == 7)
4885 continue;
4886
4887 cp->cp_eax = n;
4888
4889 /*
4890 * CPUID function 4 expects %ecx to be initialized
4891 * with an index which indicates which cache to return
4892 * information about. The OS is expected to call function 4
4893 * with %ecx set to 0, 1, 2, ... until it returns with
4894 * EAX[4:0] set to 0, which indicates there are no more
4895 * caches.
4896 *
4897 * Here, populate cpi_std[4] with the information returned by
4898 * function 4 when %ecx == 0, and do the rest in a later pass
4899 * when dynamic memory allocation becomes available.
4900 *
4901 * Note: we need to explicitly initialize %ecx here, since
4902 * function 4 may have been previously invoked.
4903 */
4904 if (n == 4)
4905 cp->cp_ecx = 0;
4906
4907 (void) __cpuid_insn(cp);
4908 platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4909 switch (n) {
4910 case 2:
4911 /*
4912 * "the lower 8 bits of the %eax register
4913 * contain a value that identifies the number
4914 * of times the cpuid [instruction] has to be
4915 * executed to obtain a complete image of the
4916 * processor's caching systems."
4917 *
4918 * How *do* they make this stuff up?
4919 */
4920 cpi->cpi_ncache = sizeof (*cp) *
4921 BITX(cp->cp_eax, 7, 0);
4922 if (cpi->cpi_ncache == 0)
4923 break;
4924 cpi->cpi_ncache--; /* skip count byte */
4925
4926 /*
4927 * Well, for now, rather than attempt to implement
4928 * this slightly dubious algorithm, we just look
4929 * at the first 15 ..
4930 */
4931 if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4932 cpi->cpi_ncache = sizeof (*cp) - 1;
4933
4934 dp = cpi->cpi_cacheinfo;
4935 if (BITX(cp->cp_eax, 31, 31) == 0) {
4936 uint8_t *p = (void *)&cp->cp_eax;
4937 for (i = 1; i < 4; i++)
4938 if (p[i] != 0)
4939 *dp++ = p[i];
4940 }
4941 if (BITX(cp->cp_ebx, 31, 31) == 0) {
4942 uint8_t *p = (void *)&cp->cp_ebx;
4943 for (i = 0; i < 4; i++)
4944 if (p[i] != 0)
4945 *dp++ = p[i];
4946 }
4947 if (BITX(cp->cp_ecx, 31, 31) == 0) {
4948 uint8_t *p = (void *)&cp->cp_ecx;
4949 for (i = 0; i < 4; i++)
4950 if (p[i] != 0)
4951 *dp++ = p[i];
4952 }
4953 if (BITX(cp->cp_edx, 31, 31) == 0) {
4954 uint8_t *p = (void *)&cp->cp_edx;
4955 for (i = 0; i < 4; i++)
4956 if (p[i] != 0)
4957 *dp++ = p[i];
4958 }
4959 break;
4960
4961 case 3: /* Processor serial number, if PSN supported */
4962 break;
4963
4964 case 4: /* Deterministic cache parameters */
4965 break;
4966
4967 case 5: /* Monitor/Mwait parameters */
4968 {
4969 size_t mwait_size;
4970
4971 /*
4972 * check cpi_mwait.support which was set in
4973 * cpuid_pass_basic()
4974 */
4975 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4976 break;
4977
4978 /*
4979 * Protect ourself from insane mwait line size.
4980 * Workaround for incomplete hardware emulator(s).
4981 */
4982 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4983 if (mwait_size < sizeof (uint32_t) ||
4984 !ISP2(mwait_size)) {
4985 #if DEBUG
4986 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4987 "size %ld", cpu->cpu_id, (long)mwait_size);
4988 #endif
4989 break;
4990 }
4991
4992 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4993 cpi->cpi_mwait.mon_max = mwait_size;
4994 if (MWAIT_EXTENSION(cpi)) {
4995 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4996 if (MWAIT_INT_ENABLE(cpi))
4997 cpi->cpi_mwait.support |=
4998 MWAIT_ECX_INT_ENABLE;
4999 }
5000 break;
5001 }
5002 default:
5003 break;
5004 }
5005 }
5006
5007 /*
5008 * XSAVE enumeration
5009 */
5010 if (cpi->cpi_maxeax >= 0xD) {
5011 struct cpuid_regs regs;
5012 boolean_t cpuid_d_valid = B_TRUE;
5013
5014 cp = ®s;
5015 cp->cp_eax = 0xD;
5016 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
5017
5018 (void) __cpuid_insn(cp);
5019
5020 /*
5021 * Sanity checks for debug
5022 */
5023 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
5024 (cp->cp_eax & XFEATURE_SSE) == 0) {
5025 cpuid_d_valid = B_FALSE;
5026 }
5027
5028 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
5029 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
5030 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
5031
5032 /*
5033 * If the hw supports AVX, get the size and offset in the save
5034 * area for the ymm state.
5035 */
5036 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
5037 cp->cp_eax = 0xD;
5038 cp->cp_ecx = 2;
5039 cp->cp_edx = cp->cp_ebx = 0;
5040
5041 (void) __cpuid_insn(cp);
5042
5043 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
5044 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
5045 cpuid_d_valid = B_FALSE;
5046 }
5047
5048 cpi->cpi_xsave.ymm_size = cp->cp_eax;
5049 cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
5050 }
5051
5052 /*
5053 * If the hw supports MPX, get the size and offset in the
5054 * save area for BNDREGS and BNDCSR.
5055 */
5056 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
5057 cp->cp_eax = 0xD;
5058 cp->cp_ecx = 3;
5059 cp->cp_edx = cp->cp_ebx = 0;
5060
5061 (void) __cpuid_insn(cp);
5062
5063 cpi->cpi_xsave.bndregs_size = cp->cp_eax;
5064 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
5065
5066 cp->cp_eax = 0xD;
5067 cp->cp_ecx = 4;
5068 cp->cp_edx = cp->cp_ebx = 0;
5069
5070 (void) __cpuid_insn(cp);
5071
5072 cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
5073 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
5074 }
5075
5076 /*
5077 * If the hw supports AVX512, get the size and offset in the
5078 * save area for the opmask registers and zmm state.
5079 */
5080 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
5081 cp->cp_eax = 0xD;
5082 cp->cp_ecx = 5;
5083 cp->cp_edx = cp->cp_ebx = 0;
5084
5085 (void) __cpuid_insn(cp);
5086
5087 cpi->cpi_xsave.opmask_size = cp->cp_eax;
5088 cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
5089
5090 cp->cp_eax = 0xD;
5091 cp->cp_ecx = 6;
5092 cp->cp_edx = cp->cp_ebx = 0;
5093
5094 (void) __cpuid_insn(cp);
5095
5096 cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
5097 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
5098
5099 cp->cp_eax = 0xD;
5100 cp->cp_ecx = 7;
5101 cp->cp_edx = cp->cp_ebx = 0;
5102
5103 (void) __cpuid_insn(cp);
5104
5105 cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
5106 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
5107 }
5108
5109 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
5110 xsave_state_size = 0;
5111 } else if (cpuid_d_valid) {
5112 xsave_state_size = cpi->cpi_xsave.xsav_max_size;
5113 } else {
5114 /* Broken CPUID 0xD, probably in HVM */
5115 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
5116 "value: hw_low = %d, hw_high = %d, xsave_size = %d"
5117 ", ymm_size = %d, ymm_offset = %d\n",
5118 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
5119 cpi->cpi_xsave.xsav_hw_features_high,
5120 (int)cpi->cpi_xsave.xsav_max_size,
5121 (int)cpi->cpi_xsave.ymm_size,
5122 (int)cpi->cpi_xsave.ymm_offset);
5123
5124 if (xsave_state_size != 0) {
5125 /*
5126 * This must be a non-boot CPU. We cannot
5127 * continue, because boot cpu has already
5128 * enabled XSAVE.
5129 */
5130 ASSERT(cpu->cpu_id != 0);
5131 cmn_err(CE_PANIC, "cpu%d: we have already "
5132 "enabled XSAVE on boot cpu, cannot "
5133 "continue.", cpu->cpu_id);
5134 } else {
5135 /*
5136 * If we reached here on the boot CPU, it's also
5137 * almost certain that we'll reach here on the
5138 * non-boot CPUs. When we're here on a boot CPU
5139 * we should disable the feature, on a non-boot
5140 * CPU we need to confirm that we have.
5141 */
5142 if (cpu->cpu_id == 0) {
5143 remove_x86_feature(x86_featureset,
5144 X86FSET_XSAVE);
5145 remove_x86_feature(x86_featureset,
5146 X86FSET_AVX);
5147 remove_x86_feature(x86_featureset,
5148 X86FSET_F16C);
5149 remove_x86_feature(x86_featureset,
5150 X86FSET_BMI1);
5151 remove_x86_feature(x86_featureset,
5152 X86FSET_BMI2);
5153 remove_x86_feature(x86_featureset,
5154 X86FSET_FMA);
5155 remove_x86_feature(x86_featureset,
5156 X86FSET_AVX2);
5157 remove_x86_feature(x86_featureset,
5158 X86FSET_MPX);
5159 remove_x86_feature(x86_featureset,
5160 X86FSET_AVX512F);
5161 remove_x86_feature(x86_featureset,
5162 X86FSET_AVX512DQ);
5163 remove_x86_feature(x86_featureset,
5164 X86FSET_AVX512PF);
5165 remove_x86_feature(x86_featureset,
5166 X86FSET_AVX512ER);
5167 remove_x86_feature(x86_featureset,
5168 X86FSET_AVX512CD);
5169 remove_x86_feature(x86_featureset,
5170 X86FSET_AVX512BW);
5171 remove_x86_feature(x86_featureset,
5172 X86FSET_AVX512VL);
5173 remove_x86_feature(x86_featureset,
5174 X86FSET_AVX512FMA);
5175 remove_x86_feature(x86_featureset,
5176 X86FSET_AVX512VBMI);
5177 remove_x86_feature(x86_featureset,
5178 X86FSET_AVX512VNNI);
5179 remove_x86_feature(x86_featureset,
5180 X86FSET_AVX512VPOPCDQ);
5181 remove_x86_feature(x86_featureset,
5182 X86FSET_AVX512NNIW);
5183 remove_x86_feature(x86_featureset,
5184 X86FSET_AVX512FMAPS);
5185 remove_x86_feature(x86_featureset,
5186 X86FSET_VAES);
5187 remove_x86_feature(x86_featureset,
5188 X86FSET_VPCLMULQDQ);
5189 remove_x86_feature(x86_featureset,
5190 X86FSET_GFNI);
5191 remove_x86_feature(x86_featureset,
5192 X86FSET_AVX512_VP2INT);
5193 remove_x86_feature(x86_featureset,
5194 X86FSET_AVX512_BITALG);
5195 remove_x86_feature(x86_featureset,
5196 X86FSET_AVX512_VBMI2);
5197 remove_x86_feature(x86_featureset,
5198 X86FSET_AVX512_BF16);
5199
5200 xsave_force_disable = B_TRUE;
5201 } else {
5202 VERIFY(is_x86_feature(x86_featureset,
5203 X86FSET_XSAVE) == B_FALSE);
5204 }
5205 }
5206 }
5207 }
5208
5209
5210 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
5211 return;
5212
5213 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
5214 nmax = NMAX_CPI_EXTD;
5215 /*
5216 * Copy the extended properties, fixing them as we go. While we start at
5217 * 2 because we've already handled a few cases in the basic pass, the
5218 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5219 */
5220 iptr = (void *)cpi->cpi_brandstr;
5221 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5222 cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5223 (void) __cpuid_insn(cp);
5224 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5225 cp);
5226 switch (n) {
5227 case 2:
5228 case 3:
5229 case 4:
5230 /*
5231 * Extract the brand string
5232 */
5233 *iptr++ = cp->cp_eax;
5234 *iptr++ = cp->cp_ebx;
5235 *iptr++ = cp->cp_ecx;
5236 *iptr++ = cp->cp_edx;
5237 break;
5238 case 5:
5239 switch (cpi->cpi_vendor) {
5240 case X86_VENDOR_AMD:
5241 /*
5242 * The Athlon and Duron were the first
5243 * parts to report the sizes of the
5244 * TLB for large pages. Before then,
5245 * we don't trust the data.
5246 */
5247 if (cpi->cpi_family < 6 ||
5248 (cpi->cpi_family == 6 &&
5249 cpi->cpi_model < 1))
5250 cp->cp_eax = 0;
5251 break;
5252 default:
5253 break;
5254 }
5255 break;
5256 case 6:
5257 switch (cpi->cpi_vendor) {
5258 case X86_VENDOR_AMD:
5259 /*
5260 * The Athlon and Duron were the first
5261 * AMD parts with L2 TLB's.
5262 * Before then, don't trust the data.
5263 */
5264 if (cpi->cpi_family < 6 ||
5265 (cpi->cpi_family == 6 &&
5266 cpi->cpi_model < 1))
5267 cp->cp_eax = cp->cp_ebx = 0;
5268 /*
5269 * AMD Duron rev A0 reports L2
5270 * cache size incorrectly as 1K
5271 * when it is really 64K
5272 */
5273 if (cpi->cpi_family == 6 &&
5274 cpi->cpi_model == 3 &&
5275 cpi->cpi_step == 0) {
5276 cp->cp_ecx &= 0xffff;
5277 cp->cp_ecx |= 0x400000;
5278 }
5279 break;
5280 case X86_VENDOR_Cyrix: /* VIA C3 */
5281 /*
5282 * VIA C3 processors are a bit messed
5283 * up w.r.t. encoding cache sizes in %ecx
5284 */
5285 if (cpi->cpi_family != 6)
5286 break;
5287 /*
5288 * model 7 and 8 were incorrectly encoded
5289 *
5290 * xxx is model 8 really broken?
5291 */
5292 if (cpi->cpi_model == 7 ||
5293 cpi->cpi_model == 8)
5294 cp->cp_ecx =
5295 BITX(cp->cp_ecx, 31, 24) << 16 |
5296 BITX(cp->cp_ecx, 23, 16) << 12 |
5297 BITX(cp->cp_ecx, 15, 8) << 8 |
5298 BITX(cp->cp_ecx, 7, 0);
5299 /*
5300 * model 9 stepping 1 has wrong associativity
5301 */
5302 if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5303 cp->cp_ecx |= 8 << 12;
5304 break;
5305 case X86_VENDOR_Intel:
5306 /*
5307 * Extended L2 Cache features function.
5308 * First appeared on Prescott.
5309 */
5310 default:
5311 break;
5312 }
5313 break;
5314 default:
5315 break;
5316 }
5317 }
5318 }
5319
5320 static const char *
intel_cpubrand(const struct cpuid_info * cpi)5321 intel_cpubrand(const struct cpuid_info *cpi)
5322 {
5323 int i;
5324
5325 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5326
5327 switch (cpi->cpi_family) {
5328 case 5:
5329 return ("Intel Pentium(r)");
5330 case 6:
5331 switch (cpi->cpi_model) {
5332 uint_t celeron, xeon;
5333 const struct cpuid_regs *cp;
5334 case 0:
5335 case 1:
5336 case 2:
5337 return ("Intel Pentium(r) Pro");
5338 case 3:
5339 case 4:
5340 return ("Intel Pentium(r) II");
5341 case 6:
5342 return ("Intel Celeron(r)");
5343 case 5:
5344 case 7:
5345 celeron = xeon = 0;
5346 cp = &cpi->cpi_std[2]; /* cache info */
5347
5348 for (i = 1; i < 4; i++) {
5349 uint_t tmp;
5350
5351 tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5352 if (tmp == 0x40)
5353 celeron++;
5354 if (tmp >= 0x44 && tmp <= 0x45)
5355 xeon++;
5356 }
5357
5358 for (i = 0; i < 2; i++) {
5359 uint_t tmp;
5360
5361 tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5362 if (tmp == 0x40)
5363 celeron++;
5364 else if (tmp >= 0x44 && tmp <= 0x45)
5365 xeon++;
5366 }
5367
5368 for (i = 0; i < 4; i++) {
5369 uint_t tmp;
5370
5371 tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5372 if (tmp == 0x40)
5373 celeron++;
5374 else if (tmp >= 0x44 && tmp <= 0x45)
5375 xeon++;
5376 }
5377
5378 for (i = 0; i < 4; i++) {
5379 uint_t tmp;
5380
5381 tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5382 if (tmp == 0x40)
5383 celeron++;
5384 else if (tmp >= 0x44 && tmp <= 0x45)
5385 xeon++;
5386 }
5387
5388 if (celeron)
5389 return ("Intel Celeron(r)");
5390 if (xeon)
5391 return (cpi->cpi_model == 5 ?
5392 "Intel Pentium(r) II Xeon(tm)" :
5393 "Intel Pentium(r) III Xeon(tm)");
5394 return (cpi->cpi_model == 5 ?
5395 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5396 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5397 default:
5398 break;
5399 }
5400 default:
5401 break;
5402 }
5403
5404 /* BrandID is present if the field is nonzero */
5405 if (cpi->cpi_brandid != 0) {
5406 static const struct {
5407 uint_t bt_bid;
5408 const char *bt_str;
5409 } brand_tbl[] = {
5410 { 0x1, "Intel(r) Celeron(r)" },
5411 { 0x2, "Intel(r) Pentium(r) III" },
5412 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" },
5413 { 0x4, "Intel(r) Pentium(r) III" },
5414 { 0x6, "Mobile Intel(r) Pentium(r) III" },
5415 { 0x7, "Mobile Intel(r) Celeron(r)" },
5416 { 0x8, "Intel(r) Pentium(r) 4" },
5417 { 0x9, "Intel(r) Pentium(r) 4" },
5418 { 0xa, "Intel(r) Celeron(r)" },
5419 { 0xb, "Intel(r) Xeon(tm)" },
5420 { 0xc, "Intel(r) Xeon(tm) MP" },
5421 { 0xe, "Mobile Intel(r) Pentium(r) 4" },
5422 { 0xf, "Mobile Intel(r) Celeron(r)" },
5423 { 0x11, "Mobile Genuine Intel(r)" },
5424 { 0x12, "Intel(r) Celeron(r) M" },
5425 { 0x13, "Mobile Intel(r) Celeron(r)" },
5426 { 0x14, "Intel(r) Celeron(r)" },
5427 { 0x15, "Mobile Genuine Intel(r)" },
5428 { 0x16, "Intel(r) Pentium(r) M" },
5429 { 0x17, "Mobile Intel(r) Celeron(r)" }
5430 };
5431 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5432 uint_t sgn;
5433
5434 sgn = (cpi->cpi_family << 8) |
5435 (cpi->cpi_model << 4) | cpi->cpi_step;
5436
5437 for (i = 0; i < btblmax; i++)
5438 if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5439 break;
5440 if (i < btblmax) {
5441 if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5442 return ("Intel(r) Celeron(r)");
5443 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5444 return ("Intel(r) Xeon(tm) MP");
5445 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5446 return ("Intel(r) Xeon(tm)");
5447 return (brand_tbl[i].bt_str);
5448 }
5449 }
5450
5451 return (NULL);
5452 }
5453
5454 static const char *
amd_cpubrand(const struct cpuid_info * cpi)5455 amd_cpubrand(const struct cpuid_info *cpi)
5456 {
5457 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5458
5459 switch (cpi->cpi_family) {
5460 case 5:
5461 switch (cpi->cpi_model) {
5462 case 0:
5463 case 1:
5464 case 2:
5465 case 3:
5466 case 4:
5467 case 5:
5468 return ("AMD-K5(r)");
5469 case 6:
5470 case 7:
5471 return ("AMD-K6(r)");
5472 case 8:
5473 return ("AMD-K6(r)-2");
5474 case 9:
5475 return ("AMD-K6(r)-III");
5476 default:
5477 return ("AMD (family 5)");
5478 }
5479 case 6:
5480 switch (cpi->cpi_model) {
5481 case 1:
5482 return ("AMD-K7(tm)");
5483 case 0:
5484 case 2:
5485 case 4:
5486 return ("AMD Athlon(tm)");
5487 case 3:
5488 case 7:
5489 return ("AMD Duron(tm)");
5490 case 6:
5491 case 8:
5492 case 10:
5493 /*
5494 * Use the L2 cache size to distinguish
5495 */
5496 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5497 "AMD Athlon(tm)" : "AMD Duron(tm)");
5498 default:
5499 return ("AMD (family 6)");
5500 }
5501 default:
5502 break;
5503 }
5504
5505 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5506 cpi->cpi_brandid != 0) {
5507 switch (BITX(cpi->cpi_brandid, 7, 5)) {
5508 case 3:
5509 return ("AMD Opteron(tm) UP 1xx");
5510 case 4:
5511 return ("AMD Opteron(tm) DP 2xx");
5512 case 5:
5513 return ("AMD Opteron(tm) MP 8xx");
5514 default:
5515 return ("AMD Opteron(tm)");
5516 }
5517 }
5518
5519 return (NULL);
5520 }
5521
5522 static const char *
cyrix_cpubrand(struct cpuid_info * cpi,uint_t type)5523 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5524 {
5525 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5526
5527 switch (type) {
5528 case X86_TYPE_CYRIX_6x86:
5529 return ("Cyrix 6x86");
5530 case X86_TYPE_CYRIX_6x86L:
5531 return ("Cyrix 6x86L");
5532 case X86_TYPE_CYRIX_6x86MX:
5533 return ("Cyrix 6x86MX");
5534 case X86_TYPE_CYRIX_GXm:
5535 return ("Cyrix GXm");
5536 case X86_TYPE_CYRIX_MediaGX:
5537 return ("Cyrix MediaGX");
5538 case X86_TYPE_CYRIX_MII:
5539 return ("Cyrix M2");
5540 case X86_TYPE_VIA_CYRIX_III:
5541 return ("VIA Cyrix M3");
5542 default:
5543 /*
5544 * Have another wild guess ..
5545 */
5546 if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5547 return ("Cyrix 5x86");
5548 else if (cpi->cpi_family == 5) {
5549 switch (cpi->cpi_model) {
5550 case 2:
5551 return ("Cyrix 6x86"); /* Cyrix M1 */
5552 case 4:
5553 return ("Cyrix MediaGX");
5554 default:
5555 break;
5556 }
5557 } else if (cpi->cpi_family == 6) {
5558 switch (cpi->cpi_model) {
5559 case 0:
5560 return ("Cyrix 6x86MX"); /* Cyrix M2? */
5561 case 5:
5562 case 6:
5563 case 7:
5564 case 8:
5565 case 9:
5566 return ("VIA C3");
5567 default:
5568 break;
5569 }
5570 }
5571 break;
5572 }
5573 return (NULL);
5574 }
5575
5576 /*
5577 * This only gets called in the case that the CPU extended
5578 * feature brand string (0x80000002, 0x80000003, 0x80000004)
5579 * aren't available, or contain null bytes for some reason.
5580 */
5581 static void
fabricate_brandstr(struct cpuid_info * cpi)5582 fabricate_brandstr(struct cpuid_info *cpi)
5583 {
5584 const char *brand = NULL;
5585
5586 switch (cpi->cpi_vendor) {
5587 case X86_VENDOR_Intel:
5588 brand = intel_cpubrand(cpi);
5589 break;
5590 case X86_VENDOR_AMD:
5591 brand = amd_cpubrand(cpi);
5592 break;
5593 case X86_VENDOR_Cyrix:
5594 brand = cyrix_cpubrand(cpi, x86_type);
5595 break;
5596 case X86_VENDOR_NexGen:
5597 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5598 brand = "NexGen Nx586";
5599 break;
5600 case X86_VENDOR_Centaur:
5601 if (cpi->cpi_family == 5)
5602 switch (cpi->cpi_model) {
5603 case 4:
5604 brand = "Centaur C6";
5605 break;
5606 case 8:
5607 brand = "Centaur C2";
5608 break;
5609 case 9:
5610 brand = "Centaur C3";
5611 break;
5612 default:
5613 break;
5614 }
5615 break;
5616 case X86_VENDOR_Rise:
5617 if (cpi->cpi_family == 5 &&
5618 (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5619 brand = "Rise mP6";
5620 break;
5621 case X86_VENDOR_SiS:
5622 if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5623 brand = "SiS 55x";
5624 break;
5625 case X86_VENDOR_TM:
5626 if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5627 brand = "Transmeta Crusoe TM3x00 or TM5x00";
5628 break;
5629 case X86_VENDOR_NSC:
5630 case X86_VENDOR_UMC:
5631 default:
5632 break;
5633 }
5634 if (brand) {
5635 (void) strcpy((char *)cpi->cpi_brandstr, brand);
5636 return;
5637 }
5638
5639 /*
5640 * If all else fails ...
5641 */
5642 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5643 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5644 cpi->cpi_model, cpi->cpi_step);
5645 }
5646
5647 /*
5648 * This routine is called just after kernel memory allocation
5649 * becomes available on cpu0, and as part of mp_startup() on
5650 * the other cpus.
5651 *
5652 * Fixup the brand string, and collect any information from cpuid
5653 * that requires dynamically allocated storage to represent.
5654 */
5655
5656 static void
cpuid_pass_dynamic(cpu_t * cpu,void * _arg __unused)5657 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5658 {
5659 int i, max, shft, level, size;
5660 struct cpuid_regs regs;
5661 struct cpuid_regs *cp;
5662 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5663
5664 /*
5665 * Deterministic cache parameters
5666 *
5667 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5668 * values that are present are currently defined to be the same. This
5669 * means we can use the same logic to parse it as long as we use the
5670 * appropriate leaf to get the data. If you're updating this, make sure
5671 * you're careful about which vendor supports which aspect.
5672 *
5673 * Take this opportunity to detect the number of threads sharing the
5674 * last level cache, and construct a corresponding cache id. The
5675 * respective cpuid_info members are initialized to the default case of
5676 * "no last level cache sharing".
5677 */
5678 cpi->cpi_ncpu_shr_last_cache = 1;
5679 cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5680
5681 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5682 ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5683 cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5684 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5685 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5686 uint32_t leaf;
5687
5688 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5689 leaf = 4;
5690 } else {
5691 leaf = CPUID_LEAF_EXT_1d;
5692 }
5693
5694 /*
5695 * Find the # of elements (size) returned by the leaf and along
5696 * the way detect last level cache sharing details.
5697 */
5698 bzero(®s, sizeof (regs));
5699 cp = ®s;
5700 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5701 cp->cp_eax = leaf;
5702 cp->cp_ecx = i;
5703
5704 (void) __cpuid_insn(cp);
5705
5706 if (CPI_CACHE_TYPE(cp) == 0)
5707 break;
5708 level = CPI_CACHE_LVL(cp);
5709 if (level > max) {
5710 max = level;
5711 cpi->cpi_ncpu_shr_last_cache =
5712 CPI_NTHR_SHR_CACHE(cp) + 1;
5713 }
5714 }
5715 cpi->cpi_cache_leaf_size = size = i;
5716
5717 /*
5718 * Allocate the cpi_cache_leaves array. The first element
5719 * references the regs for the corresponding leaf with %ecx set
5720 * to 0. This was gathered in cpuid_pass_extended().
5721 */
5722 if (size > 0) {
5723 cpi->cpi_cache_leaves =
5724 kmem_alloc(size * sizeof (cp), KM_SLEEP);
5725 if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5726 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5727 } else {
5728 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5729 }
5730
5731 /*
5732 * Allocate storage to hold the additional regs
5733 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5734 *
5735 * The regs for the leaf, %ecx == 0 has already
5736 * been allocated as indicated above.
5737 */
5738 for (i = 1; i < size; i++) {
5739 cp = cpi->cpi_cache_leaves[i] =
5740 kmem_zalloc(sizeof (regs), KM_SLEEP);
5741 cp->cp_eax = leaf;
5742 cp->cp_ecx = i;
5743
5744 (void) __cpuid_insn(cp);
5745 }
5746 }
5747 /*
5748 * Determine the number of bits needed to represent
5749 * the number of CPUs sharing the last level cache.
5750 *
5751 * Shift off that number of bits from the APIC id to
5752 * derive the cache id.
5753 */
5754 shft = 0;
5755 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5756 shft++;
5757 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5758 }
5759
5760 /*
5761 * Now fixup the brand string
5762 */
5763 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5764 fabricate_brandstr(cpi);
5765 } else {
5766
5767 /*
5768 * If we successfully extracted a brand string from the cpuid
5769 * instruction, clean it up by removing leading spaces and
5770 * similar junk.
5771 */
5772 if (cpi->cpi_brandstr[0]) {
5773 size_t maxlen = sizeof (cpi->cpi_brandstr);
5774 char *src, *dst;
5775
5776 dst = src = (char *)cpi->cpi_brandstr;
5777 src[maxlen - 1] = '\0';
5778 /*
5779 * strip leading spaces
5780 */
5781 while (*src == ' ')
5782 src++;
5783 /*
5784 * Remove any 'Genuine' or "Authentic" prefixes
5785 */
5786 if (strncmp(src, "Genuine ", 8) == 0)
5787 src += 8;
5788 if (strncmp(src, "Authentic ", 10) == 0)
5789 src += 10;
5790
5791 /*
5792 * Now do an in-place copy.
5793 * Map (R) to (r) and (TM) to (tm).
5794 * The era of teletypes is long gone, and there's
5795 * -really- no need to shout.
5796 */
5797 while (*src != '\0') {
5798 if (src[0] == '(') {
5799 if (strncmp(src + 1, "R)", 2) == 0) {
5800 (void) strncpy(dst, "(r)", 3);
5801 src += 3;
5802 dst += 3;
5803 continue;
5804 }
5805 if (strncmp(src + 1, "TM)", 3) == 0) {
5806 (void) strncpy(dst, "(tm)", 4);
5807 src += 4;
5808 dst += 4;
5809 continue;
5810 }
5811 }
5812 *dst++ = *src++;
5813 }
5814 *dst = '\0';
5815
5816 /*
5817 * Finally, remove any trailing spaces
5818 */
5819 while (--dst > cpi->cpi_brandstr)
5820 if (*dst == ' ')
5821 *dst = '\0';
5822 else
5823 break;
5824 } else
5825 fabricate_brandstr(cpi);
5826 }
5827 }
5828
5829 typedef struct {
5830 uint32_t avm_av;
5831 uint32_t avm_feat;
5832 } av_feat_map_t;
5833
5834 /*
5835 * These arrays are used to map features that we should add based on x86
5836 * features that are present. As a large number depend on kernel features,
5837 * rather than rechecking and clearing CPUID everywhere, we simply map these.
5838 * There is an array of these for each hwcap word. Some features aren't tracked
5839 * in the kernel x86 featureset and that's ok. They will not show up in here.
5840 */
5841 static const av_feat_map_t x86fset_to_av1[] = {
5842 { AV_386_CX8, X86FSET_CX8 },
5843 { AV_386_SEP, X86FSET_SEP },
5844 { AV_386_AMD_SYSC, X86FSET_ASYSC },
5845 { AV_386_CMOV, X86FSET_CMOV },
5846 { AV_386_FXSR, X86FSET_SSE },
5847 { AV_386_SSE, X86FSET_SSE },
5848 { AV_386_SSE2, X86FSET_SSE2 },
5849 { AV_386_SSE3, X86FSET_SSE3 },
5850 { AV_386_CX16, X86FSET_CX16 },
5851 { AV_386_TSCP, X86FSET_TSCP },
5852 { AV_386_AMD_SSE4A, X86FSET_SSE4A },
5853 { AV_386_SSSE3, X86FSET_SSSE3 },
5854 { AV_386_SSE4_1, X86FSET_SSE4_1 },
5855 { AV_386_SSE4_2, X86FSET_SSE4_2 },
5856 { AV_386_AES, X86FSET_AES },
5857 { AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5858 { AV_386_XSAVE, X86FSET_XSAVE },
5859 { AV_386_AVX, X86FSET_AVX },
5860 { AV_386_VMX, X86FSET_VMX },
5861 { AV_386_AMD_SVM, X86FSET_SVM }
5862 };
5863
5864 static const av_feat_map_t x86fset_to_av2[] = {
5865 { AV_386_2_F16C, X86FSET_F16C },
5866 { AV_386_2_RDRAND, X86FSET_RDRAND },
5867 { AV_386_2_BMI1, X86FSET_BMI1 },
5868 { AV_386_2_BMI2, X86FSET_BMI2 },
5869 { AV_386_2_FMA, X86FSET_FMA },
5870 { AV_386_2_AVX2, X86FSET_AVX2 },
5871 { AV_386_2_ADX, X86FSET_ADX },
5872 { AV_386_2_RDSEED, X86FSET_RDSEED },
5873 { AV_386_2_AVX512F, X86FSET_AVX512F },
5874 { AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5875 { AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5876 { AV_386_2_AVX512PF, X86FSET_AVX512PF },
5877 { AV_386_2_AVX512ER, X86FSET_AVX512ER },
5878 { AV_386_2_AVX512CD, X86FSET_AVX512CD },
5879 { AV_386_2_AVX512BW, X86FSET_AVX512BW },
5880 { AV_386_2_AVX512VL, X86FSET_AVX512VL },
5881 { AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5882 { AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5883 { AV_386_2_SHA, X86FSET_SHA },
5884 { AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5885 { AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5886 { AV_386_2_CLWB, X86FSET_CLWB },
5887 { AV_386_2_MONITORX, X86FSET_MONITORX },
5888 { AV_386_2_CLZERO, X86FSET_CLZERO },
5889 { AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5890 { AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5891 { AV_386_2_VAES, X86FSET_VAES },
5892 { AV_386_2_GFNI, X86FSET_GFNI },
5893 { AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5894 { AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5895 };
5896
5897 static const av_feat_map_t x86fset_to_av3[] = {
5898 { AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5899 { AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5900 };
5901
5902 /*
5903 * This routine is called out of bind_hwcap() much later in the life
5904 * of the kernel (post_startup()). The job of this routine is to resolve
5905 * the hardware feature support and kernel support for those features into
5906 * what we're actually going to tell applications via the aux vector.
5907 *
5908 * Most of the aux vector is derived from the x86_featureset array vector where
5909 * a given feature indicates that an aux vector should be plumbed through. This
5910 * allows the kernel to use one tracking mechanism for these based on whether or
5911 * not it has the required hardware support (most often xsave). Most newer
5912 * features are added there in case we need them in the kernel. Otherwise,
5913 * features are evaluated based on looking at the cpuid features that remain. If
5914 * you find yourself wanting to clear out cpuid features for some reason, they
5915 * should instead be driven by the feature set so we have a consistent view.
5916 */
5917
5918 static void
cpuid_pass_resolve(cpu_t * cpu,void * arg)5919 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5920 {
5921 uint_t *hwcap_out = (uint_t *)arg;
5922 struct cpuid_info *cpi;
5923 uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5924
5925 cpi = cpu->cpu_m.mcpu_cpi;
5926
5927 for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5928 if (is_x86_feature(x86_featureset,
5929 x86fset_to_av1[i].avm_feat)) {
5930 hwcap_flags |= x86fset_to_av1[i].avm_av;
5931 }
5932 }
5933
5934 for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5935 if (is_x86_feature(x86_featureset,
5936 x86fset_to_av2[i].avm_feat)) {
5937 hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5938 }
5939 }
5940
5941 for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5942 if (is_x86_feature(x86_featureset,
5943 x86fset_to_av3[i].avm_feat)) {
5944 hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5945 }
5946 }
5947
5948 /*
5949 * From here on out we're working through features that don't have
5950 * corresponding kernel feature flags for various reasons that are
5951 * mostly just due to the historical implementation.
5952 */
5953 if (cpi->cpi_maxeax >= 1) {
5954 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5955 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5956
5957 *edx = CPI_FEATURES_EDX(cpi);
5958 *ecx = CPI_FEATURES_ECX(cpi);
5959
5960 /*
5961 * [no explicit support required beyond x87 fp context]
5962 */
5963 if (!fpu_exists)
5964 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5965
5966 /*
5967 * Now map the supported feature vector to things that we
5968 * think userland will care about.
5969 */
5970 if (*ecx & CPUID_INTC_ECX_MOVBE)
5971 hwcap_flags |= AV_386_MOVBE;
5972
5973 if (*ecx & CPUID_INTC_ECX_POPCNT)
5974 hwcap_flags |= AV_386_POPCNT;
5975 if (*edx & CPUID_INTC_EDX_FPU)
5976 hwcap_flags |= AV_386_FPU;
5977 if (*edx & CPUID_INTC_EDX_MMX)
5978 hwcap_flags |= AV_386_MMX;
5979 if (*edx & CPUID_INTC_EDX_TSC)
5980 hwcap_flags |= AV_386_TSC;
5981 }
5982
5983 /*
5984 * Check a few miscellaneous features.
5985 */
5986 if (cpi->cpi_xmaxeax < 0x80000001)
5987 goto resolve_done;
5988
5989 switch (cpi->cpi_vendor) {
5990 uint32_t *edx, *ecx;
5991
5992 case X86_VENDOR_Intel:
5993 /*
5994 * Seems like Intel duplicated what we necessary
5995 * here to make the initial crop of 64-bit OS's work.
5996 * Hopefully, those are the only "extended" bits
5997 * they'll add.
5998 */
5999 /*FALLTHROUGH*/
6000
6001 case X86_VENDOR_AMD:
6002 case X86_VENDOR_HYGON:
6003 edx = &cpi->cpi_support[AMD_EDX_FEATURES];
6004 ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
6005
6006 *edx = CPI_FEATURES_XTD_EDX(cpi);
6007 *ecx = CPI_FEATURES_XTD_ECX(cpi);
6008
6009 /*
6010 * [no explicit support required beyond
6011 * x87 fp context and exception handlers]
6012 */
6013 if (!fpu_exists)
6014 *edx &= ~(CPUID_AMD_EDX_MMXamd |
6015 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
6016
6017 /*
6018 * Now map the supported feature vector to
6019 * things that we think userland will care about.
6020 */
6021 if (*edx & CPUID_AMD_EDX_MMXamd)
6022 hwcap_flags |= AV_386_AMD_MMX;
6023 if (*edx & CPUID_AMD_EDX_3DNow)
6024 hwcap_flags |= AV_386_AMD_3DNow;
6025 if (*edx & CPUID_AMD_EDX_3DNowx)
6026 hwcap_flags |= AV_386_AMD_3DNowx;
6027
6028 switch (cpi->cpi_vendor) {
6029 case X86_VENDOR_AMD:
6030 case X86_VENDOR_HYGON:
6031 if (*ecx & CPUID_AMD_ECX_AHF64)
6032 hwcap_flags |= AV_386_AHF;
6033 if (*ecx & CPUID_AMD_ECX_LZCNT)
6034 hwcap_flags |= AV_386_AMD_LZCNT;
6035 break;
6036
6037 case X86_VENDOR_Intel:
6038 if (*ecx & CPUID_AMD_ECX_LZCNT)
6039 hwcap_flags |= AV_386_AMD_LZCNT;
6040 /*
6041 * Aarrgh.
6042 * Intel uses a different bit in the same word.
6043 */
6044 if (*ecx & CPUID_INTC_ECX_AHF64)
6045 hwcap_flags |= AV_386_AHF;
6046 break;
6047 default:
6048 break;
6049 }
6050 break;
6051
6052 default:
6053 break;
6054 }
6055
6056 resolve_done:
6057 if (hwcap_out != NULL) {
6058 hwcap_out[0] = hwcap_flags;
6059 hwcap_out[1] = hwcap_flags_2;
6060 hwcap_out[2] = hwcap_flags_3;
6061 }
6062 }
6063
6064
6065 /*
6066 * Simulate the cpuid instruction using the data we previously
6067 * captured about this CPU. We try our best to return the truth
6068 * about the hardware, independently of kernel support.
6069 */
6070 uint32_t
cpuid_insn(cpu_t * cpu,struct cpuid_regs * cp)6071 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
6072 {
6073 struct cpuid_info *cpi;
6074 struct cpuid_regs *xcp;
6075
6076 if (cpu == NULL)
6077 cpu = CPU;
6078 cpi = cpu->cpu_m.mcpu_cpi;
6079
6080 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6081
6082 /*
6083 * CPUID data is cached in two separate places: cpi_std for standard
6084 * CPUID leaves , and cpi_extd for extended CPUID leaves.
6085 */
6086 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
6087 xcp = &cpi->cpi_std[cp->cp_eax];
6088 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
6089 cp->cp_eax <= cpi->cpi_xmaxeax &&
6090 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
6091 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
6092 } else {
6093 /*
6094 * The caller is asking for data from an input parameter which
6095 * the kernel has not cached. In this case we go fetch from
6096 * the hardware and return the data directly to the user.
6097 */
6098 return (__cpuid_insn(cp));
6099 }
6100
6101 cp->cp_eax = xcp->cp_eax;
6102 cp->cp_ebx = xcp->cp_ebx;
6103 cp->cp_ecx = xcp->cp_ecx;
6104 cp->cp_edx = xcp->cp_edx;
6105 return (cp->cp_eax);
6106 }
6107
6108 boolean_t
cpuid_checkpass(const cpu_t * const cpu,const cpuid_pass_t pass)6109 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
6110 {
6111 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
6112 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
6113 }
6114
6115 int
cpuid_getbrandstr(cpu_t * cpu,char * s,size_t n)6116 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
6117 {
6118 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6119
6120 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
6121 }
6122
6123 int
cpuid_is_cmt(cpu_t * cpu)6124 cpuid_is_cmt(cpu_t *cpu)
6125 {
6126 if (cpu == NULL)
6127 cpu = CPU;
6128
6129 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6130
6131 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
6132 }
6133
6134 /*
6135 * AMD and Intel both implement the 64-bit variant of the syscall
6136 * instruction (syscallq), so if there's -any- support for syscall,
6137 * cpuid currently says "yes, we support this".
6138 *
6139 * However, Intel decided to -not- implement the 32-bit variant of the
6140 * syscall instruction, so we provide a predicate to allow our caller
6141 * to test that subtlety here.
6142 *
6143 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor,
6144 * even in the case where the hardware would in fact support it.
6145 */
6146 /*ARGSUSED*/
6147 int
cpuid_syscall32_insn(cpu_t * cpu)6148 cpuid_syscall32_insn(cpu_t *cpu)
6149 {
6150 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
6151
6152 #if !defined(__xpv)
6153 if (cpu == NULL)
6154 cpu = CPU;
6155
6156 /*CSTYLED*/
6157 {
6158 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6159
6160 if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
6161 cpi->cpi_vendor == X86_VENDOR_HYGON) &&
6162 cpi->cpi_xmaxeax >= 0x80000001 &&
6163 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
6164 return (1);
6165 }
6166 #endif
6167 return (0);
6168 }
6169
6170 int
cpuid_getidstr(cpu_t * cpu,char * s,size_t n)6171 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
6172 {
6173 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6174
6175 static const char fmt[] =
6176 "x86 (%s %X family %d model %d step %d clock %d MHz)";
6177 static const char fmt_ht[] =
6178 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
6179
6180 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6181
6182 if (cpuid_is_cmt(cpu))
6183 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
6184 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6185 cpi->cpi_family, cpi->cpi_model,
6186 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6187 return (snprintf(s, n, fmt,
6188 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6189 cpi->cpi_family, cpi->cpi_model,
6190 cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6191 }
6192
6193 const char *
cpuid_getvendorstr(cpu_t * cpu)6194 cpuid_getvendorstr(cpu_t *cpu)
6195 {
6196 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6197 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
6198 }
6199
6200 uint_t
cpuid_getvendor(cpu_t * cpu)6201 cpuid_getvendor(cpu_t *cpu)
6202 {
6203 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6204 return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
6205 }
6206
6207 uint_t
cpuid_getfamily(cpu_t * cpu)6208 cpuid_getfamily(cpu_t *cpu)
6209 {
6210 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6211 return (cpu->cpu_m.mcpu_cpi->cpi_family);
6212 }
6213
6214 uint_t
cpuid_getmodel(cpu_t * cpu)6215 cpuid_getmodel(cpu_t *cpu)
6216 {
6217 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6218 return (cpu->cpu_m.mcpu_cpi->cpi_model);
6219 }
6220
6221 uint_t
cpuid_get_ncpu_per_chip(cpu_t * cpu)6222 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6223 {
6224 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6225 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6226 }
6227
6228 uint_t
cpuid_get_ncore_per_chip(cpu_t * cpu)6229 cpuid_get_ncore_per_chip(cpu_t *cpu)
6230 {
6231 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6232 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6233 }
6234
6235 uint_t
cpuid_get_ncpu_sharing_last_cache(cpu_t * cpu)6236 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6237 {
6238 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6239 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6240 }
6241
6242 id_t
cpuid_get_last_lvl_cacheid(cpu_t * cpu)6243 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6244 {
6245 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6246 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6247 }
6248
6249 uint_t
cpuid_getstep(cpu_t * cpu)6250 cpuid_getstep(cpu_t *cpu)
6251 {
6252 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6253 return (cpu->cpu_m.mcpu_cpi->cpi_step);
6254 }
6255
6256 uint_t
cpuid_getsig(struct cpu * cpu)6257 cpuid_getsig(struct cpu *cpu)
6258 {
6259 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6260 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6261 }
6262
6263 x86_chiprev_t
cpuid_getchiprev(struct cpu * cpu)6264 cpuid_getchiprev(struct cpu *cpu)
6265 {
6266 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6267 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6268 }
6269
6270 const char *
cpuid_getchiprevstr(struct cpu * cpu)6271 cpuid_getchiprevstr(struct cpu *cpu)
6272 {
6273 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6274 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6275 }
6276
6277 uint32_t
cpuid_getsockettype(struct cpu * cpu)6278 cpuid_getsockettype(struct cpu *cpu)
6279 {
6280 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6281 return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6282 }
6283
6284 const char *
cpuid_getsocketstr(cpu_t * cpu)6285 cpuid_getsocketstr(cpu_t *cpu)
6286 {
6287 static const char *socketstr = NULL;
6288 struct cpuid_info *cpi;
6289
6290 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6291 cpi = cpu->cpu_m.mcpu_cpi;
6292
6293 /* Assume that socket types are the same across the system */
6294 if (socketstr == NULL)
6295 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6296 cpi->cpi_model, cpi->cpi_step);
6297
6298
6299 return (socketstr);
6300 }
6301
6302 x86_uarchrev_t
cpuid_getuarchrev(cpu_t * cpu)6303 cpuid_getuarchrev(cpu_t *cpu)
6304 {
6305 return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6306 }
6307
6308 int
cpuid_get_chipid(cpu_t * cpu)6309 cpuid_get_chipid(cpu_t *cpu)
6310 {
6311 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6312
6313 if (cpuid_is_cmt(cpu))
6314 return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6315 return (cpu->cpu_id);
6316 }
6317
6318 id_t
cpuid_get_coreid(cpu_t * cpu)6319 cpuid_get_coreid(cpu_t *cpu)
6320 {
6321 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6322 return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6323 }
6324
6325 int
cpuid_get_pkgcoreid(cpu_t * cpu)6326 cpuid_get_pkgcoreid(cpu_t *cpu)
6327 {
6328 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6329 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6330 }
6331
6332 int
cpuid_get_clogid(cpu_t * cpu)6333 cpuid_get_clogid(cpu_t *cpu)
6334 {
6335 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6336 return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6337 }
6338
6339 int
cpuid_get_cacheid(cpu_t * cpu)6340 cpuid_get_cacheid(cpu_t *cpu)
6341 {
6342 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6343 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6344 }
6345
6346 uint_t
cpuid_get_procnodeid(cpu_t * cpu)6347 cpuid_get_procnodeid(cpu_t *cpu)
6348 {
6349 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6350 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6351 }
6352
6353 uint_t
cpuid_get_procnodes_per_pkg(cpu_t * cpu)6354 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6355 {
6356 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6357 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6358 }
6359
6360 uint_t
cpuid_get_compunitid(cpu_t * cpu)6361 cpuid_get_compunitid(cpu_t *cpu)
6362 {
6363 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6364 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6365 }
6366
6367 uint_t
cpuid_get_cores_per_compunit(cpu_t * cpu)6368 cpuid_get_cores_per_compunit(cpu_t *cpu)
6369 {
6370 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6371 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6372 }
6373
6374 uint32_t
cpuid_get_apicid(cpu_t * cpu)6375 cpuid_get_apicid(cpu_t *cpu)
6376 {
6377 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6378 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6379 return (UINT32_MAX);
6380 } else {
6381 return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6382 }
6383 }
6384
6385 void
cpuid_get_addrsize(cpu_t * cpu,uint_t * pabits,uint_t * vabits)6386 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6387 {
6388 struct cpuid_info *cpi;
6389
6390 if (cpu == NULL)
6391 cpu = CPU;
6392 cpi = cpu->cpu_m.mcpu_cpi;
6393
6394 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6395
6396 if (pabits)
6397 *pabits = cpi->cpi_pabits;
6398 if (vabits)
6399 *vabits = cpi->cpi_vabits;
6400 }
6401
6402 size_t
cpuid_get_xsave_size(void)6403 cpuid_get_xsave_size(void)
6404 {
6405 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6406 sizeof (struct xsave_state)));
6407 }
6408
6409 /*
6410 * Export information about known offsets to the kernel. We only care about
6411 * things we have actually enabled support for in %xcr0.
6412 */
6413 void
cpuid_get_xsave_info(uint64_t bit,size_t * sizep,size_t * offp)6414 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6415 {
6416 size_t size, off;
6417
6418 VERIFY3U(bit & xsave_bv_all, !=, 0);
6419
6420 if (sizep == NULL)
6421 sizep = &size;
6422 if (offp == NULL)
6423 offp = &off;
6424
6425 switch (bit) {
6426 case XFEATURE_LEGACY_FP:
6427 case XFEATURE_SSE:
6428 *sizep = sizeof (struct fxsave_state);
6429 *offp = 0;
6430 break;
6431 case XFEATURE_AVX:
6432 *sizep = cpuid_info0.cpi_xsave.ymm_size;
6433 *offp = cpuid_info0.cpi_xsave.ymm_offset;
6434 break;
6435 case XFEATURE_AVX512_OPMASK:
6436 *sizep = cpuid_info0.cpi_xsave.opmask_size;
6437 *offp = cpuid_info0.cpi_xsave.opmask_offset;
6438 break;
6439 case XFEATURE_AVX512_ZMM:
6440 *sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6441 *offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6442 break;
6443 case XFEATURE_AVX512_HI_ZMM:
6444 *sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6445 *offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6446 break;
6447 default:
6448 panic("asked for unsupported xsave feature: 0x%lx", bit);
6449 }
6450 }
6451
6452 /*
6453 * Return true if the CPUs on this system require 'pointer clearing' for the
6454 * floating point error pointer exception handling. In the past, this has been
6455 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6456 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6457 * feature bit and is reflected in the cpi_fp_amd_save member.
6458 */
6459 boolean_t
cpuid_need_fp_excp_handling(void)6460 cpuid_need_fp_excp_handling(void)
6461 {
6462 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6463 cpuid_info0.cpi_fp_amd_save != 0);
6464 }
6465
6466 /*
6467 * Returns the number of data TLB entries for a corresponding
6468 * pagesize. If it can't be computed, or isn't known, the
6469 * routine returns zero. If you ask about an architecturally
6470 * impossible pagesize, the routine will panic (so that the
6471 * hat implementor knows that things are inconsistent.)
6472 */
6473 uint_t
cpuid_get_dtlb_nent(cpu_t * cpu,size_t pagesize)6474 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6475 {
6476 struct cpuid_info *cpi;
6477 uint_t dtlb_nent = 0;
6478
6479 if (cpu == NULL)
6480 cpu = CPU;
6481 cpi = cpu->cpu_m.mcpu_cpi;
6482
6483 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6484
6485 /*
6486 * Check the L2 TLB info
6487 */
6488 if (cpi->cpi_xmaxeax >= 0x80000006) {
6489 struct cpuid_regs *cp = &cpi->cpi_extd[6];
6490
6491 switch (pagesize) {
6492
6493 case 4 * 1024:
6494 /*
6495 * All zero in the top 16 bits of the register
6496 * indicates a unified TLB. Size is in low 16 bits.
6497 */
6498 if ((cp->cp_ebx & 0xffff0000) == 0)
6499 dtlb_nent = cp->cp_ebx & 0x0000ffff;
6500 else
6501 dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6502 break;
6503
6504 case 2 * 1024 * 1024:
6505 if ((cp->cp_eax & 0xffff0000) == 0)
6506 dtlb_nent = cp->cp_eax & 0x0000ffff;
6507 else
6508 dtlb_nent = BITX(cp->cp_eax, 27, 16);
6509 break;
6510
6511 default:
6512 panic("unknown L2 pagesize");
6513 /*NOTREACHED*/
6514 }
6515 }
6516
6517 if (dtlb_nent != 0)
6518 return (dtlb_nent);
6519
6520 /*
6521 * No L2 TLB support for this size, try L1.
6522 */
6523 if (cpi->cpi_xmaxeax >= 0x80000005) {
6524 struct cpuid_regs *cp = &cpi->cpi_extd[5];
6525
6526 switch (pagesize) {
6527 case 4 * 1024:
6528 dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6529 break;
6530 case 2 * 1024 * 1024:
6531 dtlb_nent = BITX(cp->cp_eax, 23, 16);
6532 break;
6533 default:
6534 panic("unknown L1 d-TLB pagesize");
6535 /*NOTREACHED*/
6536 }
6537 }
6538
6539 return (dtlb_nent);
6540 }
6541
6542 /*
6543 * Return 0 if the erratum is not present or not applicable, positive
6544 * if it is, and negative if the status of the erratum is unknown.
6545 *
6546 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6547 * Processors" #25759, Rev 3.57, August 2005
6548 */
6549 int
cpuid_opteron_erratum(cpu_t * cpu,uint_t erratum)6550 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6551 {
6552 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6553 uint_t eax;
6554
6555 /*
6556 * Bail out if this CPU isn't an AMD CPU, or if it's
6557 * a legacy (32-bit) AMD CPU.
6558 */
6559 if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6560 cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6561 cpi->cpi_family == 6) {
6562 return (0);
6563 }
6564
6565 eax = cpi->cpi_std[1].cp_eax;
6566
6567 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50)
6568 #define SH_B3(eax) (eax == 0xf51)
6569 #define B(eax) (SH_B0(eax) || SH_B3(eax))
6570
6571 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58)
6572
6573 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6574 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6575 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2)
6576 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6577
6578 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6579 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0)
6580 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0)
6581 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6582
6583 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6584 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */
6585 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0)
6586 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71)
6587 #define BH_E4(eax) (eax == 0x20fb1)
6588 #define SH_E5(eax) (eax == 0x20f42)
6589 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2)
6590 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32)
6591 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6592 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6593 DH_E6(eax) || JH_E6(eax))
6594
6595 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6596 #define DR_B0(eax) (eax == 0x100f20)
6597 #define DR_B1(eax) (eax == 0x100f21)
6598 #define DR_BA(eax) (eax == 0x100f2a)
6599 #define DR_B2(eax) (eax == 0x100f22)
6600 #define DR_B3(eax) (eax == 0x100f23)
6601 #define RB_C0(eax) (eax == 0x100f40)
6602
6603 switch (erratum) {
6604 case 1:
6605 return (cpi->cpi_family < 0x10);
6606 case 51: /* what does the asterisk mean? */
6607 return (B(eax) || SH_C0(eax) || CG(eax));
6608 case 52:
6609 return (B(eax));
6610 case 57:
6611 return (cpi->cpi_family <= 0x11);
6612 case 58:
6613 return (B(eax));
6614 case 60:
6615 return (cpi->cpi_family <= 0x11);
6616 case 61:
6617 case 62:
6618 case 63:
6619 case 64:
6620 case 65:
6621 case 66:
6622 case 68:
6623 case 69:
6624 case 70:
6625 case 71:
6626 return (B(eax));
6627 case 72:
6628 return (SH_B0(eax));
6629 case 74:
6630 return (B(eax));
6631 case 75:
6632 return (cpi->cpi_family < 0x10);
6633 case 76:
6634 return (B(eax));
6635 case 77:
6636 return (cpi->cpi_family <= 0x11);
6637 case 78:
6638 return (B(eax) || SH_C0(eax));
6639 case 79:
6640 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6641 case 80:
6642 case 81:
6643 case 82:
6644 return (B(eax));
6645 case 83:
6646 return (B(eax) || SH_C0(eax) || CG(eax));
6647 case 85:
6648 return (cpi->cpi_family < 0x10);
6649 case 86:
6650 return (SH_C0(eax) || CG(eax));
6651 case 88:
6652 return (B(eax) || SH_C0(eax));
6653 case 89:
6654 return (cpi->cpi_family < 0x10);
6655 case 90:
6656 return (B(eax) || SH_C0(eax) || CG(eax));
6657 case 91:
6658 case 92:
6659 return (B(eax) || SH_C0(eax));
6660 case 93:
6661 return (SH_C0(eax));
6662 case 94:
6663 return (B(eax) || SH_C0(eax) || CG(eax));
6664 case 95:
6665 return (B(eax) || SH_C0(eax));
6666 case 96:
6667 return (B(eax) || SH_C0(eax) || CG(eax));
6668 case 97:
6669 case 98:
6670 return (SH_C0(eax) || CG(eax));
6671 case 99:
6672 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6673 case 100:
6674 return (B(eax) || SH_C0(eax));
6675 case 101:
6676 case 103:
6677 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6678 case 104:
6679 return (SH_C0(eax) || CG(eax) || D0(eax));
6680 case 105:
6681 case 106:
6682 case 107:
6683 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6684 case 108:
6685 return (DH_CG(eax));
6686 case 109:
6687 return (SH_C0(eax) || CG(eax) || D0(eax));
6688 case 110:
6689 return (D0(eax) || EX(eax));
6690 case 111:
6691 return (CG(eax));
6692 case 112:
6693 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6694 case 113:
6695 return (eax == 0x20fc0);
6696 case 114:
6697 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6698 case 115:
6699 return (SH_E0(eax) || JH_E1(eax));
6700 case 116:
6701 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6702 case 117:
6703 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6704 case 118:
6705 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6706 JH_E6(eax));
6707 case 121:
6708 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6709 case 122:
6710 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6711 case 123:
6712 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6713 case 131:
6714 return (cpi->cpi_family < 0x10);
6715 case 6336786:
6716
6717 /*
6718 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6719 * if this is a K8 family or newer processor. We're testing for
6720 * this 'erratum' to determine whether or not we have a constant
6721 * TSC.
6722 *
6723 * Our current fix for this is to disable the C1-Clock ramping.
6724 * However, this doesn't work on newer processor families nor
6725 * does it work when virtualized as those devices don't exist.
6726 */
6727 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6728 return (0);
6729 }
6730
6731 if (CPI_FAMILY(cpi) == 0xf) {
6732 struct cpuid_regs regs;
6733 regs.cp_eax = 0x80000007;
6734 (void) __cpuid_insn(®s);
6735 return (!(regs.cp_edx & 0x100));
6736 }
6737 return (0);
6738 case 147:
6739 /*
6740 * This erratum (K8 #147) is not present on family 10 and newer.
6741 */
6742 if (cpi->cpi_family >= 0x10) {
6743 return (0);
6744 }
6745 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6746 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6747
6748 case 6671130:
6749 /*
6750 * check for processors (pre-Shanghai) that do not provide
6751 * optimal management of 1gb ptes in its tlb.
6752 */
6753 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6754
6755 case 298:
6756 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6757 DR_B2(eax) || RB_C0(eax));
6758
6759 case 721:
6760 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6761
6762 default:
6763 return (-1);
6764
6765 }
6766 }
6767
6768 /*
6769 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6770 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6771 */
6772 int
osvw_opteron_erratum(cpu_t * cpu,uint_t erratum)6773 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6774 {
6775 struct cpuid_info *cpi;
6776 uint_t osvwid;
6777 static int osvwfeature = -1;
6778 uint64_t osvwlength;
6779
6780
6781 cpi = cpu->cpu_m.mcpu_cpi;
6782
6783 /* confirm OSVW supported */
6784 if (osvwfeature == -1) {
6785 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6786 } else {
6787 /* assert that osvw feature setting is consistent on all cpus */
6788 ASSERT(osvwfeature ==
6789 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6790 }
6791 if (!osvwfeature)
6792 return (-1);
6793
6794 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6795
6796 switch (erratum) {
6797 case 298: /* osvwid is 0 */
6798 osvwid = 0;
6799 if (osvwlength <= (uint64_t)osvwid) {
6800 /* osvwid 0 is unknown */
6801 return (-1);
6802 }
6803
6804 /*
6805 * Check the OSVW STATUS MSR to determine the state
6806 * of the erratum where:
6807 * 0 - fixed by HW
6808 * 1 - BIOS has applied the workaround when BIOS
6809 * workaround is available. (Or for other errata,
6810 * OS workaround is required.)
6811 * For a value of 1, caller will confirm that the
6812 * erratum 298 workaround has indeed been applied by BIOS.
6813 *
6814 * A 1 may be set in cpus that have a HW fix
6815 * in a mixed cpu system. Regarding erratum 298:
6816 * In a multiprocessor platform, the workaround above
6817 * should be applied to all processors regardless of
6818 * silicon revision when an affected processor is
6819 * present.
6820 */
6821
6822 return (rdmsr(MSR_AMD_OSVW_STATUS +
6823 (osvwid / OSVW_ID_CNT_PER_MSR)) &
6824 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6825
6826 default:
6827 return (-1);
6828 }
6829 }
6830
6831 static const char assoc_str[] = "associativity";
6832 static const char line_str[] = "line-size";
6833 static const char size_str[] = "size";
6834
6835 static void
add_cache_prop(dev_info_t * devi,const char * label,const char * type,uint32_t val)6836 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6837 uint32_t val)
6838 {
6839 char buf[128];
6840
6841 /*
6842 * ndi_prop_update_int() is used because it is desirable for
6843 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6844 */
6845 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6846 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6847 }
6848
6849 /*
6850 * Intel-style cache/tlb description
6851 *
6852 * Standard cpuid level 2 gives a randomly ordered
6853 * selection of tags that index into a table that describes
6854 * cache and tlb properties.
6855 */
6856
6857 static const char l1_icache_str[] = "l1-icache";
6858 static const char l1_dcache_str[] = "l1-dcache";
6859 static const char l2_cache_str[] = "l2-cache";
6860 static const char l3_cache_str[] = "l3-cache";
6861 static const char itlb4k_str[] = "itlb-4K";
6862 static const char dtlb4k_str[] = "dtlb-4K";
6863 static const char itlb2M_str[] = "itlb-2M";
6864 static const char itlb4M_str[] = "itlb-4M";
6865 static const char dtlb4M_str[] = "dtlb-4M";
6866 static const char dtlb24_str[] = "dtlb0-2M-4M";
6867 static const char itlb424_str[] = "itlb-4K-2M-4M";
6868 static const char itlb24_str[] = "itlb-2M-4M";
6869 static const char dtlb44_str[] = "dtlb-4K-4M";
6870 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6871 static const char sl2_cache_str[] = "sectored-l2-cache";
6872 static const char itrace_str[] = "itrace-cache";
6873 static const char sl3_cache_str[] = "sectored-l3-cache";
6874 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6875
6876 static const struct cachetab {
6877 uint8_t ct_code;
6878 uint8_t ct_assoc;
6879 uint16_t ct_line_size;
6880 size_t ct_size;
6881 const char *ct_label;
6882 } intel_ctab[] = {
6883 /*
6884 * maintain descending order!
6885 *
6886 * Codes ignored - Reason
6887 * ----------------------
6888 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6889 * f0H/f1H - Currently we do not interpret prefetch size by design
6890 */
6891 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6892 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6893 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6894 { 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6895 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6896 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6897 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6898 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6899 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6900 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6901 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6902 { 0xd0, 4, 64, 512*1024, l3_cache_str},
6903 { 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6904 { 0xc0, 4, 0, 8, dtlb44_str },
6905 { 0xba, 4, 0, 64, dtlb4k_str },
6906 { 0xb4, 4, 0, 256, dtlb4k_str },
6907 { 0xb3, 4, 0, 128, dtlb4k_str },
6908 { 0xb2, 4, 0, 64, itlb4k_str },
6909 { 0xb0, 4, 0, 128, itlb4k_str },
6910 { 0x87, 8, 64, 1024*1024, l2_cache_str},
6911 { 0x86, 4, 64, 512*1024, l2_cache_str},
6912 { 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6913 { 0x84, 8, 32, 1024*1024, l2_cache_str},
6914 { 0x83, 8, 32, 512*1024, l2_cache_str},
6915 { 0x82, 8, 32, 256*1024, l2_cache_str},
6916 { 0x80, 8, 64, 512*1024, l2_cache_str},
6917 { 0x7f, 2, 64, 512*1024, l2_cache_str},
6918 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6919 { 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6920 { 0x7b, 8, 64, 512*1024, sl2_cache_str},
6921 { 0x7a, 8, 64, 256*1024, sl2_cache_str},
6922 { 0x79, 8, 64, 128*1024, sl2_cache_str},
6923 { 0x78, 8, 64, 1024*1024, l2_cache_str},
6924 { 0x73, 8, 0, 64*1024, itrace_str},
6925 { 0x72, 8, 0, 32*1024, itrace_str},
6926 { 0x71, 8, 0, 16*1024, itrace_str},
6927 { 0x70, 8, 0, 12*1024, itrace_str},
6928 { 0x68, 4, 64, 32*1024, sl1_dcache_str},
6929 { 0x67, 4, 64, 16*1024, sl1_dcache_str},
6930 { 0x66, 4, 64, 8*1024, sl1_dcache_str},
6931 { 0x60, 8, 64, 16*1024, sl1_dcache_str},
6932 { 0x5d, 0, 0, 256, dtlb44_str},
6933 { 0x5c, 0, 0, 128, dtlb44_str},
6934 { 0x5b, 0, 0, 64, dtlb44_str},
6935 { 0x5a, 4, 0, 32, dtlb24_str},
6936 { 0x59, 0, 0, 16, dtlb4k_str},
6937 { 0x57, 4, 0, 16, dtlb4k_str},
6938 { 0x56, 4, 0, 16, dtlb4M_str},
6939 { 0x55, 0, 0, 7, itlb24_str},
6940 { 0x52, 0, 0, 256, itlb424_str},
6941 { 0x51, 0, 0, 128, itlb424_str},
6942 { 0x50, 0, 0, 64, itlb424_str},
6943 { 0x4f, 0, 0, 32, itlb4k_str},
6944 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6945 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6946 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6947 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6948 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6949 { 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6950 { 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6951 { 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6952 { 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6953 { 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6954 { 0x44, 4, 32, 1024*1024, l2_cache_str},
6955 { 0x43, 4, 32, 512*1024, l2_cache_str},
6956 { 0x42, 4, 32, 256*1024, l2_cache_str},
6957 { 0x41, 4, 32, 128*1024, l2_cache_str},
6958 { 0x3e, 4, 64, 512*1024, sl2_cache_str},
6959 { 0x3d, 6, 64, 384*1024, sl2_cache_str},
6960 { 0x3c, 4, 64, 256*1024, sl2_cache_str},
6961 { 0x3b, 2, 64, 128*1024, sl2_cache_str},
6962 { 0x3a, 6, 64, 192*1024, sl2_cache_str},
6963 { 0x39, 4, 64, 128*1024, sl2_cache_str},
6964 { 0x30, 8, 64, 32*1024, l1_icache_str},
6965 { 0x2c, 8, 64, 32*1024, l1_dcache_str},
6966 { 0x29, 8, 64, 4096*1024, sl3_cache_str},
6967 { 0x25, 8, 64, 2048*1024, sl3_cache_str},
6968 { 0x23, 8, 64, 1024*1024, sl3_cache_str},
6969 { 0x22, 4, 64, 512*1024, sl3_cache_str},
6970 { 0x0e, 6, 64, 24*1024, l1_dcache_str},
6971 { 0x0d, 4, 32, 16*1024, l1_dcache_str},
6972 { 0x0c, 4, 32, 16*1024, l1_dcache_str},
6973 { 0x0b, 4, 0, 4, itlb4M_str},
6974 { 0x0a, 2, 32, 8*1024, l1_dcache_str},
6975 { 0x08, 4, 32, 16*1024, l1_icache_str},
6976 { 0x06, 4, 32, 8*1024, l1_icache_str},
6977 { 0x05, 4, 0, 32, dtlb4M_str},
6978 { 0x04, 4, 0, 8, dtlb4M_str},
6979 { 0x03, 4, 0, 64, dtlb4k_str},
6980 { 0x02, 4, 0, 2, itlb4M_str},
6981 { 0x01, 4, 0, 32, itlb4k_str},
6982 { 0 }
6983 };
6984
6985 static const struct cachetab cyrix_ctab[] = {
6986 { 0x70, 4, 0, 32, "tlb-4K" },
6987 { 0x80, 4, 16, 16*1024, "l1-cache" },
6988 { 0 }
6989 };
6990
6991 /*
6992 * Search a cache table for a matching entry
6993 */
6994 static const struct cachetab *
find_cacheent(const struct cachetab * ct,uint_t code)6995 find_cacheent(const struct cachetab *ct, uint_t code)
6996 {
6997 if (code != 0) {
6998 for (; ct->ct_code != 0; ct++)
6999 if (ct->ct_code <= code)
7000 break;
7001 if (ct->ct_code == code)
7002 return (ct);
7003 }
7004 return (NULL);
7005 }
7006
7007 /*
7008 * Populate cachetab entry with L2 or L3 cache-information using
7009 * cpuid function 4. This function is called from intel_walk_cacheinfo()
7010 * when descriptor 0x49 is encountered. It returns 0 if no such cache
7011 * information is found.
7012 */
7013 static int
intel_cpuid_4_cache_info(struct cachetab * ct,struct cpuid_info * cpi)7014 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
7015 {
7016 uint32_t level, i;
7017 int ret = 0;
7018
7019 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
7020 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
7021
7022 if (level == 2 || level == 3) {
7023 ct->ct_assoc =
7024 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
7025 ct->ct_line_size =
7026 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
7027 ct->ct_size = ct->ct_assoc *
7028 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
7029 ct->ct_line_size *
7030 (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
7031
7032 if (level == 2) {
7033 ct->ct_label = l2_cache_str;
7034 } else if (level == 3) {
7035 ct->ct_label = l3_cache_str;
7036 }
7037 ret = 1;
7038 }
7039 }
7040
7041 return (ret);
7042 }
7043
7044 /*
7045 * Walk the cacheinfo descriptor, applying 'func' to every valid element
7046 * The walk is terminated if the walker returns non-zero.
7047 */
7048 static void
intel_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7049 intel_walk_cacheinfo(struct cpuid_info *cpi,
7050 void *arg, int (*func)(void *, const struct cachetab *))
7051 {
7052 const struct cachetab *ct;
7053 struct cachetab des_49_ct, des_b1_ct;
7054 uint8_t *dp;
7055 int i;
7056
7057 if ((dp = cpi->cpi_cacheinfo) == NULL)
7058 return;
7059 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7060 /*
7061 * For overloaded descriptor 0x49 we use cpuid function 4
7062 * if supported by the current processor, to create
7063 * cache information.
7064 * For overloaded descriptor 0xb1 we use X86_PAE flag
7065 * to disambiguate the cache information.
7066 */
7067 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
7068 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
7069 ct = &des_49_ct;
7070 } else if (*dp == 0xb1) {
7071 des_b1_ct.ct_code = 0xb1;
7072 des_b1_ct.ct_assoc = 4;
7073 des_b1_ct.ct_line_size = 0;
7074 if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
7075 des_b1_ct.ct_size = 8;
7076 des_b1_ct.ct_label = itlb2M_str;
7077 } else {
7078 des_b1_ct.ct_size = 4;
7079 des_b1_ct.ct_label = itlb4M_str;
7080 }
7081 ct = &des_b1_ct;
7082 } else {
7083 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
7084 continue;
7085 }
7086 }
7087
7088 if (func(arg, ct) != 0) {
7089 break;
7090 }
7091 }
7092 }
7093
7094 /*
7095 * (Like the Intel one, except for Cyrix CPUs)
7096 */
7097 static void
cyrix_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7098 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
7099 void *arg, int (*func)(void *, const struct cachetab *))
7100 {
7101 const struct cachetab *ct;
7102 uint8_t *dp;
7103 int i;
7104
7105 if ((dp = cpi->cpi_cacheinfo) == NULL)
7106 return;
7107 for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7108 /*
7109 * Search Cyrix-specific descriptor table first ..
7110 */
7111 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
7112 if (func(arg, ct) != 0)
7113 break;
7114 continue;
7115 }
7116 /*
7117 * .. else fall back to the Intel one
7118 */
7119 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
7120 if (func(arg, ct) != 0)
7121 break;
7122 continue;
7123 }
7124 }
7125 }
7126
7127 /*
7128 * A cacheinfo walker that adds associativity, line-size, and size properties
7129 * to the devinfo node it is passed as an argument.
7130 */
7131 static int
add_cacheent_props(void * arg,const struct cachetab * ct)7132 add_cacheent_props(void *arg, const struct cachetab *ct)
7133 {
7134 dev_info_t *devi = arg;
7135
7136 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
7137 if (ct->ct_line_size != 0)
7138 add_cache_prop(devi, ct->ct_label, line_str,
7139 ct->ct_line_size);
7140 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
7141 return (0);
7142 }
7143
7144
7145 static const char fully_assoc[] = "fully-associative?";
7146
7147 /*
7148 * AMD style cache/tlb description
7149 *
7150 * Extended functions 5 and 6 directly describe properties of
7151 * tlbs and various cache levels.
7152 */
7153 static void
add_amd_assoc(dev_info_t * devi,const char * label,uint_t assoc)7154 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7155 {
7156 switch (assoc) {
7157 case 0: /* reserved; ignore */
7158 break;
7159 default:
7160 add_cache_prop(devi, label, assoc_str, assoc);
7161 break;
7162 case 0xff:
7163 add_cache_prop(devi, label, fully_assoc, 1);
7164 break;
7165 }
7166 }
7167
7168 static void
add_amd_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7169 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7170 {
7171 if (size == 0)
7172 return;
7173 add_cache_prop(devi, label, size_str, size);
7174 add_amd_assoc(devi, label, assoc);
7175 }
7176
7177 static void
add_amd_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7178 add_amd_cache(dev_info_t *devi, const char *label,
7179 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7180 {
7181 if (size == 0 || line_size == 0)
7182 return;
7183 add_amd_assoc(devi, label, assoc);
7184 /*
7185 * Most AMD parts have a sectored cache. Multiple cache lines are
7186 * associated with each tag. A sector consists of all cache lines
7187 * associated with a tag. For example, the AMD K6-III has a sector
7188 * size of 2 cache lines per tag.
7189 */
7190 if (lines_per_tag != 0)
7191 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7192 add_cache_prop(devi, label, line_str, line_size);
7193 add_cache_prop(devi, label, size_str, size * 1024);
7194 }
7195
7196 static void
add_amd_l2_assoc(dev_info_t * devi,const char * label,uint_t assoc)7197 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7198 {
7199 switch (assoc) {
7200 case 0: /* off */
7201 break;
7202 case 1:
7203 case 2:
7204 case 4:
7205 add_cache_prop(devi, label, assoc_str, assoc);
7206 break;
7207 case 6:
7208 add_cache_prop(devi, label, assoc_str, 8);
7209 break;
7210 case 8:
7211 add_cache_prop(devi, label, assoc_str, 16);
7212 break;
7213 case 0xf:
7214 add_cache_prop(devi, label, fully_assoc, 1);
7215 break;
7216 default: /* reserved; ignore */
7217 break;
7218 }
7219 }
7220
7221 static void
add_amd_l2_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7222 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7223 {
7224 if (size == 0 || assoc == 0)
7225 return;
7226 add_amd_l2_assoc(devi, label, assoc);
7227 add_cache_prop(devi, label, size_str, size);
7228 }
7229
7230 static void
add_amd_l2_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7231 add_amd_l2_cache(dev_info_t *devi, const char *label,
7232 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7233 {
7234 if (size == 0 || assoc == 0 || line_size == 0)
7235 return;
7236 add_amd_l2_assoc(devi, label, assoc);
7237 if (lines_per_tag != 0)
7238 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7239 add_cache_prop(devi, label, line_str, line_size);
7240 add_cache_prop(devi, label, size_str, size * 1024);
7241 }
7242
7243 static void
amd_cache_info(struct cpuid_info * cpi,dev_info_t * devi)7244 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7245 {
7246 struct cpuid_regs *cp;
7247
7248 if (cpi->cpi_xmaxeax < 0x80000005)
7249 return;
7250 cp = &cpi->cpi_extd[5];
7251
7252 /*
7253 * 4M/2M L1 TLB configuration
7254 *
7255 * We report the size for 2M pages because AMD uses two
7256 * TLB entries for one 4M page.
7257 */
7258 add_amd_tlb(devi, "dtlb-2M",
7259 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7260 add_amd_tlb(devi, "itlb-2M",
7261 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7262
7263 /*
7264 * 4K L1 TLB configuration
7265 */
7266
7267 switch (cpi->cpi_vendor) {
7268 uint_t nentries;
7269 case X86_VENDOR_TM:
7270 if (cpi->cpi_family >= 5) {
7271 /*
7272 * Crusoe processors have 256 TLB entries, but
7273 * cpuid data format constrains them to only
7274 * reporting 255 of them.
7275 */
7276 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7277 nentries = 256;
7278 /*
7279 * Crusoe processors also have a unified TLB
7280 */
7281 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7282 nentries);
7283 break;
7284 }
7285 /*FALLTHROUGH*/
7286 default:
7287 add_amd_tlb(devi, itlb4k_str,
7288 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7289 add_amd_tlb(devi, dtlb4k_str,
7290 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7291 break;
7292 }
7293
7294 /*
7295 * data L1 cache configuration
7296 */
7297
7298 add_amd_cache(devi, l1_dcache_str,
7299 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7300 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7301
7302 /*
7303 * code L1 cache configuration
7304 */
7305
7306 add_amd_cache(devi, l1_icache_str,
7307 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7308 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7309
7310 if (cpi->cpi_xmaxeax < 0x80000006)
7311 return;
7312 cp = &cpi->cpi_extd[6];
7313
7314 /* Check for a unified L2 TLB for large pages */
7315
7316 if (BITX(cp->cp_eax, 31, 16) == 0)
7317 add_amd_l2_tlb(devi, "l2-tlb-2M",
7318 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7319 else {
7320 add_amd_l2_tlb(devi, "l2-dtlb-2M",
7321 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7322 add_amd_l2_tlb(devi, "l2-itlb-2M",
7323 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7324 }
7325
7326 /* Check for a unified L2 TLB for 4K pages */
7327
7328 if (BITX(cp->cp_ebx, 31, 16) == 0) {
7329 add_amd_l2_tlb(devi, "l2-tlb-4K",
7330 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7331 } else {
7332 add_amd_l2_tlb(devi, "l2-dtlb-4K",
7333 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7334 add_amd_l2_tlb(devi, "l2-itlb-4K",
7335 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7336 }
7337
7338 add_amd_l2_cache(devi, l2_cache_str,
7339 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7340 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7341 }
7342
7343 /*
7344 * There are two basic ways that the x86 world describes it cache
7345 * and tlb architecture - Intel's way and AMD's way.
7346 *
7347 * Return which flavor of cache architecture we should use
7348 */
7349 static int
x86_which_cacheinfo(struct cpuid_info * cpi)7350 x86_which_cacheinfo(struct cpuid_info *cpi)
7351 {
7352 switch (cpi->cpi_vendor) {
7353 case X86_VENDOR_Intel:
7354 if (cpi->cpi_maxeax >= 2)
7355 return (X86_VENDOR_Intel);
7356 break;
7357 case X86_VENDOR_AMD:
7358 /*
7359 * The K5 model 1 was the first part from AMD that reported
7360 * cache sizes via extended cpuid functions.
7361 */
7362 if (cpi->cpi_family > 5 ||
7363 (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7364 return (X86_VENDOR_AMD);
7365 break;
7366 case X86_VENDOR_HYGON:
7367 return (X86_VENDOR_AMD);
7368 case X86_VENDOR_TM:
7369 if (cpi->cpi_family >= 5)
7370 return (X86_VENDOR_AMD);
7371 /*FALLTHROUGH*/
7372 default:
7373 /*
7374 * If they have extended CPU data for 0x80000005
7375 * then we assume they have AMD-format cache
7376 * information.
7377 *
7378 * If not, and the vendor happens to be Cyrix,
7379 * then try our-Cyrix specific handler.
7380 *
7381 * If we're not Cyrix, then assume we're using Intel's
7382 * table-driven format instead.
7383 */
7384 if (cpi->cpi_xmaxeax >= 0x80000005)
7385 return (X86_VENDOR_AMD);
7386 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7387 return (X86_VENDOR_Cyrix);
7388 else if (cpi->cpi_maxeax >= 2)
7389 return (X86_VENDOR_Intel);
7390 break;
7391 }
7392 return (-1);
7393 }
7394
7395 void
cpuid_set_cpu_properties(void * dip,processorid_t cpu_id,struct cpuid_info * cpi)7396 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7397 struct cpuid_info *cpi)
7398 {
7399 dev_info_t *cpu_devi;
7400 int create;
7401
7402 cpu_devi = (dev_info_t *)dip;
7403
7404 /* device_type */
7405 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7406 "device_type", "cpu");
7407
7408 /* reg */
7409 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7410 "reg", cpu_id);
7411
7412 /* cpu-mhz, and clock-frequency */
7413 if (cpu_freq > 0) {
7414 long long mul;
7415
7416 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7417 "cpu-mhz", cpu_freq);
7418 if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7419 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7420 "clock-frequency", (int)mul);
7421 }
7422
7423 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7424
7425 /* vendor-id */
7426 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7427 "vendor-id", cpi->cpi_vendorstr);
7428
7429 if (cpi->cpi_maxeax == 0) {
7430 return;
7431 }
7432
7433 /*
7434 * family, model, and step
7435 */
7436 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7437 "family", CPI_FAMILY(cpi));
7438 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7439 "cpu-model", CPI_MODEL(cpi));
7440 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7441 "stepping-id", CPI_STEP(cpi));
7442
7443 /* type */
7444 switch (cpi->cpi_vendor) {
7445 case X86_VENDOR_Intel:
7446 create = 1;
7447 break;
7448 default:
7449 create = 0;
7450 break;
7451 }
7452 if (create)
7453 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7454 "type", CPI_TYPE(cpi));
7455
7456 /* ext-family */
7457 switch (cpi->cpi_vendor) {
7458 case X86_VENDOR_Intel:
7459 case X86_VENDOR_AMD:
7460 create = cpi->cpi_family >= 0xf;
7461 break;
7462 case X86_VENDOR_HYGON:
7463 create = 1;
7464 break;
7465 default:
7466 create = 0;
7467 break;
7468 }
7469 if (create)
7470 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7471 "ext-family", CPI_FAMILY_XTD(cpi));
7472
7473 /* ext-model */
7474 switch (cpi->cpi_vendor) {
7475 case X86_VENDOR_Intel:
7476 create = IS_EXTENDED_MODEL_INTEL(cpi);
7477 break;
7478 case X86_VENDOR_AMD:
7479 create = CPI_FAMILY(cpi) == 0xf;
7480 break;
7481 case X86_VENDOR_HYGON:
7482 create = 1;
7483 break;
7484 default:
7485 create = 0;
7486 break;
7487 }
7488 if (create)
7489 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7490 "ext-model", CPI_MODEL_XTD(cpi));
7491
7492 /* generation */
7493 switch (cpi->cpi_vendor) {
7494 case X86_VENDOR_AMD:
7495 case X86_VENDOR_HYGON:
7496 /*
7497 * AMD K5 model 1 was the first part to support this
7498 */
7499 create = cpi->cpi_xmaxeax >= 0x80000001;
7500 break;
7501 default:
7502 create = 0;
7503 break;
7504 }
7505 if (create)
7506 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7507 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7508
7509 /* brand-id */
7510 switch (cpi->cpi_vendor) {
7511 case X86_VENDOR_Intel:
7512 /*
7513 * brand id first appeared on Pentium III Xeon model 8,
7514 * and Celeron model 8 processors and Opteron
7515 */
7516 create = cpi->cpi_family > 6 ||
7517 (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7518 break;
7519 case X86_VENDOR_AMD:
7520 create = cpi->cpi_family >= 0xf;
7521 break;
7522 case X86_VENDOR_HYGON:
7523 create = 1;
7524 break;
7525 default:
7526 create = 0;
7527 break;
7528 }
7529 if (create && cpi->cpi_brandid != 0) {
7530 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7531 "brand-id", cpi->cpi_brandid);
7532 }
7533
7534 /* chunks, and apic-id */
7535 switch (cpi->cpi_vendor) {
7536 /*
7537 * first available on Pentium IV and Opteron (K8)
7538 */
7539 case X86_VENDOR_Intel:
7540 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7541 break;
7542 case X86_VENDOR_AMD:
7543 create = cpi->cpi_family >= 0xf;
7544 break;
7545 case X86_VENDOR_HYGON:
7546 create = 1;
7547 break;
7548 default:
7549 create = 0;
7550 break;
7551 }
7552 if (create) {
7553 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7554 "chunks", CPI_CHUNKS(cpi));
7555 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7556 "apic-id", cpi->cpi_apicid);
7557 if (cpi->cpi_chipid >= 0) {
7558 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7559 "chip#", cpi->cpi_chipid);
7560 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7561 "clog#", cpi->cpi_clogid);
7562 }
7563 }
7564
7565 /* cpuid-features */
7566 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7567 "cpuid-features", CPI_FEATURES_EDX(cpi));
7568
7569
7570 /* cpuid-features-ecx */
7571 switch (cpi->cpi_vendor) {
7572 case X86_VENDOR_Intel:
7573 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7574 break;
7575 case X86_VENDOR_AMD:
7576 create = cpi->cpi_family >= 0xf;
7577 break;
7578 case X86_VENDOR_HYGON:
7579 create = 1;
7580 break;
7581 default:
7582 create = 0;
7583 break;
7584 }
7585 if (create)
7586 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7587 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7588
7589 /* ext-cpuid-features */
7590 switch (cpi->cpi_vendor) {
7591 case X86_VENDOR_Intel:
7592 case X86_VENDOR_AMD:
7593 case X86_VENDOR_HYGON:
7594 case X86_VENDOR_Cyrix:
7595 case X86_VENDOR_TM:
7596 case X86_VENDOR_Centaur:
7597 create = cpi->cpi_xmaxeax >= 0x80000001;
7598 break;
7599 default:
7600 create = 0;
7601 break;
7602 }
7603 if (create) {
7604 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7605 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7606 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7607 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7608 }
7609
7610 /*
7611 * Brand String first appeared in Intel Pentium IV, AMD K5
7612 * model 1, and Cyrix GXm. On earlier models we try and
7613 * simulate something similar .. so this string should always
7614 * same -something- about the processor, however lame.
7615 */
7616 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7617 "brand-string", cpi->cpi_brandstr);
7618
7619 /*
7620 * Finally, cache and tlb information
7621 */
7622 switch (x86_which_cacheinfo(cpi)) {
7623 case X86_VENDOR_Intel:
7624 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7625 break;
7626 case X86_VENDOR_Cyrix:
7627 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7628 break;
7629 case X86_VENDOR_AMD:
7630 amd_cache_info(cpi, cpu_devi);
7631 break;
7632 default:
7633 break;
7634 }
7635 }
7636
7637 struct l2info {
7638 int *l2i_csz;
7639 int *l2i_lsz;
7640 int *l2i_assoc;
7641 int l2i_ret;
7642 };
7643
7644 /*
7645 * A cacheinfo walker that fetches the size, line-size and associativity
7646 * of the L2 cache
7647 */
7648 static int
intel_l2cinfo(void * arg,const struct cachetab * ct)7649 intel_l2cinfo(void *arg, const struct cachetab *ct)
7650 {
7651 struct l2info *l2i = arg;
7652 int *ip;
7653
7654 if (ct->ct_label != l2_cache_str &&
7655 ct->ct_label != sl2_cache_str)
7656 return (0); /* not an L2 -- keep walking */
7657
7658 if ((ip = l2i->l2i_csz) != NULL)
7659 *ip = ct->ct_size;
7660 if ((ip = l2i->l2i_lsz) != NULL)
7661 *ip = ct->ct_line_size;
7662 if ((ip = l2i->l2i_assoc) != NULL)
7663 *ip = ct->ct_assoc;
7664 l2i->l2i_ret = ct->ct_size;
7665 return (1); /* was an L2 -- terminate walk */
7666 }
7667
7668 /*
7669 * AMD L2/L3 Cache and TLB Associativity Field Definition:
7670 *
7671 * Unlike the associativity for the L1 cache and tlb where the 8 bit
7672 * value is the associativity, the associativity for the L2 cache and
7673 * tlb is encoded in the following table. The 4 bit L2 value serves as
7674 * an index into the amd_afd[] array to determine the associativity.
7675 * -1 is undefined. 0 is fully associative.
7676 */
7677
7678 static int amd_afd[] =
7679 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7680
7681 static void
amd_l2cacheinfo(struct cpuid_info * cpi,struct l2info * l2i)7682 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7683 {
7684 struct cpuid_regs *cp;
7685 uint_t size, assoc;
7686 int i;
7687 int *ip;
7688
7689 if (cpi->cpi_xmaxeax < 0x80000006)
7690 return;
7691 cp = &cpi->cpi_extd[6];
7692
7693 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7694 (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7695 uint_t cachesz = size * 1024;
7696 assoc = amd_afd[i];
7697
7698 ASSERT(assoc != -1);
7699
7700 if ((ip = l2i->l2i_csz) != NULL)
7701 *ip = cachesz;
7702 if ((ip = l2i->l2i_lsz) != NULL)
7703 *ip = BITX(cp->cp_ecx, 7, 0);
7704 if ((ip = l2i->l2i_assoc) != NULL)
7705 *ip = assoc;
7706 l2i->l2i_ret = cachesz;
7707 }
7708 }
7709
7710 int
getl2cacheinfo(cpu_t * cpu,int * csz,int * lsz,int * assoc)7711 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7712 {
7713 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7714 struct l2info __l2info, *l2i = &__l2info;
7715
7716 l2i->l2i_csz = csz;
7717 l2i->l2i_lsz = lsz;
7718 l2i->l2i_assoc = assoc;
7719 l2i->l2i_ret = -1;
7720
7721 switch (x86_which_cacheinfo(cpi)) {
7722 case X86_VENDOR_Intel:
7723 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7724 break;
7725 case X86_VENDOR_Cyrix:
7726 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7727 break;
7728 case X86_VENDOR_AMD:
7729 amd_l2cacheinfo(cpi, l2i);
7730 break;
7731 default:
7732 break;
7733 }
7734 return (l2i->l2i_ret);
7735 }
7736
7737 #if !defined(__xpv)
7738
7739 uint32_t *
cpuid_mwait_alloc(cpu_t * cpu)7740 cpuid_mwait_alloc(cpu_t *cpu)
7741 {
7742 uint32_t *ret;
7743 size_t mwait_size;
7744
7745 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7746
7747 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7748 if (mwait_size == 0)
7749 return (NULL);
7750
7751 /*
7752 * kmem_alloc() returns cache line size aligned data for mwait_size
7753 * allocations. mwait_size is currently cache line sized. Neither
7754 * of these implementation details are guarantied to be true in the
7755 * future.
7756 *
7757 * First try allocating mwait_size as kmem_alloc() currently returns
7758 * correctly aligned memory. If kmem_alloc() does not return
7759 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7760 *
7761 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7762 * decide to free this memory.
7763 */
7764 ret = kmem_zalloc(mwait_size, KM_SLEEP);
7765 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7766 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7767 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7768 *ret = MWAIT_RUNNING;
7769 return (ret);
7770 } else {
7771 kmem_free(ret, mwait_size);
7772 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7773 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7774 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7775 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7776 *ret = MWAIT_RUNNING;
7777 return (ret);
7778 }
7779 }
7780
7781 void
cpuid_mwait_free(cpu_t * cpu)7782 cpuid_mwait_free(cpu_t *cpu)
7783 {
7784 if (cpu->cpu_m.mcpu_cpi == NULL) {
7785 return;
7786 }
7787
7788 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7789 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7790 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7791 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7792 }
7793
7794 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7795 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7796 }
7797
7798 void
patch_tsc_read(int flag)7799 patch_tsc_read(int flag)
7800 {
7801 size_t cnt;
7802
7803 switch (flag) {
7804 case TSC_NONE:
7805 cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7806 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7807 break;
7808 case TSC_RDTSC_LFENCE:
7809 cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7810 (void) memcpy((void *)tsc_read,
7811 (void *)&_tsc_lfence_start, cnt);
7812 break;
7813 case TSC_TSCP:
7814 cnt = &_tscp_end - &_tscp_start;
7815 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7816 break;
7817 default:
7818 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7819 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7820 break;
7821 }
7822 tsc_type = flag;
7823 }
7824
7825 int
cpuid_deep_cstates_supported(void)7826 cpuid_deep_cstates_supported(void)
7827 {
7828 struct cpuid_info *cpi;
7829 struct cpuid_regs regs;
7830
7831 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7832 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7833
7834 cpi = CPU->cpu_m.mcpu_cpi;
7835
7836 switch (cpi->cpi_vendor) {
7837 case X86_VENDOR_Intel:
7838 case X86_VENDOR_AMD:
7839 case X86_VENDOR_HYGON:
7840 if (cpi->cpi_xmaxeax < 0x80000007)
7841 return (0);
7842
7843 /*
7844 * Does TSC run at a constant rate in all C-states?
7845 */
7846 regs.cp_eax = 0x80000007;
7847 (void) __cpuid_insn(®s);
7848 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7849
7850 default:
7851 return (0);
7852 }
7853 }
7854
7855 #endif /* !__xpv */
7856
7857 void
post_startup_cpu_fixups(void)7858 post_startup_cpu_fixups(void)
7859 {
7860 #ifndef __xpv
7861 /*
7862 * Some AMD processors support C1E state. Entering this state will
7863 * cause the local APIC timer to stop, which we can't deal with at
7864 * this time.
7865 */
7866 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7867 on_trap_data_t otd;
7868 uint64_t reg;
7869
7870 if (!on_trap(&otd, OT_DATA_ACCESS)) {
7871 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7872 /* Disable C1E state if it is enabled by BIOS */
7873 if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7874 AMD_ACTONCMPHALT_MASK) {
7875 reg &= ~(AMD_ACTONCMPHALT_MASK <<
7876 AMD_ACTONCMPHALT_SHIFT);
7877 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7878 }
7879 }
7880 no_trap();
7881 }
7882 #endif /* !__xpv */
7883 }
7884
7885 void
enable_pcid(void)7886 enable_pcid(void)
7887 {
7888 if (x86_use_pcid == -1)
7889 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7890
7891 if (x86_use_invpcid == -1) {
7892 x86_use_invpcid = is_x86_feature(x86_featureset,
7893 X86FSET_INVPCID);
7894 }
7895
7896 if (!x86_use_pcid)
7897 return;
7898
7899 /*
7900 * Intel say that on setting PCIDE, it immediately starts using the PCID
7901 * bits; better make sure there's nothing there.
7902 */
7903 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7904
7905 setcr4(getcr4() | CR4_PCIDE);
7906 }
7907
7908 /*
7909 * Setup necessary registers to enable XSAVE feature on this processor.
7910 * This function needs to be called early enough, so that no xsave/xrstor
7911 * ops will execute on the processor before the MSRs are properly set up.
7912 *
7913 * Current implementation has the following assumption:
7914 * - cpuid_pass_basic() is done, so that X86 features are known.
7915 * - fpu_probe() is done, so that fp_save_mech is chosen.
7916 */
7917 void
xsave_setup_msr(cpu_t * cpu)7918 xsave_setup_msr(cpu_t *cpu)
7919 {
7920 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7921 ASSERT(fp_save_mech == FP_XSAVE);
7922 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7923
7924 /* Enable OSXSAVE in CR4. */
7925 setcr4(getcr4() | CR4_OSXSAVE);
7926 /*
7927 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7928 * correct value.
7929 */
7930 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7931 setup_xfem();
7932 }
7933
7934 /*
7935 * Starting with the Westmere processor the local
7936 * APIC timer will continue running in all C-states,
7937 * including the deepest C-states.
7938 */
7939 int
cpuid_arat_supported(void)7940 cpuid_arat_supported(void)
7941 {
7942 struct cpuid_info *cpi;
7943 struct cpuid_regs regs;
7944
7945 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7946 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7947
7948 cpi = CPU->cpu_m.mcpu_cpi;
7949
7950 switch (cpi->cpi_vendor) {
7951 case X86_VENDOR_Intel:
7952 case X86_VENDOR_AMD:
7953 case X86_VENDOR_HYGON:
7954 /*
7955 * Always-running Local APIC Timer is
7956 * indicated by CPUID.6.EAX[2].
7957 */
7958 if (cpi->cpi_maxeax >= 6) {
7959 regs.cp_eax = 6;
7960 (void) cpuid_insn(NULL, ®s);
7961 return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7962 } else {
7963 return (0);
7964 }
7965 default:
7966 return (0);
7967 }
7968 }
7969
7970 /*
7971 * Check support for Intel ENERGY_PERF_BIAS feature
7972 */
7973 int
cpuid_iepb_supported(struct cpu * cp)7974 cpuid_iepb_supported(struct cpu *cp)
7975 {
7976 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7977 struct cpuid_regs regs;
7978
7979 ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7980 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7981
7982 if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7983 return (0);
7984 }
7985
7986 /*
7987 * Intel ENERGY_PERF_BIAS MSR is indicated by
7988 * capability bit CPUID.6.ECX.3
7989 */
7990 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7991 return (0);
7992
7993 regs.cp_eax = 0x6;
7994 (void) cpuid_insn(NULL, ®s);
7995 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7996 }
7997
7998 /*
7999 * Check support for TSC deadline timer
8000 *
8001 * TSC deadline timer provides a superior software programming
8002 * model over local APIC timer that eliminates "time drifts".
8003 * Instead of specifying a relative time, software specifies an
8004 * absolute time as the target at which the processor should
8005 * generate a timer event.
8006 */
8007 int
cpuid_deadline_tsc_supported(void)8008 cpuid_deadline_tsc_supported(void)
8009 {
8010 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
8011 struct cpuid_regs regs;
8012
8013 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8014 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
8015
8016 switch (cpi->cpi_vendor) {
8017 case X86_VENDOR_Intel:
8018 if (cpi->cpi_maxeax >= 1) {
8019 regs.cp_eax = 1;
8020 (void) cpuid_insn(NULL, ®s);
8021 return (regs.cp_ecx & CPUID_DEADLINE_TSC);
8022 } else {
8023 return (0);
8024 }
8025 default:
8026 return (0);
8027 }
8028 }
8029
8030 #if !defined(__xpv)
8031 /*
8032 * Patch in versions of bcopy for high performance Intel Nhm processors
8033 * and later...
8034 */
8035 void
patch_memops(uint_t vendor)8036 patch_memops(uint_t vendor)
8037 {
8038 size_t cnt, i;
8039 caddr_t to, from;
8040
8041 if ((vendor == X86_VENDOR_Intel) &&
8042 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
8043 cnt = &bcopy_patch_end - &bcopy_patch_start;
8044 to = &bcopy_ck_size;
8045 from = &bcopy_patch_start;
8046 for (i = 0; i < cnt; i++) {
8047 *to++ = *from++;
8048 }
8049 }
8050 }
8051 #endif /* !__xpv */
8052
8053 /*
8054 * We're being asked to tell the system how many bits are required to represent
8055 * the various thread and strand IDs. While it's tempting to derive this based
8056 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
8057 * correct. Instead, this needs to be based on the number of bits that the APIC
8058 * allows for these different configurations. We only update these to a larger
8059 * value if we find one.
8060 */
8061 void
cpuid_get_ext_topo(cpu_t * cpu,uint_t * core_nbits,uint_t * strand_nbits)8062 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
8063 {
8064 struct cpuid_info *cpi;
8065
8066 VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8067 cpi = cpu->cpu_m.mcpu_cpi;
8068
8069 if (cpi->cpi_ncore_bits > *core_nbits) {
8070 *core_nbits = cpi->cpi_ncore_bits;
8071 }
8072
8073 if (cpi->cpi_nthread_bits > *strand_nbits) {
8074 *strand_nbits = cpi->cpi_nthread_bits;
8075 }
8076 }
8077
8078 void
cpuid_pass_ucode(cpu_t * cpu,uchar_t * fset)8079 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
8080 {
8081 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
8082 struct cpuid_regs cp;
8083
8084 /*
8085 * Reread the CPUID portions that we need for various security
8086 * information.
8087 */
8088 switch (cpi->cpi_vendor) {
8089 case X86_VENDOR_Intel:
8090 /*
8091 * Check if we now have leaf 7 available to us.
8092 */
8093 if (cpi->cpi_maxeax < 7) {
8094 bzero(&cp, sizeof (cp));
8095 cp.cp_eax = 0;
8096 cpi->cpi_maxeax = __cpuid_insn(&cp);
8097 if (cpi->cpi_maxeax < 7)
8098 break;
8099 }
8100
8101 bzero(&cp, sizeof (cp));
8102 cp.cp_eax = 7;
8103 cp.cp_ecx = 0;
8104 (void) __cpuid_insn(&cp);
8105 cpi->cpi_std[7] = cp;
8106 break;
8107
8108 case X86_VENDOR_AMD:
8109 case X86_VENDOR_HYGON:
8110 /* No xcpuid support */
8111 if (cpi->cpi_family < 5 ||
8112 (cpi->cpi_family == 5 && cpi->cpi_model < 1))
8113 break;
8114
8115 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
8116 bzero(&cp, sizeof (cp));
8117 cp.cp_eax = CPUID_LEAF_EXT_0;
8118 cpi->cpi_xmaxeax = __cpuid_insn(&cp);
8119 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
8120 break;
8121 }
8122
8123 /*
8124 * Most AMD features are in leaf 8. Automatic IBRS was added in
8125 * leaf 0x21. So we also check that.
8126 */
8127 bzero(&cp, sizeof (cp));
8128 cp.cp_eax = CPUID_LEAF_EXT_8;
8129 (void) __cpuid_insn(&cp);
8130 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
8131 cpi->cpi_extd[8] = cp;
8132
8133 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21)
8134 break;
8135
8136 bzero(&cp, sizeof (cp));
8137 cp.cp_eax = CPUID_LEAF_EXT_21;
8138 (void) __cpuid_insn(&cp);
8139 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
8140 cpi->cpi_extd[0x21] = cp;
8141 break;
8142
8143 default:
8144 /*
8145 * Nothing to do here. Return an empty set which has already
8146 * been zeroed for us.
8147 */
8148 return;
8149 }
8150
8151 cpuid_scan_security(cpu, fset);
8152 }
8153
8154 /* ARGSUSED */
8155 static int
cpuid_post_ucodeadm_xc(xc_arg_t arg0,xc_arg_t arg1,xc_arg_t arg2)8156 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
8157 {
8158 uchar_t *fset;
8159 boolean_t first_pass = (boolean_t)arg1;
8160
8161 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
8162 if (first_pass && CPU->cpu_id != 0)
8163 return (0);
8164 if (!first_pass && CPU->cpu_id == 0)
8165 return (0);
8166 cpuid_pass_ucode(CPU, fset);
8167
8168 return (0);
8169 }
8170
8171 /*
8172 * After a microcode update where the version has changed, then we need to
8173 * rescan CPUID. To do this we check every CPU to make sure that they have the
8174 * same microcode. Then we perform a cross call to all such CPUs. It's the
8175 * caller's job to make sure that no one else can end up doing an update while
8176 * this is going on.
8177 *
8178 * We assume that the system is microcode capable if we're called.
8179 */
8180 void
cpuid_post_ucodeadm(void)8181 cpuid_post_ucodeadm(void)
8182 {
8183 uint32_t rev;
8184 int i;
8185 struct cpu *cpu;
8186 cpuset_t cpuset;
8187 void *argdata;
8188 uchar_t *f0;
8189
8190 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
8191
8192 mutex_enter(&cpu_lock);
8193 cpu = cpu_get(0);
8194 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
8195 CPUSET_ONLY(cpuset, 0);
8196 for (i = 1; i < max_ncpus; i++) {
8197 if ((cpu = cpu_get(i)) == NULL)
8198 continue;
8199
8200 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
8201 panic("post microcode update CPU %d has differing "
8202 "microcode revision (%u) from CPU 0 (%u)",
8203 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
8204 }
8205 CPUSET_ADD(cpuset, i);
8206 }
8207
8208 /*
8209 * We do the cross calls in two passes. The first pass is only for the
8210 * boot CPU. The second pass is for all of the other CPUs. This allows
8211 * the boot CPU to go through and change behavior related to patching or
8212 * whether or not Enhanced IBRS needs to be enabled and then allow all
8213 * other CPUs to follow suit.
8214 */
8215 kpreempt_disable();
8216 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
8217 cpuid_post_ucodeadm_xc);
8218 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
8219 cpuid_post_ucodeadm_xc);
8220 kpreempt_enable();
8221
8222 /*
8223 * OK, now look at each CPU and see if their feature sets are equal.
8224 */
8225 f0 = argdata;
8226 for (i = 1; i < max_ncpus; i++) {
8227 uchar_t *fset;
8228 if (!CPU_IN_SET(cpuset, i))
8229 continue;
8230
8231 fset = (uchar_t *)((uintptr_t)argdata +
8232 sizeof (x86_featureset) * i);
8233
8234 if (!compare_x86_featureset(f0, fset)) {
8235 panic("Post microcode update CPU %d has "
8236 "differing security feature (%p) set from CPU 0 "
8237 "(%p), not appending to feature set", i,
8238 (void *)fset, (void *)f0);
8239 }
8240 }
8241
8242 mutex_exit(&cpu_lock);
8243
8244 for (i = 0; i < NUM_X86_FEATURES; i++) {
8245 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8246 x86_feature_names[i]);
8247 if (is_x86_feature(f0, i)) {
8248 add_x86_feature(x86_featureset, i);
8249 }
8250 }
8251 kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8252 }
8253
8254 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8255
8256 typedef struct cpuid_pass_def {
8257 cpuid_pass_t cpd_pass;
8258 cpuid_pass_f cpd_func;
8259 } cpuid_pass_def_t;
8260
8261 /*
8262 * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8263 * normal sense and should not appear here.
8264 */
8265 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8266 { CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8267 { CPUID_PASS_IDENT, cpuid_pass_ident },
8268 { CPUID_PASS_BASIC, cpuid_pass_basic },
8269 { CPUID_PASS_EXTENDED, cpuid_pass_extended },
8270 { CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8271 { CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8272 };
8273
8274 void
cpuid_execpass(cpu_t * cp,cpuid_pass_t pass,void * arg)8275 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8276 {
8277 VERIFY3S(pass, !=, CPUID_PASS_NONE);
8278
8279 if (cp == NULL)
8280 cp = CPU;
8281
8282 /*
8283 * Space statically allocated for BSP, ensure pointer is set
8284 */
8285 if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8286 cp->cpu_m.mcpu_cpi = &cpuid_info0;
8287
8288 ASSERT(cpuid_checkpass(cp, pass - 1));
8289
8290 for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8291 if (cpuid_pass_defs[i].cpd_pass == pass) {
8292 cpuid_pass_defs[i].cpd_func(cp, arg);
8293 cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8294 return;
8295 }
8296 }
8297
8298 panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8299 pass, cp->cpu_id);
8300 }
8301
8302 /*
8303 * Extract the processor family from a chiprev. Processor families are not the
8304 * same as cpuid families; see comments above and in x86_archext.h.
8305 */
8306 x86_processor_family_t
chiprev_family(const x86_chiprev_t cr)8307 chiprev_family(const x86_chiprev_t cr)
8308 {
8309 return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8310 }
8311
8312 /*
8313 * A chiprev matches its template if the vendor and family are identical and the
8314 * revision of the chiprev matches one of the bits set in the template. Callers
8315 * may bitwise-OR together chiprevs of the same vendor and family to form the
8316 * template, or use the _ANY variant. It is not possible to match chiprevs of
8317 * multiple vendors or processor families with a single call. Note that this
8318 * function operates on processor families, not cpuid families.
8319 */
8320 boolean_t
chiprev_matches(const x86_chiprev_t cr,const x86_chiprev_t template)8321 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8322 {
8323 return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8324 _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8325 (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8326 }
8327
8328 /*
8329 * A chiprev is at least min if the vendor and family are identical and the
8330 * revision of the chiprev is at least as recent as that of min. Processor
8331 * families are considered unordered and cannot be compared using this function.
8332 * Note that this function operates on processor families, not cpuid families.
8333 * Use of the _ANY chiprev variant with this function is not useful; it will
8334 * always return B_FALSE if the _ANY variant is supplied as the minimum
8335 * revision. To determine only whether a chiprev is of a given processor
8336 * family, test the return value of chiprev_family() instead.
8337 */
8338 boolean_t
chiprev_at_least(const x86_chiprev_t cr,const x86_chiprev_t min)8339 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8340 {
8341 return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8342 _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8343 _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8344 }
8345
8346 /*
8347 * The uarch functions operate in a manner similar to the chiprev functions
8348 * above. While it is tempting to allow these to operate on microarchitectures
8349 * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8350 * than ZEN2), we elect not to do so because a manufacturer may supply
8351 * processors of multiple different microarchitecture families each of which may
8352 * be internally ordered but unordered with respect to those of other families.
8353 */
8354 x86_uarch_t
uarchrev_uarch(const x86_uarchrev_t ur)8355 uarchrev_uarch(const x86_uarchrev_t ur)
8356 {
8357 return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8358 }
8359
8360 boolean_t
uarchrev_matches(const x86_uarchrev_t ur,const x86_uarchrev_t template)8361 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8362 {
8363 return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8364 _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8365 (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8366 }
8367
8368 boolean_t
uarchrev_at_least(const x86_uarchrev_t ur,const x86_uarchrev_t min)8369 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8370 {
8371 return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8372 _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8373 _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8374 }
8375
8376 /*
8377 * Topology cache related information. This is yet another cache interface that
8378 * we're exposing out intended to be used when we have either Intel Leaf 4 or
8379 * AMD Leaf 8x1D (introduced with Zen 1).
8380 */
8381 static boolean_t
cpuid_cache_topo_sup(const struct cpuid_info * cpi)8382 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8383 {
8384 switch (cpi->cpi_vendor) {
8385 case X86_VENDOR_Intel:
8386 if (cpi->cpi_maxeax >= 4) {
8387 return (B_TRUE);
8388 }
8389 break;
8390 case X86_VENDOR_AMD:
8391 case X86_VENDOR_HYGON:
8392 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8393 is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8394 return (B_TRUE);
8395 }
8396 break;
8397 default:
8398 break;
8399 }
8400
8401 return (B_FALSE);
8402 }
8403
8404 int
cpuid_getncaches(struct cpu * cpu,uint32_t * ncache)8405 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8406 {
8407 const struct cpuid_info *cpi;
8408
8409 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8410 cpi = cpu->cpu_m.mcpu_cpi;
8411
8412 if (!cpuid_cache_topo_sup(cpi)) {
8413 return (ENOTSUP);
8414 }
8415
8416 *ncache = cpi->cpi_cache_leaf_size;
8417 return (0);
8418 }
8419
8420 int
cpuid_getcache(struct cpu * cpu,uint32_t cno,x86_cache_t * cache)8421 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8422 {
8423 const struct cpuid_info *cpi;
8424 const struct cpuid_regs *cp;
8425
8426 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8427 cpi = cpu->cpu_m.mcpu_cpi;
8428
8429 if (!cpuid_cache_topo_sup(cpi)) {
8430 return (ENOTSUP);
8431 }
8432
8433 if (cno >= cpi->cpi_cache_leaf_size) {
8434 return (EINVAL);
8435 }
8436
8437 bzero(cache, sizeof (x86_cache_t));
8438 cp = cpi->cpi_cache_leaves[cno];
8439 switch (CPI_CACHE_TYPE(cp)) {
8440 case CPI_CACHE_TYPE_DATA:
8441 cache->xc_type = X86_CACHE_TYPE_DATA;
8442 break;
8443 case CPI_CACHE_TYPE_INSTR:
8444 cache->xc_type = X86_CACHE_TYPE_INST;
8445 break;
8446 case CPI_CACHE_TYPE_UNIFIED:
8447 cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8448 break;
8449 case CPI_CACHE_TYPE_DONE:
8450 default:
8451 return (EINVAL);
8452 }
8453 cache->xc_level = CPI_CACHE_LVL(cp);
8454 if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8455 cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8456 }
8457 cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8458 /*
8459 * The number of sets is reserved on AMD if the CPU is tagged as fully
8460 * associative, where as it is considered valid on Intel.
8461 */
8462 if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8463 CPI_FULL_ASSOC_CACHE(cp) != 0) {
8464 cache->xc_nsets = 1;
8465 } else {
8466 cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8467 }
8468 cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8469 cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8470 cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8471 cache->xc_line_size;
8472 /*
8473 * We're looking for the number of bits to cover the number of CPUs that
8474 * are being shared. Normally this would be the value - 1, but the CPUID
8475 * value is encoded as the actual value minus one, so we don't modify
8476 * this at all.
8477 */
8478 cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8479
8480 /*
8481 * To construct a unique ID we construct a uint64_t that looks as
8482 * follows:
8483 *
8484 * [47:40] cache level
8485 * [39:32] CPUID cache type
8486 * [31:00] shifted APIC ID
8487 *
8488 * The shifted APIC ID gives us a guarantee that a given cache entry is
8489 * unique within its peers. The other two numbers give us something that
8490 * ensures that something is unique within the CPU. If we just had the
8491 * APIC ID shifted over by the indicated number of bits we'd end up with
8492 * an ID of zero for the L1I, L1D, L2, and L3.
8493 *
8494 * The format of this ID is private to the system and can change across
8495 * a reboot for the time being.
8496 */
8497 cache->xc_id = (uint64_t)cache->xc_level << 40;
8498 cache->xc_id |= (uint64_t)cache->xc_type << 32;
8499 cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8500
8501 return (0);
8502 }
8503