1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net> 26 * Copyright 2020 Joyent, Inc. 27 * Copyright 2022 Oxide Computer Company 28 * Copyright 2022 MNX Cloud, Inc. 29 */ 30 /* 31 * Copyright (c) 2010, Intel Corporation. 32 * All rights reserved. 33 */ 34 /* 35 * Portions Copyright 2009 Advanced Micro Devices, Inc. 36 */ 37 38 /* 39 * CPU Identification logic 40 * 41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal 42 * with the identification of CPUs, their features, and their topologies. More 43 * specifically, this file helps drive the following: 44 * 45 * 1. Enumeration of features of the processor which are used by the kernel to 46 * determine what features to enable or disable. These may be instruction set 47 * enhancements or features that we use. 48 * 49 * 2. Enumeration of instruction set architecture (ISA) additions that userland 50 * will be told about through the auxiliary vector. 51 * 52 * 3. Understanding the physical topology of the CPU such as the number of 53 * caches, how many cores it has, whether or not it supports symmetric 54 * multi-processing (SMT), etc. 55 * 56 * ------------------------ 57 * CPUID History and Basics 58 * ------------------------ 59 * 60 * The cpuid instruction was added by Intel roughly around the time that the 61 * original Pentium was introduced. The purpose of cpuid was to tell in a 62 * programmatic fashion information about the CPU that previously was guessed 63 * at. For example, an important part of cpuid is that we can know what 64 * extensions to the ISA exist. If you use an invalid opcode you would get a 65 * #UD, so this method allows a program (whether a user program or the kernel) 66 * to determine what exists without crashing or getting a SIGILL. Of course, 67 * this was also during the era of the clones and the AMD Am5x86. The vendor 68 * name shows up first in cpuid for a reason. 69 * 70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts 71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has 72 * its own meaning. The different leaves are broken down into different regions: 73 * 74 * [ 0, 7fffffff ] This region is called the 'basic' 75 * region. This region is generally defined 76 * by Intel, though some of the original 77 * portions have different meanings based 78 * on the manufacturer. These days, Intel 79 * adds most new features to this region. 80 * AMD adds non-Intel compatible 81 * information in the third, extended 82 * region. Intel uses this for everything 83 * including ISA extensions, CPU 84 * features, cache information, topology, 85 * and more. 86 * 87 * There is a hole carved out of this 88 * region which is reserved for 89 * hypervisors. 90 * 91 * [ 40000000, 4fffffff ] This region, which is found in the 92 * middle of the previous region, is 93 * explicitly promised to never be used by 94 * CPUs. Instead, it is used by hypervisors 95 * to communicate information about 96 * themselves to the operating system. The 97 * values and details are unique for each 98 * hypervisor. 99 * 100 * [ 80000000, ffffffff ] This region is called the 'extended' 101 * region. Some of the low leaves mirror 102 * parts of the basic leaves. This region 103 * has generally been used by AMD for 104 * various extensions. For example, AMD- 105 * specific information about caches, 106 * features, and topology are found in this 107 * region. 108 * 109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx, 110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of 111 * the ranges, one of the primary things returned is the maximum valid leaf in 112 * that range. This allows for discovery of what range of CPUID is valid. 113 * 114 * The CPUs have potentially surprising behavior when using an invalid leaf or 115 * unimplemented leaf. If the requested leaf is within the valid basic or 116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be 117 * set to zero. However, if you specify a leaf that is outside of a valid range, 118 * then instead it will be filled with the last valid _basic_ leaf. For example, 119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or 120 * an invalid extended leaf will return the information for leaf 3. 121 * 122 * Some leaves are broken down into sub-leaves. This means that the value 123 * depends on both the leaf asked for in %eax and a secondary register. For 124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get 125 * additional information. Or when getting topology information in leaf 0xb, the 126 * initial value in %ecx changes which level of the topology that you are 127 * getting information about. 128 * 129 * cpuid values are always kept to 32 bits regardless of whether or not the 130 * program is in 64-bit mode. When executing in 64-bit mode, the upper 131 * 32 bits of the register are always set to zero so that way the values are the 132 * same regardless of execution mode. 133 * 134 * ---------------------- 135 * Identifying Processors 136 * ---------------------- 137 * 138 * We can identify a processor in two steps. The first step looks at cpuid leaf 139 * 0. Leaf 0 contains the processor's vendor information. This is done by 140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is 141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'. 142 * 143 * From there, a processor is identified by a combination of three different 144 * values: 145 * 146 * 1. Family 147 * 2. Model 148 * 3. Stepping 149 * 150 * Each vendor uses the family and model to uniquely identify a processor. The 151 * way that family and model are changed depends on the vendor. For example, 152 * Intel has been using family 0x6 for almost all of their processor since the 153 * Pentium Pro/Pentium II era, often called the P6. The model is used to 154 * identify the exact processor. Different models are often used for the client 155 * (consumer) and server parts. Even though each processor often has major 156 * architectural differences, they still are considered the same family by 157 * Intel. 158 * 159 * On the other hand, each major AMD architecture generally has its own family. 160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it 161 * the model number is used to help identify specific processors. 162 * 163 * The stepping is used to refer to a revision of a specific microprocessor. The 164 * term comes from equipment used to produce masks that are used to create 165 * integrated circuits. 166 * 167 * The information is present in leaf 1, %eax. In technical documentation you 168 * will see the terms extended model and extended family. The original family, 169 * model, and stepping fields were each 4 bits wide. If the values in either 170 * are 0xf, then one is to consult the extended model and extended family, which 171 * take previously reserved bits and allow for a larger number of models and add 172 * 0xf to them. 173 * 174 * When we process this information, we store the full family, model, and 175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and 176 * cpi_step, respectively. Whenever you are performing comparisons with the 177 * family, model, and stepping, you should use these members and not the raw 178 * values from cpuid. If you must use the raw values from cpuid directly, you 179 * must make sure that you add the extended model and family to the base model 180 * and family. 181 * 182 * In general, we do not use information about the family, model, and stepping 183 * to determine whether or not a feature is present; that is generally driven by 184 * specific leaves. However, when something we care about on the processor is 185 * not considered 'architectural' meaning that it is specific to a set of 186 * processors and not promised in the architecture model to be consistent from 187 * generation to generation, then we will fall back on this information. The 188 * most common cases where this comes up is when we have to workaround errata in 189 * the processor, are dealing with processor-specific features such as CPU 190 * performance counters, or we want to provide additional information for things 191 * such as fault management. 192 * 193 * While processors also do have a brand string, which is the name that people 194 * are familiar with when buying the processor, they are not meant for 195 * programmatic consumption. That is what the family, model, and stepping are 196 * for. 197 * 198 * ------------ 199 * CPUID Passes 200 * ------------ 201 * 202 * As part of performing feature detection, we break this into several different 203 * passes. There used to be a pass 0 that was done from assembly in locore.s to 204 * support processors that have a missing or broken cpuid instruction (notably 205 * certain Cyrix processors) but those were all 32-bit processors which are no 206 * longer supported. Passes are no longer numbered explicitly to make it easier 207 * to break them up or move them around as needed; however, they still have a 208 * well-defined execution ordering enforced by the definition of cpuid_pass_t in 209 * x86_archext.h. The external interface to execute a cpuid pass or determine 210 * whether a pass has been completed consists of cpuid_execpass() and 211 * cpuid_checkpass() respectively. The passes now, in that execution order, 212 * are as follows: 213 * 214 * PRELUDE This pass does not have any dependencies on system 215 * setup; in particular, unlike all subsequent passes it is 216 * guaranteed not to require PCI config space access. It 217 * sets the flag indicating that the processor we are 218 * running on supports the cpuid instruction, which all 219 * 64-bit processors do. This would also be the place to 220 * add any other basic state that is required later on and 221 * can be learned without dependencies. 222 * 223 * IDENT Determine which vendor manufactured the CPU, the family, 224 * model, and stepping information, and compute basic 225 * identifying tags from those values. This is done first 226 * so that machine-dependent code can control the features 227 * the cpuid instruction will report during subsequent 228 * passes if needed, and so that any intervening 229 * machine-dependent code that needs basic identity will 230 * have it available. 231 * 232 * BASIC This is the primary pass and is responsible for doing a 233 * large number of different things: 234 * 235 * 1. Gathering a large number of feature flags to 236 * determine which features the CPU support and which 237 * indicate things that we need to do other work in the OS 238 * to enable. Features detected this way are added to the 239 * x86_featureset which can be queried to 240 * determine what we should do. This includes processing 241 * all of the basic and extended CPU features that we care 242 * about. 243 * 244 * 2. Determining the CPU's topology. This includes 245 * information about how many cores and threads are present 246 * in the package. It also is responsible for figuring out 247 * which logical CPUs are potentially part of the same core 248 * and what other resources they might share. For more 249 * information see the 'Topology' section. 250 * 251 * 3. Determining the set of CPU security-specific features 252 * that we need to worry about and determine the 253 * appropriate set of workarounds. 254 * 255 * Pass 1 on the boot CPU occurs before KMDB is started. 256 * 257 * EXTENDED The second pass is done after startup(). Here, we check 258 * other miscellaneous features. Most of this is gathering 259 * additional basic and extended features that we'll use in 260 * later passes or for debugging support. 261 * 262 * DYNAMIC The third pass occurs after the kernel memory allocator 263 * has been fully initialized. This gathers information 264 * where we might need dynamic memory available for our 265 * uses. This includes several varying width leaves that 266 * have cache information and the processor's brand string. 267 * 268 * RESOLVE The fourth and final normal pass is performed after the 269 * kernel has brought most everything online. This is 270 * invoked from post_startup(). In this pass, we go through 271 * the set of features that we have enabled and turn that 272 * into the hardware auxiliary vector features that 273 * userland receives. This is used by userland, primarily 274 * by the run-time link-editor (RTLD), though userland 275 * software could also refer to it directly. 276 * 277 * The function that performs a pass is currently assumed to be infallible, and 278 * all existing implementation are. This simplifies callers by allowing 279 * cpuid_execpass() to return void. Similarly, implementers do not need to check 280 * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary. 281 * Both of these assumptions can be relaxed if needed by future developments. 282 * Tracking of completed states is handled by cpuid_execpass(). It is programmer 283 * error to attempt to execute a pass before all previous passes have been 284 * completed on the specified CPU, or to request cpuid information before the 285 * pass that captures it has been executed. These conditions can be tested 286 * using cpuid_checkpass(). 287 * 288 * The Microcode Pass 289 * 290 * After a microcode update, we do a selective rescan of the cpuid leaves to 291 * determine what features have changed. Microcode updates can provide more 292 * details about security related features to deal with issues like Spectre and 293 * L1TF. On occasion, vendors have violated their contract and removed bits. 294 * However, we don't try to detect that because that puts us in a situation that 295 * we really can't deal with. As such, the only thing we rescan are security 296 * related features today. See cpuid_pass_ucode(). This pass may be run in a 297 * different sequence on APs and therefore is not part of the sequential order; 298 * It is invoked directly instead of by cpuid_execpass() and its completion 299 * status cannot be checked by cpuid_checkpass(). This could be integrated with 300 * a more complex dependency mechanism if warranted by future developments. 301 * 302 * All of the passes are run on all CPUs. However, for the most part we only 303 * care about what the boot CPU says about this information and use the other 304 * CPUs as a rough guide to sanity check that we have the same feature set. 305 * 306 * We do not support running multiple logical CPUs with disjoint, let alone 307 * different, feature sets. 308 * 309 * ------------------ 310 * Processor Topology 311 * ------------------ 312 * 313 * One of the important things that we need to do is to understand the topology 314 * of the underlying processor. When we say topology in this case, we're trying 315 * to understand the relationship between the logical CPUs that the operating 316 * system sees and the underlying physical layout. Different logical CPUs may 317 * share different resources which can have important consequences for the 318 * performance of the system. For example, they may share caches, execution 319 * units, and more. 320 * 321 * The topology of the processor changes from generation to generation and 322 * vendor to vendor. Along with that, different vendors use different 323 * terminology, and the operating system itself uses occasionally overlapping 324 * terminology. It's important to understand what this topology looks like so 325 * one can understand the different things that we try to calculate and 326 * determine. 327 * 328 * To get started, let's talk about a little bit of terminology that we've used 329 * so far, is used throughout this file, and is fairly generic across multiple 330 * vendors: 331 * 332 * CPU 333 * A central processing unit (CPU) refers to a logical and/or virtual 334 * entity that the operating system can execute instructions on. The 335 * underlying resources for this CPU may be shared between multiple 336 * entities; however, to the operating system it is a discrete unit. 337 * 338 * PROCESSOR and PACKAGE 339 * 340 * Generally, when we use the term 'processor' on its own, we are referring 341 * to the physical entity that one buys and plugs into a board. However, 342 * because processor has been overloaded and one might see it used to mean 343 * multiple different levels, we will instead use the term 'package' for 344 * the rest of this file. The term package comes from the electrical 345 * engineering side and refers to the physical entity that encloses the 346 * electronics inside. Strictly speaking the package can contain more than 347 * just the CPU, for example, on many processors it may also have what's 348 * called an 'integrated graphical processing unit (GPU)'. Because the 349 * package can encapsulate multiple units, it is the largest physical unit 350 * that we refer to. 351 * 352 * SOCKET 353 * 354 * A socket refers to unit on a system board (generally the motherboard) 355 * that can receive a package. A single package, or processor, is plugged 356 * into a single socket. A system may have multiple sockets. Often times, 357 * the term socket is used interchangeably with package and refers to the 358 * electrical component that has plugged in, and not the receptacle itself. 359 * 360 * CORE 361 * 362 * A core refers to the physical instantiation of a CPU, generally, with a 363 * full set of hardware resources available to it. A package may contain 364 * multiple cores inside of it or it may just have a single one. A 365 * processor with more than one core is often referred to as 'multi-core'. 366 * In illumos, we will use the feature X86FSET_CMP to refer to a system 367 * that has 'multi-core' processors. 368 * 369 * A core may expose a single logical CPU to the operating system, or it 370 * may expose multiple CPUs, which we call threads, defined below. 371 * 372 * Some resources may still be shared by cores in the same package. For 373 * example, many processors will share the level 3 cache between cores. 374 * Some AMD generations share hardware resources between cores. For more 375 * information on that see the section 'AMD Topology'. 376 * 377 * THREAD and STRAND 378 * 379 * In this file, generally a thread refers to a hardware resources and not 380 * the operating system's logical abstraction. A thread is always exposed 381 * as an independent logical CPU to the operating system. A thread belongs 382 * to a specific core. A core may have more than one thread. When that is 383 * the case, the threads that are part of the same core are often referred 384 * to as 'siblings'. 385 * 386 * When multiple threads exist, this is generally referred to as 387 * simultaneous multi-threading (SMT). When Intel introduced this in their 388 * processors they called it hyper-threading (HT). When multiple threads 389 * are active in a core, they split the resources of the core. For example, 390 * two threads may share the same set of hardware execution units. 391 * 392 * The operating system often uses the term 'strand' to refer to a thread. 393 * This helps disambiguate it from the software concept. 394 * 395 * CHIP 396 * 397 * Unfortunately, the term 'chip' is dramatically overloaded. At its most 398 * base meaning, it is used to refer to a single integrated circuit, which 399 * may or may not be the only thing in the package. In illumos, when you 400 * see the term 'chip' it is almost always referring to the same thing as 401 * the 'package'. However, many vendors may use chip to refer to one of 402 * many integrated circuits that have been placed in the package. As an 403 * example, see the subsequent definition. 404 * 405 * To try and keep things consistent, we will only use chip when referring 406 * to the entire integrated circuit package, with the exception of the 407 * definition of multi-chip module (because it is in the name) and use the 408 * term 'die' when we want the more general, potential sub-component 409 * definition. 410 * 411 * DIE 412 * 413 * A die refers to an integrated circuit. Inside of the package there may 414 * be a single die or multiple dies. This is sometimes called a 'chip' in 415 * vendor's parlance, but in this file, we use the term die to refer to a 416 * subcomponent. 417 * 418 * MULTI-CHIP MODULE 419 * 420 * A multi-chip module (MCM) refers to putting multiple distinct chips that 421 * are connected together in the same package. When a multi-chip design is 422 * used, generally each chip is manufactured independently and then joined 423 * together in the package. For example, on AMD's Zen microarchitecture 424 * (family 0x17), the package contains several dies (the second meaning of 425 * chip from above) that are connected together. 426 * 427 * CACHE 428 * 429 * A cache is a part of the processor that maintains copies of recently 430 * accessed memory. Caches are split into levels and then into types. 431 * Commonly there are one to three levels, called level one, two, and 432 * three. The lower the level, the smaller it is, the closer it is to the 433 * execution units of the CPU, and the faster it is to access. The layout 434 * and design of the cache come in many different flavors, consult other 435 * resources for a discussion of those. 436 * 437 * Caches are generally split into two types, the instruction and data 438 * cache. The caches contain what their names suggest, the instruction 439 * cache has executable program text, while the data cache has all other 440 * memory that the processor accesses. As of this writing, data is kept 441 * coherent between all of the caches on x86, so if one modifies program 442 * text before it is executed, that will be in the data cache, and the 443 * instruction cache will be synchronized with that change when the 444 * processor actually executes those instructions. This coherency also 445 * covers the fact that data could show up in multiple caches. 446 * 447 * Generally, the lowest level caches are specific to a core. However, the 448 * last layer cache is shared between some number of cores. The number of 449 * CPUs sharing this last level cache is important. This has implications 450 * for the choices that the scheduler makes, as accessing memory that might 451 * be in a remote cache after thread migration can be quite expensive. 452 * 453 * Sometimes, the word cache is abbreviated with a '$', because in US 454 * English the word cache is pronounced the same as cash. So L1D$ refers to 455 * the L1 data cache, and L2$ would be the L2 cache. This will not be used 456 * in the rest of this theory statement for clarity. 457 * 458 * MEMORY CONTROLLER 459 * 460 * The memory controller is a component that provides access to DRAM. Each 461 * memory controller can access a set number of DRAM channels. Each channel 462 * can have a number of DIMMs (sticks of memory) associated with it. A 463 * given package may have more than one memory controller. The association 464 * of the memory controller to a group of cores is important as it is 465 * cheaper to access memory on the controller that you are associated with. 466 * 467 * NUMA 468 * 469 * NUMA or non-uniform memory access, describes a way that systems are 470 * built. On x86, any processor core can address all of the memory in the 471 * system. However, When using multiple sockets or possibly within a 472 * multi-chip module, some of that memory is physically closer and some of 473 * it is further. Memory that is further away is more expensive to access. 474 * Consider the following image of multiple sockets with memory: 475 * 476 * +--------+ +--------+ 477 * | DIMM A | +----------+ +----------+ | DIMM D | 478 * +--------+-+ | | | | +-+------+-+ 479 * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E | 480 * +--------+-+ | | | | +-+------+-+ 481 * | DIMM C | +----------+ +----------+ | DIMM F | 482 * +--------+ +--------+ 483 * 484 * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is 485 * closer to DIMMs D-F. This means that it is cheaper for socket 0 to 486 * access DIMMs A-C and more expensive to access D-F as it has to go 487 * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs 488 * D-F are cheaper than A-C. While the socket form is the most common, when 489 * using multi-chip modules, this can also sometimes occur. For another 490 * example of this that's more involved, see the AMD topology section. 491 * 492 * 493 * Intel Topology 494 * -------------- 495 * 496 * Most Intel processors since Nehalem, (as of this writing the current gen 497 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of 498 * the package is a single monolithic die. MCMs currently aren't used. Most 499 * parts have three levels of caches, with the L3 cache being shared between 500 * all of the cores on the package. The L1/L2 cache is generally specific to 501 * an individual core. The following image shows at a simplified level what 502 * this looks like. The memory controller is commonly part of something called 503 * the 'Uncore', that used to be separate physical chips that were not a part of 504 * the package, but are now part of the same chip. 505 * 506 * +-----------------------------------------------------------------------+ 507 * | Package | 508 * | +-------------------+ +-------------------+ +-------------------+ | 509 * | | Core | | Core | | Core | | 510 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 511 * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | | 512 * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | | 513 * | | +--------+ | | | | +--------+ | | | | +--------+ | | | | 514 * | | | Thread | | | | | | Thread | | | | | | Thread | | | | | 515 * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | 516 * | | +--------------+ | | +--------------+ | | +--------------+ | | 517 * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | | 518 * | | +--------------+ | | +--------------+ | | +--------------+ | | 519 * | +-------------------+ +-------------------+ +-------------------+ | 520 * | +-------------------------------------------------------------------+ | 521 * | | Shared L3 Cache | | 522 * | +-------------------------------------------------------------------+ | 523 * | +-------------------------------------------------------------------+ | 524 * | | Memory Controller | | 525 * | +-------------------------------------------------------------------+ | 526 * +-----------------------------------------------------------------------+ 527 * 528 * A side effect of this current architecture is that what we care about from a 529 * scheduling and topology perspective, is simplified. In general we care about 530 * understanding which logical CPUs are part of the same core and socket. 531 * 532 * To determine the relationship between threads and cores, Intel initially used 533 * the identifier in the advanced programmable interrupt controller (APIC). They 534 * also added cpuid leaf 4 to give additional information about the number of 535 * threads and CPUs in the processor. With the addition of x2apic (which 536 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an 537 * additional cpuid topology leaf 0xB was added. 538 * 539 * AMD Topology 540 * ------------ 541 * 542 * When discussing AMD topology, we want to break this into three distinct 543 * generations of topology. There's the basic topology that has been used in 544 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced 545 * with family 0x15 (Bulldozer), and there's the topology that was introduced 546 * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family 547 * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some 548 * additional terminology that's worth talking about. 549 * 550 * Until the introduction of family 0x17 (Zen), AMD did not implement something 551 * that they considered SMT. Whether or not the AMD processors have SMT 552 * influences many things including scheduling and reliability, availability, 553 * and serviceability (RAS) features. 554 * 555 * NODE 556 * 557 * AMD uses the term node to refer to a die that contains a number of cores 558 * and I/O resources. Depending on the processor family and model, more 559 * than one node can be present in the package. When there is more than one 560 * node this indicates a multi-chip module. Usually each node has its own 561 * access to memory and I/O devices. This is important and generally 562 * different from the corresponding Intel Nehalem-Skylake+ processors. As a 563 * result, we track this relationship in the operating system. 564 * 565 * In processors with an L3 cache, the L3 cache is generally shared across 566 * the entire node, though the way this is carved up varies from generation 567 * to generation. 568 * 569 * BULLDOZER 570 * 571 * Starting with the Bulldozer family (0x15) and continuing until the 572 * introduction of the Zen microarchitecture, AMD introduced the idea of a 573 * compute unit. In a compute unit, two traditional cores share a number of 574 * hardware resources. Critically, they share the FPU, L1 instruction 575 * cache, and the L2 cache. Several compute units were then combined inside 576 * of a single node. Because the integer execution units, L1 data cache, 577 * and some other resources were not shared between the cores, AMD never 578 * considered this to be SMT. 579 * 580 * ZEN 581 * 582 * The Zen family (0x17) uses a multi-chip module (MCM) design, the module 583 * is called Zeppelin. These modules are similar to the idea of nodes used 584 * previously. Each of these nodes has two DRAM channels which all of the 585 * cores in the node can access uniformly. These nodes are linked together 586 * in the package, creating a NUMA environment. 587 * 588 * The Zeppelin die itself contains two different 'core complexes'. Each 589 * core complex consists of four cores which each have two threads, for a 590 * total of 8 logical CPUs per complex. Unlike other generations, 591 * where all the logical CPUs in a given node share the L3 cache, here each 592 * core complex has its own shared L3 cache. 593 * 594 * A further thing that we need to consider is that in some configurations, 595 * particularly with the Threadripper line of processors, not every die 596 * actually has its memory controllers wired up to actual memory channels. 597 * This means that some cores have memory attached to them and others 598 * don't. 599 * 600 * To put Zen in perspective, consider the following images: 601 * 602 * +--------------------------------------------------------+ 603 * | Core Complex | 604 * | +-------------------+ +-------------------+ +---+ | 605 * | | Core +----+ | | Core +----+ | | | | 606 * | | +--------+ | L2 | | | +--------+ | L2 | | | | | 607 * | | | Thread | +----+ | | | Thread | +----+ | | | | 608 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | | 609 * | | | Thread | |L1| | | | Thread | |L1| | | 3 | | 610 * | | +--------+ +--+ | | +--------+ +--+ | | | | 611 * | +-------------------+ +-------------------+ | C | | 612 * | +-------------------+ +-------------------+ | a | | 613 * | | Core +----+ | | Core +----+ | | c | | 614 * | | +--------+ | L2 | | | +--------+ | L2 | | | h | | 615 * | | | Thread | +----+ | | | Thread | +----+ | | e | | 616 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | | 617 * | | | Thread | |L1| | | | Thread | |L1| | | | | 618 * | | +--------+ +--+ | | +--------+ +--+ | | | | 619 * | +-------------------+ +-------------------+ +---+ | 620 * | | 621 * +--------------------------------------------------------+ 622 * 623 * This first image represents a single Zen core complex that consists of four 624 * cores. 625 * 626 * 627 * +--------------------------------------------------------+ 628 * | Zeppelin Die | 629 * | +--------------------------------------------------+ | 630 * | | I/O Units (PCIe, SATA, USB, etc.) | | 631 * | +--------------------------------------------------+ | 632 * | HH | 633 * | +-----------+ HH +-----------+ | 634 * | | | HH | | | 635 * | | Core |==========| Core | | 636 * | | Complex |==========| Complex | | 637 * | | | HH | | | 638 * | +-----------+ HH +-----------+ | 639 * | HH | 640 * | +--------------------------------------------------+ | 641 * | | Memory Controller | | 642 * | +--------------------------------------------------+ | 643 * | | 644 * +--------------------------------------------------------+ 645 * 646 * This image represents a single Zeppelin Die. Note how both cores are 647 * connected to the same memory controller and I/O units. While each core 648 * complex has its own L3 cache as seen in the first image, they both have 649 * uniform access to memory. 650 * 651 * 652 * PP PP 653 * PP PP 654 * +----------PP---------------------PP---------+ 655 * | PP PP | 656 * | +-----------+ +-----------+ | 657 * | | | | | | 658 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 659 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 660 * | | | | | | 661 * | +-----------+ooo ...+-----------+ | 662 * | HH ooo ... HH | 663 * | HH oo.. HH | 664 * | HH ..oo HH | 665 * | HH ... ooo HH | 666 * | +-----------+... ooo+-----------+ | 667 * | | | | | | 668 * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM 669 * MMMMMMMMM| Die |==========| Die |MMMMMMMMM 670 * | | | | | | 671 * | +-----------+ +-----------+ | 672 * | PP PP | 673 * +----------PP---------------------PP---------+ 674 * PP PP 675 * PP PP 676 * 677 * This image represents a single Zen package. In this example, it has four 678 * Zeppelin dies, though some configurations only have a single one. In this 679 * example, each die is directly connected to the next. Also, each die is 680 * represented as being connected to memory by the 'M' character and connected 681 * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin 682 * die is made up of two core complexes, we have multiple different NUMA 683 * domains that we care about for these systems. 684 * 685 * ZEN 2 686 * 687 * Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1 688 * each Zeppelin Die had its own I/O die, that has been moved out of the 689 * core complex in Zen 2. The actual core complex looks pretty similar, but 690 * now the die actually looks much simpler: 691 * 692 * +--------------------------------------------------------+ 693 * | Zen 2 Core Complex Die HH | 694 * | HH | 695 * | +-----------+ HH +-----------+ | 696 * | | | HH | | | 697 * | | Core |==========| Core | | 698 * | | Complex |==========| Complex | | 699 * | | | HH | | | 700 * | +-----------+ HH +-----------+ | 701 * | HH | 702 * | HH | 703 * +--------------------------------------------------------+ 704 * 705 * From here, when we add the central I/O die, this changes things a bit. 706 * Each die is connected to the I/O die, rather than trying to interconnect 707 * them directly. The following image takes the same Zen 1 image that we 708 * had earlier and shows what it looks like with the I/O die instead: 709 * 710 * PP PP 711 * PP PP 712 * +---------------------PP----PP---------------------+ 713 * | PP PP | 714 * | +-----------+ PP PP +-----------+ | 715 * | | | PP PP | | | 716 * | | Zen 2 | +-PP----PP-+ | Zen 2 | | 717 * | | Die _| | PP PP | |_ Die | | 718 * | | |o|oooo| |oooo|o| | | 719 * | +-----------+ | | +-----------+ | 720 * | | I/O | | 721 * MMMMMMMMMMMMMMMMMMMMMMMMMM Die MMMMMMMMMMMMMMMMMMMMMMMMMM 722 * MMMMMMMMMMMMMMMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMM 723 * | | | | 724 * MMMMMMMMMMMMMMMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMM 725 * MMMMMMMMMMMMMMMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMM 726 * | | | | 727 * | +-----------+ | | +-----------+ | 728 * | | |o|oooo| PP PP |oooo|o| | | 729 * | | Zen 2 -| +-PP----PP-+ |- Zen 2 | | 730 * | | Die | PP PP | Die | | 731 * | | | PP PP | | | 732 * | +-----------+ PP PP +-----------+ | 733 * | PP PP | 734 * +---------------------PP----PP---------------------+ 735 * PP PP 736 * PP PP 737 * 738 * The above has four core complex dies installed, though the Zen 2 EPYC 739 * and ThreadRipper parts allow for up to eight, while the Ryzen parts 740 * generally only have one to two. The more notable difference here is how 741 * everything communicates. Note that memory and PCIe come out of the 742 * central die. This changes the way that one die accesses a resource. It 743 * basically always has to go to the I/O die, where as in Zen 1 it may have 744 * satisfied it locally. In general, this ends up being a better strategy 745 * for most things, though it is possible to still treat everything in four 746 * distinct NUMA domains with each Zen 2 die slightly closer to some memory 747 * and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as 748 * now there is only one 'node' present. 749 * 750 * ZEN 3 751 * 752 * From an architectural perspective, Zen 3 is a much smaller change from 753 * Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in 754 * its microarchitectural changes. The biggest thing for us is how the die 755 * changes. In Zen 1 and Zen 2, each core complex still had its own L3 756 * cache. However, in Zen 3, the L3 is now shared between the entire core 757 * complex die and is no longer partitioned between each core complex. This 758 * means that all cores on the die can share the same L3 cache. Otherwise, 759 * the general layout of the overall package with various core complexes 760 * and an I/O die stays the same. Here's what the Core Complex Die looks 761 * like in a bit more detail: 762 * 763 * +-------------------------------------------------+ 764 * | Zen 3 Core Complex Die | 765 * | +-------------------+ +-------------------+ | 766 * | | Core +----+ | | Core +----+ | | 767 * | | +--------+ | L2 | | | +--------+ | L2 | | | 768 * | | | Thread | +----+ | | | Thread | +----+ | | 769 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | 770 * | | | Thread | |L1| | | | Thread | |L1| | | 771 * | | +--------+ +--+ | | +--------+ +--+ | | 772 * | +-------------------+ +-------------------+ | 773 * | +-------------------+ +-------------------+ | 774 * | | Core +----+ | | Core +----+ | | 775 * | | +--------+ | L2 | | | +--------+ | L2 | | | 776 * | | | Thread | +----+ | | | Thread | +----+ | | 777 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | 778 * | | | Thread | |L1| | | | Thread | |L1| | | 779 * | | +--------+ +--+ | | +--------+ +--+ | | 780 * | +-------------------+ +-------------------+ | 781 * | | 782 * | +--------------------------------------------+ | 783 * | | L3 Cache | | 784 * | +--------------------------------------------+ | 785 * | | 786 * | +-------------------+ +-------------------+ | 787 * | | Core +----+ | | Core +----+ | | 788 * | | +--------+ | L2 | | | +--------+ | L2 | | | 789 * | | | Thread | +----+ | | | Thread | +----+ | | 790 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | 791 * | | | Thread | |L1| | | | Thread | |L1| | | 792 * | | +--------+ +--+ | | +--------+ +--+ | | 793 * | +-------------------+ +-------------------+ | 794 * | +-------------------+ +-------------------+ | 795 * | | Core +----+ | | Core +----+ | | 796 * | | +--------+ | L2 | | | +--------+ | L2 | | | 797 * | | | Thread | +----+ | | | Thread | +----+ | | 798 * | | +--------+-+ +--+ | | +--------+-+ +--+ | | 799 * | | | Thread | |L1| | | | Thread | |L1| | | 800 * | | +--------+ +--+ | | +--------+ +--+ | | 801 * | +-------------------+ +-------------------+ | 802 * +-------------------------------------------------+ 803 * 804 * While it is not pictured, there are connections from the die to the 805 * broader data fabric and additional functional blocks to support that 806 * communication and coherency. 807 * 808 * CPUID LEAVES 809 * 810 * There are a few different CPUID leaves that we can use to try and understand 811 * the actual state of the world. As part of the introduction of family 0xf, AMD 812 * added CPUID leaf 0x80000008. This leaf tells us the number of logical 813 * processors that are in the system. Because families before Zen didn't have 814 * SMT, this was always the number of cores that were in the system. However, it 815 * should always be thought of as the number of logical threads to be consistent 816 * between generations. In addition we also get the size of the APIC ID that is 817 * used to represent the number of logical processors. This is important for 818 * deriving topology information. 819 * 820 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a 821 * bit between Bulldozer and later families, but it is quite useful in 822 * determining the topology information. Because this information has changed 823 * across family generations, it's worth calling out what these mean 824 * explicitly. The registers have the following meanings: 825 * 826 * %eax The APIC ID. The entire register is defined to have a 32-bit 827 * APIC ID, even though on systems without x2apic support, it will 828 * be limited to 8 bits. 829 * 830 * %ebx On Bulldozer-era systems this contains information about the 831 * number of cores that are in a compute unit (cores that share 832 * resources). It also contains a per-package compute unit ID that 833 * identifies which compute unit the logical CPU is a part of. 834 * 835 * On Zen-era systems this instead contains the number of threads 836 * per core and the ID of the core that the logical CPU is a part 837 * of. Note, this ID is unique only to the package, it is not 838 * globally unique across the entire system. 839 * 840 * %ecx This contains the number of nodes that exist in the package. It 841 * also contains an ID that identifies which node the logical CPU 842 * is a part of. 843 * 844 * Finally, we also use cpuid leaf 0x8000001D to determine information about the 845 * cache layout to determine which logical CPUs are sharing which caches. 846 * 847 * illumos Topology 848 * ---------------- 849 * 850 * Based on the above we synthesize the information into several different 851 * variables that we store in the 'struct cpuid_info'. We'll go into the details 852 * of what each member is supposed to represent and their uniqueness. In 853 * general, there are two levels of uniqueness that we care about. We care about 854 * an ID that is globally unique. That means that it will be unique across all 855 * entities in the system. For example, the default logical CPU ID is globally 856 * unique. On the other hand, there is some information that we only care about 857 * being unique within the context of a single package / socket. Here are the 858 * variables that we keep track of and their meaning. 859 * 860 * Several of the values that are asking for an identifier, with the exception 861 * of cpi_apicid, are allowed to be synthetic. 862 * 863 * 864 * cpi_apicid 865 * 866 * This is the value of the CPU's APIC id. This should be the full 32-bit 867 * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit 868 * APIC ID. This value is globally unique between all logical CPUs across 869 * all packages. This is usually required by the APIC. 870 * 871 * cpi_chipid 872 * 873 * This value indicates the ID of the package that the logical CPU is a 874 * part of. This value is allowed to be synthetic. It is usually derived by 875 * taking the CPU's APIC ID and determining how many bits are used to 876 * represent CPU cores in the package. All logical CPUs that are part of 877 * the same package must have the same value. 878 * 879 * cpi_coreid 880 * 881 * This represents the ID of a CPU core. Two logical CPUs should only have 882 * the same cpi_coreid value if they are part of the same core. These 883 * values may be synthetic. On systems that support SMT, this value is 884 * usually derived from the APIC ID, otherwise it is often synthetic and 885 * just set to the value of the cpu_id in the cpu_t. 886 * 887 * cpi_pkgcoreid 888 * 889 * This is similar to the cpi_coreid in that logical CPUs that are part of 890 * the same core should have the same ID. The main difference is that these 891 * values are only required to be unique to a given socket. 892 * 893 * cpi_clogid 894 * 895 * This represents the logical ID of a logical CPU. This value should be 896 * unique within a given socket for each logical CPU. This is allowed to be 897 * synthetic, though it is usually based off of the CPU's apic ID. The 898 * broader system expects that logical CPUs that have are part of the same 899 * core have contiguous numbers. For example, if there were two threads per 900 * core, then the core IDs divided by two should be the same and the first 901 * modulus two should be zero and the second one. For example, IDs 4 and 5 902 * indicate two logical CPUs that are part of the same core. But IDs 5 and 903 * 6 represent two logical CPUs that are part of different cores. 904 * 905 * While it is common for the cpi_coreid and the cpi_clogid to be derived 906 * from the same source, strictly speaking, they don't have to be and the 907 * two values should be considered logically independent. One should not 908 * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine 909 * some kind of relationship. While this is tempting, we've seen cases on 910 * AMD family 0xf where the system's cpu id is not related to its APIC ID. 911 * 912 * cpi_ncpu_per_chip 913 * 914 * This value indicates the total number of logical CPUs that exist in the 915 * physical package. Critically, this is not the number of logical CPUs 916 * that exist for just the single core. 917 * 918 * This value should be the same for all logical CPUs in the same package. 919 * 920 * cpi_ncore_per_chip 921 * 922 * This value indicates the total number of physical CPU cores that exist 923 * in the package. The system compares this value with cpi_ncpu_per_chip to 924 * determine if simultaneous multi-threading (SMT) is enabled. When 925 * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and 926 * the X86FSET_HTT feature is not set. If this value is greater than one, 927 * than we consider the processor to have the feature X86FSET_CMP, to 928 * indicate that there is support for more than one core. 929 * 930 * This value should be the same for all logical CPUs in the same package. 931 * 932 * cpi_procnodes_per_pkg 933 * 934 * This value indicates the number of 'nodes' that exist in the package. 935 * When processors are actually a multi-chip module, this represents the 936 * number of such modules that exist in the package. Currently, on Intel 937 * based systems this member is always set to 1. 938 * 939 * This value should be the same for all logical CPUs in the same package. 940 * 941 * cpi_procnodeid 942 * 943 * This value indicates the ID of the node that the logical CPU is a part 944 * of. All logical CPUs that are in the same node must have the same value 945 * here. This value must be unique across all of the packages in the 946 * system. On Intel based systems, this is currently set to the value in 947 * cpi_chipid because there is only one node. 948 * 949 * cpi_cores_per_compunit 950 * 951 * This value indicates the number of cores that are part of a compute 952 * unit. See the AMD topology section for this. This member only has real 953 * meaning currently for AMD Bulldozer family processors. For all other 954 * processors, this should currently be set to 1. 955 * 956 * cpi_compunitid 957 * 958 * This indicates the compute unit that the logical CPU belongs to. For 959 * processors without AMD Bulldozer-style compute units this should be set 960 * to the value of cpi_coreid. 961 * 962 * cpi_ncpu_shr_last_cache 963 * 964 * This indicates the number of logical CPUs that are sharing the same last 965 * level cache. This value should be the same for all CPUs that are sharing 966 * that cache. The last cache refers to the cache that is closest to memory 967 * and furthest away from the CPU. 968 * 969 * cpi_last_lvl_cacheid 970 * 971 * This indicates the ID of the last cache that the logical CPU uses. This 972 * cache is often shared between multiple logical CPUs and is the cache 973 * that is closest to memory and furthest away from the CPU. This value 974 * should be the same for a group of logical CPUs only if they actually 975 * share the same last level cache. IDs should not overlap between 976 * packages. 977 * 978 * cpi_ncore_bits 979 * 980 * This indicates the number of bits that are required to represent all of 981 * the cores in the system. As cores are derived based on their APIC IDs, 982 * we aren't guaranteed a run of APIC IDs starting from zero. It's OK for 983 * this value to be larger than the actual number of IDs that are present 984 * in the system. This is used to size tables by the CMI framework. It is 985 * only filled in for Intel and AMD CPUs. 986 * 987 * cpi_nthread_bits 988 * 989 * This indicates the number of bits required to represent all of the IDs 990 * that cover the logical CPUs that exist on a given core. It's OK for this 991 * value to be larger than the actual number of IDs that are present in the 992 * system. This is used to size tables by the CMI framework. It is 993 * only filled in for Intel and AMD CPUs. 994 * 995 * ----------- 996 * Hypervisors 997 * ----------- 998 * 999 * If trying to manage the differences between vendors wasn't bad enough, it can 1000 * get worse thanks to our friend hardware virtualization. Hypervisors are given 1001 * the ability to interpose on all cpuid instructions and change them to suit 1002 * their purposes. In general, this is necessary as the hypervisor wants to be 1003 * able to present a more uniform set of features or not necessarily give the 1004 * guest operating system kernel knowledge of all features so it can be 1005 * more easily migrated between systems. 1006 * 1007 * When it comes to trying to determine topology information, this can be a 1008 * double edged sword. When a hypervisor doesn't actually implement a cpuid 1009 * leaf, it'll often return all zeros. Because of that, you'll often see various 1010 * checks scattered about fields being non-zero before we assume we can use 1011 * them. 1012 * 1013 * When it comes to topology information, the hypervisor is often incentivized 1014 * to lie to you about topology. This is because it doesn't always actually 1015 * guarantee that topology at all. The topology path we take in the system 1016 * depends on how the CPU advertises itself. If it advertises itself as an Intel 1017 * or AMD CPU, then we basically do our normal path. However, when they don't 1018 * use an actual vendor, then that usually turns into multiple one-core CPUs 1019 * that we enumerate that are often on different sockets. The actual behavior 1020 * depends greatly on what the hypervisor actually exposes to us. 1021 * 1022 * -------------------- 1023 * Exposing Information 1024 * -------------------- 1025 * 1026 * We expose CPUID information in three different forms in the system. 1027 * 1028 * The first is through the x86_featureset variable. This is used in conjunction 1029 * with the is_x86_feature() function. This is queried by x86-specific functions 1030 * to determine which features are or aren't present in the system and to make 1031 * decisions based upon them. For example, users of this include everything from 1032 * parts of the system dedicated to reliability, availability, and 1033 * serviceability (RAS), to making decisions about how to handle security 1034 * mitigations, to various x86-specific drivers. General purpose or 1035 * architecture independent drivers should never be calling this function. 1036 * 1037 * The second means is through the auxiliary vector. The auxiliary vector is a 1038 * series of tagged data that the kernel passes down to a user program when it 1039 * begins executing. This information is used to indicate to programs what 1040 * instruction set extensions are present. For example, information about the 1041 * CPU supporting the machine check architecture (MCA) wouldn't be passed down 1042 * since user programs cannot make use of it. However, things like the AVX 1043 * instruction sets are. Programs use this information to make run-time 1044 * decisions about what features they should use. As an example, the run-time 1045 * link-editor (rtld) can relocate different functions depending on the hardware 1046 * support available. 1047 * 1048 * The final form is through a series of accessor functions that all have the 1049 * form cpuid_get*. This is used by a number of different subsystems in the 1050 * kernel to determine more detailed information about what we're running on, 1051 * topology information, etc. Some of these subsystems include processor groups 1052 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI, 1053 * microcode, and performance monitoring. These functions all ASSERT that the 1054 * CPU they're being called on has reached a certain cpuid pass. If the passes 1055 * are rearranged, then this needs to be adjusted. 1056 * 1057 * ----------------------------------------------- 1058 * Speculative Execution CPU Side Channel Security 1059 * ----------------------------------------------- 1060 * 1061 * With the advent of the Spectre and Meltdown attacks which exploit speculative 1062 * execution in the CPU to create side channels there have been a number of 1063 * different attacks and corresponding issues that the operating system needs to 1064 * mitigate against. The following list is some of the common, but not 1065 * exhaustive, set of issues that we know about and have done some or need to do 1066 * more work in the system to mitigate against: 1067 * 1068 * - Spectre v1 1069 * - swapgs (Spectre v1 variant) 1070 * - Spectre v2 1071 * - Meltdown (Spectre v3) 1072 * - Rogue Register Read (Spectre v3a) 1073 * - Speculative Store Bypass (Spectre v4) 1074 * - ret2spec, SpectreRSB 1075 * - L1 Terminal Fault (L1TF) 1076 * - Microarchitectural Data Sampling (MDS) 1077 * 1078 * Each of these requires different sets of mitigations and has different attack 1079 * surfaces. For the most part, this discussion is about protecting the kernel 1080 * from non-kernel executing environments such as user processes and hardware 1081 * virtual machines. Unfortunately, there are a number of user vs. user 1082 * scenarios that exist with these. The rest of this section will describe the 1083 * overall approach that the system has taken to address these as well as their 1084 * shortcomings. Unfortunately, not all of the above have been handled today. 1085 * 1086 * SPECTRE v2, ret2spec, SpectreRSB 1087 * 1088 * The second variant of the spectre attack focuses on performing branch target 1089 * injection. This generally impacts indirect call instructions in the system. 1090 * There are three different ways to mitigate this issue that are commonly 1091 * described today: 1092 * 1093 * 1. Using Indirect Branch Restricted Speculation (IBRS). 1094 * 2. Using Retpolines and RSB Stuffing 1095 * 3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS) 1096 * 1097 * IBRS uses a feature added to microcode to restrict speculation, among other 1098 * things. This form of mitigation has not been used as it has been generally 1099 * seen as too expensive and requires reactivation upon various transitions in 1100 * the system. 1101 * 1102 * As a less impactful alternative to IBRS, retpolines were developed by 1103 * Google. These basically require one to replace indirect calls with a specific 1104 * trampoline that will cause speculation to fail and break the attack. 1105 * Retpolines require compiler support. We always build with retpolines in the 1106 * external thunk mode. This means that a traditional indirect call is replaced 1107 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect 1108 * of this is that all indirect function calls are performed through a register. 1109 * 1110 * We have to use a common external location of the thunk and not inline it into 1111 * the callsite so that way we can have a single place to patch these functions. 1112 * As it turns out, we currently have two different forms of retpolines that 1113 * exist in the system: 1114 * 1115 * 1. A full retpoline 1116 * 2. A no-op version 1117 * 1118 * The first one is used in the general case. Historically, there was an 1119 * AMD-specific optimized retopoline variant that was based around using a 1120 * serializing lfence instruction; however, in March 2022 it was announced that 1121 * this was actually still vulnerable to Spectre v2 and therefore we no longer 1122 * use it and it is no longer available in the system. 1123 * 1124 * The third form described above is the most curious. It turns out that the way 1125 * that retpolines are implemented is that they rely on how speculation is 1126 * performed on a 'ret' instruction. Intel has continued to optimize this 1127 * process (which is partly why we need to have return stack buffer stuffing, 1128 * but more on that in a bit) and in processors starting with Cascade Lake 1129 * on the server side, it's dangerous to rely on retpolines. Instead, a new 1130 * mechanism has been introduced called Enhanced IBRS (eIBRS). 1131 * 1132 * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each 1133 * physical core. However, if this is the case, we don't want to use retpolines 1134 * any more. Therefore if eIBRS is present, we end up turning each retpoline 1135 * function (called a thunk) into a jmp instruction. This means that we're still 1136 * paying the cost of an extra jump to the external thunk, but it gives us 1137 * flexibility and the ability to have a single kernel image that works across a 1138 * wide variety of systems and hardware features. 1139 * 1140 * Unfortunately, this alone is insufficient. First, Skylake systems have 1141 * additional speculation for the Return Stack Buffer (RSB) which is used to 1142 * return from call instructions which retpolines take advantage of. However, 1143 * this problem is not just limited to Skylake and is actually more pernicious. 1144 * The SpectreRSB paper introduces several more problems that can arise with 1145 * dealing with this. The RSB can be poisoned just like the indirect branch 1146 * predictor. This means that one needs to clear the RSB when transitioning 1147 * between two different privilege domains. Some examples include: 1148 * 1149 * - Switching between two different user processes 1150 * - Going between user land and the kernel 1151 * - Returning to the kernel from a hardware virtual machine 1152 * 1153 * Mitigating this involves combining a couple of different things. The first is 1154 * SMEP (supervisor mode execution protection) which was introduced in Ivy 1155 * Bridge. When an RSB entry refers to a user address and we're executing in the 1156 * kernel, speculation through it will be stopped when SMEP is enabled. This 1157 * protects against a number of the different cases that we would normally be 1158 * worried about such as when we enter the kernel from user land. 1159 * 1160 * To prevent against additional manipulation of the RSB from other contexts 1161 * such as a non-root VMX context attacking the kernel we first look to 1162 * enhanced IBRS. When eIBRS is present and enabled, then there should be 1163 * nothing else that we need to do to protect the kernel at this time. 1164 * 1165 * Unfortunately, eIBRS or not, we need to manually overwrite the contents of 1166 * the return stack buffer. We do this through the x86_rsb_stuff() function. 1167 * Currently this is employed on context switch and vmx_exit. The 1168 * x86_rsb_stuff() function is disabled only when mitigations in general are. 1169 * 1170 * If SMEP is not present, then we would have to stuff the RSB every time we 1171 * transitioned from user mode to the kernel, which isn't very practical right 1172 * now. 1173 * 1174 * To fully protect user to user and vmx to vmx attacks from these classes of 1175 * issues, we would also need to allow them to opt into performing an Indirect 1176 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up. 1177 * 1178 * By default, the system will enable RSB stuffing and the required variant of 1179 * retpolines and store that information in the x86_spectrev2_mitigation value. 1180 * This will be evaluated after a microcode update as well, though it is 1181 * expected that microcode updates will not take away features. This may mean 1182 * that a late loaded microcode may not end up in the optimal configuration 1183 * (though this should be rare). 1184 * 1185 * Currently we do not build kmdb with retpolines or perform any additional side 1186 * channel security mitigations for it. One complication with kmdb is that it 1187 * requires its own retpoline thunks and it would need to adjust itself based on 1188 * what the kernel does. The threat model of kmdb is more limited and therefore 1189 * it may make more sense to investigate using prediction barriers as the whole 1190 * system is only executing a single instruction at a time while in kmdb. 1191 * 1192 * SPECTRE v1, v4 1193 * 1194 * The v1 and v4 variants of spectre are not currently mitigated in the 1195 * system and require other classes of changes to occur in the code. 1196 * 1197 * SPECTRE v1 (SWAPGS VARIANT) 1198 * 1199 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but 1200 * can generally affect any branch-dependent code. The swapgs issue is one 1201 * variant of this. If we are coming in from userspace, we can have code like 1202 * this: 1203 * 1204 * cmpw $KCS_SEL, REGOFF_CS(%rsp) 1205 * je 1f 1206 * movq $0, REGOFF_SAVFP(%rsp) 1207 * swapgs 1208 * 1: 1209 * movq %gs:CPU_THREAD, %rax 1210 * 1211 * If an attacker can cause a mis-speculation of the branch here, we could skip 1212 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based 1213 * load. If subsequent code can act as the usual Spectre cache gadget, this 1214 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to 1215 * any use of the %gs override. 1216 * 1217 * The other case is also an issue: if we're coming into a trap from kernel 1218 * space, we could mis-speculate and swapgs the user %gsbase back in prior to 1219 * using it. AMD systems are not vulnerable to this version, as a swapgs is 1220 * serializing with respect to subsequent uses. But as AMD /does/ need the other 1221 * case, and the fix is the same in both cases (an lfence at the branch target 1222 * 1: in this example), we'll just do it unconditionally. 1223 * 1224 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it 1225 * harder for user-space to actually set a useful %gsbase value: although it's 1226 * not clear, it might still be feasible via lwp_setprivate(), though, so we 1227 * mitigate anyway. 1228 * 1229 * MELTDOWN 1230 * 1231 * Meltdown, or spectre v3, allowed a user process to read any data in their 1232 * address space regardless of whether or not the page tables in question 1233 * allowed the user to have the ability to read them. The solution to meltdown 1234 * is kernel page table isolation. In this world, there are two page tables that 1235 * are used for a process, one in user land and one in the kernel. To implement 1236 * this we use per-CPU page tables and switch between the user and kernel 1237 * variants when entering and exiting the kernel. For more information about 1238 * this process and how the trampolines work, please see the big theory 1239 * statements and additional comments in: 1240 * 1241 * - uts/i86pc/ml/kpti_trampolines.s 1242 * - uts/i86pc/vm/hat_i86.c 1243 * 1244 * While Meltdown only impacted Intel systems and there are also Intel systems 1245 * that have Meltdown fixed (called Rogue Data Cache Load), we always have 1246 * kernel page table isolation enabled. While this may at first seem weird, an 1247 * important thing to remember is that you can't speculatively read an address 1248 * if it's never in your page table at all. Having user processes without kernel 1249 * pages present provides us with an important layer of defense in the kernel 1250 * against any other side channel attacks that exist and have yet to be 1251 * discovered. As such, kernel page table isolation (KPTI) is always enabled by 1252 * default, no matter the x86 system. 1253 * 1254 * L1 TERMINAL FAULT 1255 * 1256 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative 1257 * execution uses page table entries. Effectively, it is two different problems. 1258 * The first is that it ignores the not present bit in the page table entries 1259 * when performing speculative execution. This means that something can 1260 * speculatively read the listed physical address if it's present in the L1 1261 * cache under certain conditions (see Intel's documentation for the full set of 1262 * conditions). Secondly, this can be used to bypass hardware virtualization 1263 * extended page tables (EPT) that are part of Intel's hardware virtual machine 1264 * instructions. 1265 * 1266 * For the non-hardware virtualized case, this is relatively easy to deal with. 1267 * We must make sure that all unmapped pages have an address of zero. This means 1268 * that they could read the first 4k of physical memory; however, we never use 1269 * that first page in the operating system and always skip putting it in our 1270 * memory map, even if firmware tells us we can use it in our memory map. While 1271 * other systems try to put extra metadata in the address and reserved bits, 1272 * which led to this being problematic in those cases, we do not. 1273 * 1274 * For hardware virtual machines things are more complicated. Because they can 1275 * construct their own page tables, it isn't hard for them to perform this 1276 * attack against any physical address. The one wrinkle is that this physical 1277 * address must be in the L1 data cache. Thus Intel added an MSR that we can use 1278 * to flush the L1 data cache. We wrap this up in the function 1279 * spec_uarch_flush(). This function is also used in the mitigation of 1280 * microarchitectural data sampling (MDS) discussed later on. Kernel based 1281 * hypervisors such as KVM or bhyve are responsible for performing this before 1282 * entering the guest. 1283 * 1284 * Because this attack takes place in the L1 cache, there's another wrinkle 1285 * here. The L1 cache is shared between all logical CPUs in a core in most Intel 1286 * designs. This means that when a thread enters a hardware virtualized context 1287 * and flushes the L1 data cache, the other thread on the processor may then go 1288 * ahead and put new data in it that can be potentially attacked. While one 1289 * solution is to disable SMT on the system, another option that is available is 1290 * to use a feature for hardware virtualization called 'SMT exclusion'. This 1291 * goes through and makes sure that if a HVM is being scheduled on one thread, 1292 * then the thing on the other thread is from the same hardware virtual machine. 1293 * If an interrupt comes in or the guest exits to the broader system, then the 1294 * other SMT thread will be kicked out. 1295 * 1296 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the 1297 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not 1298 * perform L1TF related mitigations. 1299 * 1300 * MICROARCHITECTURAL DATA SAMPLING 1301 * 1302 * Microarchitectural data sampling (MDS) is a combination of four discrete 1303 * vulnerabilities that are similar issues affecting various parts of the CPU's 1304 * microarchitectural implementation around load, store, and fill buffers. 1305 * Specifically it is made up of the following subcomponents: 1306 * 1307 * 1. Microarchitectural Store Buffer Data Sampling (MSBDS) 1308 * 2. Microarchitectural Fill Buffer Data Sampling (MFBDS) 1309 * 3. Microarchitectural Load Port Data Sampling (MLPDS) 1310 * 4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM) 1311 * 1312 * To begin addressing these, Intel has introduced another feature in microcode 1313 * called MD_CLEAR. This changes the verw instruction to operate in a different 1314 * way. This allows us to execute the verw instruction in a particular way to 1315 * flush the state of the affected parts. The L1TF L1D flush mechanism is also 1316 * updated when this microcode is present to flush this state. 1317 * 1318 * Primarily we need to flush this state whenever we transition from the kernel 1319 * to a less privileged context such as user mode or an HVM guest. MSBDS is a 1320 * little bit different. Here the structures are statically sized when a logical 1321 * CPU is in use and resized when it goes to sleep. Therefore, we also need to 1322 * flush the microarchitectural state before the CPU goes idles by calling hlt, 1323 * mwait, or another ACPI method. To perform these flushes, we call 1324 * x86_md_clear() at all of these transition points. 1325 * 1326 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF, 1327 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If 1328 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes 1329 * a no-op. 1330 * 1331 * Unfortunately, with this issue hyperthreading rears its ugly head. In 1332 * particular, everything we've discussed above is only valid for a single 1333 * thread executing on a core. In the case where you have hyper-threading 1334 * present, this attack can be performed between threads. The theoretical fix 1335 * for this is to ensure that both threads are always in the same security 1336 * domain. This means that they are executing in the same ring and mutually 1337 * trust each other. Practically speaking, this would mean that a system call 1338 * would have to issue an inter-processor interrupt (IPI) to the other thread. 1339 * Rather than implement this, we recommend that one disables hyper-threading 1340 * through the use of psradm -aS. 1341 * 1342 * TSX ASYNCHRONOUS ABORT 1343 * 1344 * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that 1345 * behaves like MDS, but leverages Intel's transactional instructions as another 1346 * vector. Effectively, when a transaction hits one of these cases (unmapped 1347 * page, various cache snoop activity, etc.) then the same data can be exposed 1348 * as in the case of MDS. This means that you can attack your twin. 1349 * 1350 * Intel has described that there are two different ways that we can mitigate 1351 * this problem on affected processors: 1352 * 1353 * 1) We can use the same techniques used to deal with MDS. Flushing the 1354 * microarchitectural buffers and disabling hyperthreading will mitigate 1355 * this in the same way. 1356 * 1357 * 2) Using microcode to disable TSX. 1358 * 1359 * Now, most processors that are subject to MDS (as in they don't have MDS_NO in 1360 * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX. 1361 * That's OK as we're already doing all such mitigations. On the other hand, 1362 * processors with MDS_NO are all supposed to receive microcode updates that 1363 * enumerate support for disabling TSX. In general, we'd rather use this method 1364 * when available as it doesn't require disabling hyperthreading to be 1365 * effective. Currently we basically are relying on microcode for processors 1366 * that enumerate MDS_NO. 1367 * 1368 * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES. 1369 * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two 1370 * different powers. The first allows us to cause all transactions to 1371 * immediately abort. The second gives us a means of disabling TSX completely, 1372 * which includes removing it from cpuid. If we have support for this in 1373 * microcode during the first cpuid pass, then we'll disable TSX completely such 1374 * that user land never has a chance to observe the bit. However, if we are late 1375 * loading the microcode, then we must use the functionality to cause 1376 * transactions to automatically abort. This is necessary for user land's sake. 1377 * Once a program sees a cpuid bit, it must not be taken away. 1378 * 1379 * We track whether or not we should do this based on what cpuid pass we're in. 1380 * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass 1381 * 1 of the cpuid logic, then we can completely turn off TSX. Notably this 1382 * should happen twice. Once in the normal cpuid_pass_basic() code and then a 1383 * second time after we do the initial microcode update. As a result we need to 1384 * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a 1385 * suitable microcode on the current CPU (which happens prior to 1386 * cpuid_pass_ucode()). 1387 * 1388 * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES 1389 * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an 1390 * unfortunate feature in a number of ways, and taking the opportunity to 1391 * finally be able to turn it off is likely to be of benefit in the future. 1392 * 1393 * SUMMARY 1394 * 1395 * The following table attempts to summarize the mitigations for various issues 1396 * and what's done in various places: 1397 * 1398 * - Spectre v1: Not currently mitigated 1399 * - swapgs: lfences after swapgs paths 1400 * - Spectre v2: Retpolines/RSB Stuffing or eIBRS if HW support 1401 * - Meltdown: Kernel Page Table Isolation 1402 * - Spectre v3a: Updated CPU microcode 1403 * - Spectre v4: Not currently mitigated 1404 * - SpectreRSB: SMEP and RSB Stuffing 1405 * - L1TF: spec_uarch_flush, SMT exclusion, requires microcode 1406 * - MDS: x86_md_clear, requires microcode, disabling SMT 1407 * - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX 1408 * 1409 * The following table indicates the x86 feature set bits that indicate that a 1410 * given problem has been solved or a notable feature is present: 1411 * 1412 * - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS 1413 * - MDS_NO: All forms of MDS 1414 * - TAA_NO: TAA 1415 */ 1416 1417 #include <sys/types.h> 1418 #include <sys/archsystm.h> 1419 #include <sys/x86_archext.h> 1420 #include <sys/kmem.h> 1421 #include <sys/systm.h> 1422 #include <sys/cmn_err.h> 1423 #include <sys/sunddi.h> 1424 #include <sys/sunndi.h> 1425 #include <sys/cpuvar.h> 1426 #include <sys/processor.h> 1427 #include <sys/sysmacros.h> 1428 #include <sys/pg.h> 1429 #include <sys/fp.h> 1430 #include <sys/controlregs.h> 1431 #include <sys/bitmap.h> 1432 #include <sys/auxv_386.h> 1433 #include <sys/memnode.h> 1434 #include <sys/pci_cfgspace.h> 1435 #include <sys/comm_page.h> 1436 #include <sys/mach_mmu.h> 1437 #include <sys/ucode.h> 1438 #include <sys/tsc.h> 1439 #include <sys/kobj.h> 1440 #include <sys/asm_misc.h> 1441 1442 #ifdef __xpv 1443 #include <sys/hypervisor.h> 1444 #else 1445 #include <sys/ontrap.h> 1446 #endif 1447 1448 uint_t x86_vendor = X86_VENDOR_IntelClone; 1449 uint_t x86_type = X86_TYPE_OTHER; 1450 uint_t x86_clflush_size = 0; 1451 1452 #if defined(__xpv) 1453 int x86_use_pcid = 0; 1454 int x86_use_invpcid = 0; 1455 #else 1456 int x86_use_pcid = -1; 1457 int x86_use_invpcid = -1; 1458 #endif 1459 1460 typedef enum { 1461 X86_SPECTREV2_RETPOLINE, 1462 X86_SPECTREV2_ENHANCED_IBRS, 1463 X86_SPECTREV2_DISABLED 1464 } x86_spectrev2_mitigation_t; 1465 1466 uint_t x86_disable_spectrev2 = 0; 1467 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation = 1468 X86_SPECTREV2_RETPOLINE; 1469 1470 /* 1471 * The mitigation status for TAA: 1472 * X86_TAA_NOTHING -- no mitigation available for TAA side-channels 1473 * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa 1474 * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA 1475 * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort 1476 * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID 1477 * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable 1478 */ 1479 typedef enum { 1480 X86_TAA_NOTHING, 1481 X86_TAA_DISABLED, 1482 X86_TAA_MD_CLEAR, 1483 X86_TAA_TSX_FORCE_ABORT, 1484 X86_TAA_TSX_DISABLE, 1485 X86_TAA_HW_MITIGATED 1486 } x86_taa_mitigation_t; 1487 1488 uint_t x86_disable_taa = 0; 1489 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING; 1490 1491 uint_t pentiumpro_bug4046376; 1492 1493 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; 1494 1495 static char *x86_feature_names[NUM_X86_FEATURES] = { 1496 "lgpg", 1497 "tsc", 1498 "msr", 1499 "mtrr", 1500 "pge", 1501 "de", 1502 "cmov", 1503 "mmx", 1504 "mca", 1505 "pae", 1506 "cv8", 1507 "pat", 1508 "sep", 1509 "sse", 1510 "sse2", 1511 "htt", 1512 "asysc", 1513 "nx", 1514 "sse3", 1515 "cx16", 1516 "cmp", 1517 "tscp", 1518 "mwait", 1519 "sse4a", 1520 "cpuid", 1521 "ssse3", 1522 "sse4_1", 1523 "sse4_2", 1524 "1gpg", 1525 "clfsh", 1526 "64", 1527 "aes", 1528 "pclmulqdq", 1529 "xsave", 1530 "avx", 1531 "vmx", 1532 "svm", 1533 "topoext", 1534 "f16c", 1535 "rdrand", 1536 "x2apic", 1537 "avx2", 1538 "bmi1", 1539 "bmi2", 1540 "fma", 1541 "smep", 1542 "smap", 1543 "adx", 1544 "rdseed", 1545 "mpx", 1546 "avx512f", 1547 "avx512dq", 1548 "avx512pf", 1549 "avx512er", 1550 "avx512cd", 1551 "avx512bw", 1552 "avx512vl", 1553 "avx512fma", 1554 "avx512vbmi", 1555 "avx512_vpopcntdq", 1556 "avx512_4vnniw", 1557 "avx512_4fmaps", 1558 "xsaveopt", 1559 "xsavec", 1560 "xsaves", 1561 "sha", 1562 "umip", 1563 "pku", 1564 "ospke", 1565 "pcid", 1566 "invpcid", 1567 "ibrs", 1568 "ibpb", 1569 "stibp", 1570 "ssbd", 1571 "ssbd_virt", 1572 "rdcl_no", 1573 "ibrs_all", 1574 "rsba", 1575 "ssb_no", 1576 "stibp_all", 1577 "flush_cmd", 1578 "l1d_vmentry_no", 1579 "fsgsbase", 1580 "clflushopt", 1581 "clwb", 1582 "monitorx", 1583 "clzero", 1584 "xop", 1585 "fma4", 1586 "tbm", 1587 "avx512_vnni", 1588 "amd_pcec", 1589 "md_clear", 1590 "mds_no", 1591 "core_thermal", 1592 "pkg_thermal", 1593 "tsx_ctrl", 1594 "taa_no", 1595 "ppin", 1596 "vaes", 1597 "vpclmulqdq", 1598 "lfence_serializing" 1599 }; 1600 1601 boolean_t 1602 is_x86_feature(void *featureset, uint_t feature) 1603 { 1604 ASSERT(feature < NUM_X86_FEATURES); 1605 return (BT_TEST((ulong_t *)featureset, feature)); 1606 } 1607 1608 void 1609 add_x86_feature(void *featureset, uint_t feature) 1610 { 1611 ASSERT(feature < NUM_X86_FEATURES); 1612 BT_SET((ulong_t *)featureset, feature); 1613 } 1614 1615 void 1616 remove_x86_feature(void *featureset, uint_t feature) 1617 { 1618 ASSERT(feature < NUM_X86_FEATURES); 1619 BT_CLEAR((ulong_t *)featureset, feature); 1620 } 1621 1622 boolean_t 1623 compare_x86_featureset(void *setA, void *setB) 1624 { 1625 /* 1626 * We assume that the unused bits of the bitmap are always zero. 1627 */ 1628 if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) { 1629 return (B_TRUE); 1630 } else { 1631 return (B_FALSE); 1632 } 1633 } 1634 1635 void 1636 print_x86_featureset(void *featureset) 1637 { 1638 uint_t i; 1639 1640 for (i = 0; i < NUM_X86_FEATURES; i++) { 1641 if (is_x86_feature(featureset, i)) { 1642 cmn_err(CE_CONT, "?x86_feature: %s\n", 1643 x86_feature_names[i]); 1644 } 1645 } 1646 } 1647 1648 /* Note: This is the maximum size for the CPU, not the size of the structure. */ 1649 static size_t xsave_state_size = 0; 1650 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE); 1651 boolean_t xsave_force_disable = B_FALSE; 1652 extern int disable_smap; 1653 1654 /* 1655 * This is set to platform type we are running on. 1656 */ 1657 static int platform_type = -1; 1658 1659 #if !defined(__xpv) 1660 /* 1661 * Variable to patch if hypervisor platform detection needs to be 1662 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0). 1663 */ 1664 int enable_platform_detection = 1; 1665 #endif 1666 1667 /* 1668 * monitor/mwait info. 1669 * 1670 * size_actual and buf_actual are the real address and size allocated to get 1671 * proper mwait_buf alignement. buf_actual and size_actual should be passed 1672 * to kmem_free(). Currently kmem_alloc() and mwait happen to both use 1673 * processor cache-line alignment, but this is not guarantied in the furture. 1674 */ 1675 struct mwait_info { 1676 size_t mon_min; /* min size to avoid missed wakeups */ 1677 size_t mon_max; /* size to avoid false wakeups */ 1678 size_t size_actual; /* size actually allocated */ 1679 void *buf_actual; /* memory actually allocated */ 1680 uint32_t support; /* processor support of monitor/mwait */ 1681 }; 1682 1683 /* 1684 * xsave/xrestor info. 1685 * 1686 * This structure contains HW feature bits and the size of the xsave save area. 1687 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure 1688 * (xsave_state) to describe the xsave layout. However, at runtime the 1689 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The 1690 * xsave_state structure simply represents the legacy layout of the beginning 1691 * of the xsave area. 1692 */ 1693 struct xsave_info { 1694 uint32_t xsav_hw_features_low; /* Supported HW features */ 1695 uint32_t xsav_hw_features_high; /* Supported HW features */ 1696 size_t xsav_max_size; /* max size save area for HW features */ 1697 size_t ymm_size; /* AVX: size of ymm save area */ 1698 size_t ymm_offset; /* AVX: offset for ymm save area */ 1699 size_t bndregs_size; /* MPX: size of bndregs save area */ 1700 size_t bndregs_offset; /* MPX: offset for bndregs save area */ 1701 size_t bndcsr_size; /* MPX: size of bndcsr save area */ 1702 size_t bndcsr_offset; /* MPX: offset for bndcsr save area */ 1703 size_t opmask_size; /* AVX512: size of opmask save */ 1704 size_t opmask_offset; /* AVX512: offset for opmask save */ 1705 size_t zmmlo_size; /* AVX512: size of zmm 256 save */ 1706 size_t zmmlo_offset; /* AVX512: offset for zmm 256 save */ 1707 size_t zmmhi_size; /* AVX512: size of zmm hi reg save */ 1708 size_t zmmhi_offset; /* AVX512: offset for zmm hi reg save */ 1709 }; 1710 1711 1712 /* 1713 * These constants determine how many of the elements of the 1714 * cpuid we cache in the cpuid_info data structure; the 1715 * remaining elements are accessible via the cpuid instruction. 1716 */ 1717 1718 #define NMAX_CPI_STD 8 /* eax = 0 .. 7 */ 1719 #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */ 1720 1721 /* 1722 * See the big theory statement for a more detailed explanation of what some of 1723 * these members mean. 1724 */ 1725 struct cpuid_info { 1726 uint_t cpi_pass; /* last pass completed */ 1727 /* 1728 * standard function information 1729 */ 1730 uint_t cpi_maxeax; /* fn 0: %eax */ 1731 char cpi_vendorstr[13]; /* fn 0: %ebx:%ecx:%edx */ 1732 uint_t cpi_vendor; /* enum of cpi_vendorstr */ 1733 1734 uint_t cpi_family; /* fn 1: extended family */ 1735 uint_t cpi_model; /* fn 1: extended model */ 1736 uint_t cpi_step; /* fn 1: stepping */ 1737 chipid_t cpi_chipid; /* fn 1: %ebx: Intel: chip # */ 1738 /* AMD: package/socket # */ 1739 uint_t cpi_brandid; /* fn 1: %ebx: brand ID */ 1740 int cpi_clogid; /* fn 1: %ebx: thread # */ 1741 uint_t cpi_ncpu_per_chip; /* fn 1: %ebx: logical cpu count */ 1742 uint8_t cpi_cacheinfo[16]; /* fn 2: intel-style cache desc */ 1743 uint_t cpi_ncache; /* fn 2: number of elements */ 1744 uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */ 1745 id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */ 1746 uint_t cpi_cache_leaf_size; /* Number of cache elements */ 1747 /* Intel fn: 4, AMD fn: 8000001d */ 1748 struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */ 1749 struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */ 1750 /* 1751 * extended function information 1752 */ 1753 uint_t cpi_xmaxeax; /* fn 0x80000000: %eax */ 1754 char cpi_brandstr[49]; /* fn 0x8000000[234] */ 1755 uint8_t cpi_pabits; /* fn 0x80000006: %eax */ 1756 uint8_t cpi_vabits; /* fn 0x80000006: %eax */ 1757 uint8_t cpi_fp_amd_save; /* AMD: FP error pointer save rqd. */ 1758 struct cpuid_regs cpi_extd[NMAX_CPI_EXTD]; /* 0x800000XX */ 1759 1760 id_t cpi_coreid; /* same coreid => strands share core */ 1761 int cpi_pkgcoreid; /* core number within single package */ 1762 uint_t cpi_ncore_per_chip; /* AMD: fn 0x80000008: %ecx[7-0] */ 1763 /* Intel: fn 4: %eax[31-26] */ 1764 1765 /* 1766 * These values represent the number of bits that are required to store 1767 * information about the number of cores and threads. 1768 */ 1769 uint_t cpi_ncore_bits; 1770 uint_t cpi_nthread_bits; 1771 /* 1772 * supported feature information 1773 */ 1774 uint32_t cpi_support[6]; 1775 #define STD_EDX_FEATURES 0 1776 #define AMD_EDX_FEATURES 1 1777 #define TM_EDX_FEATURES 2 1778 #define STD_ECX_FEATURES 3 1779 #define AMD_ECX_FEATURES 4 1780 #define STD_EBX_FEATURES 5 1781 /* 1782 * Synthesized information, where known. 1783 */ 1784 uint32_t cpi_chiprev; /* See X86_CHIPREV_* in x86_archext.h */ 1785 const char *cpi_chiprevstr; /* May be NULL if chiprev unknown */ 1786 uint32_t cpi_socket; /* Chip package/socket type */ 1787 1788 struct mwait_info cpi_mwait; /* fn 5: monitor/mwait info */ 1789 uint32_t cpi_apicid; 1790 uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */ 1791 uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */ 1792 /* Intel: 1 */ 1793 uint_t cpi_compunitid; /* AMD: ComputeUnit ID, Intel: coreid */ 1794 uint_t cpi_cores_per_compunit; /* AMD: # of cores in the ComputeUnit */ 1795 1796 struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */ 1797 }; 1798 1799 1800 static struct cpuid_info cpuid_info0; 1801 1802 /* 1803 * These bit fields are defined by the Intel Application Note AP-485 1804 * "Intel Processor Identification and the CPUID Instruction" 1805 */ 1806 #define CPI_FAMILY_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 27, 20) 1807 #define CPI_MODEL_XTD(cpi) BITX((cpi)->cpi_std[1].cp_eax, 19, 16) 1808 #define CPI_TYPE(cpi) BITX((cpi)->cpi_std[1].cp_eax, 13, 12) 1809 #define CPI_FAMILY(cpi) BITX((cpi)->cpi_std[1].cp_eax, 11, 8) 1810 #define CPI_STEP(cpi) BITX((cpi)->cpi_std[1].cp_eax, 3, 0) 1811 #define CPI_MODEL(cpi) BITX((cpi)->cpi_std[1].cp_eax, 7, 4) 1812 1813 #define CPI_FEATURES_EDX(cpi) ((cpi)->cpi_std[1].cp_edx) 1814 #define CPI_FEATURES_ECX(cpi) ((cpi)->cpi_std[1].cp_ecx) 1815 #define CPI_FEATURES_XTD_EDX(cpi) ((cpi)->cpi_extd[1].cp_edx) 1816 #define CPI_FEATURES_XTD_ECX(cpi) ((cpi)->cpi_extd[1].cp_ecx) 1817 #define CPI_FEATURES_7_0_EBX(cpi) ((cpi)->cpi_std[7].cp_ebx) 1818 #define CPI_FEATURES_7_0_ECX(cpi) ((cpi)->cpi_std[7].cp_ecx) 1819 #define CPI_FEATURES_7_0_EDX(cpi) ((cpi)->cpi_std[7].cp_edx) 1820 1821 #define CPI_BRANDID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 7, 0) 1822 #define CPI_CHUNKS(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 15, 7) 1823 #define CPI_CPU_COUNT(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 23, 16) 1824 #define CPI_APIC_ID(cpi) BITX((cpi)->cpi_std[1].cp_ebx, 31, 24) 1825 1826 #define CPI_MAXEAX_MAX 0x100 /* sanity control */ 1827 #define CPI_XMAXEAX_MAX 0x80000100 1828 #define CPI_FN4_ECX_MAX 0x20 /* sanity: max fn 4 levels */ 1829 #define CPI_FNB_ECX_MAX 0x20 /* sanity: max fn B levels */ 1830 1831 /* 1832 * Function 4 (Deterministic Cache Parameters) macros 1833 * Defined by Intel Application Note AP-485 1834 */ 1835 #define CPI_NUM_CORES(regs) BITX((regs)->cp_eax, 31, 26) 1836 #define CPI_NTHR_SHR_CACHE(regs) BITX((regs)->cp_eax, 25, 14) 1837 #define CPI_FULL_ASSOC_CACHE(regs) BITX((regs)->cp_eax, 9, 9) 1838 #define CPI_SELF_INIT_CACHE(regs) BITX((regs)->cp_eax, 8, 8) 1839 #define CPI_CACHE_LVL(regs) BITX((regs)->cp_eax, 7, 5) 1840 #define CPI_CACHE_TYPE(regs) BITX((regs)->cp_eax, 4, 0) 1841 #define CPI_CPU_LEVEL_TYPE(regs) BITX((regs)->cp_ecx, 15, 8) 1842 1843 #define CPI_CACHE_WAYS(regs) BITX((regs)->cp_ebx, 31, 22) 1844 #define CPI_CACHE_PARTS(regs) BITX((regs)->cp_ebx, 21, 12) 1845 #define CPI_CACHE_COH_LN_SZ(regs) BITX((regs)->cp_ebx, 11, 0) 1846 1847 #define CPI_CACHE_SETS(regs) BITX((regs)->cp_ecx, 31, 0) 1848 1849 #define CPI_PREFCH_STRIDE(regs) BITX((regs)->cp_edx, 9, 0) 1850 1851 1852 /* 1853 * A couple of shorthand macros to identify "later" P6-family chips 1854 * like the Pentium M and Core. First, the "older" P6-based stuff 1855 * (loosely defined as "pre-Pentium-4"): 1856 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon 1857 */ 1858 #define IS_LEGACY_P6(cpi) ( \ 1859 cpi->cpi_family == 6 && \ 1860 (cpi->cpi_model == 1 || \ 1861 cpi->cpi_model == 3 || \ 1862 cpi->cpi_model == 5 || \ 1863 cpi->cpi_model == 6 || \ 1864 cpi->cpi_model == 7 || \ 1865 cpi->cpi_model == 8 || \ 1866 cpi->cpi_model == 0xA || \ 1867 cpi->cpi_model == 0xB) \ 1868 ) 1869 1870 /* A "new F6" is everything with family 6 that's not the above */ 1871 #define IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi)) 1872 1873 /* Extended family/model support */ 1874 #define IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \ 1875 cpi->cpi_family >= 0xf) 1876 1877 /* 1878 * Info for monitor/mwait idle loop. 1879 * 1880 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's 1881 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November 1882 * 2006. 1883 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual 1884 * Documentation Updates" #33633, Rev 2.05, December 2006. 1885 */ 1886 #define MWAIT_SUPPORT (0x00000001) /* mwait supported */ 1887 #define MWAIT_EXTENSIONS (0x00000002) /* extenstion supported */ 1888 #define MWAIT_ECX_INT_ENABLE (0x00000004) /* ecx 1 extension supported */ 1889 #define MWAIT_SUPPORTED(cpi) ((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON) 1890 #define MWAIT_INT_ENABLE(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x2) 1891 #define MWAIT_EXTENSION(cpi) ((cpi)->cpi_std[5].cp_ecx & 0x1) 1892 #define MWAIT_SIZE_MIN(cpi) BITX((cpi)->cpi_std[5].cp_eax, 15, 0) 1893 #define MWAIT_SIZE_MAX(cpi) BITX((cpi)->cpi_std[5].cp_ebx, 15, 0) 1894 /* 1895 * Number of sub-cstates for a given c-state. 1896 */ 1897 #define MWAIT_NUM_SUBC_STATES(cpi, c_state) \ 1898 BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state) 1899 1900 /* 1901 * XSAVE leaf 0xD enumeration 1902 */ 1903 #define CPUID_LEAFD_2_YMM_OFFSET 576 1904 #define CPUID_LEAFD_2_YMM_SIZE 256 1905 1906 /* 1907 * Common extended leaf names to cut down on typos. 1908 */ 1909 #define CPUID_LEAF_EXT_0 0x80000000 1910 #define CPUID_LEAF_EXT_8 0x80000008 1911 #define CPUID_LEAF_EXT_1d 0x8000001d 1912 #define CPUID_LEAF_EXT_1e 0x8000001e 1913 1914 /* 1915 * Functions we consune from cpuid_subr.c; don't publish these in a header 1916 * file to try and keep people using the expected cpuid_* interfaces. 1917 */ 1918 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t); 1919 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t); 1920 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t); 1921 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t); 1922 extern uint_t _cpuid_vendorstr_to_vendorcode(char *); 1923 1924 /* 1925 * Apply up various platform-dependent restrictions where the 1926 * underlying platform restrictions mean the CPU can be marked 1927 * as less capable than its cpuid instruction would imply. 1928 */ 1929 #if defined(__xpv) 1930 static void 1931 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) 1932 { 1933 switch (eax) { 1934 case 1: { 1935 uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ? 1936 0 : CPUID_INTC_EDX_MCA; 1937 cp->cp_edx &= 1938 ~(mcamask | 1939 CPUID_INTC_EDX_PSE | 1940 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1941 CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR | 1942 CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT | 1943 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1944 CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT); 1945 break; 1946 } 1947 1948 case 0x80000001: 1949 cp->cp_edx &= 1950 ~(CPUID_AMD_EDX_PSE | 1951 CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE | 1952 CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE | 1953 CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 | 1954 CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP | 1955 CPUID_AMD_EDX_TSCP); 1956 cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY; 1957 break; 1958 default: 1959 break; 1960 } 1961 1962 switch (vendor) { 1963 case X86_VENDOR_Intel: 1964 switch (eax) { 1965 case 4: 1966 /* 1967 * Zero out the (ncores-per-chip - 1) field 1968 */ 1969 cp->cp_eax &= 0x03fffffff; 1970 break; 1971 default: 1972 break; 1973 } 1974 break; 1975 case X86_VENDOR_AMD: 1976 case X86_VENDOR_HYGON: 1977 switch (eax) { 1978 1979 case 0x80000001: 1980 cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D; 1981 break; 1982 1983 case CPUID_LEAF_EXT_8: 1984 /* 1985 * Zero out the (ncores-per-chip - 1) field 1986 */ 1987 cp->cp_ecx &= 0xffffff00; 1988 break; 1989 default: 1990 break; 1991 } 1992 break; 1993 default: 1994 break; 1995 } 1996 } 1997 #else 1998 #define platform_cpuid_mangle(vendor, eax, cp) /* nothing */ 1999 #endif 2000 2001 /* 2002 * Some undocumented ways of patching the results of the cpuid 2003 * instruction to permit running Solaris 10 on future cpus that 2004 * we don't currently support. Could be set to non-zero values 2005 * via settings in eeprom. 2006 */ 2007 2008 uint32_t cpuid_feature_ecx_include; 2009 uint32_t cpuid_feature_ecx_exclude; 2010 uint32_t cpuid_feature_edx_include; 2011 uint32_t cpuid_feature_edx_exclude; 2012 2013 /* 2014 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs. 2015 */ 2016 void 2017 cpuid_alloc_space(cpu_t *cpu) 2018 { 2019 /* 2020 * By convention, cpu0 is the boot cpu, which is set up 2021 * before memory allocation is available. All other cpus get 2022 * their cpuid_info struct allocated here. 2023 */ 2024 ASSERT(cpu->cpu_id != 0); 2025 ASSERT(cpu->cpu_m.mcpu_cpi == NULL); 2026 cpu->cpu_m.mcpu_cpi = 2027 kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP); 2028 } 2029 2030 void 2031 cpuid_free_space(cpu_t *cpu) 2032 { 2033 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2034 int i; 2035 2036 ASSERT(cpi != NULL); 2037 ASSERT(cpi != &cpuid_info0); 2038 2039 /* 2040 * Free up any cache leaf related dynamic storage. The first entry was 2041 * cached from the standard cpuid storage, so we should not free it. 2042 */ 2043 for (i = 1; i < cpi->cpi_cache_leaf_size; i++) 2044 kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs)); 2045 if (cpi->cpi_cache_leaf_size > 0) 2046 kmem_free(cpi->cpi_cache_leaves, 2047 cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *)); 2048 2049 kmem_free(cpi, sizeof (*cpi)); 2050 cpu->cpu_m.mcpu_cpi = NULL; 2051 } 2052 2053 #if !defined(__xpv) 2054 /* 2055 * Determine the type of the underlying platform. This is used to customize 2056 * initialization of various subsystems (e.g. TSC). determine_platform() must 2057 * only ever be called once to prevent two processors from seeing different 2058 * values of platform_type. Must be called before cpuid_pass_ident(), the 2059 * earliest consumer to execute; the identification pass will call 2060 * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv(). 2061 */ 2062 void 2063 determine_platform(void) 2064 { 2065 struct cpuid_regs cp; 2066 uint32_t base; 2067 uint32_t regs[4]; 2068 char *hvstr = (char *)regs; 2069 2070 ASSERT(platform_type == -1); 2071 2072 platform_type = HW_NATIVE; 2073 2074 if (!enable_platform_detection) 2075 return; 2076 2077 /* 2078 * If Hypervisor CPUID bit is set, try to determine hypervisor 2079 * vendor signature, and set platform type accordingly. 2080 * 2081 * References: 2082 * http://lkml.org/lkml/2008/10/1/246 2083 * http://kb.vmware.com/kb/1009458 2084 */ 2085 cp.cp_eax = 0x1; 2086 (void) __cpuid_insn(&cp); 2087 if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) { 2088 cp.cp_eax = 0x40000000; 2089 (void) __cpuid_insn(&cp); 2090 regs[0] = cp.cp_ebx; 2091 regs[1] = cp.cp_ecx; 2092 regs[2] = cp.cp_edx; 2093 regs[3] = 0; 2094 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) { 2095 platform_type = HW_XEN_HVM; 2096 return; 2097 } 2098 if (strcmp(hvstr, HVSIG_VMWARE) == 0) { 2099 platform_type = HW_VMWARE; 2100 return; 2101 } 2102 if (strcmp(hvstr, HVSIG_KVM) == 0) { 2103 platform_type = HW_KVM; 2104 return; 2105 } 2106 if (strcmp(hvstr, HVSIG_BHYVE) == 0) { 2107 platform_type = HW_BHYVE; 2108 return; 2109 } 2110 if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) 2111 platform_type = HW_MICROSOFT; 2112 } else { 2113 /* 2114 * Check older VMware hardware versions. VMware hypervisor is 2115 * detected by performing an IN operation to VMware hypervisor 2116 * port and checking that value returned in %ebx is VMware 2117 * hypervisor magic value. 2118 * 2119 * References: http://kb.vmware.com/kb/1009458 2120 */ 2121 vmware_port(VMWARE_HVCMD_GETVERSION, regs); 2122 if (regs[1] == VMWARE_HVMAGIC) { 2123 platform_type = HW_VMWARE; 2124 return; 2125 } 2126 } 2127 2128 /* 2129 * Check Xen hypervisor. In a fully virtualized domain, 2130 * Xen's pseudo-cpuid function returns a string representing the 2131 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum 2132 * supported cpuid function. We need at least a (base + 2) leaf value 2133 * to do what we want to do. Try different base values, since the 2134 * hypervisor might use a different one depending on whether Hyper-V 2135 * emulation is switched on by default or not. 2136 */ 2137 for (base = 0x40000000; base < 0x40010000; base += 0x100) { 2138 cp.cp_eax = base; 2139 (void) __cpuid_insn(&cp); 2140 regs[0] = cp.cp_ebx; 2141 regs[1] = cp.cp_ecx; 2142 regs[2] = cp.cp_edx; 2143 regs[3] = 0; 2144 if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 && 2145 cp.cp_eax >= (base + 2)) { 2146 platform_type &= ~HW_NATIVE; 2147 platform_type |= HW_XEN_HVM; 2148 return; 2149 } 2150 } 2151 } 2152 2153 int 2154 get_hwenv(void) 2155 { 2156 ASSERT(platform_type != -1); 2157 return (platform_type); 2158 } 2159 2160 int 2161 is_controldom(void) 2162 { 2163 return (0); 2164 } 2165 2166 #else 2167 2168 int 2169 get_hwenv(void) 2170 { 2171 return (HW_XEN_PV); 2172 } 2173 2174 int 2175 is_controldom(void) 2176 { 2177 return (DOMAIN_IS_INITDOMAIN(xen_info)); 2178 } 2179 2180 #endif /* __xpv */ 2181 2182 /* 2183 * Make sure that we have gathered all of the CPUID leaves that we might need to 2184 * determine topology. We assume that the standard leaf 1 has already been done 2185 * and that xmaxeax has already been calculated. 2186 */ 2187 static void 2188 cpuid_gather_amd_topology_leaves(cpu_t *cpu) 2189 { 2190 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2191 2192 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2193 struct cpuid_regs *cp; 2194 2195 cp = &cpi->cpi_extd[8]; 2196 cp->cp_eax = CPUID_LEAF_EXT_8; 2197 (void) __cpuid_insn(cp); 2198 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp); 2199 } 2200 2201 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2202 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2203 struct cpuid_regs *cp; 2204 2205 cp = &cpi->cpi_extd[0x1e]; 2206 cp->cp_eax = CPUID_LEAF_EXT_1e; 2207 (void) __cpuid_insn(cp); 2208 } 2209 } 2210 2211 /* 2212 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer 2213 * it to everything else. If not, and we're on an AMD system where 8000001e is 2214 * valid, then we use that. Othewrise, we fall back to the default value for the 2215 * APIC ID in leaf 1. 2216 */ 2217 static uint32_t 2218 cpuid_gather_apicid(struct cpuid_info *cpi) 2219 { 2220 /* 2221 * Leaf B changes based on the arguments to it. Beacuse we don't cache 2222 * it, we need to gather it again. 2223 */ 2224 if (cpi->cpi_maxeax >= 0xB) { 2225 struct cpuid_regs regs; 2226 struct cpuid_regs *cp; 2227 2228 cp = ®s; 2229 cp->cp_eax = 0xB; 2230 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2231 (void) __cpuid_insn(cp); 2232 2233 if (cp->cp_ebx != 0) { 2234 return (cp->cp_edx); 2235 } 2236 } 2237 2238 if ((cpi->cpi_vendor == X86_VENDOR_AMD || 2239 cpi->cpi_vendor == X86_VENDOR_HYGON) && 2240 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2241 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2242 return (cpi->cpi_extd[0x1e].cp_eax); 2243 } 2244 2245 return (CPI_APIC_ID(cpi)); 2246 } 2247 2248 /* 2249 * For AMD processors, attempt to calculate the number of chips and cores that 2250 * exist. The way that we do this varies based on the generation, because the 2251 * generations themselves have changed dramatically. 2252 * 2253 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores. 2254 * However, with the advent of family 17h (Zen) it actually tells us the number 2255 * of threads, so we need to look at leaf 0x8000001e if available to determine 2256 * its value. Otherwise, for all prior families, the number of enabled cores is 2257 * the same as threads. 2258 * 2259 * If we do not have leaf 0x80000008, then we assume that this processor does 2260 * not have anything. AMD's older CPUID specification says there's no reason to 2261 * fall back to leaf 1. 2262 * 2263 * In some virtualization cases we will not have leaf 8000001e or it will be 2264 * zero. When that happens we assume the number of threads is one. 2265 */ 2266 static void 2267 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2268 { 2269 uint_t nthreads, nthread_per_core; 2270 2271 nthreads = nthread_per_core = 1; 2272 2273 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2274 nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1; 2275 } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2276 nthreads = CPI_CPU_COUNT(cpi); 2277 } 2278 2279 /* 2280 * For us to have threads, and know about it, we have to be at least at 2281 * family 17h and have the cpuid bit that says we have extended 2282 * topology. 2283 */ 2284 if (cpi->cpi_family >= 0x17 && 2285 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2286 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2287 nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2288 } 2289 2290 *ncpus = nthreads; 2291 *ncores = nthreads / nthread_per_core; 2292 } 2293 2294 /* 2295 * Seed the initial values for the cores and threads for an Intel based 2296 * processor. These values will be overwritten if we detect that the processor 2297 * supports CPUID leaf 0xb. 2298 */ 2299 static void 2300 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) 2301 { 2302 /* 2303 * Only seed the number of physical cores from the first level leaf 4 2304 * information. The number of threads there indicate how many share the 2305 * L1 cache, which may or may not have anything to do with the number of 2306 * logical CPUs per core. 2307 */ 2308 if (cpi->cpi_maxeax >= 4) { 2309 *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1; 2310 } else { 2311 *ncores = 1; 2312 } 2313 2314 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 2315 *ncpus = CPI_CPU_COUNT(cpi); 2316 } else { 2317 *ncpus = *ncores; 2318 } 2319 } 2320 2321 static boolean_t 2322 cpuid_leafB_getids(cpu_t *cpu) 2323 { 2324 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2325 struct cpuid_regs regs; 2326 struct cpuid_regs *cp; 2327 2328 if (cpi->cpi_maxeax < 0xB) 2329 return (B_FALSE); 2330 2331 cp = ®s; 2332 cp->cp_eax = 0xB; 2333 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 2334 2335 (void) __cpuid_insn(cp); 2336 2337 /* 2338 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which 2339 * indicates that the extended topology enumeration leaf is 2340 * available. 2341 */ 2342 if (cp->cp_ebx != 0) { 2343 uint32_t x2apic_id = 0; 2344 uint_t coreid_shift = 0; 2345 uint_t ncpu_per_core = 1; 2346 uint_t chipid_shift = 0; 2347 uint_t ncpu_per_chip = 1; 2348 uint_t i; 2349 uint_t level; 2350 2351 for (i = 0; i < CPI_FNB_ECX_MAX; i++) { 2352 cp->cp_eax = 0xB; 2353 cp->cp_ecx = i; 2354 2355 (void) __cpuid_insn(cp); 2356 level = CPI_CPU_LEVEL_TYPE(cp); 2357 2358 if (level == 1) { 2359 x2apic_id = cp->cp_edx; 2360 coreid_shift = BITX(cp->cp_eax, 4, 0); 2361 ncpu_per_core = BITX(cp->cp_ebx, 15, 0); 2362 } else if (level == 2) { 2363 x2apic_id = cp->cp_edx; 2364 chipid_shift = BITX(cp->cp_eax, 4, 0); 2365 ncpu_per_chip = BITX(cp->cp_ebx, 15, 0); 2366 } 2367 } 2368 2369 /* 2370 * cpi_apicid is taken care of in cpuid_gather_apicid. 2371 */ 2372 cpi->cpi_ncpu_per_chip = ncpu_per_chip; 2373 cpi->cpi_ncore_per_chip = ncpu_per_chip / 2374 ncpu_per_core; 2375 cpi->cpi_chipid = x2apic_id >> chipid_shift; 2376 cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1); 2377 cpi->cpi_coreid = x2apic_id >> coreid_shift; 2378 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2379 cpi->cpi_procnodeid = cpi->cpi_chipid; 2380 cpi->cpi_compunitid = cpi->cpi_coreid; 2381 2382 if (coreid_shift > 0 && chipid_shift > coreid_shift) { 2383 cpi->cpi_nthread_bits = coreid_shift; 2384 cpi->cpi_ncore_bits = chipid_shift - coreid_shift; 2385 } 2386 2387 return (B_TRUE); 2388 } else { 2389 return (B_FALSE); 2390 } 2391 } 2392 2393 static void 2394 cpuid_intel_getids(cpu_t *cpu, void *feature) 2395 { 2396 uint_t i; 2397 uint_t chipid_shift = 0; 2398 uint_t coreid_shift = 0; 2399 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2400 2401 /* 2402 * There are no compute units or processor nodes currently on Intel. 2403 * Always set these to one. 2404 */ 2405 cpi->cpi_procnodes_per_pkg = 1; 2406 cpi->cpi_cores_per_compunit = 1; 2407 2408 /* 2409 * If cpuid Leaf B is present, use that to try and get this information. 2410 * It will be the most accurate for Intel CPUs. 2411 */ 2412 if (cpuid_leafB_getids(cpu)) 2413 return; 2414 2415 /* 2416 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip 2417 * and ncore_per_chip. These represent the largest power of two values 2418 * that we need to cover all of the IDs in the system. Therefore, we use 2419 * those values to seed the number of bits needed to cover information 2420 * in the case when leaf B is not available. These values will probably 2421 * be larger than required, but that's OK. 2422 */ 2423 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip); 2424 cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip); 2425 2426 for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1) 2427 chipid_shift++; 2428 2429 cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift; 2430 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1); 2431 2432 if (is_x86_feature(feature, X86FSET_CMP)) { 2433 /* 2434 * Multi-core (and possibly multi-threaded) 2435 * processors. 2436 */ 2437 uint_t ncpu_per_core = 0; 2438 2439 if (cpi->cpi_ncore_per_chip == 1) 2440 ncpu_per_core = cpi->cpi_ncpu_per_chip; 2441 else if (cpi->cpi_ncore_per_chip > 1) 2442 ncpu_per_core = cpi->cpi_ncpu_per_chip / 2443 cpi->cpi_ncore_per_chip; 2444 /* 2445 * 8bit APIC IDs on dual core Pentiums 2446 * look like this: 2447 * 2448 * +-----------------------+------+------+ 2449 * | Physical Package ID | MC | HT | 2450 * +-----------------------+------+------+ 2451 * <------- chipid --------> 2452 * <------- coreid ---------------> 2453 * <--- clogid --> 2454 * <------> 2455 * pkgcoreid 2456 * 2457 * Where the number of bits necessary to 2458 * represent MC and HT fields together equals 2459 * to the minimum number of bits necessary to 2460 * store the value of cpi->cpi_ncpu_per_chip. 2461 * Of those bits, the MC part uses the number 2462 * of bits necessary to store the value of 2463 * cpi->cpi_ncore_per_chip. 2464 */ 2465 for (i = 1; i < ncpu_per_core; i <<= 1) 2466 coreid_shift++; 2467 cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift; 2468 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; 2469 } else if (is_x86_feature(feature, X86FSET_HTT)) { 2470 /* 2471 * Single-core multi-threaded processors. 2472 */ 2473 cpi->cpi_coreid = cpi->cpi_chipid; 2474 cpi->cpi_pkgcoreid = 0; 2475 } else { 2476 /* 2477 * Single-core single-thread processors. 2478 */ 2479 cpi->cpi_coreid = cpu->cpu_id; 2480 cpi->cpi_pkgcoreid = 0; 2481 } 2482 cpi->cpi_procnodeid = cpi->cpi_chipid; 2483 cpi->cpi_compunitid = cpi->cpi_coreid; 2484 } 2485 2486 /* 2487 * Historically, AMD has had CMP chips with only a single thread per core. 2488 * However, starting in family 17h (Zen), this has changed and they now have 2489 * multiple threads. Our internal core id needs to be a unique value. 2490 * 2491 * To determine the core id of an AMD system, if we're from a family before 17h, 2492 * then we just use the cpu id, as that gives us a good value that will be 2493 * unique for each core. If instead, we're on family 17h or later, then we need 2494 * to do something more complicated. CPUID leaf 0x8000001e can tell us 2495 * how many threads are in the system. Based on that, we'll shift the APIC ID. 2496 * We can't use the normal core id in that leaf as it's only unique within the 2497 * socket, which is perfect for cpi_pkgcoreid, but not us. 2498 */ 2499 static id_t 2500 cpuid_amd_get_coreid(cpu_t *cpu) 2501 { 2502 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2503 2504 if (cpi->cpi_family >= 0x17 && 2505 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2506 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2507 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2508 if (nthreads > 1) { 2509 VERIFY3U(nthreads, ==, 2); 2510 return (cpi->cpi_apicid >> 1); 2511 } 2512 } 2513 2514 return (cpu->cpu_id); 2515 } 2516 2517 /* 2518 * IDs on AMD is a more challenging task. This is notable because of the 2519 * following two facts: 2520 * 2521 * 1. Before family 0x17 (Zen), there was no support for SMT and there was 2522 * also no way to get an actual unique core id from the system. As such, we 2523 * synthesize this case by using cpu->cpu_id. This scheme does not, 2524 * however, guarantee that sibling cores of a chip will have sequential 2525 * coreids starting at a multiple of the number of cores per chip - that is 2526 * usually the case, but if the APIC IDs have been set up in a different 2527 * order then we need to perform a few more gymnastics for the pkgcoreid. 2528 * 2529 * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups 2530 * called compute units. These compute units share the L1I cache, L2 cache, 2531 * and the FPU. To deal with this, a new topology leaf was added in 2532 * 0x8000001e. However, parts of this leaf have different meanings 2533 * once we get to family 0x17. 2534 */ 2535 2536 static void 2537 cpuid_amd_getids(cpu_t *cpu, uchar_t *features) 2538 { 2539 int i, first_half, coreidsz; 2540 uint32_t nb_caps_reg; 2541 uint_t node2_1; 2542 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2543 struct cpuid_regs *cp; 2544 2545 /* 2546 * Calculate the core id (this comes from hardware in family 0x17 if it 2547 * hasn't been stripped by virtualization). We always set the compute 2548 * unit id to the same value. Also, initialize the default number of 2549 * cores per compute unit and nodes per package. This will be 2550 * overwritten when we know information about a particular family. 2551 */ 2552 cpi->cpi_coreid = cpuid_amd_get_coreid(cpu); 2553 cpi->cpi_compunitid = cpi->cpi_coreid; 2554 cpi->cpi_cores_per_compunit = 1; 2555 cpi->cpi_procnodes_per_pkg = 1; 2556 2557 /* 2558 * To construct the logical ID, we need to determine how many APIC IDs 2559 * are dedicated to the cores and threads. This is provided for us in 2560 * 0x80000008. However, if it's not present (say due to virtualization), 2561 * then we assume it's one. This should be present on all 64-bit AMD 2562 * processors. It was added in family 0xf (Hammer). 2563 */ 2564 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2565 coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12); 2566 2567 /* 2568 * In AMD parlance chip is really a node while illumos 2569 * uses chip as equivalent to socket/package. 2570 */ 2571 if (coreidsz == 0) { 2572 /* Use legacy method */ 2573 for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1) 2574 coreidsz++; 2575 if (coreidsz == 0) 2576 coreidsz = 1; 2577 } 2578 } else { 2579 /* Assume single-core part */ 2580 coreidsz = 1; 2581 } 2582 cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1); 2583 2584 /* 2585 * The package core ID varies depending on the family. While it may be 2586 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately, 2587 * this value is the core id in the given node. For non-virtualized 2588 * family 17h, we need to take the logical core id and shift off the 2589 * threads like we do when getting the core id. Otherwise, we can use 2590 * the clogid as is. When family 17h is virtualized, the clogid should 2591 * be sufficient as if we don't have valid data in the leaf, then we 2592 * won't think we have SMT, in which case the cpi_clogid should be 2593 * sufficient. 2594 */ 2595 if (cpi->cpi_family >= 0x17 && 2596 is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2597 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e && 2598 cpi->cpi_extd[0x1e].cp_ebx != 0) { 2599 uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; 2600 if (nthreads > 1) { 2601 VERIFY3U(nthreads, ==, 2); 2602 cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1; 2603 } else { 2604 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2605 } 2606 } else { 2607 cpi->cpi_pkgcoreid = cpi->cpi_clogid; 2608 } 2609 2610 /* 2611 * Obtain the node ID and compute unit IDs. If we're on family 0x15 2612 * (bulldozer) or newer, then we can derive all of this from leaf 2613 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family. 2614 */ 2615 if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && 2616 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { 2617 cp = &cpi->cpi_extd[0x1e]; 2618 2619 cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1; 2620 cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0); 2621 2622 /* 2623 * For Bulldozer-era CPUs, recalculate the compute unit 2624 * information. 2625 */ 2626 if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) { 2627 cpi->cpi_cores_per_compunit = 2628 BITX(cp->cp_ebx, 15, 8) + 1; 2629 cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) + 2630 (cpi->cpi_ncore_per_chip / 2631 cpi->cpi_cores_per_compunit) * 2632 (cpi->cpi_procnodeid / 2633 cpi->cpi_procnodes_per_pkg); 2634 } 2635 } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) { 2636 cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7; 2637 } else if (cpi->cpi_family == 0x10) { 2638 /* 2639 * See if we are a multi-node processor. 2640 * All processors in the system have the same number of nodes 2641 */ 2642 nb_caps_reg = pci_getl_func(0, 24, 3, 0xe8); 2643 if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) { 2644 /* Single-node */ 2645 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5, 2646 coreidsz); 2647 } else { 2648 2649 /* 2650 * Multi-node revision D (2 nodes per package 2651 * are supported) 2652 */ 2653 cpi->cpi_procnodes_per_pkg = 2; 2654 2655 first_half = (cpi->cpi_pkgcoreid <= 2656 (cpi->cpi_ncore_per_chip/2 - 1)); 2657 2658 if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) { 2659 /* We are BSP */ 2660 cpi->cpi_procnodeid = (first_half ? 0 : 1); 2661 } else { 2662 2663 /* We are AP */ 2664 /* NodeId[2:1] bits to use for reading F3xe8 */ 2665 node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1; 2666 2667 nb_caps_reg = 2668 pci_getl_func(0, 24 + node2_1, 3, 0xe8); 2669 2670 /* 2671 * Check IntNodeNum bit (31:30, but bit 31 is 2672 * always 0 on dual-node processors) 2673 */ 2674 if (BITX(nb_caps_reg, 30, 30) == 0) 2675 cpi->cpi_procnodeid = node2_1 + 2676 !first_half; 2677 else 2678 cpi->cpi_procnodeid = node2_1 + 2679 first_half; 2680 } 2681 } 2682 } else { 2683 cpi->cpi_procnodeid = 0; 2684 } 2685 2686 cpi->cpi_chipid = 2687 cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg; 2688 2689 cpi->cpi_ncore_bits = coreidsz; 2690 cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip / 2691 cpi->cpi_ncore_per_chip); 2692 } 2693 2694 static void 2695 spec_uarch_flush_noop(void) 2696 { 2697 } 2698 2699 /* 2700 * When microcode is present that mitigates MDS, this wrmsr will also flush the 2701 * MDS-related micro-architectural state that would normally happen by calling 2702 * x86_md_clear(). 2703 */ 2704 static void 2705 spec_uarch_flush_msr(void) 2706 { 2707 wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); 2708 } 2709 2710 /* 2711 * This function points to a function that will flush certain 2712 * micro-architectural state on the processor. This flush is used to mitigate 2713 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This 2714 * function can point to one of three functions: 2715 * 2716 * - A noop which is done because we either are vulnerable, but do not have 2717 * microcode available to help deal with a fix, or because we aren't 2718 * vulnerable. 2719 * 2720 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to 2721 * mitigate MDS is present, also perform the equivalent of the MDS flush; 2722 * however, it only flushes the MDS related micro-architectural state on the 2723 * current hyperthread, it does not do anything for the twin. 2724 * 2725 * - x86_md_clear which will flush the MDS related state. This is done when we 2726 * have a processor that is vulnerable to MDS, but is not vulnerable to L1TF 2727 * (RDCL_NO is set). 2728 */ 2729 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop; 2730 2731 static void 2732 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset) 2733 { 2734 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2735 2736 /* 2737 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS) 2738 * has been fixed in hardware, it doesn't cover everything related to 2739 * MDS. Therefore we can only rely on MDS_NO to determine that we don't 2740 * need to mitigate this. 2741 */ 2742 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2743 is_x86_feature(featureset, X86FSET_MDS_NO)) { 2744 return; 2745 } 2746 2747 if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2748 const uint8_t nop = NOP_INSTR; 2749 uint8_t *md = (uint8_t *)x86_md_clear; 2750 2751 *md = nop; 2752 } 2753 2754 membar_producer(); 2755 } 2756 2757 static void 2758 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset) 2759 { 2760 boolean_t need_l1d, need_mds; 2761 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2762 2763 /* 2764 * If we're not on Intel or we've mitigated both RDCL and MDS in 2765 * hardware, then there's nothing left for us to do for enabling the 2766 * flush. We can also go ahead and say that SMT exclusion is 2767 * unnecessary. 2768 */ 2769 if (cpi->cpi_vendor != X86_VENDOR_Intel || 2770 (is_x86_feature(featureset, X86FSET_RDCL_NO) && 2771 is_x86_feature(featureset, X86FSET_MDS_NO))) { 2772 extern int smt_exclusion; 2773 smt_exclusion = 0; 2774 spec_uarch_flush = spec_uarch_flush_noop; 2775 membar_producer(); 2776 return; 2777 } 2778 2779 /* 2780 * The locations where we need to perform an L1D flush are required both 2781 * for mitigating L1TF and MDS. When verw support is present in 2782 * microcode, then the L1D flush will take care of doing that as well. 2783 * However, if we have a system where RDCL_NO is present, but we don't 2784 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full 2785 * L1D flush. 2786 */ 2787 if (!is_x86_feature(featureset, X86FSET_RDCL_NO) && 2788 is_x86_feature(featureset, X86FSET_FLUSH_CMD) && 2789 !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) { 2790 need_l1d = B_TRUE; 2791 } else { 2792 need_l1d = B_FALSE; 2793 } 2794 2795 if (!is_x86_feature(featureset, X86FSET_MDS_NO) && 2796 is_x86_feature(featureset, X86FSET_MD_CLEAR)) { 2797 need_mds = B_TRUE; 2798 } else { 2799 need_mds = B_FALSE; 2800 } 2801 2802 if (need_l1d) { 2803 spec_uarch_flush = spec_uarch_flush_msr; 2804 } else if (need_mds) { 2805 spec_uarch_flush = x86_md_clear; 2806 } else { 2807 /* 2808 * We have no hardware mitigations available to us. 2809 */ 2810 spec_uarch_flush = spec_uarch_flush_noop; 2811 } 2812 membar_producer(); 2813 } 2814 2815 /* 2816 * We default to enabling RSB mitigations. 2817 * 2818 * NOTE: We used to skip RSB mitigations with eIBRS, but developments around 2819 * post-barrier RSB guessing suggests we should enable RSB mitigations always 2820 * unless specifically instructed not to. 2821 */ 2822 static void 2823 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit) 2824 { 2825 const uint8_t ret = RET_INSTR; 2826 uint8_t *stuff = (uint8_t *)x86_rsb_stuff; 2827 2828 switch (mit) { 2829 case X86_SPECTREV2_DISABLED: 2830 *stuff = ret; 2831 break; 2832 default: 2833 break; 2834 } 2835 } 2836 2837 static void 2838 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit) 2839 { 2840 const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi", 2841 "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13", 2842 "_r14", "_r15" }; 2843 const uint_t nthunks = ARRAY_SIZE(thunks); 2844 const char *type; 2845 uint_t i; 2846 2847 if (mit == x86_spectrev2_mitigation) 2848 return; 2849 2850 switch (mit) { 2851 case X86_SPECTREV2_RETPOLINE: 2852 type = "gen"; 2853 break; 2854 case X86_SPECTREV2_ENHANCED_IBRS: 2855 case X86_SPECTREV2_DISABLED: 2856 type = "jmp"; 2857 break; 2858 default: 2859 panic("asked to updated retpoline state with unknown state!"); 2860 } 2861 2862 for (i = 0; i < nthunks; i++) { 2863 uintptr_t source, dest; 2864 int ssize, dsize; 2865 char sourcebuf[64], destbuf[64]; 2866 2867 (void) snprintf(destbuf, sizeof (destbuf), 2868 "__x86_indirect_thunk%s", thunks[i]); 2869 (void) snprintf(sourcebuf, sizeof (sourcebuf), 2870 "__x86_indirect_thunk_%s%s", type, thunks[i]); 2871 2872 source = kobj_getelfsym(sourcebuf, NULL, &ssize); 2873 dest = kobj_getelfsym(destbuf, NULL, &dsize); 2874 VERIFY3U(source, !=, 0); 2875 VERIFY3U(dest, !=, 0); 2876 VERIFY3S(dsize, >=, ssize); 2877 bcopy((void *)source, (void *)dest, ssize); 2878 } 2879 } 2880 2881 static void 2882 cpuid_enable_enhanced_ibrs(void) 2883 { 2884 uint64_t val; 2885 2886 val = rdmsr(MSR_IA32_SPEC_CTRL); 2887 val |= IA32_SPEC_CTRL_IBRS; 2888 wrmsr(MSR_IA32_SPEC_CTRL, val); 2889 } 2890 2891 /* 2892 * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if 2893 * we can disable TSX, we do so. 2894 * 2895 * This determination is done only on the boot CPU, potentially after loading 2896 * updated microcode. 2897 */ 2898 static void 2899 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset) 2900 { 2901 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2902 2903 VERIFY(cpu->cpu_id == 0); 2904 2905 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 2906 x86_taa_mitigation = X86_TAA_HW_MITIGATED; 2907 return; 2908 } 2909 2910 if (x86_disable_taa) { 2911 x86_taa_mitigation = X86_TAA_DISABLED; 2912 return; 2913 } 2914 2915 /* 2916 * If we do not have the ability to disable TSX, then our only 2917 * mitigation options are in hardware (TAA_NO), or by using our existing 2918 * MDS mitigation as described above. The latter relies upon us having 2919 * configured MDS mitigations correctly! This includes disabling SMT if 2920 * we want to cross-CPU-thread protection. 2921 */ 2922 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) { 2923 /* 2924 * It's not clear whether any parts will enumerate TAA_NO 2925 * *without* TSX_CTRL, but let's mark it as such if we see this. 2926 */ 2927 if (is_x86_feature(featureset, X86FSET_TAA_NO)) { 2928 x86_taa_mitigation = X86_TAA_HW_MITIGATED; 2929 return; 2930 } 2931 2932 if (is_x86_feature(featureset, X86FSET_MD_CLEAR) && 2933 !is_x86_feature(featureset, X86FSET_MDS_NO)) { 2934 x86_taa_mitigation = X86_TAA_MD_CLEAR; 2935 } else { 2936 x86_taa_mitigation = X86_TAA_NOTHING; 2937 } 2938 return; 2939 } 2940 2941 /* 2942 * We have TSX_CTRL, but we can only fully disable TSX if we're early 2943 * enough in boot. 2944 * 2945 * Otherwise, we'll fall back to causing transactions to abort as our 2946 * mitigation. TSX-using code will always take the fallback path. 2947 */ 2948 if (cpi->cpi_pass < 4) { 2949 x86_taa_mitigation = X86_TAA_TSX_DISABLE; 2950 } else { 2951 x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT; 2952 } 2953 } 2954 2955 /* 2956 * As mentioned, we should only touch the MSR when we've got a suitable 2957 * microcode loaded on this CPU. 2958 */ 2959 static void 2960 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset) 2961 { 2962 uint64_t val; 2963 2964 switch (taa) { 2965 case X86_TAA_TSX_DISABLE: 2966 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) 2967 return; 2968 val = rdmsr(MSR_IA32_TSX_CTRL); 2969 val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE; 2970 wrmsr(MSR_IA32_TSX_CTRL, val); 2971 break; 2972 case X86_TAA_TSX_FORCE_ABORT: 2973 if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) 2974 return; 2975 val = rdmsr(MSR_IA32_TSX_CTRL); 2976 val |= IA32_TSX_CTRL_RTM_DISABLE; 2977 wrmsr(MSR_IA32_TSX_CTRL, val); 2978 break; 2979 case X86_TAA_HW_MITIGATED: 2980 case X86_TAA_MD_CLEAR: 2981 case X86_TAA_DISABLED: 2982 case X86_TAA_NOTHING: 2983 break; 2984 } 2985 } 2986 2987 static void 2988 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) 2989 { 2990 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 2991 x86_spectrev2_mitigation_t v2mit; 2992 2993 if ((cpi->cpi_vendor == X86_VENDOR_AMD || 2994 cpi->cpi_vendor == X86_VENDOR_HYGON) && 2995 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 2996 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB) 2997 add_x86_feature(featureset, X86FSET_IBPB); 2998 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS) 2999 add_x86_feature(featureset, X86FSET_IBRS); 3000 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP) 3001 add_x86_feature(featureset, X86FSET_STIBP); 3002 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL) 3003 add_x86_feature(featureset, X86FSET_STIBP_ALL); 3004 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD) 3005 add_x86_feature(featureset, X86FSET_SSBD); 3006 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD) 3007 add_x86_feature(featureset, X86FSET_SSBD_VIRT); 3008 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO) 3009 add_x86_feature(featureset, X86FSET_SSB_NO); 3010 /* 3011 * Don't enable enhanced IBRS unless we're told that we should 3012 * prefer it and it has the same semantics as Intel. This is 3013 * split into two bits rather than a single one. 3014 */ 3015 if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) && 3016 (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) { 3017 add_x86_feature(featureset, X86FSET_IBRS_ALL); 3018 } 3019 3020 } else if (cpi->cpi_vendor == X86_VENDOR_Intel && 3021 cpi->cpi_maxeax >= 7) { 3022 struct cpuid_regs *ecp; 3023 ecp = &cpi->cpi_std[7]; 3024 3025 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) { 3026 add_x86_feature(featureset, X86FSET_MD_CLEAR); 3027 } 3028 3029 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) { 3030 add_x86_feature(featureset, X86FSET_IBRS); 3031 add_x86_feature(featureset, X86FSET_IBPB); 3032 } 3033 3034 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) { 3035 add_x86_feature(featureset, X86FSET_STIBP); 3036 } 3037 3038 /* 3039 * Don't read the arch caps MSR on xpv where we lack the 3040 * on_trap(). 3041 */ 3042 #ifndef __xpv 3043 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) { 3044 on_trap_data_t otd; 3045 3046 /* 3047 * Be paranoid and assume we'll get a #GP. 3048 */ 3049 if (!on_trap(&otd, OT_DATA_ACCESS)) { 3050 uint64_t reg; 3051 3052 reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES); 3053 if (reg & IA32_ARCH_CAP_RDCL_NO) { 3054 add_x86_feature(featureset, 3055 X86FSET_RDCL_NO); 3056 } 3057 if (reg & IA32_ARCH_CAP_IBRS_ALL) { 3058 add_x86_feature(featureset, 3059 X86FSET_IBRS_ALL); 3060 } 3061 if (reg & IA32_ARCH_CAP_RSBA) { 3062 add_x86_feature(featureset, 3063 X86FSET_RSBA); 3064 } 3065 if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) { 3066 add_x86_feature(featureset, 3067 X86FSET_L1D_VM_NO); 3068 } 3069 if (reg & IA32_ARCH_CAP_SSB_NO) { 3070 add_x86_feature(featureset, 3071 X86FSET_SSB_NO); 3072 } 3073 if (reg & IA32_ARCH_CAP_MDS_NO) { 3074 add_x86_feature(featureset, 3075 X86FSET_MDS_NO); 3076 } 3077 if (reg & IA32_ARCH_CAP_TSX_CTRL) { 3078 add_x86_feature(featureset, 3079 X86FSET_TSX_CTRL); 3080 } 3081 if (reg & IA32_ARCH_CAP_TAA_NO) { 3082 add_x86_feature(featureset, 3083 X86FSET_TAA_NO); 3084 } 3085 } 3086 no_trap(); 3087 } 3088 #endif /* !__xpv */ 3089 3090 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD) 3091 add_x86_feature(featureset, X86FSET_SSBD); 3092 3093 if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD) 3094 add_x86_feature(featureset, X86FSET_FLUSH_CMD); 3095 } 3096 3097 /* 3098 * Take care of certain mitigations on the non-boot CPU. The boot CPU 3099 * will have already run this function and determined what we need to 3100 * do. This gives us a hook for per-HW thread mitigations such as 3101 * enhanced IBRS, or disabling TSX. 3102 */ 3103 if (cpu->cpu_id != 0) { 3104 if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) { 3105 cpuid_enable_enhanced_ibrs(); 3106 } 3107 3108 cpuid_apply_tsx(x86_taa_mitigation, featureset); 3109 return; 3110 } 3111 3112 /* 3113 * Go through and initialize various security mechanisms that we should 3114 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and 3115 * TAA. 3116 */ 3117 3118 /* 3119 * By default we've come in with retpolines enabled. Check whether we 3120 * should disable them or enable enhanced IBRS. RSB stuffing is enabled 3121 * by default, but disabled if we are using enhanced IBRS. Note, we do 3122 * not allow the use of AMD optimized retpolines as it was disclosed by 3123 * AMD in March 2022 that they were still vulnerable. Prior to that 3124 * point, we used them. 3125 */ 3126 if (x86_disable_spectrev2 != 0) { 3127 v2mit = X86_SPECTREV2_DISABLED; 3128 } else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) { 3129 cpuid_enable_enhanced_ibrs(); 3130 v2mit = X86_SPECTREV2_ENHANCED_IBRS; 3131 } else { 3132 v2mit = X86_SPECTREV2_RETPOLINE; 3133 } 3134 3135 cpuid_patch_retpolines(v2mit); 3136 cpuid_patch_rsb(v2mit); 3137 x86_spectrev2_mitigation = v2mit; 3138 membar_producer(); 3139 3140 /* 3141 * We need to determine what changes are required for mitigating L1TF 3142 * and MDS. If the CPU suffers from either of them, then SMT exclusion 3143 * is required. 3144 * 3145 * If any of these are present, then we need to flush u-arch state at 3146 * various points. For MDS, we need to do so whenever we change to a 3147 * lesser privilege level or we are halting the CPU. For L1TF we need to 3148 * flush the L1D cache at VM entry. When we have microcode that handles 3149 * MDS, the L1D flush also clears the other u-arch state that the 3150 * md_clear does. 3151 */ 3152 3153 /* 3154 * Update whether or not we need to be taking explicit action against 3155 * MDS. 3156 */ 3157 cpuid_update_md_clear(cpu, featureset); 3158 3159 /* 3160 * Determine whether SMT exclusion is required and whether or not we 3161 * need to perform an l1d flush. 3162 */ 3163 cpuid_update_l1d_flush(cpu, featureset); 3164 3165 /* 3166 * Determine what our mitigation strategy should be for TAA and then 3167 * also apply TAA mitigations. 3168 */ 3169 cpuid_update_tsx(cpu, featureset); 3170 cpuid_apply_tsx(x86_taa_mitigation, featureset); 3171 } 3172 3173 /* 3174 * Setup XFeature_Enabled_Mask register. Required by xsave feature. 3175 */ 3176 void 3177 setup_xfem(void) 3178 { 3179 uint64_t flags = XFEATURE_LEGACY_FP; 3180 3181 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 3182 3183 if (is_x86_feature(x86_featureset, X86FSET_SSE)) 3184 flags |= XFEATURE_SSE; 3185 3186 if (is_x86_feature(x86_featureset, X86FSET_AVX)) 3187 flags |= XFEATURE_AVX; 3188 3189 if (is_x86_feature(x86_featureset, X86FSET_AVX512F)) 3190 flags |= XFEATURE_AVX512; 3191 3192 set_xcr(XFEATURE_ENABLED_MASK, flags); 3193 3194 xsave_bv_all = flags; 3195 } 3196 3197 static void 3198 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset) 3199 { 3200 struct cpuid_info *cpi; 3201 3202 cpi = cpu->cpu_m.mcpu_cpi; 3203 3204 if (cpi->cpi_vendor == X86_VENDOR_AMD || 3205 cpi->cpi_vendor == X86_VENDOR_HYGON) { 3206 cpuid_gather_amd_topology_leaves(cpu); 3207 } 3208 3209 cpi->cpi_apicid = cpuid_gather_apicid(cpi); 3210 3211 /* 3212 * Before we can calculate the IDs that we should assign to this 3213 * processor, we need to understand how many cores and threads it has. 3214 */ 3215 switch (cpi->cpi_vendor) { 3216 case X86_VENDOR_Intel: 3217 cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip, 3218 &cpi->cpi_ncore_per_chip); 3219 break; 3220 case X86_VENDOR_AMD: 3221 case X86_VENDOR_HYGON: 3222 cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip, 3223 &cpi->cpi_ncore_per_chip); 3224 break; 3225 default: 3226 /* 3227 * If we have some other x86 compatible chip, it's not clear how 3228 * they would behave. The most common case is virtualization 3229 * today, though there are also 64-bit VIA chips. Assume that 3230 * all we can get is the basic Leaf 1 HTT information. 3231 */ 3232 if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { 3233 cpi->cpi_ncore_per_chip = 1; 3234 cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); 3235 } 3236 break; 3237 } 3238 3239 /* 3240 * Based on the calculated number of threads and cores, potentially 3241 * assign the HTT and CMT features. 3242 */ 3243 if (cpi->cpi_ncore_per_chip > 1) { 3244 add_x86_feature(featureset, X86FSET_CMP); 3245 } 3246 3247 if (cpi->cpi_ncpu_per_chip > 1 && 3248 cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) { 3249 add_x86_feature(featureset, X86FSET_HTT); 3250 } 3251 3252 /* 3253 * Now that has been set up, we need to go through and calculate all of 3254 * the rest of the parameters that exist. If we think the CPU doesn't 3255 * have either SMT (HTT) or CMP, then we basically go through and fake 3256 * up information in some way. The most likely case for this is 3257 * virtualization where we have a lot of partial topology information. 3258 */ 3259 if (!is_x86_feature(featureset, X86FSET_HTT) && 3260 !is_x86_feature(featureset, X86FSET_CMP)) { 3261 /* 3262 * This is a single core, single-threaded processor. 3263 */ 3264 cpi->cpi_procnodes_per_pkg = 1; 3265 cpi->cpi_cores_per_compunit = 1; 3266 cpi->cpi_compunitid = 0; 3267 cpi->cpi_chipid = -1; 3268 cpi->cpi_clogid = 0; 3269 cpi->cpi_coreid = cpu->cpu_id; 3270 cpi->cpi_pkgcoreid = 0; 3271 if (cpi->cpi_vendor == X86_VENDOR_AMD || 3272 cpi->cpi_vendor == X86_VENDOR_HYGON) { 3273 cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0); 3274 } else { 3275 cpi->cpi_procnodeid = cpi->cpi_chipid; 3276 } 3277 } else { 3278 switch (cpi->cpi_vendor) { 3279 case X86_VENDOR_Intel: 3280 cpuid_intel_getids(cpu, featureset); 3281 break; 3282 case X86_VENDOR_AMD: 3283 case X86_VENDOR_HYGON: 3284 cpuid_amd_getids(cpu, featureset); 3285 break; 3286 default: 3287 /* 3288 * In this case, it's hard to say what we should do. 3289 * We're going to model them to the OS as single core 3290 * threads. We don't have a good identifier for them, so 3291 * we're just going to use the cpu id all on a single 3292 * chip. 3293 * 3294 * This case has historically been different from the 3295 * case above where we don't have HTT or CMP. While they 3296 * could be combined, we've opted to keep it separate to 3297 * minimize the risk of topology changes in weird cases. 3298 */ 3299 cpi->cpi_procnodes_per_pkg = 1; 3300 cpi->cpi_cores_per_compunit = 1; 3301 cpi->cpi_chipid = 0; 3302 cpi->cpi_coreid = cpu->cpu_id; 3303 cpi->cpi_clogid = cpu->cpu_id; 3304 cpi->cpi_pkgcoreid = cpu->cpu_id; 3305 cpi->cpi_procnodeid = cpi->cpi_chipid; 3306 cpi->cpi_compunitid = cpi->cpi_coreid; 3307 break; 3308 } 3309 } 3310 } 3311 3312 /* 3313 * Gather relevant CPU features from leaf 6 which covers thermal information. We 3314 * always gather leaf 6 if it's supported; however, we only look for features on 3315 * Intel systems as AMD does not currently define any of the features we look 3316 * for below. 3317 */ 3318 static void 3319 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset) 3320 { 3321 struct cpuid_regs *cp; 3322 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 3323 3324 if (cpi->cpi_maxeax < 6) { 3325 return; 3326 } 3327 3328 cp = &cpi->cpi_std[6]; 3329 cp->cp_eax = 6; 3330 cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0; 3331 (void) __cpuid_insn(cp); 3332 platform_cpuid_mangle(cpi->cpi_vendor, 6, cp); 3333 3334 if (cpi->cpi_vendor != X86_VENDOR_Intel) { 3335 return; 3336 } 3337 3338 if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) { 3339 add_x86_feature(featureset, X86FSET_CORE_THERMAL); 3340 } 3341 3342 if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) { 3343 add_x86_feature(featureset, X86FSET_PKG_THERMAL); 3344 } 3345 } 3346 3347 /* 3348 * PPIN is the protected processor inventory number. On AMD this is an actual 3349 * feature bit. However, on Intel systems we need to read the platform 3350 * information MSR if we're on a specific model. 3351 */ 3352 #if !defined(__xpv) 3353 static void 3354 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset) 3355 { 3356 on_trap_data_t otd; 3357 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 3358 3359 switch (cpi->cpi_vendor) { 3360 case X86_VENDOR_AMD: 3361 /* 3362 * This leaf will have already been gathered in the topology 3363 * functions. 3364 */ 3365 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { 3366 if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) { 3367 add_x86_feature(featureset, X86FSET_PPIN); 3368 } 3369 } 3370 break; 3371 case X86_VENDOR_Intel: 3372 if (cpi->cpi_family != 6) 3373 break; 3374 switch (cpi->cpi_model) { 3375 case INTC_MODEL_IVYBRIDGE_XEON: 3376 case INTC_MODEL_HASWELL_XEON: 3377 case INTC_MODEL_BROADWELL_XEON: 3378 case INTC_MODEL_BROADWELL_XEON_D: 3379 case INTC_MODEL_SKYLAKE_XEON: 3380 case INTC_MODEL_ICELAKE_XEON: 3381 if (!on_trap(&otd, OT_DATA_ACCESS)) { 3382 uint64_t value; 3383 3384 value = rdmsr(MSR_PLATFORM_INFO); 3385 if ((value & MSR_PLATFORM_INFO_PPIN) != 0) { 3386 add_x86_feature(featureset, 3387 X86FSET_PPIN); 3388 } 3389 } 3390 no_trap(); 3391 break; 3392 default: 3393 break; 3394 } 3395 break; 3396 default: 3397 break; 3398 } 3399 } 3400 #endif /* ! __xpv */ 3401 3402 static void 3403 cpuid_pass_prelude(cpu_t *cpu, void *arg) 3404 { 3405 uchar_t *featureset = (uchar_t *)arg; 3406 3407 /* 3408 * We don't run on any processor that doesn't have cpuid, and could not 3409 * possibly have arrived here. 3410 */ 3411 add_x86_feature(featureset, X86FSET_CPUID); 3412 } 3413 3414 static void 3415 cpuid_pass_ident(cpu_t *cpu, void *arg __unused) 3416 { 3417 struct cpuid_info *cpi; 3418 struct cpuid_regs *cp; 3419 3420 /* 3421 * We require that virtual/native detection be complete and that PCI 3422 * config space access has been set up; at present there is no reliable 3423 * way to determine the latter. 3424 */ 3425 ASSERT3S(platform_type, !=, -1); 3426 3427 cpi = cpu->cpu_m.mcpu_cpi; 3428 ASSERT(cpi != NULL); 3429 3430 cp = &cpi->cpi_std[0]; 3431 cp->cp_eax = 0; 3432 cpi->cpi_maxeax = __cpuid_insn(cp); 3433 { 3434 uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr; 3435 *iptr++ = cp->cp_ebx; 3436 *iptr++ = cp->cp_edx; 3437 *iptr++ = cp->cp_ecx; 3438 *(char *)&cpi->cpi_vendorstr[12] = '\0'; 3439 } 3440 3441 cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr); 3442 x86_vendor = cpi->cpi_vendor; /* for compatibility */ 3443 3444 /* 3445 * Limit the range in case of weird hardware 3446 */ 3447 if (cpi->cpi_maxeax > CPI_MAXEAX_MAX) 3448 cpi->cpi_maxeax = CPI_MAXEAX_MAX; 3449 if (cpi->cpi_maxeax < 1) 3450 return; 3451 3452 cp = &cpi->cpi_std[1]; 3453 cp->cp_eax = 1; 3454 (void) __cpuid_insn(cp); 3455 3456 /* 3457 * Extract identifying constants for easy access. 3458 */ 3459 cpi->cpi_model = CPI_MODEL(cpi); 3460 cpi->cpi_family = CPI_FAMILY(cpi); 3461 3462 if (cpi->cpi_family == 0xf) 3463 cpi->cpi_family += CPI_FAMILY_XTD(cpi); 3464 3465 /* 3466 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf. 3467 * Intel, and presumably everyone else, uses model == 0xf, as 3468 * one would expect (max value means possible overflow). Sigh. 3469 */ 3470 3471 switch (cpi->cpi_vendor) { 3472 case X86_VENDOR_Intel: 3473 if (IS_EXTENDED_MODEL_INTEL(cpi)) 3474 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3475 break; 3476 case X86_VENDOR_AMD: 3477 if (CPI_FAMILY(cpi) == 0xf) 3478 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3479 break; 3480 case X86_VENDOR_HYGON: 3481 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3482 break; 3483 default: 3484 if (cpi->cpi_model == 0xf) 3485 cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4; 3486 break; 3487 } 3488 3489 cpi->cpi_step = CPI_STEP(cpi); 3490 cpi->cpi_brandid = CPI_BRANDID(cpi); 3491 3492 /* 3493 * Synthesize chip "revision" and socket type 3494 */ 3495 cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family, 3496 cpi->cpi_model, cpi->cpi_step); 3497 cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor, 3498 cpi->cpi_family, cpi->cpi_model, cpi->cpi_step); 3499 cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family, 3500 cpi->cpi_model, cpi->cpi_step); 3501 } 3502 3503 static void 3504 cpuid_pass_basic(cpu_t *cpu, void *arg) 3505 { 3506 uchar_t *featureset = (uchar_t *)arg; 3507 uint32_t mask_ecx, mask_edx; 3508 struct cpuid_info *cpi; 3509 struct cpuid_regs *cp; 3510 int xcpuid; 3511 #if !defined(__xpv) 3512 extern int idle_cpu_prefer_mwait; 3513 #endif 3514 3515 cpi = cpu->cpu_m.mcpu_cpi; 3516 ASSERT(cpi != NULL); 3517 3518 if (cpi->cpi_maxeax < 1) 3519 return; 3520 3521 /* 3522 * This was filled during the identification pass. 3523 */ 3524 cp = &cpi->cpi_std[1]; 3525 3526 /* 3527 * *default* assumptions: 3528 * - believe %edx feature word 3529 * - ignore %ecx feature word 3530 * - 32-bit virtual and physical addressing 3531 */ 3532 mask_edx = 0xffffffff; 3533 mask_ecx = 0; 3534 3535 cpi->cpi_pabits = cpi->cpi_vabits = 32; 3536 3537 switch (cpi->cpi_vendor) { 3538 case X86_VENDOR_Intel: 3539 if (cpi->cpi_family == 5) 3540 x86_type = X86_TYPE_P5; 3541 else if (IS_LEGACY_P6(cpi)) { 3542 x86_type = X86_TYPE_P6; 3543 pentiumpro_bug4046376 = 1; 3544 /* 3545 * Clear the SEP bit when it was set erroneously 3546 */ 3547 if (cpi->cpi_model < 3 && cpi->cpi_step < 3) 3548 cp->cp_edx &= ~CPUID_INTC_EDX_SEP; 3549 } else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) { 3550 x86_type = X86_TYPE_P4; 3551 /* 3552 * We don't currently depend on any of the %ecx 3553 * features until Prescott, so we'll only check 3554 * this from P4 onwards. We might want to revisit 3555 * that idea later. 3556 */ 3557 mask_ecx = 0xffffffff; 3558 } else if (cpi->cpi_family > 0xf) 3559 mask_ecx = 0xffffffff; 3560 /* 3561 * We don't support MONITOR/MWAIT if leaf 5 is not available 3562 * to obtain the monitor linesize. 3563 */ 3564 if (cpi->cpi_maxeax < 5) 3565 mask_ecx &= ~CPUID_INTC_ECX_MON; 3566 break; 3567 case X86_VENDOR_IntelClone: 3568 default: 3569 break; 3570 case X86_VENDOR_AMD: 3571 #if defined(OPTERON_ERRATUM_108) 3572 if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) { 3573 cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0; 3574 cpi->cpi_model = 0xc; 3575 } else 3576 #endif 3577 if (cpi->cpi_family == 5) { 3578 /* 3579 * AMD K5 and K6 3580 * 3581 * These CPUs have an incomplete implementation 3582 * of MCA/MCE which we mask away. 3583 */ 3584 mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA); 3585 3586 /* 3587 * Model 0 uses the wrong (APIC) bit 3588 * to indicate PGE. Fix it here. 3589 */ 3590 if (cpi->cpi_model == 0) { 3591 if (cp->cp_edx & 0x200) { 3592 cp->cp_edx &= ~0x200; 3593 cp->cp_edx |= CPUID_INTC_EDX_PGE; 3594 } 3595 } 3596 3597 /* 3598 * Early models had problems w/ MMX; disable. 3599 */ 3600 if (cpi->cpi_model < 6) 3601 mask_edx &= ~CPUID_INTC_EDX_MMX; 3602 } 3603 3604 /* 3605 * For newer families, SSE3 and CX16, at least, are valid; 3606 * enable all 3607 */ 3608 if (cpi->cpi_family >= 0xf) 3609 mask_ecx = 0xffffffff; 3610 /* 3611 * We don't support MONITOR/MWAIT if leaf 5 is not available 3612 * to obtain the monitor linesize. 3613 */ 3614 if (cpi->cpi_maxeax < 5) 3615 mask_ecx &= ~CPUID_INTC_ECX_MON; 3616 3617 #if !defined(__xpv) 3618 /* 3619 * AMD has not historically used MWAIT in the CPU's idle loop. 3620 * Pre-family-10h Opterons do not have the MWAIT instruction. We 3621 * know for certain that in at least family 17h, per AMD, mwait 3622 * is preferred. Families in-between are less certain. 3623 */ 3624 if (cpi->cpi_family < 0x17) { 3625 idle_cpu_prefer_mwait = 0; 3626 } 3627 #endif 3628 3629 break; 3630 case X86_VENDOR_HYGON: 3631 /* Enable all for Hygon Dhyana CPU */ 3632 mask_ecx = 0xffffffff; 3633 break; 3634 case X86_VENDOR_TM: 3635 /* 3636 * workaround the NT workaround in CMS 4.1 3637 */ 3638 if (cpi->cpi_family == 5 && cpi->cpi_model == 4 && 3639 (cpi->cpi_step == 2 || cpi->cpi_step == 3)) 3640 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3641 break; 3642 case X86_VENDOR_Centaur: 3643 /* 3644 * workaround the NT workarounds again 3645 */ 3646 if (cpi->cpi_family == 6) 3647 cp->cp_edx |= CPUID_INTC_EDX_CX8; 3648 break; 3649 case X86_VENDOR_Cyrix: 3650 /* 3651 * We rely heavily on the probing in locore 3652 * to actually figure out what parts, if any, 3653 * of the Cyrix cpuid instruction to believe. 3654 */ 3655 switch (x86_type) { 3656 case X86_TYPE_CYRIX_486: 3657 mask_edx = 0; 3658 break; 3659 case X86_TYPE_CYRIX_6x86: 3660 mask_edx = 0; 3661 break; 3662 case X86_TYPE_CYRIX_6x86L: 3663 mask_edx = 3664 CPUID_INTC_EDX_DE | 3665 CPUID_INTC_EDX_CX8; 3666 break; 3667 case X86_TYPE_CYRIX_6x86MX: 3668 mask_edx = 3669 CPUID_INTC_EDX_DE | 3670 CPUID_INTC_EDX_MSR | 3671 CPUID_INTC_EDX_CX8 | 3672 CPUID_INTC_EDX_PGE | 3673 CPUID_INTC_EDX_CMOV | 3674 CPUID_INTC_EDX_MMX; 3675 break; 3676 case X86_TYPE_CYRIX_GXm: 3677 mask_edx = 3678 CPUID_INTC_EDX_MSR | 3679 CPUID_INTC_EDX_CX8 | 3680 CPUID_INTC_EDX_CMOV | 3681 CPUID_INTC_EDX_MMX; 3682 break; 3683 case X86_TYPE_CYRIX_MediaGX: 3684 break; 3685 case X86_TYPE_CYRIX_MII: 3686 case X86_TYPE_VIA_CYRIX_III: 3687 mask_edx = 3688 CPUID_INTC_EDX_DE | 3689 CPUID_INTC_EDX_TSC | 3690 CPUID_INTC_EDX_MSR | 3691 CPUID_INTC_EDX_CX8 | 3692 CPUID_INTC_EDX_PGE | 3693 CPUID_INTC_EDX_CMOV | 3694 CPUID_INTC_EDX_MMX; 3695 break; 3696 default: 3697 break; 3698 } 3699 break; 3700 } 3701 3702 #if defined(__xpv) 3703 /* 3704 * Do not support MONITOR/MWAIT under a hypervisor 3705 */ 3706 mask_ecx &= ~CPUID_INTC_ECX_MON; 3707 /* 3708 * Do not support XSAVE under a hypervisor for now 3709 */ 3710 xsave_force_disable = B_TRUE; 3711 3712 #endif /* __xpv */ 3713 3714 if (xsave_force_disable) { 3715 mask_ecx &= ~CPUID_INTC_ECX_XSAVE; 3716 mask_ecx &= ~CPUID_INTC_ECX_AVX; 3717 mask_ecx &= ~CPUID_INTC_ECX_F16C; 3718 mask_ecx &= ~CPUID_INTC_ECX_FMA; 3719 } 3720 3721 /* 3722 * Now we've figured out the masks that determine 3723 * which bits we choose to believe, apply the masks 3724 * to the feature words, then map the kernel's view 3725 * of these feature words into its feature word. 3726 */ 3727 cp->cp_edx &= mask_edx; 3728 cp->cp_ecx &= mask_ecx; 3729 3730 /* 3731 * apply any platform restrictions (we don't call this 3732 * immediately after __cpuid_insn here, because we need the 3733 * workarounds applied above first) 3734 */ 3735 platform_cpuid_mangle(cpi->cpi_vendor, 1, cp); 3736 3737 /* 3738 * In addition to ecx and edx, Intel and AMD are storing a bunch of 3739 * instruction set extensions in leaf 7's ebx, ecx, and edx. 3740 */ 3741 if (cpi->cpi_maxeax >= 7) { 3742 struct cpuid_regs *ecp; 3743 ecp = &cpi->cpi_std[7]; 3744 ecp->cp_eax = 7; 3745 ecp->cp_ecx = 0; 3746 (void) __cpuid_insn(ecp); 3747 3748 /* 3749 * If XSAVE has been disabled, just ignore all of the 3750 * extended-save-area dependent flags here. 3751 */ 3752 if (xsave_force_disable) { 3753 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 3754 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 3755 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 3756 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX; 3757 ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512; 3758 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512; 3759 ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512; 3760 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES; 3761 ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ; 3762 } 3763 3764 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP) 3765 add_x86_feature(featureset, X86FSET_SMEP); 3766 3767 /* 3768 * We check disable_smap here in addition to in startup_smap() 3769 * to ensure CPUs that aren't the boot CPU don't accidentally 3770 * include it in the feature set and thus generate a mismatched 3771 * x86 feature set across CPUs. 3772 */ 3773 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP && 3774 disable_smap == 0) 3775 add_x86_feature(featureset, X86FSET_SMAP); 3776 3777 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) 3778 add_x86_feature(featureset, X86FSET_RDSEED); 3779 3780 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX) 3781 add_x86_feature(featureset, X86FSET_ADX); 3782 3783 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 3784 add_x86_feature(featureset, X86FSET_FSGSBASE); 3785 3786 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 3787 add_x86_feature(featureset, X86FSET_CLFLUSHOPT); 3788 3789 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID) 3790 add_x86_feature(featureset, X86FSET_INVPCID); 3791 3792 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP) 3793 add_x86_feature(featureset, X86FSET_UMIP); 3794 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU) 3795 add_x86_feature(featureset, X86FSET_PKU); 3796 if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE) 3797 add_x86_feature(featureset, X86FSET_OSPKE); 3798 3799 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 3800 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX) 3801 add_x86_feature(featureset, X86FSET_MPX); 3802 3803 if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB) 3804 add_x86_feature(featureset, X86FSET_CLWB); 3805 } 3806 } 3807 3808 /* 3809 * fold in overrides from the "eeprom" mechanism 3810 */ 3811 cp->cp_edx |= cpuid_feature_edx_include; 3812 cp->cp_edx &= ~cpuid_feature_edx_exclude; 3813 3814 cp->cp_ecx |= cpuid_feature_ecx_include; 3815 cp->cp_ecx &= ~cpuid_feature_ecx_exclude; 3816 3817 if (cp->cp_edx & CPUID_INTC_EDX_PSE) { 3818 add_x86_feature(featureset, X86FSET_LARGEPAGE); 3819 } 3820 if (cp->cp_edx & CPUID_INTC_EDX_TSC) { 3821 add_x86_feature(featureset, X86FSET_TSC); 3822 } 3823 if (cp->cp_edx & CPUID_INTC_EDX_MSR) { 3824 add_x86_feature(featureset, X86FSET_MSR); 3825 } 3826 if (cp->cp_edx & CPUID_INTC_EDX_MTRR) { 3827 add_x86_feature(featureset, X86FSET_MTRR); 3828 } 3829 if (cp->cp_edx & CPUID_INTC_EDX_PGE) { 3830 add_x86_feature(featureset, X86FSET_PGE); 3831 } 3832 if (cp->cp_edx & CPUID_INTC_EDX_CMOV) { 3833 add_x86_feature(featureset, X86FSET_CMOV); 3834 } 3835 if (cp->cp_edx & CPUID_INTC_EDX_MMX) { 3836 add_x86_feature(featureset, X86FSET_MMX); 3837 } 3838 if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 && 3839 (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) { 3840 add_x86_feature(featureset, X86FSET_MCA); 3841 } 3842 if (cp->cp_edx & CPUID_INTC_EDX_PAE) { 3843 add_x86_feature(featureset, X86FSET_PAE); 3844 } 3845 if (cp->cp_edx & CPUID_INTC_EDX_CX8) { 3846 add_x86_feature(featureset, X86FSET_CX8); 3847 } 3848 if (cp->cp_ecx & CPUID_INTC_ECX_CX16) { 3849 add_x86_feature(featureset, X86FSET_CX16); 3850 } 3851 if (cp->cp_edx & CPUID_INTC_EDX_PAT) { 3852 add_x86_feature(featureset, X86FSET_PAT); 3853 } 3854 if (cp->cp_edx & CPUID_INTC_EDX_SEP) { 3855 add_x86_feature(featureset, X86FSET_SEP); 3856 } 3857 if (cp->cp_edx & CPUID_INTC_EDX_FXSR) { 3858 /* 3859 * In our implementation, fxsave/fxrstor 3860 * are prerequisites before we'll even 3861 * try and do SSE things. 3862 */ 3863 if (cp->cp_edx & CPUID_INTC_EDX_SSE) { 3864 add_x86_feature(featureset, X86FSET_SSE); 3865 } 3866 if (cp->cp_edx & CPUID_INTC_EDX_SSE2) { 3867 add_x86_feature(featureset, X86FSET_SSE2); 3868 } 3869 if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) { 3870 add_x86_feature(featureset, X86FSET_SSE3); 3871 } 3872 if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) { 3873 add_x86_feature(featureset, X86FSET_SSSE3); 3874 } 3875 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) { 3876 add_x86_feature(featureset, X86FSET_SSE4_1); 3877 } 3878 if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) { 3879 add_x86_feature(featureset, X86FSET_SSE4_2); 3880 } 3881 if (cp->cp_ecx & CPUID_INTC_ECX_AES) { 3882 add_x86_feature(featureset, X86FSET_AES); 3883 } 3884 if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) { 3885 add_x86_feature(featureset, X86FSET_PCLMULQDQ); 3886 } 3887 3888 if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA) 3889 add_x86_feature(featureset, X86FSET_SHA); 3890 3891 if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) { 3892 add_x86_feature(featureset, X86FSET_XSAVE); 3893 3894 /* We only test AVX & AVX512 when there is XSAVE */ 3895 3896 if (cp->cp_ecx & CPUID_INTC_ECX_AVX) { 3897 add_x86_feature(featureset, 3898 X86FSET_AVX); 3899 3900 /* 3901 * Intel says we can't check these without also 3902 * checking AVX. 3903 */ 3904 if (cp->cp_ecx & CPUID_INTC_ECX_F16C) 3905 add_x86_feature(featureset, 3906 X86FSET_F16C); 3907 3908 if (cp->cp_ecx & CPUID_INTC_ECX_FMA) 3909 add_x86_feature(featureset, 3910 X86FSET_FMA); 3911 3912 if (cpi->cpi_std[7].cp_ebx & 3913 CPUID_INTC_EBX_7_0_BMI1) 3914 add_x86_feature(featureset, 3915 X86FSET_BMI1); 3916 3917 if (cpi->cpi_std[7].cp_ebx & 3918 CPUID_INTC_EBX_7_0_BMI2) 3919 add_x86_feature(featureset, 3920 X86FSET_BMI2); 3921 3922 if (cpi->cpi_std[7].cp_ebx & 3923 CPUID_INTC_EBX_7_0_AVX2) 3924 add_x86_feature(featureset, 3925 X86FSET_AVX2); 3926 3927 if (cpi->cpi_std[7].cp_ecx & 3928 CPUID_INTC_ECX_7_0_VAES) 3929 add_x86_feature(featureset, 3930 X86FSET_VAES); 3931 3932 if (cpi->cpi_std[7].cp_ecx & 3933 CPUID_INTC_ECX_7_0_VPCLMULQDQ) 3934 add_x86_feature(featureset, 3935 X86FSET_VPCLMULQDQ); 3936 } 3937 3938 if (cpi->cpi_vendor == X86_VENDOR_Intel && 3939 (cpi->cpi_std[7].cp_ebx & 3940 CPUID_INTC_EBX_7_0_AVX512F) != 0) { 3941 add_x86_feature(featureset, X86FSET_AVX512F); 3942 3943 if (cpi->cpi_std[7].cp_ebx & 3944 CPUID_INTC_EBX_7_0_AVX512DQ) 3945 add_x86_feature(featureset, 3946 X86FSET_AVX512DQ); 3947 if (cpi->cpi_std[7].cp_ebx & 3948 CPUID_INTC_EBX_7_0_AVX512IFMA) 3949 add_x86_feature(featureset, 3950 X86FSET_AVX512FMA); 3951 if (cpi->cpi_std[7].cp_ebx & 3952 CPUID_INTC_EBX_7_0_AVX512PF) 3953 add_x86_feature(featureset, 3954 X86FSET_AVX512PF); 3955 if (cpi->cpi_std[7].cp_ebx & 3956 CPUID_INTC_EBX_7_0_AVX512ER) 3957 add_x86_feature(featureset, 3958 X86FSET_AVX512ER); 3959 if (cpi->cpi_std[7].cp_ebx & 3960 CPUID_INTC_EBX_7_0_AVX512CD) 3961 add_x86_feature(featureset, 3962 X86FSET_AVX512CD); 3963 if (cpi->cpi_std[7].cp_ebx & 3964 CPUID_INTC_EBX_7_0_AVX512BW) 3965 add_x86_feature(featureset, 3966 X86FSET_AVX512BW); 3967 if (cpi->cpi_std[7].cp_ebx & 3968 CPUID_INTC_EBX_7_0_AVX512VL) 3969 add_x86_feature(featureset, 3970 X86FSET_AVX512VL); 3971 3972 if (cpi->cpi_std[7].cp_ecx & 3973 CPUID_INTC_ECX_7_0_AVX512VBMI) 3974 add_x86_feature(featureset, 3975 X86FSET_AVX512VBMI); 3976 if (cpi->cpi_std[7].cp_ecx & 3977 CPUID_INTC_ECX_7_0_AVX512VNNI) 3978 add_x86_feature(featureset, 3979 X86FSET_AVX512VNNI); 3980 if (cpi->cpi_std[7].cp_ecx & 3981 CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 3982 add_x86_feature(featureset, 3983 X86FSET_AVX512VPOPCDQ); 3984 3985 if (cpi->cpi_std[7].cp_edx & 3986 CPUID_INTC_EDX_7_0_AVX5124NNIW) 3987 add_x86_feature(featureset, 3988 X86FSET_AVX512NNIW); 3989 if (cpi->cpi_std[7].cp_edx & 3990 CPUID_INTC_EDX_7_0_AVX5124FMAPS) 3991 add_x86_feature(featureset, 3992 X86FSET_AVX512FMAPS); 3993 } 3994 } 3995 } 3996 3997 if (cp->cp_ecx & CPUID_INTC_ECX_PCID) { 3998 add_x86_feature(featureset, X86FSET_PCID); 3999 } 4000 4001 if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) { 4002 add_x86_feature(featureset, X86FSET_X2APIC); 4003 } 4004 if (cp->cp_edx & CPUID_INTC_EDX_DE) { 4005 add_x86_feature(featureset, X86FSET_DE); 4006 } 4007 #if !defined(__xpv) 4008 if (cp->cp_ecx & CPUID_INTC_ECX_MON) { 4009 4010 /* 4011 * We require the CLFLUSH instruction for erratum workaround 4012 * to use MONITOR/MWAIT. 4013 */ 4014 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 4015 cpi->cpi_mwait.support |= MWAIT_SUPPORT; 4016 add_x86_feature(featureset, X86FSET_MWAIT); 4017 } else { 4018 extern int idle_cpu_assert_cflush_monitor; 4019 4020 /* 4021 * All processors we are aware of which have 4022 * MONITOR/MWAIT also have CLFLUSH. 4023 */ 4024 if (idle_cpu_assert_cflush_monitor) { 4025 ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) && 4026 (cp->cp_edx & CPUID_INTC_EDX_CLFSH)); 4027 } 4028 } 4029 } 4030 #endif /* __xpv */ 4031 4032 if (cp->cp_ecx & CPUID_INTC_ECX_VMX) { 4033 add_x86_feature(featureset, X86FSET_VMX); 4034 } 4035 4036 if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND) 4037 add_x86_feature(featureset, X86FSET_RDRAND); 4038 4039 /* 4040 * Only need it first time, rest of the cpus would follow suit. 4041 * we only capture this for the bootcpu. 4042 */ 4043 if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { 4044 add_x86_feature(featureset, X86FSET_CLFSH); 4045 x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8); 4046 } 4047 if (is_x86_feature(featureset, X86FSET_PAE)) 4048 cpi->cpi_pabits = 36; 4049 4050 if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) { 4051 struct cpuid_regs r, *ecp; 4052 4053 ecp = &r; 4054 ecp->cp_eax = 0xD; 4055 ecp->cp_ecx = 1; 4056 ecp->cp_edx = ecp->cp_ebx = 0; 4057 (void) __cpuid_insn(ecp); 4058 4059 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT) 4060 add_x86_feature(featureset, X86FSET_XSAVEOPT); 4061 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC) 4062 add_x86_feature(featureset, X86FSET_XSAVEC); 4063 if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES) 4064 add_x86_feature(featureset, X86FSET_XSAVES); 4065 } 4066 4067 /* 4068 * Work on the "extended" feature information, doing 4069 * some basic initialization to be used in the extended pass. 4070 */ 4071 xcpuid = 0; 4072 switch (cpi->cpi_vendor) { 4073 case X86_VENDOR_Intel: 4074 /* 4075 * On KVM we know we will have proper support for extended 4076 * cpuid. 4077 */ 4078 if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf || 4079 (get_hwenv() == HW_KVM && cpi->cpi_family == 6 && 4080 (cpi->cpi_model == 6 || cpi->cpi_model == 2))) 4081 xcpuid++; 4082 break; 4083 case X86_VENDOR_AMD: 4084 if (cpi->cpi_family > 5 || 4085 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 4086 xcpuid++; 4087 break; 4088 case X86_VENDOR_Cyrix: 4089 /* 4090 * Only these Cyrix CPUs are -known- to support 4091 * extended cpuid operations. 4092 */ 4093 if (x86_type == X86_TYPE_VIA_CYRIX_III || 4094 x86_type == X86_TYPE_CYRIX_GXm) 4095 xcpuid++; 4096 break; 4097 case X86_VENDOR_HYGON: 4098 case X86_VENDOR_Centaur: 4099 case X86_VENDOR_TM: 4100 default: 4101 xcpuid++; 4102 break; 4103 } 4104 4105 if (xcpuid) { 4106 cp = &cpi->cpi_extd[0]; 4107 cp->cp_eax = CPUID_LEAF_EXT_0; 4108 cpi->cpi_xmaxeax = __cpuid_insn(cp); 4109 } 4110 4111 if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) { 4112 4113 if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX) 4114 cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX; 4115 4116 switch (cpi->cpi_vendor) { 4117 case X86_VENDOR_Intel: 4118 case X86_VENDOR_AMD: 4119 case X86_VENDOR_HYGON: 4120 if (cpi->cpi_xmaxeax < 0x80000001) 4121 break; 4122 cp = &cpi->cpi_extd[1]; 4123 cp->cp_eax = 0x80000001; 4124 (void) __cpuid_insn(cp); 4125 4126 if (cpi->cpi_vendor == X86_VENDOR_AMD && 4127 cpi->cpi_family == 5 && 4128 cpi->cpi_model == 6 && 4129 cpi->cpi_step == 6) { 4130 /* 4131 * K6 model 6 uses bit 10 to indicate SYSC 4132 * Later models use bit 11. Fix it here. 4133 */ 4134 if (cp->cp_edx & 0x400) { 4135 cp->cp_edx &= ~0x400; 4136 cp->cp_edx |= CPUID_AMD_EDX_SYSC; 4137 } 4138 } 4139 4140 platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp); 4141 4142 /* 4143 * Compute the additions to the kernel's feature word. 4144 */ 4145 if (cp->cp_edx & CPUID_AMD_EDX_NX) { 4146 add_x86_feature(featureset, X86FSET_NX); 4147 } 4148 4149 /* 4150 * Regardless whether or not we boot 64-bit, 4151 * we should have a way to identify whether 4152 * the CPU is capable of running 64-bit. 4153 */ 4154 if (cp->cp_edx & CPUID_AMD_EDX_LM) { 4155 add_x86_feature(featureset, X86FSET_64); 4156 } 4157 4158 /* 1 GB large page - enable only for 64 bit kernel */ 4159 if (cp->cp_edx & CPUID_AMD_EDX_1GPG) { 4160 add_x86_feature(featureset, X86FSET_1GPG); 4161 } 4162 4163 if ((cpi->cpi_vendor == X86_VENDOR_AMD || 4164 cpi->cpi_vendor == X86_VENDOR_HYGON) && 4165 (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) && 4166 (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) { 4167 add_x86_feature(featureset, X86FSET_SSE4A); 4168 } 4169 4170 /* 4171 * It's really tricky to support syscall/sysret in 4172 * the i386 kernel; we rely on sysenter/sysexit 4173 * instead. In the amd64 kernel, things are -way- 4174 * better. 4175 */ 4176 if (cp->cp_edx & CPUID_AMD_EDX_SYSC) { 4177 add_x86_feature(featureset, X86FSET_ASYSC); 4178 } 4179 4180 /* 4181 * While we're thinking about system calls, note 4182 * that AMD processors don't support sysenter 4183 * in long mode at all, so don't try to program them. 4184 */ 4185 if (x86_vendor == X86_VENDOR_AMD || 4186 x86_vendor == X86_VENDOR_HYGON) { 4187 remove_x86_feature(featureset, X86FSET_SEP); 4188 } 4189 4190 if (cp->cp_edx & CPUID_AMD_EDX_TSCP) { 4191 add_x86_feature(featureset, X86FSET_TSCP); 4192 } 4193 4194 if (cp->cp_ecx & CPUID_AMD_ECX_SVM) { 4195 add_x86_feature(featureset, X86FSET_SVM); 4196 } 4197 4198 if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) { 4199 add_x86_feature(featureset, X86FSET_TOPOEXT); 4200 } 4201 4202 if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) { 4203 add_x86_feature(featureset, X86FSET_AMD_PCEC); 4204 } 4205 4206 if (cp->cp_ecx & CPUID_AMD_ECX_XOP) { 4207 add_x86_feature(featureset, X86FSET_XOP); 4208 } 4209 4210 if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) { 4211 add_x86_feature(featureset, X86FSET_FMA4); 4212 } 4213 4214 if (cp->cp_ecx & CPUID_AMD_ECX_TBM) { 4215 add_x86_feature(featureset, X86FSET_TBM); 4216 } 4217 4218 if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) { 4219 add_x86_feature(featureset, X86FSET_MONITORX); 4220 } 4221 break; 4222 default: 4223 break; 4224 } 4225 4226 /* 4227 * Get CPUID data about processor cores and hyperthreads. 4228 */ 4229 switch (cpi->cpi_vendor) { 4230 case X86_VENDOR_Intel: 4231 if (cpi->cpi_maxeax >= 4) { 4232 cp = &cpi->cpi_std[4]; 4233 cp->cp_eax = 4; 4234 cp->cp_ecx = 0; 4235 (void) __cpuid_insn(cp); 4236 platform_cpuid_mangle(cpi->cpi_vendor, 4, cp); 4237 } 4238 /*FALLTHROUGH*/ 4239 case X86_VENDOR_AMD: 4240 case X86_VENDOR_HYGON: 4241 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) 4242 break; 4243 cp = &cpi->cpi_extd[8]; 4244 cp->cp_eax = CPUID_LEAF_EXT_8; 4245 (void) __cpuid_insn(cp); 4246 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, 4247 cp); 4248 4249 /* 4250 * AMD uses ebx for some extended functions. 4251 */ 4252 if (cpi->cpi_vendor == X86_VENDOR_AMD || 4253 cpi->cpi_vendor == X86_VENDOR_HYGON) { 4254 /* 4255 * While we're here, check for the AMD "Error 4256 * Pointer Zero/Restore" feature. This can be 4257 * used to setup the FP save handlers 4258 * appropriately. 4259 */ 4260 if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 4261 cpi->cpi_fp_amd_save = 0; 4262 } else { 4263 cpi->cpi_fp_amd_save = 1; 4264 } 4265 4266 if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) { 4267 add_x86_feature(featureset, 4268 X86FSET_CLZERO); 4269 } 4270 } 4271 4272 /* 4273 * Virtual and physical address limits from 4274 * cpuid override previously guessed values. 4275 */ 4276 cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0); 4277 cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8); 4278 break; 4279 default: 4280 break; 4281 } 4282 4283 /* 4284 * Get CPUID data about TSC Invariance in Deep C-State. 4285 */ 4286 switch (cpi->cpi_vendor) { 4287 case X86_VENDOR_Intel: 4288 case X86_VENDOR_AMD: 4289 case X86_VENDOR_HYGON: 4290 if (cpi->cpi_maxeax >= 7) { 4291 cp = &cpi->cpi_extd[7]; 4292 cp->cp_eax = 0x80000007; 4293 cp->cp_ecx = 0; 4294 (void) __cpuid_insn(cp); 4295 } 4296 break; 4297 default: 4298 break; 4299 } 4300 } 4301 4302 /* 4303 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been 4304 * run and thus gathered some of its dependent leaves. 4305 */ 4306 cpuid_basic_topology(cpu, featureset); 4307 cpuid_basic_thermal(cpu, featureset); 4308 #if !defined(__xpv) 4309 cpuid_basic_ppin(cpu, featureset); 4310 #endif 4311 4312 if (cpi->cpi_vendor == X86_VENDOR_AMD || 4313 cpi->cpi_vendor == X86_VENDOR_HYGON) { 4314 if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 && 4315 cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { 4316 /* Special handling for AMD FP not necessary. */ 4317 cpi->cpi_fp_amd_save = 0; 4318 } else { 4319 cpi->cpi_fp_amd_save = 1; 4320 } 4321 } 4322 4323 /* 4324 * Check (and potentially set) if lfence is serializing. 4325 * This is useful for accurate rdtsc measurements and AMD retpolines. 4326 */ 4327 if ((cpi->cpi_vendor == X86_VENDOR_AMD || 4328 cpi->cpi_vendor == X86_VENDOR_HYGON) && 4329 is_x86_feature(featureset, X86FSET_SSE2)) { 4330 /* 4331 * The AMD white paper Software Techniques For Managing 4332 * Speculation on AMD Processors details circumstances for when 4333 * lfence instructions are serializing. 4334 * 4335 * On family 0xf and 0x11, it is inherently so. On family 0x10 4336 * and later (excluding 0x11), a bit in the DE_CFG MSR 4337 * determines the lfence behavior. Per that whitepaper, AMD has 4338 * committed to supporting that MSR on all later CPUs. 4339 */ 4340 if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) { 4341 add_x86_feature(featureset, X86FSET_LFENCE_SER); 4342 } else if (cpi->cpi_family >= 0x10) { 4343 #if !defined(__xpv) 4344 uint64_t val; 4345 4346 /* 4347 * Be careful when attempting to enable the bit, and 4348 * verify that it was actually set in case we are 4349 * running in a hypervisor which is less than faithful 4350 * about its emulation of this feature. 4351 */ 4352 on_trap_data_t otd; 4353 if (!on_trap(&otd, OT_DATA_ACCESS)) { 4354 val = rdmsr(MSR_AMD_DE_CFG); 4355 val |= AMD_DE_CFG_LFENCE_DISPATCH; 4356 wrmsr(MSR_AMD_DE_CFG, val); 4357 val = rdmsr(MSR_AMD_DE_CFG); 4358 } else { 4359 val = 0; 4360 } 4361 no_trap(); 4362 4363 if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) { 4364 add_x86_feature(featureset, X86FSET_LFENCE_SER); 4365 } 4366 #endif 4367 } 4368 } else if (cpi->cpi_vendor == X86_VENDOR_Intel && 4369 is_x86_feature(featureset, X86FSET_SSE2)) { 4370 /* 4371 * Documentation and other OSes indicate that lfence is always 4372 * serializing on Intel CPUs. 4373 */ 4374 add_x86_feature(featureset, X86FSET_LFENCE_SER); 4375 } 4376 4377 4378 /* 4379 * Check the processor leaves that are used for security features. 4380 */ 4381 cpuid_scan_security(cpu, featureset); 4382 } 4383 4384 /* 4385 * Make copies of the cpuid table entries we depend on, in 4386 * part for ease of parsing now, in part so that we have only 4387 * one place to correct any of it, in part for ease of 4388 * later export to userland, and in part so we can look at 4389 * this stuff in a crash dump. 4390 */ 4391 4392 static void 4393 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused) 4394 { 4395 uint_t n, nmax; 4396 int i; 4397 struct cpuid_regs *cp; 4398 uint8_t *dp; 4399 uint32_t *iptr; 4400 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 4401 4402 if (cpi->cpi_maxeax < 1) 4403 return; 4404 4405 if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD) 4406 nmax = NMAX_CPI_STD; 4407 /* 4408 * (We already handled n == 0 and n == 1 in the basic pass) 4409 */ 4410 for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) { 4411 /* 4412 * leaves 6 and 7 were handled in the basic pass 4413 */ 4414 if (n == 6 || n == 7) 4415 continue; 4416 4417 cp->cp_eax = n; 4418 4419 /* 4420 * CPUID function 4 expects %ecx to be initialized 4421 * with an index which indicates which cache to return 4422 * information about. The OS is expected to call function 4 4423 * with %ecx set to 0, 1, 2, ... until it returns with 4424 * EAX[4:0] set to 0, which indicates there are no more 4425 * caches. 4426 * 4427 * Here, populate cpi_std[4] with the information returned by 4428 * function 4 when %ecx == 0, and do the rest in a later pass 4429 * when dynamic memory allocation becomes available. 4430 * 4431 * Note: we need to explicitly initialize %ecx here, since 4432 * function 4 may have been previously invoked. 4433 */ 4434 if (n == 4) 4435 cp->cp_ecx = 0; 4436 4437 (void) __cpuid_insn(cp); 4438 platform_cpuid_mangle(cpi->cpi_vendor, n, cp); 4439 switch (n) { 4440 case 2: 4441 /* 4442 * "the lower 8 bits of the %eax register 4443 * contain a value that identifies the number 4444 * of times the cpuid [instruction] has to be 4445 * executed to obtain a complete image of the 4446 * processor's caching systems." 4447 * 4448 * How *do* they make this stuff up? 4449 */ 4450 cpi->cpi_ncache = sizeof (*cp) * 4451 BITX(cp->cp_eax, 7, 0); 4452 if (cpi->cpi_ncache == 0) 4453 break; 4454 cpi->cpi_ncache--; /* skip count byte */ 4455 4456 /* 4457 * Well, for now, rather than attempt to implement 4458 * this slightly dubious algorithm, we just look 4459 * at the first 15 .. 4460 */ 4461 if (cpi->cpi_ncache > (sizeof (*cp) - 1)) 4462 cpi->cpi_ncache = sizeof (*cp) - 1; 4463 4464 dp = cpi->cpi_cacheinfo; 4465 if (BITX(cp->cp_eax, 31, 31) == 0) { 4466 uint8_t *p = (void *)&cp->cp_eax; 4467 for (i = 1; i < 4; i++) 4468 if (p[i] != 0) 4469 *dp++ = p[i]; 4470 } 4471 if (BITX(cp->cp_ebx, 31, 31) == 0) { 4472 uint8_t *p = (void *)&cp->cp_ebx; 4473 for (i = 0; i < 4; i++) 4474 if (p[i] != 0) 4475 *dp++ = p[i]; 4476 } 4477 if (BITX(cp->cp_ecx, 31, 31) == 0) { 4478 uint8_t *p = (void *)&cp->cp_ecx; 4479 for (i = 0; i < 4; i++) 4480 if (p[i] != 0) 4481 *dp++ = p[i]; 4482 } 4483 if (BITX(cp->cp_edx, 31, 31) == 0) { 4484 uint8_t *p = (void *)&cp->cp_edx; 4485 for (i = 0; i < 4; i++) 4486 if (p[i] != 0) 4487 *dp++ = p[i]; 4488 } 4489 break; 4490 4491 case 3: /* Processor serial number, if PSN supported */ 4492 break; 4493 4494 case 4: /* Deterministic cache parameters */ 4495 break; 4496 4497 case 5: /* Monitor/Mwait parameters */ 4498 { 4499 size_t mwait_size; 4500 4501 /* 4502 * check cpi_mwait.support which was set in 4503 * cpuid_pass_basic() 4504 */ 4505 if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT)) 4506 break; 4507 4508 /* 4509 * Protect ourself from insane mwait line size. 4510 * Workaround for incomplete hardware emulator(s). 4511 */ 4512 mwait_size = (size_t)MWAIT_SIZE_MAX(cpi); 4513 if (mwait_size < sizeof (uint32_t) || 4514 !ISP2(mwait_size)) { 4515 #if DEBUG 4516 cmn_err(CE_NOTE, "Cannot handle cpu %d mwait " 4517 "size %ld", cpu->cpu_id, (long)mwait_size); 4518 #endif 4519 break; 4520 } 4521 4522 cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi); 4523 cpi->cpi_mwait.mon_max = mwait_size; 4524 if (MWAIT_EXTENSION(cpi)) { 4525 cpi->cpi_mwait.support |= MWAIT_EXTENSIONS; 4526 if (MWAIT_INT_ENABLE(cpi)) 4527 cpi->cpi_mwait.support |= 4528 MWAIT_ECX_INT_ENABLE; 4529 } 4530 break; 4531 } 4532 default: 4533 break; 4534 } 4535 } 4536 4537 /* 4538 * XSAVE enumeration 4539 */ 4540 if (cpi->cpi_maxeax >= 0xD) { 4541 struct cpuid_regs regs; 4542 boolean_t cpuid_d_valid = B_TRUE; 4543 4544 cp = ®s; 4545 cp->cp_eax = 0xD; 4546 cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; 4547 4548 (void) __cpuid_insn(cp); 4549 4550 /* 4551 * Sanity checks for debug 4552 */ 4553 if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 || 4554 (cp->cp_eax & XFEATURE_SSE) == 0) { 4555 cpuid_d_valid = B_FALSE; 4556 } 4557 4558 cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax; 4559 cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx; 4560 cpi->cpi_xsave.xsav_max_size = cp->cp_ecx; 4561 4562 /* 4563 * If the hw supports AVX, get the size and offset in the save 4564 * area for the ymm state. 4565 */ 4566 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) { 4567 cp->cp_eax = 0xD; 4568 cp->cp_ecx = 2; 4569 cp->cp_edx = cp->cp_ebx = 0; 4570 4571 (void) __cpuid_insn(cp); 4572 4573 if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET || 4574 cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) { 4575 cpuid_d_valid = B_FALSE; 4576 } 4577 4578 cpi->cpi_xsave.ymm_size = cp->cp_eax; 4579 cpi->cpi_xsave.ymm_offset = cp->cp_ebx; 4580 } 4581 4582 /* 4583 * If the hw supports MPX, get the size and offset in the 4584 * save area for BNDREGS and BNDCSR. 4585 */ 4586 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) { 4587 cp->cp_eax = 0xD; 4588 cp->cp_ecx = 3; 4589 cp->cp_edx = cp->cp_ebx = 0; 4590 4591 (void) __cpuid_insn(cp); 4592 4593 cpi->cpi_xsave.bndregs_size = cp->cp_eax; 4594 cpi->cpi_xsave.bndregs_offset = cp->cp_ebx; 4595 4596 cp->cp_eax = 0xD; 4597 cp->cp_ecx = 4; 4598 cp->cp_edx = cp->cp_ebx = 0; 4599 4600 (void) __cpuid_insn(cp); 4601 4602 cpi->cpi_xsave.bndcsr_size = cp->cp_eax; 4603 cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx; 4604 } 4605 4606 /* 4607 * If the hw supports AVX512, get the size and offset in the 4608 * save area for the opmask registers and zmm state. 4609 */ 4610 if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) { 4611 cp->cp_eax = 0xD; 4612 cp->cp_ecx = 5; 4613 cp->cp_edx = cp->cp_ebx = 0; 4614 4615 (void) __cpuid_insn(cp); 4616 4617 cpi->cpi_xsave.opmask_size = cp->cp_eax; 4618 cpi->cpi_xsave.opmask_offset = cp->cp_ebx; 4619 4620 cp->cp_eax = 0xD; 4621 cp->cp_ecx = 6; 4622 cp->cp_edx = cp->cp_ebx = 0; 4623 4624 (void) __cpuid_insn(cp); 4625 4626 cpi->cpi_xsave.zmmlo_size = cp->cp_eax; 4627 cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx; 4628 4629 cp->cp_eax = 0xD; 4630 cp->cp_ecx = 7; 4631 cp->cp_edx = cp->cp_ebx = 0; 4632 4633 (void) __cpuid_insn(cp); 4634 4635 cpi->cpi_xsave.zmmhi_size = cp->cp_eax; 4636 cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx; 4637 } 4638 4639 if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { 4640 xsave_state_size = 0; 4641 } else if (cpuid_d_valid) { 4642 xsave_state_size = cpi->cpi_xsave.xsav_max_size; 4643 } else { 4644 /* Broken CPUID 0xD, probably in HVM */ 4645 cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid " 4646 "value: hw_low = %d, hw_high = %d, xsave_size = %d" 4647 ", ymm_size = %d, ymm_offset = %d\n", 4648 cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low, 4649 cpi->cpi_xsave.xsav_hw_features_high, 4650 (int)cpi->cpi_xsave.xsav_max_size, 4651 (int)cpi->cpi_xsave.ymm_size, 4652 (int)cpi->cpi_xsave.ymm_offset); 4653 4654 if (xsave_state_size != 0) { 4655 /* 4656 * This must be a non-boot CPU. We cannot 4657 * continue, because boot cpu has already 4658 * enabled XSAVE. 4659 */ 4660 ASSERT(cpu->cpu_id != 0); 4661 cmn_err(CE_PANIC, "cpu%d: we have already " 4662 "enabled XSAVE on boot cpu, cannot " 4663 "continue.", cpu->cpu_id); 4664 } else { 4665 /* 4666 * If we reached here on the boot CPU, it's also 4667 * almost certain that we'll reach here on the 4668 * non-boot CPUs. When we're here on a boot CPU 4669 * we should disable the feature, on a non-boot 4670 * CPU we need to confirm that we have. 4671 */ 4672 if (cpu->cpu_id == 0) { 4673 remove_x86_feature(x86_featureset, 4674 X86FSET_XSAVE); 4675 remove_x86_feature(x86_featureset, 4676 X86FSET_AVX); 4677 remove_x86_feature(x86_featureset, 4678 X86FSET_F16C); 4679 remove_x86_feature(x86_featureset, 4680 X86FSET_BMI1); 4681 remove_x86_feature(x86_featureset, 4682 X86FSET_BMI2); 4683 remove_x86_feature(x86_featureset, 4684 X86FSET_FMA); 4685 remove_x86_feature(x86_featureset, 4686 X86FSET_AVX2); 4687 remove_x86_feature(x86_featureset, 4688 X86FSET_MPX); 4689 remove_x86_feature(x86_featureset, 4690 X86FSET_AVX512F); 4691 remove_x86_feature(x86_featureset, 4692 X86FSET_AVX512DQ); 4693 remove_x86_feature(x86_featureset, 4694 X86FSET_AVX512PF); 4695 remove_x86_feature(x86_featureset, 4696 X86FSET_AVX512ER); 4697 remove_x86_feature(x86_featureset, 4698 X86FSET_AVX512CD); 4699 remove_x86_feature(x86_featureset, 4700 X86FSET_AVX512BW); 4701 remove_x86_feature(x86_featureset, 4702 X86FSET_AVX512VL); 4703 remove_x86_feature(x86_featureset, 4704 X86FSET_AVX512FMA); 4705 remove_x86_feature(x86_featureset, 4706 X86FSET_AVX512VBMI); 4707 remove_x86_feature(x86_featureset, 4708 X86FSET_AVX512VNNI); 4709 remove_x86_feature(x86_featureset, 4710 X86FSET_AVX512VPOPCDQ); 4711 remove_x86_feature(x86_featureset, 4712 X86FSET_AVX512NNIW); 4713 remove_x86_feature(x86_featureset, 4714 X86FSET_AVX512FMAPS); 4715 remove_x86_feature(x86_featureset, 4716 X86FSET_VAES); 4717 remove_x86_feature(x86_featureset, 4718 X86FSET_VPCLMULQDQ); 4719 4720 CPI_FEATURES_ECX(cpi) &= 4721 ~CPUID_INTC_ECX_XSAVE; 4722 CPI_FEATURES_ECX(cpi) &= 4723 ~CPUID_INTC_ECX_AVX; 4724 CPI_FEATURES_ECX(cpi) &= 4725 ~CPUID_INTC_ECX_F16C; 4726 CPI_FEATURES_ECX(cpi) &= 4727 ~CPUID_INTC_ECX_FMA; 4728 CPI_FEATURES_7_0_EBX(cpi) &= 4729 ~CPUID_INTC_EBX_7_0_BMI1; 4730 CPI_FEATURES_7_0_EBX(cpi) &= 4731 ~CPUID_INTC_EBX_7_0_BMI2; 4732 CPI_FEATURES_7_0_EBX(cpi) &= 4733 ~CPUID_INTC_EBX_7_0_AVX2; 4734 CPI_FEATURES_7_0_EBX(cpi) &= 4735 ~CPUID_INTC_EBX_7_0_MPX; 4736 CPI_FEATURES_7_0_EBX(cpi) &= 4737 ~CPUID_INTC_EBX_7_0_ALL_AVX512; 4738 4739 CPI_FEATURES_7_0_ECX(cpi) &= 4740 ~CPUID_INTC_ECX_7_0_ALL_AVX512; 4741 4742 CPI_FEATURES_7_0_ECX(cpi) &= 4743 ~CPUID_INTC_ECX_7_0_VAES; 4744 CPI_FEATURES_7_0_ECX(cpi) &= 4745 ~CPUID_INTC_ECX_7_0_VPCLMULQDQ; 4746 4747 CPI_FEATURES_7_0_EDX(cpi) &= 4748 ~CPUID_INTC_EDX_7_0_ALL_AVX512; 4749 4750 xsave_force_disable = B_TRUE; 4751 } else { 4752 VERIFY(is_x86_feature(x86_featureset, 4753 X86FSET_XSAVE) == B_FALSE); 4754 } 4755 } 4756 } 4757 } 4758 4759 4760 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) 4761 return; 4762 4763 if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD) 4764 nmax = NMAX_CPI_EXTD; 4765 /* 4766 * Copy the extended properties, fixing them as we go. 4767 * (We already handled n == 0 and n == 1 in the basic pass) 4768 */ 4769 iptr = (void *)cpi->cpi_brandstr; 4770 for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) { 4771 cp->cp_eax = CPUID_LEAF_EXT_0 + n; 4772 (void) __cpuid_insn(cp); 4773 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n, 4774 cp); 4775 switch (n) { 4776 case 2: 4777 case 3: 4778 case 4: 4779 /* 4780 * Extract the brand string 4781 */ 4782 *iptr++ = cp->cp_eax; 4783 *iptr++ = cp->cp_ebx; 4784 *iptr++ = cp->cp_ecx; 4785 *iptr++ = cp->cp_edx; 4786 break; 4787 case 5: 4788 switch (cpi->cpi_vendor) { 4789 case X86_VENDOR_AMD: 4790 /* 4791 * The Athlon and Duron were the first 4792 * parts to report the sizes of the 4793 * TLB for large pages. Before then, 4794 * we don't trust the data. 4795 */ 4796 if (cpi->cpi_family < 6 || 4797 (cpi->cpi_family == 6 && 4798 cpi->cpi_model < 1)) 4799 cp->cp_eax = 0; 4800 break; 4801 default: 4802 break; 4803 } 4804 break; 4805 case 6: 4806 switch (cpi->cpi_vendor) { 4807 case X86_VENDOR_AMD: 4808 /* 4809 * The Athlon and Duron were the first 4810 * AMD parts with L2 TLB's. 4811 * Before then, don't trust the data. 4812 */ 4813 if (cpi->cpi_family < 6 || 4814 (cpi->cpi_family == 6 && 4815 cpi->cpi_model < 1)) 4816 cp->cp_eax = cp->cp_ebx = 0; 4817 /* 4818 * AMD Duron rev A0 reports L2 4819 * cache size incorrectly as 1K 4820 * when it is really 64K 4821 */ 4822 if (cpi->cpi_family == 6 && 4823 cpi->cpi_model == 3 && 4824 cpi->cpi_step == 0) { 4825 cp->cp_ecx &= 0xffff; 4826 cp->cp_ecx |= 0x400000; 4827 } 4828 break; 4829 case X86_VENDOR_Cyrix: /* VIA C3 */ 4830 /* 4831 * VIA C3 processors are a bit messed 4832 * up w.r.t. encoding cache sizes in %ecx 4833 */ 4834 if (cpi->cpi_family != 6) 4835 break; 4836 /* 4837 * model 7 and 8 were incorrectly encoded 4838 * 4839 * xxx is model 8 really broken? 4840 */ 4841 if (cpi->cpi_model == 7 || 4842 cpi->cpi_model == 8) 4843 cp->cp_ecx = 4844 BITX(cp->cp_ecx, 31, 24) << 16 | 4845 BITX(cp->cp_ecx, 23, 16) << 12 | 4846 BITX(cp->cp_ecx, 15, 8) << 8 | 4847 BITX(cp->cp_ecx, 7, 0); 4848 /* 4849 * model 9 stepping 1 has wrong associativity 4850 */ 4851 if (cpi->cpi_model == 9 && cpi->cpi_step == 1) 4852 cp->cp_ecx |= 8 << 12; 4853 break; 4854 case X86_VENDOR_Intel: 4855 /* 4856 * Extended L2 Cache features function. 4857 * First appeared on Prescott. 4858 */ 4859 default: 4860 break; 4861 } 4862 break; 4863 default: 4864 break; 4865 } 4866 } 4867 } 4868 4869 static const char * 4870 intel_cpubrand(const struct cpuid_info *cpi) 4871 { 4872 int i; 4873 4874 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 4875 4876 switch (cpi->cpi_family) { 4877 case 5: 4878 return ("Intel Pentium(r)"); 4879 case 6: 4880 switch (cpi->cpi_model) { 4881 uint_t celeron, xeon; 4882 const struct cpuid_regs *cp; 4883 case 0: 4884 case 1: 4885 case 2: 4886 return ("Intel Pentium(r) Pro"); 4887 case 3: 4888 case 4: 4889 return ("Intel Pentium(r) II"); 4890 case 6: 4891 return ("Intel Celeron(r)"); 4892 case 5: 4893 case 7: 4894 celeron = xeon = 0; 4895 cp = &cpi->cpi_std[2]; /* cache info */ 4896 4897 for (i = 1; i < 4; i++) { 4898 uint_t tmp; 4899 4900 tmp = (cp->cp_eax >> (8 * i)) & 0xff; 4901 if (tmp == 0x40) 4902 celeron++; 4903 if (tmp >= 0x44 && tmp <= 0x45) 4904 xeon++; 4905 } 4906 4907 for (i = 0; i < 2; i++) { 4908 uint_t tmp; 4909 4910 tmp = (cp->cp_ebx >> (8 * i)) & 0xff; 4911 if (tmp == 0x40) 4912 celeron++; 4913 else if (tmp >= 0x44 && tmp <= 0x45) 4914 xeon++; 4915 } 4916 4917 for (i = 0; i < 4; i++) { 4918 uint_t tmp; 4919 4920 tmp = (cp->cp_ecx >> (8 * i)) & 0xff; 4921 if (tmp == 0x40) 4922 celeron++; 4923 else if (tmp >= 0x44 && tmp <= 0x45) 4924 xeon++; 4925 } 4926 4927 for (i = 0; i < 4; i++) { 4928 uint_t tmp; 4929 4930 tmp = (cp->cp_edx >> (8 * i)) & 0xff; 4931 if (tmp == 0x40) 4932 celeron++; 4933 else if (tmp >= 0x44 && tmp <= 0x45) 4934 xeon++; 4935 } 4936 4937 if (celeron) 4938 return ("Intel Celeron(r)"); 4939 if (xeon) 4940 return (cpi->cpi_model == 5 ? 4941 "Intel Pentium(r) II Xeon(tm)" : 4942 "Intel Pentium(r) III Xeon(tm)"); 4943 return (cpi->cpi_model == 5 ? 4944 "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" : 4945 "Intel Pentium(r) III or Pentium(r) III Xeon(tm)"); 4946 default: 4947 break; 4948 } 4949 default: 4950 break; 4951 } 4952 4953 /* BrandID is present if the field is nonzero */ 4954 if (cpi->cpi_brandid != 0) { 4955 static const struct { 4956 uint_t bt_bid; 4957 const char *bt_str; 4958 } brand_tbl[] = { 4959 { 0x1, "Intel(r) Celeron(r)" }, 4960 { 0x2, "Intel(r) Pentium(r) III" }, 4961 { 0x3, "Intel(r) Pentium(r) III Xeon(tm)" }, 4962 { 0x4, "Intel(r) Pentium(r) III" }, 4963 { 0x6, "Mobile Intel(r) Pentium(r) III" }, 4964 { 0x7, "Mobile Intel(r) Celeron(r)" }, 4965 { 0x8, "Intel(r) Pentium(r) 4" }, 4966 { 0x9, "Intel(r) Pentium(r) 4" }, 4967 { 0xa, "Intel(r) Celeron(r)" }, 4968 { 0xb, "Intel(r) Xeon(tm)" }, 4969 { 0xc, "Intel(r) Xeon(tm) MP" }, 4970 { 0xe, "Mobile Intel(r) Pentium(r) 4" }, 4971 { 0xf, "Mobile Intel(r) Celeron(r)" }, 4972 { 0x11, "Mobile Genuine Intel(r)" }, 4973 { 0x12, "Intel(r) Celeron(r) M" }, 4974 { 0x13, "Mobile Intel(r) Celeron(r)" }, 4975 { 0x14, "Intel(r) Celeron(r)" }, 4976 { 0x15, "Mobile Genuine Intel(r)" }, 4977 { 0x16, "Intel(r) Pentium(r) M" }, 4978 { 0x17, "Mobile Intel(r) Celeron(r)" } 4979 }; 4980 uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]); 4981 uint_t sgn; 4982 4983 sgn = (cpi->cpi_family << 8) | 4984 (cpi->cpi_model << 4) | cpi->cpi_step; 4985 4986 for (i = 0; i < btblmax; i++) 4987 if (brand_tbl[i].bt_bid == cpi->cpi_brandid) 4988 break; 4989 if (i < btblmax) { 4990 if (sgn == 0x6b1 && cpi->cpi_brandid == 3) 4991 return ("Intel(r) Celeron(r)"); 4992 if (sgn < 0xf13 && cpi->cpi_brandid == 0xb) 4993 return ("Intel(r) Xeon(tm) MP"); 4994 if (sgn < 0xf13 && cpi->cpi_brandid == 0xe) 4995 return ("Intel(r) Xeon(tm)"); 4996 return (brand_tbl[i].bt_str); 4997 } 4998 } 4999 5000 return (NULL); 5001 } 5002 5003 static const char * 5004 amd_cpubrand(const struct cpuid_info *cpi) 5005 { 5006 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 5007 5008 switch (cpi->cpi_family) { 5009 case 5: 5010 switch (cpi->cpi_model) { 5011 case 0: 5012 case 1: 5013 case 2: 5014 case 3: 5015 case 4: 5016 case 5: 5017 return ("AMD-K5(r)"); 5018 case 6: 5019 case 7: 5020 return ("AMD-K6(r)"); 5021 case 8: 5022 return ("AMD-K6(r)-2"); 5023 case 9: 5024 return ("AMD-K6(r)-III"); 5025 default: 5026 return ("AMD (family 5)"); 5027 } 5028 case 6: 5029 switch (cpi->cpi_model) { 5030 case 1: 5031 return ("AMD-K7(tm)"); 5032 case 0: 5033 case 2: 5034 case 4: 5035 return ("AMD Athlon(tm)"); 5036 case 3: 5037 case 7: 5038 return ("AMD Duron(tm)"); 5039 case 6: 5040 case 8: 5041 case 10: 5042 /* 5043 * Use the L2 cache size to distinguish 5044 */ 5045 return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ? 5046 "AMD Athlon(tm)" : "AMD Duron(tm)"); 5047 default: 5048 return ("AMD (family 6)"); 5049 } 5050 default: 5051 break; 5052 } 5053 5054 if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 && 5055 cpi->cpi_brandid != 0) { 5056 switch (BITX(cpi->cpi_brandid, 7, 5)) { 5057 case 3: 5058 return ("AMD Opteron(tm) UP 1xx"); 5059 case 4: 5060 return ("AMD Opteron(tm) DP 2xx"); 5061 case 5: 5062 return ("AMD Opteron(tm) MP 8xx"); 5063 default: 5064 return ("AMD Opteron(tm)"); 5065 } 5066 } 5067 5068 return (NULL); 5069 } 5070 5071 static const char * 5072 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type) 5073 { 5074 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 5075 5076 switch (type) { 5077 case X86_TYPE_CYRIX_6x86: 5078 return ("Cyrix 6x86"); 5079 case X86_TYPE_CYRIX_6x86L: 5080 return ("Cyrix 6x86L"); 5081 case X86_TYPE_CYRIX_6x86MX: 5082 return ("Cyrix 6x86MX"); 5083 case X86_TYPE_CYRIX_GXm: 5084 return ("Cyrix GXm"); 5085 case X86_TYPE_CYRIX_MediaGX: 5086 return ("Cyrix MediaGX"); 5087 case X86_TYPE_CYRIX_MII: 5088 return ("Cyrix M2"); 5089 case X86_TYPE_VIA_CYRIX_III: 5090 return ("VIA Cyrix M3"); 5091 default: 5092 /* 5093 * Have another wild guess .. 5094 */ 5095 if (cpi->cpi_family == 4 && cpi->cpi_model == 9) 5096 return ("Cyrix 5x86"); 5097 else if (cpi->cpi_family == 5) { 5098 switch (cpi->cpi_model) { 5099 case 2: 5100 return ("Cyrix 6x86"); /* Cyrix M1 */ 5101 case 4: 5102 return ("Cyrix MediaGX"); 5103 default: 5104 break; 5105 } 5106 } else if (cpi->cpi_family == 6) { 5107 switch (cpi->cpi_model) { 5108 case 0: 5109 return ("Cyrix 6x86MX"); /* Cyrix M2? */ 5110 case 5: 5111 case 6: 5112 case 7: 5113 case 8: 5114 case 9: 5115 return ("VIA C3"); 5116 default: 5117 break; 5118 } 5119 } 5120 break; 5121 } 5122 return (NULL); 5123 } 5124 5125 /* 5126 * This only gets called in the case that the CPU extended 5127 * feature brand string (0x80000002, 0x80000003, 0x80000004) 5128 * aren't available, or contain null bytes for some reason. 5129 */ 5130 static void 5131 fabricate_brandstr(struct cpuid_info *cpi) 5132 { 5133 const char *brand = NULL; 5134 5135 switch (cpi->cpi_vendor) { 5136 case X86_VENDOR_Intel: 5137 brand = intel_cpubrand(cpi); 5138 break; 5139 case X86_VENDOR_AMD: 5140 brand = amd_cpubrand(cpi); 5141 break; 5142 case X86_VENDOR_Cyrix: 5143 brand = cyrix_cpubrand(cpi, x86_type); 5144 break; 5145 case X86_VENDOR_NexGen: 5146 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 5147 brand = "NexGen Nx586"; 5148 break; 5149 case X86_VENDOR_Centaur: 5150 if (cpi->cpi_family == 5) 5151 switch (cpi->cpi_model) { 5152 case 4: 5153 brand = "Centaur C6"; 5154 break; 5155 case 8: 5156 brand = "Centaur C2"; 5157 break; 5158 case 9: 5159 brand = "Centaur C3"; 5160 break; 5161 default: 5162 break; 5163 } 5164 break; 5165 case X86_VENDOR_Rise: 5166 if (cpi->cpi_family == 5 && 5167 (cpi->cpi_model == 0 || cpi->cpi_model == 2)) 5168 brand = "Rise mP6"; 5169 break; 5170 case X86_VENDOR_SiS: 5171 if (cpi->cpi_family == 5 && cpi->cpi_model == 0) 5172 brand = "SiS 55x"; 5173 break; 5174 case X86_VENDOR_TM: 5175 if (cpi->cpi_family == 5 && cpi->cpi_model == 4) 5176 brand = "Transmeta Crusoe TM3x00 or TM5x00"; 5177 break; 5178 case X86_VENDOR_NSC: 5179 case X86_VENDOR_UMC: 5180 default: 5181 break; 5182 } 5183 if (brand) { 5184 (void) strcpy((char *)cpi->cpi_brandstr, brand); 5185 return; 5186 } 5187 5188 /* 5189 * If all else fails ... 5190 */ 5191 (void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr), 5192 "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family, 5193 cpi->cpi_model, cpi->cpi_step); 5194 } 5195 5196 /* 5197 * This routine is called just after kernel memory allocation 5198 * becomes available on cpu0, and as part of mp_startup() on 5199 * the other cpus. 5200 * 5201 * Fixup the brand string, and collect any information from cpuid 5202 * that requires dynamically allocated storage to represent. 5203 */ 5204 5205 static void 5206 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused) 5207 { 5208 int i, max, shft, level, size; 5209 struct cpuid_regs regs; 5210 struct cpuid_regs *cp; 5211 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5212 5213 /* 5214 * Deterministic cache parameters 5215 * 5216 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The 5217 * values that are present are currently defined to be the same. This 5218 * means we can use the same logic to parse it as long as we use the 5219 * appropriate leaf to get the data. If you're updating this, make sure 5220 * you're careful about which vendor supports which aspect. 5221 * 5222 * Take this opportunity to detect the number of threads sharing the 5223 * last level cache, and construct a corresponding cache id. The 5224 * respective cpuid_info members are initialized to the default case of 5225 * "no last level cache sharing". 5226 */ 5227 cpi->cpi_ncpu_shr_last_cache = 1; 5228 cpi->cpi_last_lvl_cacheid = cpu->cpu_id; 5229 5230 if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) || 5231 ((cpi->cpi_vendor == X86_VENDOR_AMD || 5232 cpi->cpi_vendor == X86_VENDOR_HYGON) && 5233 cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d && 5234 is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) { 5235 uint32_t leaf; 5236 5237 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 5238 leaf = 4; 5239 } else { 5240 leaf = CPUID_LEAF_EXT_1d; 5241 } 5242 5243 /* 5244 * Find the # of elements (size) returned by the leaf and along 5245 * the way detect last level cache sharing details. 5246 */ 5247 bzero(®s, sizeof (regs)); 5248 cp = ®s; 5249 for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) { 5250 cp->cp_eax = leaf; 5251 cp->cp_ecx = i; 5252 5253 (void) __cpuid_insn(cp); 5254 5255 if (CPI_CACHE_TYPE(cp) == 0) 5256 break; 5257 level = CPI_CACHE_LVL(cp); 5258 if (level > max) { 5259 max = level; 5260 cpi->cpi_ncpu_shr_last_cache = 5261 CPI_NTHR_SHR_CACHE(cp) + 1; 5262 } 5263 } 5264 cpi->cpi_cache_leaf_size = size = i; 5265 5266 /* 5267 * Allocate the cpi_cache_leaves array. The first element 5268 * references the regs for the corresponding leaf with %ecx set 5269 * to 0. This was gathered in cpuid_pass_extended(). 5270 */ 5271 if (size > 0) { 5272 cpi->cpi_cache_leaves = 5273 kmem_alloc(size * sizeof (cp), KM_SLEEP); 5274 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 5275 cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4]; 5276 } else { 5277 cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d]; 5278 } 5279 5280 /* 5281 * Allocate storage to hold the additional regs 5282 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size. 5283 * 5284 * The regs for the leaf, %ecx == 0 has already 5285 * been allocated as indicated above. 5286 */ 5287 for (i = 1; i < size; i++) { 5288 cp = cpi->cpi_cache_leaves[i] = 5289 kmem_zalloc(sizeof (regs), KM_SLEEP); 5290 cp->cp_eax = leaf; 5291 cp->cp_ecx = i; 5292 5293 (void) __cpuid_insn(cp); 5294 } 5295 } 5296 /* 5297 * Determine the number of bits needed to represent 5298 * the number of CPUs sharing the last level cache. 5299 * 5300 * Shift off that number of bits from the APIC id to 5301 * derive the cache id. 5302 */ 5303 shft = 0; 5304 for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1) 5305 shft++; 5306 cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft; 5307 } 5308 5309 /* 5310 * Now fixup the brand string 5311 */ 5312 if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) { 5313 fabricate_brandstr(cpi); 5314 } else { 5315 5316 /* 5317 * If we successfully extracted a brand string from the cpuid 5318 * instruction, clean it up by removing leading spaces and 5319 * similar junk. 5320 */ 5321 if (cpi->cpi_brandstr[0]) { 5322 size_t maxlen = sizeof (cpi->cpi_brandstr); 5323 char *src, *dst; 5324 5325 dst = src = (char *)cpi->cpi_brandstr; 5326 src[maxlen - 1] = '\0'; 5327 /* 5328 * strip leading spaces 5329 */ 5330 while (*src == ' ') 5331 src++; 5332 /* 5333 * Remove any 'Genuine' or "Authentic" prefixes 5334 */ 5335 if (strncmp(src, "Genuine ", 8) == 0) 5336 src += 8; 5337 if (strncmp(src, "Authentic ", 10) == 0) 5338 src += 10; 5339 5340 /* 5341 * Now do an in-place copy. 5342 * Map (R) to (r) and (TM) to (tm). 5343 * The era of teletypes is long gone, and there's 5344 * -really- no need to shout. 5345 */ 5346 while (*src != '\0') { 5347 if (src[0] == '(') { 5348 if (strncmp(src + 1, "R)", 2) == 0) { 5349 (void) strncpy(dst, "(r)", 3); 5350 src += 3; 5351 dst += 3; 5352 continue; 5353 } 5354 if (strncmp(src + 1, "TM)", 3) == 0) { 5355 (void) strncpy(dst, "(tm)", 4); 5356 src += 4; 5357 dst += 4; 5358 continue; 5359 } 5360 } 5361 *dst++ = *src++; 5362 } 5363 *dst = '\0'; 5364 5365 /* 5366 * Finally, remove any trailing spaces 5367 */ 5368 while (--dst > cpi->cpi_brandstr) 5369 if (*dst == ' ') 5370 *dst = '\0'; 5371 else 5372 break; 5373 } else 5374 fabricate_brandstr(cpi); 5375 } 5376 } 5377 5378 /* 5379 * This routine is called out of bind_hwcap() much later in the life 5380 * of the kernel (post_startup()). The job of this routine is to resolve 5381 * the hardware feature support and kernel support for those features into 5382 * what we're actually going to tell applications via the aux vector. 5383 */ 5384 5385 static void 5386 cpuid_pass_resolve(cpu_t *cpu, void *arg) 5387 { 5388 uint_t *hwcap_out = (uint_t *)arg; 5389 struct cpuid_info *cpi; 5390 uint_t hwcap_flags = 0, hwcap_flags_2 = 0; 5391 5392 cpi = cpu->cpu_m.mcpu_cpi; 5393 5394 if (cpi->cpi_maxeax >= 1) { 5395 uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES]; 5396 uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES]; 5397 uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES]; 5398 5399 *edx = CPI_FEATURES_EDX(cpi); 5400 *ecx = CPI_FEATURES_ECX(cpi); 5401 *ebx = CPI_FEATURES_7_0_EBX(cpi); 5402 5403 /* 5404 * [these require explicit kernel support] 5405 */ 5406 if (!is_x86_feature(x86_featureset, X86FSET_SEP)) 5407 *edx &= ~CPUID_INTC_EDX_SEP; 5408 5409 if (!is_x86_feature(x86_featureset, X86FSET_SSE)) 5410 *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE); 5411 if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) 5412 *edx &= ~CPUID_INTC_EDX_SSE2; 5413 5414 if (!is_x86_feature(x86_featureset, X86FSET_HTT)) 5415 *edx &= ~CPUID_INTC_EDX_HTT; 5416 5417 if (!is_x86_feature(x86_featureset, X86FSET_SSE3)) 5418 *ecx &= ~CPUID_INTC_ECX_SSE3; 5419 5420 if (!is_x86_feature(x86_featureset, X86FSET_SSSE3)) 5421 *ecx &= ~CPUID_INTC_ECX_SSSE3; 5422 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1)) 5423 *ecx &= ~CPUID_INTC_ECX_SSE4_1; 5424 if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2)) 5425 *ecx &= ~CPUID_INTC_ECX_SSE4_2; 5426 if (!is_x86_feature(x86_featureset, X86FSET_AES)) 5427 *ecx &= ~CPUID_INTC_ECX_AES; 5428 if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ)) 5429 *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ; 5430 if (!is_x86_feature(x86_featureset, X86FSET_XSAVE)) 5431 *ecx &= ~(CPUID_INTC_ECX_XSAVE | 5432 CPUID_INTC_ECX_OSXSAVE); 5433 if (!is_x86_feature(x86_featureset, X86FSET_AVX)) 5434 *ecx &= ~CPUID_INTC_ECX_AVX; 5435 if (!is_x86_feature(x86_featureset, X86FSET_F16C)) 5436 *ecx &= ~CPUID_INTC_ECX_F16C; 5437 if (!is_x86_feature(x86_featureset, X86FSET_FMA)) 5438 *ecx &= ~CPUID_INTC_ECX_FMA; 5439 if (!is_x86_feature(x86_featureset, X86FSET_BMI1)) 5440 *ebx &= ~CPUID_INTC_EBX_7_0_BMI1; 5441 if (!is_x86_feature(x86_featureset, X86FSET_BMI2)) 5442 *ebx &= ~CPUID_INTC_EBX_7_0_BMI2; 5443 if (!is_x86_feature(x86_featureset, X86FSET_AVX2)) 5444 *ebx &= ~CPUID_INTC_EBX_7_0_AVX2; 5445 if (!is_x86_feature(x86_featureset, X86FSET_RDSEED)) 5446 *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED; 5447 if (!is_x86_feature(x86_featureset, X86FSET_ADX)) 5448 *ebx &= ~CPUID_INTC_EBX_7_0_ADX; 5449 5450 /* 5451 * [no explicit support required beyond x87 fp context] 5452 */ 5453 if (!fpu_exists) 5454 *edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX); 5455 5456 /* 5457 * Now map the supported feature vector to things that we 5458 * think userland will care about. 5459 */ 5460 if (*edx & CPUID_INTC_EDX_SEP) 5461 hwcap_flags |= AV_386_SEP; 5462 if (*edx & CPUID_INTC_EDX_SSE) 5463 hwcap_flags |= AV_386_FXSR | AV_386_SSE; 5464 if (*edx & CPUID_INTC_EDX_SSE2) 5465 hwcap_flags |= AV_386_SSE2; 5466 if (*ecx & CPUID_INTC_ECX_SSE3) 5467 hwcap_flags |= AV_386_SSE3; 5468 if (*ecx & CPUID_INTC_ECX_SSSE3) 5469 hwcap_flags |= AV_386_SSSE3; 5470 if (*ecx & CPUID_INTC_ECX_SSE4_1) 5471 hwcap_flags |= AV_386_SSE4_1; 5472 if (*ecx & CPUID_INTC_ECX_SSE4_2) 5473 hwcap_flags |= AV_386_SSE4_2; 5474 if (*ecx & CPUID_INTC_ECX_MOVBE) 5475 hwcap_flags |= AV_386_MOVBE; 5476 if (*ecx & CPUID_INTC_ECX_AES) 5477 hwcap_flags |= AV_386_AES; 5478 if (*ecx & CPUID_INTC_ECX_PCLMULQDQ) 5479 hwcap_flags |= AV_386_PCLMULQDQ; 5480 if ((*ecx & CPUID_INTC_ECX_XSAVE) && 5481 (*ecx & CPUID_INTC_ECX_OSXSAVE)) { 5482 hwcap_flags |= AV_386_XSAVE; 5483 5484 if (*ecx & CPUID_INTC_ECX_AVX) { 5485 uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi); 5486 uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi); 5487 5488 hwcap_flags |= AV_386_AVX; 5489 if (*ecx & CPUID_INTC_ECX_F16C) 5490 hwcap_flags_2 |= AV_386_2_F16C; 5491 if (*ecx & CPUID_INTC_ECX_FMA) 5492 hwcap_flags_2 |= AV_386_2_FMA; 5493 5494 if (*ebx & CPUID_INTC_EBX_7_0_BMI1) 5495 hwcap_flags_2 |= AV_386_2_BMI1; 5496 if (*ebx & CPUID_INTC_EBX_7_0_BMI2) 5497 hwcap_flags_2 |= AV_386_2_BMI2; 5498 if (*ebx & CPUID_INTC_EBX_7_0_AVX2) 5499 hwcap_flags_2 |= AV_386_2_AVX2; 5500 if (*ebx & CPUID_INTC_EBX_7_0_AVX512F) 5501 hwcap_flags_2 |= AV_386_2_AVX512F; 5502 if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ) 5503 hwcap_flags_2 |= AV_386_2_AVX512DQ; 5504 if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA) 5505 hwcap_flags_2 |= AV_386_2_AVX512IFMA; 5506 if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF) 5507 hwcap_flags_2 |= AV_386_2_AVX512PF; 5508 if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER) 5509 hwcap_flags_2 |= AV_386_2_AVX512ER; 5510 if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD) 5511 hwcap_flags_2 |= AV_386_2_AVX512CD; 5512 if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW) 5513 hwcap_flags_2 |= AV_386_2_AVX512BW; 5514 if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL) 5515 hwcap_flags_2 |= AV_386_2_AVX512VL; 5516 5517 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI) 5518 hwcap_flags_2 |= AV_386_2_AVX512VBMI; 5519 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI) 5520 hwcap_flags_2 |= AV_386_2_AVX512_VNNI; 5521 if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ) 5522 hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ; 5523 if (*ecx_7 & CPUID_INTC_ECX_7_0_VAES) 5524 hwcap_flags_2 |= AV_386_2_VAES; 5525 if (*ecx_7 & CPUID_INTC_ECX_7_0_VPCLMULQDQ) 5526 hwcap_flags_2 |= AV_386_2_VPCLMULQDQ; 5527 5528 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW) 5529 hwcap_flags_2 |= AV_386_2_AVX512_4NNIW; 5530 if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS) 5531 hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS; 5532 } 5533 } 5534 if (*ecx & CPUID_INTC_ECX_VMX) 5535 hwcap_flags |= AV_386_VMX; 5536 if (*ecx & CPUID_INTC_ECX_POPCNT) 5537 hwcap_flags |= AV_386_POPCNT; 5538 if (*edx & CPUID_INTC_EDX_FPU) 5539 hwcap_flags |= AV_386_FPU; 5540 if (*edx & CPUID_INTC_EDX_MMX) 5541 hwcap_flags |= AV_386_MMX; 5542 5543 if (*edx & CPUID_INTC_EDX_TSC) 5544 hwcap_flags |= AV_386_TSC; 5545 if (*edx & CPUID_INTC_EDX_CX8) 5546 hwcap_flags |= AV_386_CX8; 5547 if (*edx & CPUID_INTC_EDX_CMOV) 5548 hwcap_flags |= AV_386_CMOV; 5549 if (*ecx & CPUID_INTC_ECX_CX16) 5550 hwcap_flags |= AV_386_CX16; 5551 5552 if (*ecx & CPUID_INTC_ECX_RDRAND) 5553 hwcap_flags_2 |= AV_386_2_RDRAND; 5554 if (*ebx & CPUID_INTC_EBX_7_0_ADX) 5555 hwcap_flags_2 |= AV_386_2_ADX; 5556 if (*ebx & CPUID_INTC_EBX_7_0_RDSEED) 5557 hwcap_flags_2 |= AV_386_2_RDSEED; 5558 if (*ebx & CPUID_INTC_EBX_7_0_SHA) 5559 hwcap_flags_2 |= AV_386_2_SHA; 5560 if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE) 5561 hwcap_flags_2 |= AV_386_2_FSGSBASE; 5562 if (*ebx & CPUID_INTC_EBX_7_0_CLWB) 5563 hwcap_flags_2 |= AV_386_2_CLWB; 5564 if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT) 5565 hwcap_flags_2 |= AV_386_2_CLFLUSHOPT; 5566 5567 } 5568 /* 5569 * Check a few miscilaneous features. 5570 */ 5571 if (is_x86_feature(x86_featureset, X86FSET_CLZERO)) 5572 hwcap_flags_2 |= AV_386_2_CLZERO; 5573 5574 if (cpi->cpi_xmaxeax < 0x80000001) 5575 goto resolve_done; 5576 5577 switch (cpi->cpi_vendor) { 5578 struct cpuid_regs cp; 5579 uint32_t *edx, *ecx; 5580 5581 case X86_VENDOR_Intel: 5582 /* 5583 * Seems like Intel duplicated what we necessary 5584 * here to make the initial crop of 64-bit OS's work. 5585 * Hopefully, those are the only "extended" bits 5586 * they'll add. 5587 */ 5588 /*FALLTHROUGH*/ 5589 5590 case X86_VENDOR_AMD: 5591 case X86_VENDOR_HYGON: 5592 edx = &cpi->cpi_support[AMD_EDX_FEATURES]; 5593 ecx = &cpi->cpi_support[AMD_ECX_FEATURES]; 5594 5595 *edx = CPI_FEATURES_XTD_EDX(cpi); 5596 *ecx = CPI_FEATURES_XTD_ECX(cpi); 5597 5598 /* 5599 * [these features require explicit kernel support] 5600 */ 5601 switch (cpi->cpi_vendor) { 5602 case X86_VENDOR_Intel: 5603 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5604 *edx &= ~CPUID_AMD_EDX_TSCP; 5605 break; 5606 5607 case X86_VENDOR_AMD: 5608 case X86_VENDOR_HYGON: 5609 if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) 5610 *edx &= ~CPUID_AMD_EDX_TSCP; 5611 if (!is_x86_feature(x86_featureset, X86FSET_SSE4A)) 5612 *ecx &= ~CPUID_AMD_ECX_SSE4A; 5613 break; 5614 5615 default: 5616 break; 5617 } 5618 5619 /* 5620 * [no explicit support required beyond 5621 * x87 fp context and exception handlers] 5622 */ 5623 if (!fpu_exists) 5624 *edx &= ~(CPUID_AMD_EDX_MMXamd | 5625 CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx); 5626 5627 if (!is_x86_feature(x86_featureset, X86FSET_NX)) 5628 *edx &= ~CPUID_AMD_EDX_NX; 5629 /* 5630 * Now map the supported feature vector to 5631 * things that we think userland will care about. 5632 */ 5633 if (*edx & CPUID_AMD_EDX_SYSC) 5634 hwcap_flags |= AV_386_AMD_SYSC; 5635 if (*edx & CPUID_AMD_EDX_MMXamd) 5636 hwcap_flags |= AV_386_AMD_MMX; 5637 if (*edx & CPUID_AMD_EDX_3DNow) 5638 hwcap_flags |= AV_386_AMD_3DNow; 5639 if (*edx & CPUID_AMD_EDX_3DNowx) 5640 hwcap_flags |= AV_386_AMD_3DNowx; 5641 if (*ecx & CPUID_AMD_ECX_SVM) 5642 hwcap_flags |= AV_386_AMD_SVM; 5643 5644 switch (cpi->cpi_vendor) { 5645 case X86_VENDOR_AMD: 5646 case X86_VENDOR_HYGON: 5647 if (*edx & CPUID_AMD_EDX_TSCP) 5648 hwcap_flags |= AV_386_TSCP; 5649 if (*ecx & CPUID_AMD_ECX_AHF64) 5650 hwcap_flags |= AV_386_AHF; 5651 if (*ecx & CPUID_AMD_ECX_SSE4A) 5652 hwcap_flags |= AV_386_AMD_SSE4A; 5653 if (*ecx & CPUID_AMD_ECX_LZCNT) 5654 hwcap_flags |= AV_386_AMD_LZCNT; 5655 if (*ecx & CPUID_AMD_ECX_MONITORX) 5656 hwcap_flags_2 |= AV_386_2_MONITORX; 5657 break; 5658 5659 case X86_VENDOR_Intel: 5660 if (*edx & CPUID_AMD_EDX_TSCP) 5661 hwcap_flags |= AV_386_TSCP; 5662 if (*ecx & CPUID_AMD_ECX_LZCNT) 5663 hwcap_flags |= AV_386_AMD_LZCNT; 5664 /* 5665 * Aarrgh. 5666 * Intel uses a different bit in the same word. 5667 */ 5668 if (*ecx & CPUID_INTC_ECX_AHF64) 5669 hwcap_flags |= AV_386_AHF; 5670 break; 5671 5672 default: 5673 break; 5674 } 5675 break; 5676 5677 case X86_VENDOR_TM: 5678 cp.cp_eax = 0x80860001; 5679 (void) __cpuid_insn(&cp); 5680 cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx; 5681 break; 5682 5683 default: 5684 break; 5685 } 5686 5687 resolve_done: 5688 if (hwcap_out != NULL) { 5689 hwcap_out[0] = hwcap_flags; 5690 hwcap_out[1] = hwcap_flags_2; 5691 } 5692 } 5693 5694 5695 /* 5696 * Simulate the cpuid instruction using the data we previously 5697 * captured about this CPU. We try our best to return the truth 5698 * about the hardware, independently of kernel support. 5699 */ 5700 uint32_t 5701 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp) 5702 { 5703 struct cpuid_info *cpi; 5704 struct cpuid_regs *xcp; 5705 5706 if (cpu == NULL) 5707 cpu = CPU; 5708 cpi = cpu->cpu_m.mcpu_cpi; 5709 5710 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC)); 5711 5712 /* 5713 * CPUID data is cached in two separate places: cpi_std for standard 5714 * CPUID leaves , and cpi_extd for extended CPUID leaves. 5715 */ 5716 if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) { 5717 xcp = &cpi->cpi_std[cp->cp_eax]; 5718 } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 && 5719 cp->cp_eax <= cpi->cpi_xmaxeax && 5720 cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) { 5721 xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0]; 5722 } else { 5723 /* 5724 * The caller is asking for data from an input parameter which 5725 * the kernel has not cached. In this case we go fetch from 5726 * the hardware and return the data directly to the user. 5727 */ 5728 return (__cpuid_insn(cp)); 5729 } 5730 5731 cp->cp_eax = xcp->cp_eax; 5732 cp->cp_ebx = xcp->cp_ebx; 5733 cp->cp_ecx = xcp->cp_ecx; 5734 cp->cp_edx = xcp->cp_edx; 5735 return (cp->cp_eax); 5736 } 5737 5738 boolean_t 5739 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass) 5740 { 5741 return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL && 5742 cpu->cpu_m.mcpu_cpi->cpi_pass >= pass); 5743 } 5744 5745 int 5746 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n) 5747 { 5748 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC)); 5749 5750 return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr)); 5751 } 5752 5753 int 5754 cpuid_is_cmt(cpu_t *cpu) 5755 { 5756 if (cpu == NULL) 5757 cpu = CPU; 5758 5759 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5760 5761 return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0); 5762 } 5763 5764 /* 5765 * AMD and Intel both implement the 64-bit variant of the syscall 5766 * instruction (syscallq), so if there's -any- support for syscall, 5767 * cpuid currently says "yes, we support this". 5768 * 5769 * However, Intel decided to -not- implement the 32-bit variant of the 5770 * syscall instruction, so we provide a predicate to allow our caller 5771 * to test that subtlety here. 5772 * 5773 * XXPV Currently, 32-bit syscall instructions don't work via the hypervisor, 5774 * even in the case where the hardware would in fact support it. 5775 */ 5776 /*ARGSUSED*/ 5777 int 5778 cpuid_syscall32_insn(cpu_t *cpu) 5779 { 5780 ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC)); 5781 5782 #if !defined(__xpv) 5783 if (cpu == NULL) 5784 cpu = CPU; 5785 5786 /*CSTYLED*/ 5787 { 5788 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5789 5790 if ((cpi->cpi_vendor == X86_VENDOR_AMD || 5791 cpi->cpi_vendor == X86_VENDOR_HYGON) && 5792 cpi->cpi_xmaxeax >= 0x80000001 && 5793 (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC)) 5794 return (1); 5795 } 5796 #endif 5797 return (0); 5798 } 5799 5800 int 5801 cpuid_getidstr(cpu_t *cpu, char *s, size_t n) 5802 { 5803 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 5804 5805 static const char fmt[] = 5806 "x86 (%s %X family %d model %d step %d clock %d MHz)"; 5807 static const char fmt_ht[] = 5808 "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)"; 5809 5810 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5811 5812 if (cpuid_is_cmt(cpu)) 5813 return (snprintf(s, n, fmt_ht, cpi->cpi_chipid, 5814 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5815 cpi->cpi_family, cpi->cpi_model, 5816 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5817 return (snprintf(s, n, fmt, 5818 cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax, 5819 cpi->cpi_family, cpi->cpi_model, 5820 cpi->cpi_step, cpu->cpu_type_info.pi_clock)); 5821 } 5822 5823 const char * 5824 cpuid_getvendorstr(cpu_t *cpu) 5825 { 5826 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5827 return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr); 5828 } 5829 5830 uint_t 5831 cpuid_getvendor(cpu_t *cpu) 5832 { 5833 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5834 return (cpu->cpu_m.mcpu_cpi->cpi_vendor); 5835 } 5836 5837 uint_t 5838 cpuid_getfamily(cpu_t *cpu) 5839 { 5840 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5841 return (cpu->cpu_m.mcpu_cpi->cpi_family); 5842 } 5843 5844 uint_t 5845 cpuid_getmodel(cpu_t *cpu) 5846 { 5847 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5848 return (cpu->cpu_m.mcpu_cpi->cpi_model); 5849 } 5850 5851 uint_t 5852 cpuid_get_ncpu_per_chip(cpu_t *cpu) 5853 { 5854 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5855 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip); 5856 } 5857 5858 uint_t 5859 cpuid_get_ncore_per_chip(cpu_t *cpu) 5860 { 5861 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5862 return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip); 5863 } 5864 5865 uint_t 5866 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu) 5867 { 5868 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED)); 5869 return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache); 5870 } 5871 5872 id_t 5873 cpuid_get_last_lvl_cacheid(cpu_t *cpu) 5874 { 5875 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED)); 5876 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5877 } 5878 5879 uint_t 5880 cpuid_getstep(cpu_t *cpu) 5881 { 5882 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5883 return (cpu->cpu_m.mcpu_cpi->cpi_step); 5884 } 5885 5886 uint_t 5887 cpuid_getsig(struct cpu *cpu) 5888 { 5889 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5890 return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax); 5891 } 5892 5893 uint32_t 5894 cpuid_getchiprev(struct cpu *cpu) 5895 { 5896 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5897 return (cpu->cpu_m.mcpu_cpi->cpi_chiprev); 5898 } 5899 5900 const char * 5901 cpuid_getchiprevstr(struct cpu *cpu) 5902 { 5903 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5904 return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr); 5905 } 5906 5907 uint32_t 5908 cpuid_getsockettype(struct cpu *cpu) 5909 { 5910 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5911 return (cpu->cpu_m.mcpu_cpi->cpi_socket); 5912 } 5913 5914 const char * 5915 cpuid_getsocketstr(cpu_t *cpu) 5916 { 5917 static const char *socketstr = NULL; 5918 struct cpuid_info *cpi; 5919 5920 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT)); 5921 cpi = cpu->cpu_m.mcpu_cpi; 5922 5923 /* Assume that socket types are the same across the system */ 5924 if (socketstr == NULL) 5925 socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family, 5926 cpi->cpi_model, cpi->cpi_step); 5927 5928 5929 return (socketstr); 5930 } 5931 5932 int 5933 cpuid_get_chipid(cpu_t *cpu) 5934 { 5935 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5936 5937 if (cpuid_is_cmt(cpu)) 5938 return (cpu->cpu_m.mcpu_cpi->cpi_chipid); 5939 return (cpu->cpu_id); 5940 } 5941 5942 id_t 5943 cpuid_get_coreid(cpu_t *cpu) 5944 { 5945 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5946 return (cpu->cpu_m.mcpu_cpi->cpi_coreid); 5947 } 5948 5949 int 5950 cpuid_get_pkgcoreid(cpu_t *cpu) 5951 { 5952 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5953 return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid); 5954 } 5955 5956 int 5957 cpuid_get_clogid(cpu_t *cpu) 5958 { 5959 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5960 return (cpu->cpu_m.mcpu_cpi->cpi_clogid); 5961 } 5962 5963 int 5964 cpuid_get_cacheid(cpu_t *cpu) 5965 { 5966 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5967 return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid); 5968 } 5969 5970 uint_t 5971 cpuid_get_procnodeid(cpu_t *cpu) 5972 { 5973 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5974 return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid); 5975 } 5976 5977 uint_t 5978 cpuid_get_procnodes_per_pkg(cpu_t *cpu) 5979 { 5980 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5981 return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg); 5982 } 5983 5984 uint_t 5985 cpuid_get_compunitid(cpu_t *cpu) 5986 { 5987 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5988 return (cpu->cpu_m.mcpu_cpi->cpi_compunitid); 5989 } 5990 5991 uint_t 5992 cpuid_get_cores_per_compunit(cpu_t *cpu) 5993 { 5994 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 5995 return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit); 5996 } 5997 5998 uint32_t 5999 cpuid_get_apicid(cpu_t *cpu) 6000 { 6001 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 6002 if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) { 6003 return (UINT32_MAX); 6004 } else { 6005 return (cpu->cpu_m.mcpu_cpi->cpi_apicid); 6006 } 6007 } 6008 6009 void 6010 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits) 6011 { 6012 struct cpuid_info *cpi; 6013 6014 if (cpu == NULL) 6015 cpu = CPU; 6016 cpi = cpu->cpu_m.mcpu_cpi; 6017 6018 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 6019 6020 if (pabits) 6021 *pabits = cpi->cpi_pabits; 6022 if (vabits) 6023 *vabits = cpi->cpi_vabits; 6024 } 6025 6026 size_t 6027 cpuid_get_xsave_size() 6028 { 6029 return (MAX(cpuid_info0.cpi_xsave.xsav_max_size, 6030 sizeof (struct xsave_state))); 6031 } 6032 6033 /* 6034 * Return true if the CPUs on this system require 'pointer clearing' for the 6035 * floating point error pointer exception handling. In the past, this has been 6036 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to 6037 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO 6038 * feature bit and is reflected in the cpi_fp_amd_save member. 6039 */ 6040 boolean_t 6041 cpuid_need_fp_excp_handling() 6042 { 6043 return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD && 6044 cpuid_info0.cpi_fp_amd_save != 0); 6045 } 6046 6047 /* 6048 * Returns the number of data TLB entries for a corresponding 6049 * pagesize. If it can't be computed, or isn't known, the 6050 * routine returns zero. If you ask about an architecturally 6051 * impossible pagesize, the routine will panic (so that the 6052 * hat implementor knows that things are inconsistent.) 6053 */ 6054 uint_t 6055 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize) 6056 { 6057 struct cpuid_info *cpi; 6058 uint_t dtlb_nent = 0; 6059 6060 if (cpu == NULL) 6061 cpu = CPU; 6062 cpi = cpu->cpu_m.mcpu_cpi; 6063 6064 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 6065 6066 /* 6067 * Check the L2 TLB info 6068 */ 6069 if (cpi->cpi_xmaxeax >= 0x80000006) { 6070 struct cpuid_regs *cp = &cpi->cpi_extd[6]; 6071 6072 switch (pagesize) { 6073 6074 case 4 * 1024: 6075 /* 6076 * All zero in the top 16 bits of the register 6077 * indicates a unified TLB. Size is in low 16 bits. 6078 */ 6079 if ((cp->cp_ebx & 0xffff0000) == 0) 6080 dtlb_nent = cp->cp_ebx & 0x0000ffff; 6081 else 6082 dtlb_nent = BITX(cp->cp_ebx, 27, 16); 6083 break; 6084 6085 case 2 * 1024 * 1024: 6086 if ((cp->cp_eax & 0xffff0000) == 0) 6087 dtlb_nent = cp->cp_eax & 0x0000ffff; 6088 else 6089 dtlb_nent = BITX(cp->cp_eax, 27, 16); 6090 break; 6091 6092 default: 6093 panic("unknown L2 pagesize"); 6094 /*NOTREACHED*/ 6095 } 6096 } 6097 6098 if (dtlb_nent != 0) 6099 return (dtlb_nent); 6100 6101 /* 6102 * No L2 TLB support for this size, try L1. 6103 */ 6104 if (cpi->cpi_xmaxeax >= 0x80000005) { 6105 struct cpuid_regs *cp = &cpi->cpi_extd[5]; 6106 6107 switch (pagesize) { 6108 case 4 * 1024: 6109 dtlb_nent = BITX(cp->cp_ebx, 23, 16); 6110 break; 6111 case 2 * 1024 * 1024: 6112 dtlb_nent = BITX(cp->cp_eax, 23, 16); 6113 break; 6114 default: 6115 panic("unknown L1 d-TLB pagesize"); 6116 /*NOTREACHED*/ 6117 } 6118 } 6119 6120 return (dtlb_nent); 6121 } 6122 6123 /* 6124 * Return 0 if the erratum is not present or not applicable, positive 6125 * if it is, and negative if the status of the erratum is unknown. 6126 * 6127 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm) 6128 * Processors" #25759, Rev 3.57, August 2005 6129 */ 6130 int 6131 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum) 6132 { 6133 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 6134 uint_t eax; 6135 6136 /* 6137 * Bail out if this CPU isn't an AMD CPU, or if it's 6138 * a legacy (32-bit) AMD CPU. 6139 */ 6140 if (cpi->cpi_vendor != X86_VENDOR_AMD || 6141 cpi->cpi_family == 4 || cpi->cpi_family == 5 || 6142 cpi->cpi_family == 6) { 6143 return (0); 6144 } 6145 6146 eax = cpi->cpi_std[1].cp_eax; 6147 6148 #define SH_B0(eax) (eax == 0xf40 || eax == 0xf50) 6149 #define SH_B3(eax) (eax == 0xf51) 6150 #define B(eax) (SH_B0(eax) || SH_B3(eax)) 6151 6152 #define SH_C0(eax) (eax == 0xf48 || eax == 0xf58) 6153 6154 #define SH_CG(eax) (eax == 0xf4a || eax == 0xf5a || eax == 0xf7a) 6155 #define DH_CG(eax) (eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0) 6156 #define CH_CG(eax) (eax == 0xf82 || eax == 0xfb2) 6157 #define CG(eax) (SH_CG(eax) || DH_CG(eax) || CH_CG(eax)) 6158 6159 #define SH_D0(eax) (eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70) 6160 #define DH_D0(eax) (eax == 0x10fc0 || eax == 0x10ff0) 6161 #define CH_D0(eax) (eax == 0x10f80 || eax == 0x10fb0) 6162 #define D0(eax) (SH_D0(eax) || DH_D0(eax) || CH_D0(eax)) 6163 6164 #define SH_E0(eax) (eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70) 6165 #define JH_E1(eax) (eax == 0x20f10) /* JH8_E0 had 0x20f30 */ 6166 #define DH_E3(eax) (eax == 0x20fc0 || eax == 0x20ff0) 6167 #define SH_E4(eax) (eax == 0x20f51 || eax == 0x20f71) 6168 #define BH_E4(eax) (eax == 0x20fb1) 6169 #define SH_E5(eax) (eax == 0x20f42) 6170 #define DH_E6(eax) (eax == 0x20ff2 || eax == 0x20fc2) 6171 #define JH_E6(eax) (eax == 0x20f12 || eax == 0x20f32) 6172 #define EX(eax) (SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \ 6173 SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \ 6174 DH_E6(eax) || JH_E6(eax)) 6175 6176 #define DR_AX(eax) (eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02) 6177 #define DR_B0(eax) (eax == 0x100f20) 6178 #define DR_B1(eax) (eax == 0x100f21) 6179 #define DR_BA(eax) (eax == 0x100f2a) 6180 #define DR_B2(eax) (eax == 0x100f22) 6181 #define DR_B3(eax) (eax == 0x100f23) 6182 #define RB_C0(eax) (eax == 0x100f40) 6183 6184 switch (erratum) { 6185 case 1: 6186 return (cpi->cpi_family < 0x10); 6187 case 51: /* what does the asterisk mean? */ 6188 return (B(eax) || SH_C0(eax) || CG(eax)); 6189 case 52: 6190 return (B(eax)); 6191 case 57: 6192 return (cpi->cpi_family <= 0x11); 6193 case 58: 6194 return (B(eax)); 6195 case 60: 6196 return (cpi->cpi_family <= 0x11); 6197 case 61: 6198 case 62: 6199 case 63: 6200 case 64: 6201 case 65: 6202 case 66: 6203 case 68: 6204 case 69: 6205 case 70: 6206 case 71: 6207 return (B(eax)); 6208 case 72: 6209 return (SH_B0(eax)); 6210 case 74: 6211 return (B(eax)); 6212 case 75: 6213 return (cpi->cpi_family < 0x10); 6214 case 76: 6215 return (B(eax)); 6216 case 77: 6217 return (cpi->cpi_family <= 0x11); 6218 case 78: 6219 return (B(eax) || SH_C0(eax)); 6220 case 79: 6221 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 6222 case 80: 6223 case 81: 6224 case 82: 6225 return (B(eax)); 6226 case 83: 6227 return (B(eax) || SH_C0(eax) || CG(eax)); 6228 case 85: 6229 return (cpi->cpi_family < 0x10); 6230 case 86: 6231 return (SH_C0(eax) || CG(eax)); 6232 case 88: 6233 return (B(eax) || SH_C0(eax)); 6234 case 89: 6235 return (cpi->cpi_family < 0x10); 6236 case 90: 6237 return (B(eax) || SH_C0(eax) || CG(eax)); 6238 case 91: 6239 case 92: 6240 return (B(eax) || SH_C0(eax)); 6241 case 93: 6242 return (SH_C0(eax)); 6243 case 94: 6244 return (B(eax) || SH_C0(eax) || CG(eax)); 6245 case 95: 6246 return (B(eax) || SH_C0(eax)); 6247 case 96: 6248 return (B(eax) || SH_C0(eax) || CG(eax)); 6249 case 97: 6250 case 98: 6251 return (SH_C0(eax) || CG(eax)); 6252 case 99: 6253 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6254 case 100: 6255 return (B(eax) || SH_C0(eax)); 6256 case 101: 6257 case 103: 6258 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6259 case 104: 6260 return (SH_C0(eax) || CG(eax) || D0(eax)); 6261 case 105: 6262 case 106: 6263 case 107: 6264 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6265 case 108: 6266 return (DH_CG(eax)); 6267 case 109: 6268 return (SH_C0(eax) || CG(eax) || D0(eax)); 6269 case 110: 6270 return (D0(eax) || EX(eax)); 6271 case 111: 6272 return (CG(eax)); 6273 case 112: 6274 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 6275 case 113: 6276 return (eax == 0x20fc0); 6277 case 114: 6278 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 6279 case 115: 6280 return (SH_E0(eax) || JH_E1(eax)); 6281 case 116: 6282 return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax)); 6283 case 117: 6284 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax)); 6285 case 118: 6286 return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) || 6287 JH_E6(eax)); 6288 case 121: 6289 return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax)); 6290 case 122: 6291 return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11); 6292 case 123: 6293 return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax)); 6294 case 131: 6295 return (cpi->cpi_family < 0x10); 6296 case 6336786: 6297 6298 /* 6299 * Test for AdvPowerMgmtInfo.TscPStateInvariant 6300 * if this is a K8 family or newer processor. We're testing for 6301 * this 'erratum' to determine whether or not we have a constant 6302 * TSC. 6303 * 6304 * Our current fix for this is to disable the C1-Clock ramping. 6305 * However, this doesn't work on newer processor families nor 6306 * does it work when virtualized as those devices don't exist. 6307 */ 6308 if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) { 6309 return (0); 6310 } 6311 6312 if (CPI_FAMILY(cpi) == 0xf) { 6313 struct cpuid_regs regs; 6314 regs.cp_eax = 0x80000007; 6315 (void) __cpuid_insn(®s); 6316 return (!(regs.cp_edx & 0x100)); 6317 } 6318 return (0); 6319 case 147: 6320 /* 6321 * This erratum (K8 #147) is not present on family 10 and newer. 6322 */ 6323 if (cpi->cpi_family >= 0x10) { 6324 return (0); 6325 } 6326 return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) | 6327 (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40); 6328 6329 case 6671130: 6330 /* 6331 * check for processors (pre-Shanghai) that do not provide 6332 * optimal management of 1gb ptes in its tlb. 6333 */ 6334 return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4); 6335 6336 case 298: 6337 return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) || 6338 DR_B2(eax) || RB_C0(eax)); 6339 6340 case 721: 6341 return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12); 6342 6343 default: 6344 return (-1); 6345 6346 } 6347 } 6348 6349 /* 6350 * Determine if specified erratum is present via OSVW (OS Visible Workaround). 6351 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate. 6352 */ 6353 int 6354 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum) 6355 { 6356 struct cpuid_info *cpi; 6357 uint_t osvwid; 6358 static int osvwfeature = -1; 6359 uint64_t osvwlength; 6360 6361 6362 cpi = cpu->cpu_m.mcpu_cpi; 6363 6364 /* confirm OSVW supported */ 6365 if (osvwfeature == -1) { 6366 osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW; 6367 } else { 6368 /* assert that osvw feature setting is consistent on all cpus */ 6369 ASSERT(osvwfeature == 6370 (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW)); 6371 } 6372 if (!osvwfeature) 6373 return (-1); 6374 6375 osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK; 6376 6377 switch (erratum) { 6378 case 298: /* osvwid is 0 */ 6379 osvwid = 0; 6380 if (osvwlength <= (uint64_t)osvwid) { 6381 /* osvwid 0 is unknown */ 6382 return (-1); 6383 } 6384 6385 /* 6386 * Check the OSVW STATUS MSR to determine the state 6387 * of the erratum where: 6388 * 0 - fixed by HW 6389 * 1 - BIOS has applied the workaround when BIOS 6390 * workaround is available. (Or for other errata, 6391 * OS workaround is required.) 6392 * For a value of 1, caller will confirm that the 6393 * erratum 298 workaround has indeed been applied by BIOS. 6394 * 6395 * A 1 may be set in cpus that have a HW fix 6396 * in a mixed cpu system. Regarding erratum 298: 6397 * In a multiprocessor platform, the workaround above 6398 * should be applied to all processors regardless of 6399 * silicon revision when an affected processor is 6400 * present. 6401 */ 6402 6403 return (rdmsr(MSR_AMD_OSVW_STATUS + 6404 (osvwid / OSVW_ID_CNT_PER_MSR)) & 6405 (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR))); 6406 6407 default: 6408 return (-1); 6409 } 6410 } 6411 6412 static const char assoc_str[] = "associativity"; 6413 static const char line_str[] = "line-size"; 6414 static const char size_str[] = "size"; 6415 6416 static void 6417 add_cache_prop(dev_info_t *devi, const char *label, const char *type, 6418 uint32_t val) 6419 { 6420 char buf[128]; 6421 6422 /* 6423 * ndi_prop_update_int() is used because it is desirable for 6424 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set. 6425 */ 6426 if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf)) 6427 (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val); 6428 } 6429 6430 /* 6431 * Intel-style cache/tlb description 6432 * 6433 * Standard cpuid level 2 gives a randomly ordered 6434 * selection of tags that index into a table that describes 6435 * cache and tlb properties. 6436 */ 6437 6438 static const char l1_icache_str[] = "l1-icache"; 6439 static const char l1_dcache_str[] = "l1-dcache"; 6440 static const char l2_cache_str[] = "l2-cache"; 6441 static const char l3_cache_str[] = "l3-cache"; 6442 static const char itlb4k_str[] = "itlb-4K"; 6443 static const char dtlb4k_str[] = "dtlb-4K"; 6444 static const char itlb2M_str[] = "itlb-2M"; 6445 static const char itlb4M_str[] = "itlb-4M"; 6446 static const char dtlb4M_str[] = "dtlb-4M"; 6447 static const char dtlb24_str[] = "dtlb0-2M-4M"; 6448 static const char itlb424_str[] = "itlb-4K-2M-4M"; 6449 static const char itlb24_str[] = "itlb-2M-4M"; 6450 static const char dtlb44_str[] = "dtlb-4K-4M"; 6451 static const char sl1_dcache_str[] = "sectored-l1-dcache"; 6452 static const char sl2_cache_str[] = "sectored-l2-cache"; 6453 static const char itrace_str[] = "itrace-cache"; 6454 static const char sl3_cache_str[] = "sectored-l3-cache"; 6455 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k"; 6456 6457 static const struct cachetab { 6458 uint8_t ct_code; 6459 uint8_t ct_assoc; 6460 uint16_t ct_line_size; 6461 size_t ct_size; 6462 const char *ct_label; 6463 } intel_ctab[] = { 6464 /* 6465 * maintain descending order! 6466 * 6467 * Codes ignored - Reason 6468 * ---------------------- 6469 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache 6470 * f0H/f1H - Currently we do not interpret prefetch size by design 6471 */ 6472 { 0xe4, 16, 64, 8*1024*1024, l3_cache_str}, 6473 { 0xe3, 16, 64, 4*1024*1024, l3_cache_str}, 6474 { 0xe2, 16, 64, 2*1024*1024, l3_cache_str}, 6475 { 0xde, 12, 64, 6*1024*1024, l3_cache_str}, 6476 { 0xdd, 12, 64, 3*1024*1024, l3_cache_str}, 6477 { 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str}, 6478 { 0xd8, 8, 64, 4*1024*1024, l3_cache_str}, 6479 { 0xd7, 8, 64, 2*1024*1024, l3_cache_str}, 6480 { 0xd6, 8, 64, 1*1024*1024, l3_cache_str}, 6481 { 0xd2, 4, 64, 2*1024*1024, l3_cache_str}, 6482 { 0xd1, 4, 64, 1*1024*1024, l3_cache_str}, 6483 { 0xd0, 4, 64, 512*1024, l3_cache_str}, 6484 { 0xca, 4, 0, 512, sh_l2_tlb4k_str}, 6485 { 0xc0, 4, 0, 8, dtlb44_str }, 6486 { 0xba, 4, 0, 64, dtlb4k_str }, 6487 { 0xb4, 4, 0, 256, dtlb4k_str }, 6488 { 0xb3, 4, 0, 128, dtlb4k_str }, 6489 { 0xb2, 4, 0, 64, itlb4k_str }, 6490 { 0xb0, 4, 0, 128, itlb4k_str }, 6491 { 0x87, 8, 64, 1024*1024, l2_cache_str}, 6492 { 0x86, 4, 64, 512*1024, l2_cache_str}, 6493 { 0x85, 8, 32, 2*1024*1024, l2_cache_str}, 6494 { 0x84, 8, 32, 1024*1024, l2_cache_str}, 6495 { 0x83, 8, 32, 512*1024, l2_cache_str}, 6496 { 0x82, 8, 32, 256*1024, l2_cache_str}, 6497 { 0x80, 8, 64, 512*1024, l2_cache_str}, 6498 { 0x7f, 2, 64, 512*1024, l2_cache_str}, 6499 { 0x7d, 8, 64, 2*1024*1024, sl2_cache_str}, 6500 { 0x7c, 8, 64, 1024*1024, sl2_cache_str}, 6501 { 0x7b, 8, 64, 512*1024, sl2_cache_str}, 6502 { 0x7a, 8, 64, 256*1024, sl2_cache_str}, 6503 { 0x79, 8, 64, 128*1024, sl2_cache_str}, 6504 { 0x78, 8, 64, 1024*1024, l2_cache_str}, 6505 { 0x73, 8, 0, 64*1024, itrace_str}, 6506 { 0x72, 8, 0, 32*1024, itrace_str}, 6507 { 0x71, 8, 0, 16*1024, itrace_str}, 6508 { 0x70, 8, 0, 12*1024, itrace_str}, 6509 { 0x68, 4, 64, 32*1024, sl1_dcache_str}, 6510 { 0x67, 4, 64, 16*1024, sl1_dcache_str}, 6511 { 0x66, 4, 64, 8*1024, sl1_dcache_str}, 6512 { 0x60, 8, 64, 16*1024, sl1_dcache_str}, 6513 { 0x5d, 0, 0, 256, dtlb44_str}, 6514 { 0x5c, 0, 0, 128, dtlb44_str}, 6515 { 0x5b, 0, 0, 64, dtlb44_str}, 6516 { 0x5a, 4, 0, 32, dtlb24_str}, 6517 { 0x59, 0, 0, 16, dtlb4k_str}, 6518 { 0x57, 4, 0, 16, dtlb4k_str}, 6519 { 0x56, 4, 0, 16, dtlb4M_str}, 6520 { 0x55, 0, 0, 7, itlb24_str}, 6521 { 0x52, 0, 0, 256, itlb424_str}, 6522 { 0x51, 0, 0, 128, itlb424_str}, 6523 { 0x50, 0, 0, 64, itlb424_str}, 6524 { 0x4f, 0, 0, 32, itlb4k_str}, 6525 { 0x4e, 24, 64, 6*1024*1024, l2_cache_str}, 6526 { 0x4d, 16, 64, 16*1024*1024, l3_cache_str}, 6527 { 0x4c, 12, 64, 12*1024*1024, l3_cache_str}, 6528 { 0x4b, 16, 64, 8*1024*1024, l3_cache_str}, 6529 { 0x4a, 12, 64, 6*1024*1024, l3_cache_str}, 6530 { 0x49, 16, 64, 4*1024*1024, l3_cache_str}, 6531 { 0x48, 12, 64, 3*1024*1024, l2_cache_str}, 6532 { 0x47, 8, 64, 8*1024*1024, l3_cache_str}, 6533 { 0x46, 4, 64, 4*1024*1024, l3_cache_str}, 6534 { 0x45, 4, 32, 2*1024*1024, l2_cache_str}, 6535 { 0x44, 4, 32, 1024*1024, l2_cache_str}, 6536 { 0x43, 4, 32, 512*1024, l2_cache_str}, 6537 { 0x42, 4, 32, 256*1024, l2_cache_str}, 6538 { 0x41, 4, 32, 128*1024, l2_cache_str}, 6539 { 0x3e, 4, 64, 512*1024, sl2_cache_str}, 6540 { 0x3d, 6, 64, 384*1024, sl2_cache_str}, 6541 { 0x3c, 4, 64, 256*1024, sl2_cache_str}, 6542 { 0x3b, 2, 64, 128*1024, sl2_cache_str}, 6543 { 0x3a, 6, 64, 192*1024, sl2_cache_str}, 6544 { 0x39, 4, 64, 128*1024, sl2_cache_str}, 6545 { 0x30, 8, 64, 32*1024, l1_icache_str}, 6546 { 0x2c, 8, 64, 32*1024, l1_dcache_str}, 6547 { 0x29, 8, 64, 4096*1024, sl3_cache_str}, 6548 { 0x25, 8, 64, 2048*1024, sl3_cache_str}, 6549 { 0x23, 8, 64, 1024*1024, sl3_cache_str}, 6550 { 0x22, 4, 64, 512*1024, sl3_cache_str}, 6551 { 0x0e, 6, 64, 24*1024, l1_dcache_str}, 6552 { 0x0d, 4, 32, 16*1024, l1_dcache_str}, 6553 { 0x0c, 4, 32, 16*1024, l1_dcache_str}, 6554 { 0x0b, 4, 0, 4, itlb4M_str}, 6555 { 0x0a, 2, 32, 8*1024, l1_dcache_str}, 6556 { 0x08, 4, 32, 16*1024, l1_icache_str}, 6557 { 0x06, 4, 32, 8*1024, l1_icache_str}, 6558 { 0x05, 4, 0, 32, dtlb4M_str}, 6559 { 0x04, 4, 0, 8, dtlb4M_str}, 6560 { 0x03, 4, 0, 64, dtlb4k_str}, 6561 { 0x02, 4, 0, 2, itlb4M_str}, 6562 { 0x01, 4, 0, 32, itlb4k_str}, 6563 { 0 } 6564 }; 6565 6566 static const struct cachetab cyrix_ctab[] = { 6567 { 0x70, 4, 0, 32, "tlb-4K" }, 6568 { 0x80, 4, 16, 16*1024, "l1-cache" }, 6569 { 0 } 6570 }; 6571 6572 /* 6573 * Search a cache table for a matching entry 6574 */ 6575 static const struct cachetab * 6576 find_cacheent(const struct cachetab *ct, uint_t code) 6577 { 6578 if (code != 0) { 6579 for (; ct->ct_code != 0; ct++) 6580 if (ct->ct_code <= code) 6581 break; 6582 if (ct->ct_code == code) 6583 return (ct); 6584 } 6585 return (NULL); 6586 } 6587 6588 /* 6589 * Populate cachetab entry with L2 or L3 cache-information using 6590 * cpuid function 4. This function is called from intel_walk_cacheinfo() 6591 * when descriptor 0x49 is encountered. It returns 0 if no such cache 6592 * information is found. 6593 */ 6594 static int 6595 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi) 6596 { 6597 uint32_t level, i; 6598 int ret = 0; 6599 6600 for (i = 0; i < cpi->cpi_cache_leaf_size; i++) { 6601 level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]); 6602 6603 if (level == 2 || level == 3) { 6604 ct->ct_assoc = 6605 CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1; 6606 ct->ct_line_size = 6607 CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1; 6608 ct->ct_size = ct->ct_assoc * 6609 (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) * 6610 ct->ct_line_size * 6611 (cpi->cpi_cache_leaves[i]->cp_ecx + 1); 6612 6613 if (level == 2) { 6614 ct->ct_label = l2_cache_str; 6615 } else if (level == 3) { 6616 ct->ct_label = l3_cache_str; 6617 } 6618 ret = 1; 6619 } 6620 } 6621 6622 return (ret); 6623 } 6624 6625 /* 6626 * Walk the cacheinfo descriptor, applying 'func' to every valid element 6627 * The walk is terminated if the walker returns non-zero. 6628 */ 6629 static void 6630 intel_walk_cacheinfo(struct cpuid_info *cpi, 6631 void *arg, int (*func)(void *, const struct cachetab *)) 6632 { 6633 const struct cachetab *ct; 6634 struct cachetab des_49_ct, des_b1_ct; 6635 uint8_t *dp; 6636 int i; 6637 6638 if ((dp = cpi->cpi_cacheinfo) == NULL) 6639 return; 6640 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6641 /* 6642 * For overloaded descriptor 0x49 we use cpuid function 4 6643 * if supported by the current processor, to create 6644 * cache information. 6645 * For overloaded descriptor 0xb1 we use X86_PAE flag 6646 * to disambiguate the cache information. 6647 */ 6648 if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 && 6649 intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) { 6650 ct = &des_49_ct; 6651 } else if (*dp == 0xb1) { 6652 des_b1_ct.ct_code = 0xb1; 6653 des_b1_ct.ct_assoc = 4; 6654 des_b1_ct.ct_line_size = 0; 6655 if (is_x86_feature(x86_featureset, X86FSET_PAE)) { 6656 des_b1_ct.ct_size = 8; 6657 des_b1_ct.ct_label = itlb2M_str; 6658 } else { 6659 des_b1_ct.ct_size = 4; 6660 des_b1_ct.ct_label = itlb4M_str; 6661 } 6662 ct = &des_b1_ct; 6663 } else { 6664 if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) { 6665 continue; 6666 } 6667 } 6668 6669 if (func(arg, ct) != 0) { 6670 break; 6671 } 6672 } 6673 } 6674 6675 /* 6676 * (Like the Intel one, except for Cyrix CPUs) 6677 */ 6678 static void 6679 cyrix_walk_cacheinfo(struct cpuid_info *cpi, 6680 void *arg, int (*func)(void *, const struct cachetab *)) 6681 { 6682 const struct cachetab *ct; 6683 uint8_t *dp; 6684 int i; 6685 6686 if ((dp = cpi->cpi_cacheinfo) == NULL) 6687 return; 6688 for (i = 0; i < cpi->cpi_ncache; i++, dp++) { 6689 /* 6690 * Search Cyrix-specific descriptor table first .. 6691 */ 6692 if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) { 6693 if (func(arg, ct) != 0) 6694 break; 6695 continue; 6696 } 6697 /* 6698 * .. else fall back to the Intel one 6699 */ 6700 if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) { 6701 if (func(arg, ct) != 0) 6702 break; 6703 continue; 6704 } 6705 } 6706 } 6707 6708 /* 6709 * A cacheinfo walker that adds associativity, line-size, and size properties 6710 * to the devinfo node it is passed as an argument. 6711 */ 6712 static int 6713 add_cacheent_props(void *arg, const struct cachetab *ct) 6714 { 6715 dev_info_t *devi = arg; 6716 6717 add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc); 6718 if (ct->ct_line_size != 0) 6719 add_cache_prop(devi, ct->ct_label, line_str, 6720 ct->ct_line_size); 6721 add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size); 6722 return (0); 6723 } 6724 6725 6726 static const char fully_assoc[] = "fully-associative?"; 6727 6728 /* 6729 * AMD style cache/tlb description 6730 * 6731 * Extended functions 5 and 6 directly describe properties of 6732 * tlbs and various cache levels. 6733 */ 6734 static void 6735 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6736 { 6737 switch (assoc) { 6738 case 0: /* reserved; ignore */ 6739 break; 6740 default: 6741 add_cache_prop(devi, label, assoc_str, assoc); 6742 break; 6743 case 0xff: 6744 add_cache_prop(devi, label, fully_assoc, 1); 6745 break; 6746 } 6747 } 6748 6749 static void 6750 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6751 { 6752 if (size == 0) 6753 return; 6754 add_cache_prop(devi, label, size_str, size); 6755 add_amd_assoc(devi, label, assoc); 6756 } 6757 6758 static void 6759 add_amd_cache(dev_info_t *devi, const char *label, 6760 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6761 { 6762 if (size == 0 || line_size == 0) 6763 return; 6764 add_amd_assoc(devi, label, assoc); 6765 /* 6766 * Most AMD parts have a sectored cache. Multiple cache lines are 6767 * associated with each tag. A sector consists of all cache lines 6768 * associated with a tag. For example, the AMD K6-III has a sector 6769 * size of 2 cache lines per tag. 6770 */ 6771 if (lines_per_tag != 0) 6772 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6773 add_cache_prop(devi, label, line_str, line_size); 6774 add_cache_prop(devi, label, size_str, size * 1024); 6775 } 6776 6777 static void 6778 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc) 6779 { 6780 switch (assoc) { 6781 case 0: /* off */ 6782 break; 6783 case 1: 6784 case 2: 6785 case 4: 6786 add_cache_prop(devi, label, assoc_str, assoc); 6787 break; 6788 case 6: 6789 add_cache_prop(devi, label, assoc_str, 8); 6790 break; 6791 case 8: 6792 add_cache_prop(devi, label, assoc_str, 16); 6793 break; 6794 case 0xf: 6795 add_cache_prop(devi, label, fully_assoc, 1); 6796 break; 6797 default: /* reserved; ignore */ 6798 break; 6799 } 6800 } 6801 6802 static void 6803 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size) 6804 { 6805 if (size == 0 || assoc == 0) 6806 return; 6807 add_amd_l2_assoc(devi, label, assoc); 6808 add_cache_prop(devi, label, size_str, size); 6809 } 6810 6811 static void 6812 add_amd_l2_cache(dev_info_t *devi, const char *label, 6813 uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size) 6814 { 6815 if (size == 0 || assoc == 0 || line_size == 0) 6816 return; 6817 add_amd_l2_assoc(devi, label, assoc); 6818 if (lines_per_tag != 0) 6819 add_cache_prop(devi, label, "lines-per-tag", lines_per_tag); 6820 add_cache_prop(devi, label, line_str, line_size); 6821 add_cache_prop(devi, label, size_str, size * 1024); 6822 } 6823 6824 static void 6825 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi) 6826 { 6827 struct cpuid_regs *cp; 6828 6829 if (cpi->cpi_xmaxeax < 0x80000005) 6830 return; 6831 cp = &cpi->cpi_extd[5]; 6832 6833 /* 6834 * 4M/2M L1 TLB configuration 6835 * 6836 * We report the size for 2M pages because AMD uses two 6837 * TLB entries for one 4M page. 6838 */ 6839 add_amd_tlb(devi, "dtlb-2M", 6840 BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16)); 6841 add_amd_tlb(devi, "itlb-2M", 6842 BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0)); 6843 6844 /* 6845 * 4K L1 TLB configuration 6846 */ 6847 6848 switch (cpi->cpi_vendor) { 6849 uint_t nentries; 6850 case X86_VENDOR_TM: 6851 if (cpi->cpi_family >= 5) { 6852 /* 6853 * Crusoe processors have 256 TLB entries, but 6854 * cpuid data format constrains them to only 6855 * reporting 255 of them. 6856 */ 6857 if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255) 6858 nentries = 256; 6859 /* 6860 * Crusoe processors also have a unified TLB 6861 */ 6862 add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24), 6863 nentries); 6864 break; 6865 } 6866 /*FALLTHROUGH*/ 6867 default: 6868 add_amd_tlb(devi, itlb4k_str, 6869 BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16)); 6870 add_amd_tlb(devi, dtlb4k_str, 6871 BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0)); 6872 break; 6873 } 6874 6875 /* 6876 * data L1 cache configuration 6877 */ 6878 6879 add_amd_cache(devi, l1_dcache_str, 6880 BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16), 6881 BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0)); 6882 6883 /* 6884 * code L1 cache configuration 6885 */ 6886 6887 add_amd_cache(devi, l1_icache_str, 6888 BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16), 6889 BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0)); 6890 6891 if (cpi->cpi_xmaxeax < 0x80000006) 6892 return; 6893 cp = &cpi->cpi_extd[6]; 6894 6895 /* Check for a unified L2 TLB for large pages */ 6896 6897 if (BITX(cp->cp_eax, 31, 16) == 0) 6898 add_amd_l2_tlb(devi, "l2-tlb-2M", 6899 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6900 else { 6901 add_amd_l2_tlb(devi, "l2-dtlb-2M", 6902 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6903 add_amd_l2_tlb(devi, "l2-itlb-2M", 6904 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6905 } 6906 6907 /* Check for a unified L2 TLB for 4K pages */ 6908 6909 if (BITX(cp->cp_ebx, 31, 16) == 0) { 6910 add_amd_l2_tlb(devi, "l2-tlb-4K", 6911 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6912 } else { 6913 add_amd_l2_tlb(devi, "l2-dtlb-4K", 6914 BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16)); 6915 add_amd_l2_tlb(devi, "l2-itlb-4K", 6916 BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0)); 6917 } 6918 6919 add_amd_l2_cache(devi, l2_cache_str, 6920 BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12), 6921 BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0)); 6922 } 6923 6924 /* 6925 * There are two basic ways that the x86 world describes it cache 6926 * and tlb architecture - Intel's way and AMD's way. 6927 * 6928 * Return which flavor of cache architecture we should use 6929 */ 6930 static int 6931 x86_which_cacheinfo(struct cpuid_info *cpi) 6932 { 6933 switch (cpi->cpi_vendor) { 6934 case X86_VENDOR_Intel: 6935 if (cpi->cpi_maxeax >= 2) 6936 return (X86_VENDOR_Intel); 6937 break; 6938 case X86_VENDOR_AMD: 6939 /* 6940 * The K5 model 1 was the first part from AMD that reported 6941 * cache sizes via extended cpuid functions. 6942 */ 6943 if (cpi->cpi_family > 5 || 6944 (cpi->cpi_family == 5 && cpi->cpi_model >= 1)) 6945 return (X86_VENDOR_AMD); 6946 break; 6947 case X86_VENDOR_HYGON: 6948 return (X86_VENDOR_AMD); 6949 case X86_VENDOR_TM: 6950 if (cpi->cpi_family >= 5) 6951 return (X86_VENDOR_AMD); 6952 /*FALLTHROUGH*/ 6953 default: 6954 /* 6955 * If they have extended CPU data for 0x80000005 6956 * then we assume they have AMD-format cache 6957 * information. 6958 * 6959 * If not, and the vendor happens to be Cyrix, 6960 * then try our-Cyrix specific handler. 6961 * 6962 * If we're not Cyrix, then assume we're using Intel's 6963 * table-driven format instead. 6964 */ 6965 if (cpi->cpi_xmaxeax >= 0x80000005) 6966 return (X86_VENDOR_AMD); 6967 else if (cpi->cpi_vendor == X86_VENDOR_Cyrix) 6968 return (X86_VENDOR_Cyrix); 6969 else if (cpi->cpi_maxeax >= 2) 6970 return (X86_VENDOR_Intel); 6971 break; 6972 } 6973 return (-1); 6974 } 6975 6976 void 6977 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id, 6978 struct cpuid_info *cpi) 6979 { 6980 dev_info_t *cpu_devi; 6981 int create; 6982 6983 cpu_devi = (dev_info_t *)dip; 6984 6985 /* device_type */ 6986 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 6987 "device_type", "cpu"); 6988 6989 /* reg */ 6990 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6991 "reg", cpu_id); 6992 6993 /* cpu-mhz, and clock-frequency */ 6994 if (cpu_freq > 0) { 6995 long long mul; 6996 6997 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 6998 "cpu-mhz", cpu_freq); 6999 if ((mul = cpu_freq * 1000000LL) <= INT_MAX) 7000 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7001 "clock-frequency", (int)mul); 7002 } 7003 7004 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7005 7006 /* vendor-id */ 7007 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 7008 "vendor-id", cpi->cpi_vendorstr); 7009 7010 if (cpi->cpi_maxeax == 0) { 7011 return; 7012 } 7013 7014 /* 7015 * family, model, and step 7016 */ 7017 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7018 "family", CPI_FAMILY(cpi)); 7019 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7020 "cpu-model", CPI_MODEL(cpi)); 7021 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7022 "stepping-id", CPI_STEP(cpi)); 7023 7024 /* type */ 7025 switch (cpi->cpi_vendor) { 7026 case X86_VENDOR_Intel: 7027 create = 1; 7028 break; 7029 default: 7030 create = 0; 7031 break; 7032 } 7033 if (create) 7034 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7035 "type", CPI_TYPE(cpi)); 7036 7037 /* ext-family */ 7038 switch (cpi->cpi_vendor) { 7039 case X86_VENDOR_Intel: 7040 case X86_VENDOR_AMD: 7041 create = cpi->cpi_family >= 0xf; 7042 break; 7043 case X86_VENDOR_HYGON: 7044 create = 1; 7045 break; 7046 default: 7047 create = 0; 7048 break; 7049 } 7050 if (create) 7051 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7052 "ext-family", CPI_FAMILY_XTD(cpi)); 7053 7054 /* ext-model */ 7055 switch (cpi->cpi_vendor) { 7056 case X86_VENDOR_Intel: 7057 create = IS_EXTENDED_MODEL_INTEL(cpi); 7058 break; 7059 case X86_VENDOR_AMD: 7060 create = CPI_FAMILY(cpi) == 0xf; 7061 break; 7062 case X86_VENDOR_HYGON: 7063 create = 1; 7064 break; 7065 default: 7066 create = 0; 7067 break; 7068 } 7069 if (create) 7070 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7071 "ext-model", CPI_MODEL_XTD(cpi)); 7072 7073 /* generation */ 7074 switch (cpi->cpi_vendor) { 7075 case X86_VENDOR_AMD: 7076 case X86_VENDOR_HYGON: 7077 /* 7078 * AMD K5 model 1 was the first part to support this 7079 */ 7080 create = cpi->cpi_xmaxeax >= 0x80000001; 7081 break; 7082 default: 7083 create = 0; 7084 break; 7085 } 7086 if (create) 7087 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7088 "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8)); 7089 7090 /* brand-id */ 7091 switch (cpi->cpi_vendor) { 7092 case X86_VENDOR_Intel: 7093 /* 7094 * brand id first appeared on Pentium III Xeon model 8, 7095 * and Celeron model 8 processors and Opteron 7096 */ 7097 create = cpi->cpi_family > 6 || 7098 (cpi->cpi_family == 6 && cpi->cpi_model >= 8); 7099 break; 7100 case X86_VENDOR_AMD: 7101 create = cpi->cpi_family >= 0xf; 7102 break; 7103 case X86_VENDOR_HYGON: 7104 create = 1; 7105 break; 7106 default: 7107 create = 0; 7108 break; 7109 } 7110 if (create && cpi->cpi_brandid != 0) { 7111 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7112 "brand-id", cpi->cpi_brandid); 7113 } 7114 7115 /* chunks, and apic-id */ 7116 switch (cpi->cpi_vendor) { 7117 /* 7118 * first available on Pentium IV and Opteron (K8) 7119 */ 7120 case X86_VENDOR_Intel: 7121 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 7122 break; 7123 case X86_VENDOR_AMD: 7124 create = cpi->cpi_family >= 0xf; 7125 break; 7126 case X86_VENDOR_HYGON: 7127 create = 1; 7128 break; 7129 default: 7130 create = 0; 7131 break; 7132 } 7133 if (create) { 7134 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7135 "chunks", CPI_CHUNKS(cpi)); 7136 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7137 "apic-id", cpi->cpi_apicid); 7138 if (cpi->cpi_chipid >= 0) { 7139 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7140 "chip#", cpi->cpi_chipid); 7141 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7142 "clog#", cpi->cpi_clogid); 7143 } 7144 } 7145 7146 /* cpuid-features */ 7147 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7148 "cpuid-features", CPI_FEATURES_EDX(cpi)); 7149 7150 7151 /* cpuid-features-ecx */ 7152 switch (cpi->cpi_vendor) { 7153 case X86_VENDOR_Intel: 7154 create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf; 7155 break; 7156 case X86_VENDOR_AMD: 7157 create = cpi->cpi_family >= 0xf; 7158 break; 7159 case X86_VENDOR_HYGON: 7160 create = 1; 7161 break; 7162 default: 7163 create = 0; 7164 break; 7165 } 7166 if (create) 7167 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7168 "cpuid-features-ecx", CPI_FEATURES_ECX(cpi)); 7169 7170 /* ext-cpuid-features */ 7171 switch (cpi->cpi_vendor) { 7172 case X86_VENDOR_Intel: 7173 case X86_VENDOR_AMD: 7174 case X86_VENDOR_HYGON: 7175 case X86_VENDOR_Cyrix: 7176 case X86_VENDOR_TM: 7177 case X86_VENDOR_Centaur: 7178 create = cpi->cpi_xmaxeax >= 0x80000001; 7179 break; 7180 default: 7181 create = 0; 7182 break; 7183 } 7184 if (create) { 7185 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7186 "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi)); 7187 (void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi, 7188 "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi)); 7189 } 7190 7191 /* 7192 * Brand String first appeared in Intel Pentium IV, AMD K5 7193 * model 1, and Cyrix GXm. On earlier models we try and 7194 * simulate something similar .. so this string should always 7195 * same -something- about the processor, however lame. 7196 */ 7197 (void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi, 7198 "brand-string", cpi->cpi_brandstr); 7199 7200 /* 7201 * Finally, cache and tlb information 7202 */ 7203 switch (x86_which_cacheinfo(cpi)) { 7204 case X86_VENDOR_Intel: 7205 intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 7206 break; 7207 case X86_VENDOR_Cyrix: 7208 cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props); 7209 break; 7210 case X86_VENDOR_AMD: 7211 amd_cache_info(cpi, cpu_devi); 7212 break; 7213 default: 7214 break; 7215 } 7216 } 7217 7218 struct l2info { 7219 int *l2i_csz; 7220 int *l2i_lsz; 7221 int *l2i_assoc; 7222 int l2i_ret; 7223 }; 7224 7225 /* 7226 * A cacheinfo walker that fetches the size, line-size and associativity 7227 * of the L2 cache 7228 */ 7229 static int 7230 intel_l2cinfo(void *arg, const struct cachetab *ct) 7231 { 7232 struct l2info *l2i = arg; 7233 int *ip; 7234 7235 if (ct->ct_label != l2_cache_str && 7236 ct->ct_label != sl2_cache_str) 7237 return (0); /* not an L2 -- keep walking */ 7238 7239 if ((ip = l2i->l2i_csz) != NULL) 7240 *ip = ct->ct_size; 7241 if ((ip = l2i->l2i_lsz) != NULL) 7242 *ip = ct->ct_line_size; 7243 if ((ip = l2i->l2i_assoc) != NULL) 7244 *ip = ct->ct_assoc; 7245 l2i->l2i_ret = ct->ct_size; 7246 return (1); /* was an L2 -- terminate walk */ 7247 } 7248 7249 /* 7250 * AMD L2/L3 Cache and TLB Associativity Field Definition: 7251 * 7252 * Unlike the associativity for the L1 cache and tlb where the 8 bit 7253 * value is the associativity, the associativity for the L2 cache and 7254 * tlb is encoded in the following table. The 4 bit L2 value serves as 7255 * an index into the amd_afd[] array to determine the associativity. 7256 * -1 is undefined. 0 is fully associative. 7257 */ 7258 7259 static int amd_afd[] = 7260 {-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0}; 7261 7262 static void 7263 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i) 7264 { 7265 struct cpuid_regs *cp; 7266 uint_t size, assoc; 7267 int i; 7268 int *ip; 7269 7270 if (cpi->cpi_xmaxeax < 0x80000006) 7271 return; 7272 cp = &cpi->cpi_extd[6]; 7273 7274 if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 && 7275 (size = BITX(cp->cp_ecx, 31, 16)) != 0) { 7276 uint_t cachesz = size * 1024; 7277 assoc = amd_afd[i]; 7278 7279 ASSERT(assoc != -1); 7280 7281 if ((ip = l2i->l2i_csz) != NULL) 7282 *ip = cachesz; 7283 if ((ip = l2i->l2i_lsz) != NULL) 7284 *ip = BITX(cp->cp_ecx, 7, 0); 7285 if ((ip = l2i->l2i_assoc) != NULL) 7286 *ip = assoc; 7287 l2i->l2i_ret = cachesz; 7288 } 7289 } 7290 7291 int 7292 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc) 7293 { 7294 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7295 struct l2info __l2info, *l2i = &__l2info; 7296 7297 l2i->l2i_csz = csz; 7298 l2i->l2i_lsz = lsz; 7299 l2i->l2i_assoc = assoc; 7300 l2i->l2i_ret = -1; 7301 7302 switch (x86_which_cacheinfo(cpi)) { 7303 case X86_VENDOR_Intel: 7304 intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 7305 break; 7306 case X86_VENDOR_Cyrix: 7307 cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo); 7308 break; 7309 case X86_VENDOR_AMD: 7310 amd_l2cacheinfo(cpi, l2i); 7311 break; 7312 default: 7313 break; 7314 } 7315 return (l2i->l2i_ret); 7316 } 7317 7318 #if !defined(__xpv) 7319 7320 uint32_t * 7321 cpuid_mwait_alloc(cpu_t *cpu) 7322 { 7323 uint32_t *ret; 7324 size_t mwait_size; 7325 7326 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED)); 7327 7328 mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max; 7329 if (mwait_size == 0) 7330 return (NULL); 7331 7332 /* 7333 * kmem_alloc() returns cache line size aligned data for mwait_size 7334 * allocations. mwait_size is currently cache line sized. Neither 7335 * of these implementation details are guarantied to be true in the 7336 * future. 7337 * 7338 * First try allocating mwait_size as kmem_alloc() currently returns 7339 * correctly aligned memory. If kmem_alloc() does not return 7340 * mwait_size aligned memory, then use mwait_size ROUNDUP. 7341 * 7342 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we 7343 * decide to free this memory. 7344 */ 7345 ret = kmem_zalloc(mwait_size, KM_SLEEP); 7346 if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) { 7347 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 7348 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size; 7349 *ret = MWAIT_RUNNING; 7350 return (ret); 7351 } else { 7352 kmem_free(ret, mwait_size); 7353 ret = kmem_zalloc(mwait_size * 2, KM_SLEEP); 7354 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret; 7355 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2; 7356 ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size); 7357 *ret = MWAIT_RUNNING; 7358 return (ret); 7359 } 7360 } 7361 7362 void 7363 cpuid_mwait_free(cpu_t *cpu) 7364 { 7365 if (cpu->cpu_m.mcpu_cpi == NULL) { 7366 return; 7367 } 7368 7369 if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL && 7370 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) { 7371 kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual, 7372 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual); 7373 } 7374 7375 cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL; 7376 cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0; 7377 } 7378 7379 void 7380 patch_tsc_read(int flag) 7381 { 7382 size_t cnt; 7383 7384 switch (flag) { 7385 case TSC_NONE: 7386 cnt = &_no_rdtsc_end - &_no_rdtsc_start; 7387 (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt); 7388 break; 7389 case TSC_RDTSC_LFENCE: 7390 cnt = &_tsc_lfence_end - &_tsc_lfence_start; 7391 (void) memcpy((void *)tsc_read, 7392 (void *)&_tsc_lfence_start, cnt); 7393 break; 7394 case TSC_TSCP: 7395 cnt = &_tscp_end - &_tscp_start; 7396 (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); 7397 break; 7398 default: 7399 /* Bail for unexpected TSC types. (TSC_NONE covers 0) */ 7400 cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag); 7401 break; 7402 } 7403 tsc_type = flag; 7404 } 7405 7406 int 7407 cpuid_deep_cstates_supported(void) 7408 { 7409 struct cpuid_info *cpi; 7410 struct cpuid_regs regs; 7411 7412 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC)); 7413 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7414 7415 cpi = CPU->cpu_m.mcpu_cpi; 7416 7417 switch (cpi->cpi_vendor) { 7418 case X86_VENDOR_Intel: 7419 if (cpi->cpi_xmaxeax < 0x80000007) 7420 return (0); 7421 7422 /* 7423 * Does TSC run at a constant rate in all C-states? 7424 */ 7425 regs.cp_eax = 0x80000007; 7426 (void) __cpuid_insn(®s); 7427 return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE); 7428 7429 default: 7430 return (0); 7431 } 7432 } 7433 7434 #endif /* !__xpv */ 7435 7436 void 7437 post_startup_cpu_fixups(void) 7438 { 7439 #ifndef __xpv 7440 /* 7441 * Some AMD processors support C1E state. Entering this state will 7442 * cause the local APIC timer to stop, which we can't deal with at 7443 * this time. 7444 */ 7445 if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) { 7446 on_trap_data_t otd; 7447 uint64_t reg; 7448 7449 if (!on_trap(&otd, OT_DATA_ACCESS)) { 7450 reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT); 7451 /* Disable C1E state if it is enabled by BIOS */ 7452 if ((reg >> AMD_ACTONCMPHALT_SHIFT) & 7453 AMD_ACTONCMPHALT_MASK) { 7454 reg &= ~(AMD_ACTONCMPHALT_MASK << 7455 AMD_ACTONCMPHALT_SHIFT); 7456 wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg); 7457 } 7458 } 7459 no_trap(); 7460 } 7461 #endif /* !__xpv */ 7462 } 7463 7464 void 7465 enable_pcid(void) 7466 { 7467 if (x86_use_pcid == -1) 7468 x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID); 7469 7470 if (x86_use_invpcid == -1) { 7471 x86_use_invpcid = is_x86_feature(x86_featureset, 7472 X86FSET_INVPCID); 7473 } 7474 7475 if (!x86_use_pcid) 7476 return; 7477 7478 /* 7479 * Intel say that on setting PCIDE, it immediately starts using the PCID 7480 * bits; better make sure there's nothing there. 7481 */ 7482 ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE); 7483 7484 setcr4(getcr4() | CR4_PCIDE); 7485 } 7486 7487 /* 7488 * Setup necessary registers to enable XSAVE feature on this processor. 7489 * This function needs to be called early enough, so that no xsave/xrstor 7490 * ops will execute on the processor before the MSRs are properly set up. 7491 * 7492 * Current implementation has the following assumption: 7493 * - cpuid_pass_basic() is done, so that X86 features are known. 7494 * - fpu_probe() is done, so that fp_save_mech is chosen. 7495 */ 7496 void 7497 xsave_setup_msr(cpu_t *cpu) 7498 { 7499 ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC)); 7500 ASSERT(fp_save_mech == FP_XSAVE); 7501 ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); 7502 7503 /* Enable OSXSAVE in CR4. */ 7504 setcr4(getcr4() | CR4_OSXSAVE); 7505 /* 7506 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report 7507 * correct value. 7508 */ 7509 cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE; 7510 setup_xfem(); 7511 } 7512 7513 /* 7514 * Starting with the Westmere processor the local 7515 * APIC timer will continue running in all C-states, 7516 * including the deepest C-states. 7517 */ 7518 int 7519 cpuid_arat_supported(void) 7520 { 7521 struct cpuid_info *cpi; 7522 struct cpuid_regs regs; 7523 7524 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC)); 7525 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7526 7527 cpi = CPU->cpu_m.mcpu_cpi; 7528 7529 switch (cpi->cpi_vendor) { 7530 case X86_VENDOR_Intel: 7531 /* 7532 * Always-running Local APIC Timer is 7533 * indicated by CPUID.6.EAX[2]. 7534 */ 7535 if (cpi->cpi_maxeax >= 6) { 7536 regs.cp_eax = 6; 7537 (void) cpuid_insn(NULL, ®s); 7538 return (regs.cp_eax & CPUID_INTC_EAX_ARAT); 7539 } else { 7540 return (0); 7541 } 7542 default: 7543 return (0); 7544 } 7545 } 7546 7547 /* 7548 * Check support for Intel ENERGY_PERF_BIAS feature 7549 */ 7550 int 7551 cpuid_iepb_supported(struct cpu *cp) 7552 { 7553 struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi; 7554 struct cpuid_regs regs; 7555 7556 ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC)); 7557 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7558 7559 if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) { 7560 return (0); 7561 } 7562 7563 /* 7564 * Intel ENERGY_PERF_BIAS MSR is indicated by 7565 * capability bit CPUID.6.ECX.3 7566 */ 7567 if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6)) 7568 return (0); 7569 7570 regs.cp_eax = 0x6; 7571 (void) cpuid_insn(NULL, ®s); 7572 return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS); 7573 } 7574 7575 /* 7576 * Check support for TSC deadline timer 7577 * 7578 * TSC deadline timer provides a superior software programming 7579 * model over local APIC timer that eliminates "time drifts". 7580 * Instead of specifying a relative time, software specifies an 7581 * absolute time as the target at which the processor should 7582 * generate a timer event. 7583 */ 7584 int 7585 cpuid_deadline_tsc_supported(void) 7586 { 7587 struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi; 7588 struct cpuid_regs regs; 7589 7590 ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC)); 7591 ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); 7592 7593 switch (cpi->cpi_vendor) { 7594 case X86_VENDOR_Intel: 7595 if (cpi->cpi_maxeax >= 1) { 7596 regs.cp_eax = 1; 7597 (void) cpuid_insn(NULL, ®s); 7598 return (regs.cp_ecx & CPUID_DEADLINE_TSC); 7599 } else { 7600 return (0); 7601 } 7602 default: 7603 return (0); 7604 } 7605 } 7606 7607 #if !defined(__xpv) 7608 /* 7609 * Patch in versions of bcopy for high performance Intel Nhm processors 7610 * and later... 7611 */ 7612 void 7613 patch_memops(uint_t vendor) 7614 { 7615 size_t cnt, i; 7616 caddr_t to, from; 7617 7618 if ((vendor == X86_VENDOR_Intel) && 7619 is_x86_feature(x86_featureset, X86FSET_SSE4_2)) { 7620 cnt = &bcopy_patch_end - &bcopy_patch_start; 7621 to = &bcopy_ck_size; 7622 from = &bcopy_patch_start; 7623 for (i = 0; i < cnt; i++) { 7624 *to++ = *from++; 7625 } 7626 } 7627 } 7628 #endif /* !__xpv */ 7629 7630 /* 7631 * We're being asked to tell the system how many bits are required to represent 7632 * the various thread and strand IDs. While it's tempting to derive this based 7633 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite 7634 * correct. Instead, this needs to be based on the number of bits that the APIC 7635 * allows for these different configurations. We only update these to a larger 7636 * value if we find one. 7637 */ 7638 void 7639 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits) 7640 { 7641 struct cpuid_info *cpi; 7642 7643 VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC)); 7644 cpi = cpu->cpu_m.mcpu_cpi; 7645 7646 if (cpi->cpi_ncore_bits > *core_nbits) { 7647 *core_nbits = cpi->cpi_ncore_bits; 7648 } 7649 7650 if (cpi->cpi_nthread_bits > *strand_nbits) { 7651 *strand_nbits = cpi->cpi_nthread_bits; 7652 } 7653 } 7654 7655 void 7656 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset) 7657 { 7658 struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; 7659 struct cpuid_regs cp; 7660 7661 /* 7662 * Reread the CPUID portions that we need for various security 7663 * information. 7664 */ 7665 if (cpi->cpi_vendor == X86_VENDOR_Intel) { 7666 /* 7667 * Check if we now have leaf 7 available to us. 7668 */ 7669 if (cpi->cpi_maxeax < 7) { 7670 bzero(&cp, sizeof (cp)); 7671 cp.cp_eax = 0; 7672 cpi->cpi_maxeax = __cpuid_insn(&cp); 7673 if (cpi->cpi_maxeax < 7) 7674 return; 7675 } 7676 7677 bzero(&cp, sizeof (cp)); 7678 cp.cp_eax = 7; 7679 cp.cp_ecx = 0; 7680 (void) __cpuid_insn(&cp); 7681 cpi->cpi_std[7] = cp; 7682 } else if (cpi->cpi_vendor == X86_VENDOR_AMD || 7683 cpi->cpi_vendor == X86_VENDOR_HYGON) { 7684 /* No xcpuid support */ 7685 if (cpi->cpi_family < 5 || 7686 (cpi->cpi_family == 5 && cpi->cpi_model < 1)) 7687 return; 7688 7689 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7690 bzero(&cp, sizeof (cp)); 7691 cp.cp_eax = CPUID_LEAF_EXT_0; 7692 cpi->cpi_xmaxeax = __cpuid_insn(&cp); 7693 if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { 7694 return; 7695 } 7696 } 7697 7698 bzero(&cp, sizeof (cp)); 7699 cp.cp_eax = CPUID_LEAF_EXT_8; 7700 (void) __cpuid_insn(&cp); 7701 platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp); 7702 cpi->cpi_extd[8] = cp; 7703 } else { 7704 /* 7705 * Nothing to do here. Return an empty set which has already 7706 * been zeroed for us. 7707 */ 7708 return; 7709 } 7710 cpuid_scan_security(cpu, fset); 7711 } 7712 7713 /* ARGSUSED */ 7714 static int 7715 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2) 7716 { 7717 uchar_t *fset; 7718 boolean_t first_pass = (boolean_t)arg1; 7719 7720 fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id); 7721 if (first_pass && CPU->cpu_id != 0) 7722 return (0); 7723 if (!first_pass && CPU->cpu_id == 0) 7724 return (0); 7725 cpuid_pass_ucode(CPU, fset); 7726 7727 return (0); 7728 } 7729 7730 /* 7731 * After a microcode update where the version has changed, then we need to 7732 * rescan CPUID. To do this we check every CPU to make sure that they have the 7733 * same microcode. Then we perform a cross call to all such CPUs. It's the 7734 * caller's job to make sure that no one else can end up doing an update while 7735 * this is going on. 7736 * 7737 * We assume that the system is microcode capable if we're called. 7738 */ 7739 void 7740 cpuid_post_ucodeadm(void) 7741 { 7742 uint32_t rev; 7743 int i; 7744 struct cpu *cpu; 7745 cpuset_t cpuset; 7746 void *argdata; 7747 uchar_t *f0; 7748 7749 argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP); 7750 7751 mutex_enter(&cpu_lock); 7752 cpu = cpu_get(0); 7753 rev = cpu->cpu_m.mcpu_ucode_info->cui_rev; 7754 CPUSET_ONLY(cpuset, 0); 7755 for (i = 1; i < max_ncpus; i++) { 7756 if ((cpu = cpu_get(i)) == NULL) 7757 continue; 7758 7759 if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) { 7760 panic("post microcode update CPU %d has differing " 7761 "microcode revision (%u) from CPU 0 (%u)", 7762 i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev); 7763 } 7764 CPUSET_ADD(cpuset, i); 7765 } 7766 7767 /* 7768 * We do the cross calls in two passes. The first pass is only for the 7769 * boot CPU. The second pass is for all of the other CPUs. This allows 7770 * the boot CPU to go through and change behavior related to patching or 7771 * whether or not Enhanced IBRS needs to be enabled and then allow all 7772 * other CPUs to follow suit. 7773 */ 7774 kpreempt_disable(); 7775 xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset), 7776 cpuid_post_ucodeadm_xc); 7777 xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset), 7778 cpuid_post_ucodeadm_xc); 7779 kpreempt_enable(); 7780 7781 /* 7782 * OK, now look at each CPU and see if their feature sets are equal. 7783 */ 7784 f0 = argdata; 7785 for (i = 1; i < max_ncpus; i++) { 7786 uchar_t *fset; 7787 if (!CPU_IN_SET(cpuset, i)) 7788 continue; 7789 7790 fset = (uchar_t *)((uintptr_t)argdata + 7791 sizeof (x86_featureset) * i); 7792 7793 if (!compare_x86_featureset(f0, fset)) { 7794 panic("Post microcode update CPU %d has " 7795 "differing security feature (%p) set from CPU 0 " 7796 "(%p), not appending to feature set", i, 7797 (void *)fset, (void *)f0); 7798 } 7799 } 7800 7801 mutex_exit(&cpu_lock); 7802 7803 for (i = 0; i < NUM_X86_FEATURES; i++) { 7804 cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n", 7805 x86_feature_names[i]); 7806 if (is_x86_feature(f0, i)) { 7807 add_x86_feature(x86_featureset, i); 7808 } 7809 } 7810 kmem_free(argdata, sizeof (x86_featureset) * NCPU); 7811 } 7812 7813 typedef void (*cpuid_pass_f)(cpu_t *, void *); 7814 7815 typedef struct cpuid_pass_def { 7816 cpuid_pass_t cpd_pass; 7817 cpuid_pass_f cpd_func; 7818 } cpuid_pass_def_t; 7819 7820 /* 7821 * See block comment at the top; note that cpuid_pass_ucode is not a pass in the 7822 * normal sense and should not appear here. 7823 */ 7824 static const cpuid_pass_def_t cpuid_pass_defs[] = { 7825 { CPUID_PASS_PRELUDE, cpuid_pass_prelude }, 7826 { CPUID_PASS_IDENT, cpuid_pass_ident }, 7827 { CPUID_PASS_BASIC, cpuid_pass_basic }, 7828 { CPUID_PASS_EXTENDED, cpuid_pass_extended }, 7829 { CPUID_PASS_DYNAMIC, cpuid_pass_dynamic }, 7830 { CPUID_PASS_RESOLVE, cpuid_pass_resolve }, 7831 }; 7832 7833 void 7834 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg) 7835 { 7836 VERIFY3S(pass, !=, CPUID_PASS_NONE); 7837 7838 if (cp == NULL) 7839 cp = CPU; 7840 7841 /* 7842 * Space statically allocated for BSP, ensure pointer is set 7843 */ 7844 if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL) 7845 cp->cpu_m.mcpu_cpi = &cpuid_info0; 7846 7847 ASSERT(cpuid_checkpass(cp, pass - 1)); 7848 7849 for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) { 7850 if (cpuid_pass_defs[i].cpd_pass == pass) { 7851 cpuid_pass_defs[i].cpd_func(cp, arg); 7852 cp->cpu_m.mcpu_cpi->cpi_pass = pass; 7853 return; 7854 } 7855 } 7856 7857 panic("unable to execute invalid cpuid pass %d on cpu%d\n", 7858 pass, cp->cpu_id); 7859 } 7860