1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2022 Oxide Computer Company 15 */ 16 17 /* 18 * Generic Intel Integrated Memory Controller (IMC) Driver 19 * 20 * This driver talks to the CPU's IMC to understand the detailed topology of the 21 * processor and to determine how to map between physical addresses to the 22 * corresponding DIMM. This driver supports the following generations of Intel 23 * chips: 24 * 25 * - Sandy Bridge 26 * - Ivy Bridge 27 * - Haswell 28 * - Broadwell 29 * - Skylake / Cascade Lake 30 * 31 * Memory Decoding 32 * --------------- 33 * 34 * For more detailed summaries of the memory decoding process, please refer to 35 * the Intel External Design Specifications for the corresponding processor. 36 * What follows is a rough overview of how the memory decoding system works. 37 * 38 * First, we'd like to define the following concepts: 39 * 40 * SYSTEM ADDRESS 41 * 42 * This is a physical address that the operating system normally uses. This 43 * address may refer to DRAM, it may refer to memory mapped PCI 44 * configuration space or device registers, or it may refer to other parts 45 * of the system's memory map, such as the extended advanced programmable 46 * interrupt controller (xAPIC), etc. 47 * 48 * DIMM 49 * 50 * Dual-inline memory module. This refers to a physical stick of volatile 51 * memory that is inserted into a slot on the motherboard. 52 * 53 * RANK 54 * 55 * A potential sub-division of a DIMM. A DIMM's memory capacity is divided 56 * into a number of equal sized ranks. For example, an 8 GiB DIMM, may have 57 * 1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks. 58 * 59 * RANK ADDRESS 60 * 61 * An address that exists in the context of a given rank on a DIMM. All 62 * ranks have overlapping addresses, so the address 0x400 exists on all 63 * ranks on a given DIMM. 64 * 65 * CHANNEL 66 * 67 * Multiple DIMMs may be combined into a single channel. The channel 68 * represents the combined memory of all the DIMMs. A given channel only 69 * ever exists on a socket and is bound to a single memory controller. 70 * 71 * CHANNEL ADDRESS 72 * 73 * This is an address that exists logically on a channel. Each address on a 74 * channel maps to a corresponding DIMM that exists on that channel. The 75 * address space on one channel is independent from that on another. This 76 * means that address 0x1000 can exist on each memory channel in the 77 * system. 78 * 79 * INTERLEAVE 80 * 81 * There are several different cases where interleaving occurs on the 82 * system. For example, addresses may be interleaved across sockets, 83 * memory channels, or DIMM ranks. When addresses are interleaved, then 84 * some number of bits in an address are used to select which target to go 85 * to (usually through a look up table). The effect of interleaving is that 86 * addresses that are next to one another may not all go to the same 87 * device. The following image shows a non-interleaving case. 88 * 89 * 0x0fff +-----+ +-----+ 0x7ff 90 * | |\___________/| | 91 * | | __________ | (b) | 92 * | | / \| | 93 * 0x0800 |=====|= +-----+ 0x000 +-----+ 0x7ff 94 * | | \______________________________/| | 95 * | | _______________________________ | (a) | 96 * | |/ \| | 97 * 0x0000 +-----+ +-----+ 0x000 98 * 99 * In this example of non-interleaving, addresses 0x0000 to 0x07ff go to 100 * device (a). While, addresses 0x08000 to 0xfff, go to device (b). 101 * However, each range is divided into the same number of components. 102 * 103 * If instead, we were to look at that with interleaving, what we might say 104 * is that rather than splitting the range in half, we might say that if 105 * the address has bit 8 set (0x100), then it goes to (b), otherwise it 106 * goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a). 107 * 0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a) 108 * again, and then 0x300 to 0x2ff would go back to (b). This would continue 109 * for a while. This would instead look something more like: 110 * 111 * 112 * 0x0fff +-----+ A: 0x7ff +---------+ B: 0x7ff +---------+ 113 * | (b) | | e00-eff | | f00-fff | 114 * 0x0f00 |-----| 0x700 +---------+ 0x700 +---------+ 115 * | (a) | | c00-cff | | d00-dff | 116 * 0x0e00 ~~~~~~~ 0x600 +---------+ 0x600 +---------+ 117 * *** | a00-aff | | b00-bff | 118 * 0x0400 ~~~~~~~ 0x500 +---------+ 0x500 +---------+ 119 * | (b) | | 800-8ff | | 900-9ff | 120 * 0x0300 |-----| 0x400 +---------+ 0x400 +---------+ 121 * | (a) | | 600-6ff | | 700-7ff | 122 * 0x0200 |-----| 0x300 +---------+ 0x300 +---------+ 123 * | (b) | | 400-4ff | | 500-5ff | 124 * 0x0100 |-----| 0x200 +---------+ 0x200 +---------+ 125 * | (a) | | 200-2ff | | 300-3ff | 126 * 0x0000 +-----+ 0x100 +---------+ 0x100 +---------+ 127 * | 000-0ff | | 100-1ff | 128 * 0x000 +---------+ 0x000 +---------+ 129 * 130 * In this example we've performed two-way interleaving. The number of ways 131 * that something can interleave varies based on what we're interleaving 132 * between. 133 * 134 * MEMORY CONTROLLER 135 * 136 * A given processor die (see uts/i86pc/os/cpuid.c) contains a number of 137 * memory controllers. Usually 1 or two. Each memory controller supports a 138 * given number of DIMMs, which are divided across multiple channels. 139 * 140 * TARGET ADDRESS DECODER 141 * 142 * The target address decoder (TAD) is responsible for taking a system 143 * address and transforming it into a channel address based on the rules 144 * that are present. Each memory controller has a corresponding TAD. The 145 * TAD is often contained in a device called a 'Home Agent'. 146 * 147 * SYSTEM ADDRESS DECODER 148 * 149 * The system address decoder (SAD) is responsible for taking a system 150 * address and directing it to the right place, whether this be memory or 151 * otherwise. There is a single memory controller per socket (see 152 * uts/i86pc/os/cpuid.c) that is shared between all the cores currently. 153 * 154 * NODE IDENTIFIER 155 * 156 * The node identifier is used to uniquely identify an element in the 157 * various routing topologies on the die (see uts/i86pc/os/cpuid.c for the 158 * definition of 'die'). One can roughly think about this as a unique 159 * identifier for the socket itself. In general, the primary node ID for a 160 * socket should map to the socket APIC ID. 161 * 162 * Finding Devices 163 * --------------- 164 * 165 * There is a bit of a chicken and egg problem on Intel systems and in the 166 * device driver interface. The information that we need in the system is spread 167 * out amongst a large number of different PCI devices that the processor 168 * exposes. The number of such devices can vary based on the processor 169 * generation and the specific SKU in the processor. To deal with this, we break 170 * the driver into two different components: a stub driver and the full driver. 171 * 172 * The stub driver has aliases for all known PCI devices that we might attach to 173 * in a given generation on the system. This driver is called 'imcstub'. When a 174 * stub attaches, it just registers itself with the main driver, upon which it 175 * has a module dependency. 176 * 177 * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it 178 * kicks off a scan of the device tree which takes place in a task queue. Once 179 * there, it determines the number of devices that it expects to exist by 180 * walking the tree and comparing it against the generation-specific table. 181 * 182 * If all devices are found, we'll go ahead and read through all the devices and 183 * build a map of all the information we need to understand the topology of the 184 * system and to be able to decode addresses. We do this here, because we can be 185 * asked to perform decoding in dangerous contexts (after taking an MCE, panic, 186 * etc) where we don't want to have to rely on the broader kernel functioning at 187 * this point in time. 188 * 189 * Once our topology is built, we'll create minor nodes which are used by the 190 * fault management architecture to query for information and register our 191 * decoding functionality with the kernel. 192 * 193 * PCI Numbering 194 * ------------- 195 * 196 * For each device that we care about, Intel defines the device and function 197 * that we can expect to find the information and PCI configuration space 198 * registers that we care about at. However, the PCI bus is not well defined. 199 * Devices that are on the same socket use the same set of bus numbers; however, 200 * some sockets have multiple device numbers that they'll use to represent 201 * different classes. These bus numbers are programmed by systems firmware as 202 * part of powering on the system. This means, that we need the ability to 203 * map together these disparate ranges ourselves. 204 * 205 * There is a device called a utility box (UBOX), which exists per-socket and 206 * maps the different sockets together. We use this to determine which devices 207 * correspond to which sockets. 208 * 209 * Mapping Sockets 210 * --------------- 211 * 212 * Another wrinkle is that the way that the OS sees the numbering of the CPUs is 213 * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more 214 * information). However, to map to the corresponding socket, we need to look at 215 * the socket's node ID. The order of PCI buses in the system is not required to 216 * have any relation to the socket ID. Therefore, we have to have yet another 217 * indirection table in the imc_t. 218 * 219 * Exposing Data 220 * ------------- 221 * 222 * We expose topology data to FMA using the OS-private memory controller 223 * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a 224 * number of specific interfaces that we can then implement. The ioctl API asks 225 * us for a snapshot of data, which basically has us go through and send an 226 * nvlist_t to userland. This nvlist_t is constructed as part of the scan 227 * process. This nvlist uses the version 1 format, which more explicitly encodes 228 * the topology in a series of nested nvlists. 229 * 230 * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the 231 * decoder and ask it to perform decoding. 232 * 233 * Decoding Addresses 234 * ------------------ 235 * 236 * The decoding logic can be found in common/imc/imc_decode.c. This file is 237 * shared between the kernel and userland to allow for easier testing and 238 * additional flexibility in operation. The decoding process happens in a few 239 * different phases. 240 * 241 * The first phase, is to determine which memory controller on which socket is 242 * responsible for this data. To determine this, we use the system address 243 * decoder and walk the rules, looking for the correct target. There are various 244 * manipulations to the address that exist which are used to determine which 245 * index we use. The way that we interpret the output of the rule varies 246 * somewhat based on the generation. Sandy Bridge just has a node ID which 247 * points us to the socket with its single IMC. On Ivy Bridge through Broadwell, 248 * the memory controller to use is also encoded in part of the node ID. Finally, 249 * on Skylake, the SAD tells us which socket to look at. The socket in question 250 * then has a routing table which tells us which channel on which memory 251 * controller that is local to that socket. 252 * 253 * Once we have the target memory controller, we walk the list of target address 254 * decoder rules. These rules can help tell us which channel we care about 255 * (which is required on Sandy Bridge through Broadwell) and then describe some 256 * amount of the interleaving rules which are used to turn the system address 257 * into a channel address. 258 * 259 * Once we know the channel and the channel address, we walk the rank interleave 260 * rules which help us determine which DIMM and the corresponding rank on it 261 * that the corresponding channel address is on. It also has logic that we need 262 * to use to determine how to transform a channel address into an address on 263 * that specific rank. Once we have that, then the initial decoding is done. 264 * 265 * The logic in imc_decode.c is abstracted away from the broader kernel CMI 266 * logic. This is on purpose and allows us not only an easier time unit testing 267 * the logic, but also allows us to express more high fidelity errors that are 268 * translated into a much smaller subset. This logic is exercised in the 269 * 'imc_test' program which is built in 'test/os-tests/tests/imc'. 270 * 271 * Limitations 272 * ----------- 273 * 274 * Currently, this driver has the following limitations: 275 * 276 * o It doesn't decode the row and column addresses. 277 * o It doesn't encode from a DIMM address to a system address. 278 * o It doesn't properly support lockstep and mirroring modes on Sandy Bridge - 279 * Broadwell platforms. 280 * o It doesn't support virtual lockstep and adaptive mirroring on Purley 281 * platforms. 282 * o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs. 283 * o It doesn't know how to decode three way channel interleaving. 284 * 285 * None of these are intrinsic problems to the driver, it's mostly a matter of 286 * having proper documentation and testing. 287 */ 288 289 #include <sys/modctl.h> 290 #include <sys/conf.h> 291 #include <sys/devops.h> 292 #include <sys/ddi.h> 293 #include <sys/sunddi.h> 294 #include <sys/types.h> 295 #include <sys/file.h> 296 #include <sys/errno.h> 297 #include <sys/open.h> 298 #include <sys/cred.h> 299 #include <sys/pci.h> 300 #include <sys/sysmacros.h> 301 #include <sys/avl.h> 302 #include <sys/stat.h> 303 #include <sys/policy.h> 304 305 #include <sys/cpu_module.h> 306 #include <sys/mc.h> 307 #include <sys/mc_intel.h> 308 309 #include "imc.h" 310 311 /* 312 * These tables contain generational data that varies between processor 313 * generation such as the maximum number of sockets, memory controllers, and the 314 * offsets of the various registers. 315 */ 316 317 static const imc_gen_data_t imc_gen_data_snb = { 318 .igd_max_sockets = 4, 319 .igd_max_imcs = 2, 320 .igd_max_channels = 4, 321 .igd_max_dimms = 3, 322 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, 323 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, 324 IMC_REG_MC_MTR2 }, 325 .igd_mcmtr_offset = 0x7c, 326 .igd_tolm_offset = 0x80, 327 .igd_tohm_low_offset = 0x84, 328 .igd_sad_dram_offset = 0x80, 329 .igd_sad_ndram_rules = 10, 330 .igd_sad_nodeid_offset = 0x40, 331 .igd_tad_nrules = 12, 332 .igd_tad_rule_offset = 0x40, 333 .igd_tad_chan_offset = 0x90, 334 .igd_tad_sysdef = 0x80, 335 .igd_tad_sysdef2 = 0x84, 336 .igd_mc_mirror = 0xac, 337 .igd_rir_nways = 5, 338 .igd_rir_way_offset = 0x108, 339 .igd_rir_nileaves = 8, 340 .igd_rir_ileave_offset = 0x120, 341 .igd_ubox_cpubusno_offset = 0xd0, 342 }; 343 344 static const imc_gen_data_t imc_gen_data_ivb = { 345 .igd_max_sockets = 4, 346 .igd_max_imcs = 2, 347 .igd_max_channels = 4, 348 .igd_max_dimms = 3, 349 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, 350 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, 351 IMC_REG_MC_MTR2 }, 352 .igd_mcmtr_offset = 0x7c, 353 .igd_tolm_offset = 0x80, 354 .igd_tohm_low_offset = 0x84, 355 .igd_sad_dram_offset = 0x60, 356 .igd_sad_ndram_rules = 20, 357 .igd_sad_nodeid_offset = 0x40, 358 .igd_tad_nrules = 12, 359 .igd_tad_rule_offset = 0x40, 360 .igd_tad_chan_offset = 0x90, 361 .igd_tad_sysdef = 0x80, 362 .igd_tad_sysdef2 = 0x84, 363 .igd_mc_mirror = 0xac, 364 .igd_rir_nways = 5, 365 .igd_rir_way_offset = 0x108, 366 .igd_rir_nileaves = 8, 367 .igd_rir_ileave_offset = 0x120, 368 .igd_ubox_cpubusno_offset = 0xd0, 369 }; 370 371 static const imc_gen_data_t imc_gen_data_has_brd = { 372 .igd_max_sockets = 4, 373 .igd_max_imcs = 2, 374 .igd_max_channels = 4, 375 .igd_max_dimms = 3, 376 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX, 377 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, 378 IMC_REG_MC_MTR2 }, 379 .igd_mcmtr_offset = 0x7c, 380 .igd_tolm_offset = 0xd0, 381 .igd_tohm_low_offset = 0xd4, 382 .igd_tohm_hi_offset = 0xd8, 383 .igd_sad_dram_offset = 0x60, 384 .igd_sad_ndram_rules = 20, 385 .igd_sad_nodeid_offset = 0x40, 386 .igd_tad_nrules = 12, 387 .igd_tad_rule_offset = 0x40, 388 .igd_tad_chan_offset = 0x90, 389 .igd_tad_sysdef = 0x80, 390 .igd_tad_sysdef2 = 0x84, 391 .igd_mc_mirror = 0xac, 392 .igd_rir_nways = 5, 393 .igd_rir_way_offset = 0x108, 394 .igd_rir_nileaves = 8, 395 .igd_rir_ileave_offset = 0x120, 396 .igd_ubox_cpubusno_offset = 0xd0, 397 }; 398 399 static const imc_gen_data_t imc_gen_data_skx = { 400 .igd_max_sockets = 8, 401 .igd_max_imcs = 2, 402 .igd_max_channels = 3, 403 .igd_max_dimms = 2, 404 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, 405 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 }, 406 .igd_mcmtr_offset = 0x87c, 407 .igd_topo_offset = 0x88, 408 .igd_tolm_offset = 0xd0, 409 .igd_tohm_low_offset = 0xd4, 410 .igd_tohm_hi_offset = 0xd8, 411 .igd_sad_dram_offset = 0x60, 412 .igd_sad_ndram_rules = 24, 413 .igd_sad_nodeid_offset = 0xc0, 414 .igd_tad_nrules = 8, 415 .igd_tad_rule_offset = 0x850, 416 .igd_tad_chan_offset = 0x90, 417 .igd_rir_nways = 4, 418 .igd_rir_way_offset = 0x108, 419 .igd_rir_nileaves = 4, 420 .igd_rir_ileave_offset = 0x120, 421 .igd_ubox_cpubusno_offset = 0xcc, 422 }; 423 424 /* 425 * This table contains all of the devices that we're looking for from a stub 426 * perspective. These are organized by generation. Different generations behave 427 * in slightly different ways. For example, Sandy Bridge through Broadwell use 428 * unique PCI IDs for each PCI device/function combination that appears. Whereas 429 * Skylake based systems use the same PCI ID; however, different device/function 430 * values indicate that the IDs are used for different purposes. 431 */ 432 /* BEGIN CSTYLED */ 433 static const imc_stub_table_t imc_stub_table[] = { 434 /* Sandy Bridge */ 435 { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" }, 436 { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" }, 437 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" }, 438 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" }, 439 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" }, 440 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" }, 441 { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" }, 442 { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" }, 443 { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" }, 444 { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" }, 445 { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" }, 446 { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" }, 447 /* Ivy Bridge */ 448 { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" }, 449 { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" }, 450 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" }, 451 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" }, 452 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" }, 453 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" }, 454 { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" }, 455 { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" }, 456 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" }, 457 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" }, 458 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" }, 459 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" }, 460 { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" }, 461 { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" }, 462 { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" }, 463 { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" }, 464 { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" }, 465 { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" }, 466 { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" }, 467 /* Haswell */ 468 { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" }, 469 { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" }, 470 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" }, 471 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" }, 472 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" }, 473 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" }, 474 { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" }, 475 { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" }, 476 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" }, 477 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" }, 478 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" }, 479 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" }, 480 { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" }, 481 { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" }, 482 { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" }, 483 { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" }, 484 { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" }, 485 { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" }, 486 { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" }, 487 /* Broadwell Devices */ 488 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" }, 489 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" }, 490 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" }, 491 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" }, 492 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" }, 493 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" }, 494 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" }, 495 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" }, 496 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" }, 497 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" }, 498 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" }, 499 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" }, 500 { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" }, 501 { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" }, 502 { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" }, 503 { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" }, 504 { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" }, 505 { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" }, 506 { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" }, 507 /* Skylake and Cascade Lake Devices */ 508 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" }, 509 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" }, 510 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" }, 511 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" }, 512 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" }, 513 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" }, 514 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" }, 515 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" }, 516 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" }, 517 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" }, 518 { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" }, 519 520 /* 521 * There is one SAD MC Route type device per core! Because of this a 522 * wide array of device and functions are allocated. For now, we list 523 * all 28 of them out. 524 */ 525 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" }, 526 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" }, 527 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" }, 528 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" }, 529 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" }, 530 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" }, 531 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" }, 532 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" }, 533 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" }, 534 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" }, 535 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" }, 536 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" }, 537 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" }, 538 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" }, 539 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" }, 540 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" }, 541 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" }, 542 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" }, 543 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" }, 544 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" }, 545 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" }, 546 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" }, 547 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" }, 548 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" }, 549 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" }, 550 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" }, 551 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" }, 552 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" }, 553 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" }, 554 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" }, 555 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" }, 556 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" }, 557 558 { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" }, 559 { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" }, 560 }; 561 /* END CSTYLED */ 562 563 #define IMC_PCI_VENDOR_INTC 0x8086 564 565 /* 566 * Our IMC data is global and statically set up during a combination of 567 * _init(9E) and attach(9E). While we have a module dependency between the PCI 568 * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't 569 * guarantee that the imc driver has finished attaching. As such we make sure 570 * that it can operate without it being attached in any way. 571 */ 572 static imc_t *imc_data = NULL; 573 574 /* 575 * By default we should not allow the stubs to detach as we don't have a good 576 * way of forcing them to attach again. This is provided in case someone does 577 * want to allow the driver to unload. 578 */ 579 int imc_allow_detach = 0; 580 581 static void 582 imc_set_gen_data(imc_t *imc) 583 { 584 switch (imc->imc_gen) { 585 case IMC_GEN_SANDY: 586 imc->imc_gen_data = &imc_gen_data_snb; 587 break; 588 case IMC_GEN_IVY: 589 imc->imc_gen_data = &imc_gen_data_ivb; 590 break; 591 case IMC_GEN_HASWELL: 592 case IMC_GEN_BROADWELL: 593 imc->imc_gen_data = &imc_gen_data_has_brd; 594 break; 595 case IMC_GEN_SKYLAKE: 596 imc->imc_gen_data = &imc_gen_data_skx; 597 break; 598 default: 599 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " 600 "set to unknown generation: %u", imc->imc_gen); 601 } 602 } 603 604 /* 605 * If our device (dev_info_t) does not have a non-zero unit address, then 606 * devfsadmd will not pay attention to us at all. Therefore we need to set the 607 * unit address below, before we create minor nodes. 608 * 609 * The rest of the system expects us to have one minor node per socket. The 610 * minor node ID should be the ID of the socket. 611 */ 612 static boolean_t 613 imc_create_minors(imc_t *imc) 614 { 615 uint_t i; 616 617 ddi_set_name_addr(imc->imc_dip, "1"); 618 for (i = 0; i < imc->imc_nsockets; i++) { 619 char buf[MAXNAMELEN]; 620 621 if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >= 622 sizeof (buf)) { 623 goto fail; 624 } 625 626 if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i, 627 "ddi_mem_ctrl", 0) != DDI_SUCCESS) { 628 dev_err(imc->imc_dip, CE_WARN, "failed to create " 629 "minor node %u: %s", i, buf); 630 goto fail; 631 } 632 } 633 return (B_TRUE); 634 635 fail: 636 ddi_remove_minor_node(imc->imc_dip, NULL); 637 return (B_FALSE); 638 } 639 640 /* 641 * Check the current MC route value for this SAD. On Skylake systems there is 642 * one per core. Every core should agree. If not, we will not trust the SAD 643 * MCROUTE values and this will cause system address decoding to fail on 644 * skylake. 645 */ 646 static void 647 imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub) 648 { 649 uint32_t val; 650 651 val = pci_config_get32(stub->istub_cfgspace, 652 IMC_REG_SKX_SAD_MC_ROUTE_TABLE); 653 if (val == PCI_EINVAL32) { 654 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; 655 return; 656 } 657 658 if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) { 659 sad->isad_flags |= IMC_SAD_MCROUTE_VALID; 660 sad->isad_mcroute.ismc_raw_mcroute = val; 661 return; 662 } 663 664 /* 665 * Occasionally we see MC ROUTE table entries with a value of zero. 666 * We should ignore those for now. 667 */ 668 if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) { 669 dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch " 670 "with socket. SAD has val 0x%x, system has %x\n", 671 val, sad->isad_mcroute.ismc_raw_mcroute); 672 sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE; 673 } 674 } 675 676 /* 677 * On Skylake, many of the devices that we care about are on separate PCI Buses. 678 * These can be mapped together by the DECS register. However, we need to know 679 * how to map different buses together so that we can more usefully associate 680 * information. The set of buses is all present in the DECS register. We'll 681 * effectively assign sockets to buses. This is also still something that comes 682 * up on pre-Skylake systems as well. 683 */ 684 static boolean_t 685 imc_map_buses(imc_t *imc) 686 { 687 imc_stub_t *stub; 688 uint_t nsock; 689 690 /* 691 * Find the UBOX_DECS registers so we can establish socket mappings. On 692 * Skylake, there are three different sets of buses that we need to 693 * cover all of our devices, while there are only two before that. 694 */ 695 for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL; 696 stub = AVL_NEXT(&imc->imc_stubs, stub)) { 697 uint32_t busno; 698 699 if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) { 700 continue; 701 } 702 703 busno = pci_config_get32(stub->istub_cfgspace, 704 imc->imc_gen_data->igd_ubox_cpubusno_offset); 705 if (busno == PCI_EINVAL32) { 706 dev_err(imc->imc_dip, CE_WARN, "failed to read " 707 "UBOX_DECS CPUBUSNO0: invalid PCI read"); 708 return (B_FALSE); 709 } 710 711 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 712 imc->imc_sockets[nsock].isock_nbus = 3; 713 imc->imc_sockets[nsock].isock_bus[0] = 714 IMC_UBOX_CPUBUSNO_0(busno); 715 imc->imc_sockets[nsock].isock_bus[1] = 716 IMC_UBOX_CPUBUSNO_1(busno); 717 imc->imc_sockets[nsock].isock_bus[2] = 718 IMC_UBOX_CPUBUSNO_2(busno); 719 } else { 720 imc->imc_sockets[nsock].isock_bus[0] = 721 IMC_UBOX_CPUBUSNO_0(busno); 722 imc->imc_sockets[nsock].isock_bus[1] = 723 IMC_UBOX_CPUBUSNO_1(busno); 724 imc->imc_sockets[nsock].isock_nbus = 2; 725 } 726 nsock++; 727 } 728 imc->imc_nsockets = nsock; 729 730 return (B_TRUE); 731 } 732 733 /* 734 * For a given stub that we've found, map it to its corresponding socket based 735 * on the PCI bus that it has. 736 */ 737 static imc_socket_t * 738 imc_map_find_socket(imc_t *imc, imc_stub_t *stub) 739 { 740 uint_t i; 741 742 for (i = 0; i < imc->imc_nsockets; i++) { 743 uint_t bus; 744 745 for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) { 746 if (imc->imc_sockets[i].isock_bus[bus] == 747 stub->istub_bus) { 748 return (&imc->imc_sockets[i]); 749 } 750 } 751 } 752 753 return (NULL); 754 } 755 756 static boolean_t 757 imc_map_stubs(imc_t *imc) 758 { 759 imc_stub_t *stub; 760 761 if (!imc_map_buses(imc)) { 762 return (B_FALSE); 763 } 764 765 stub = avl_first(&imc->imc_stubs); 766 for (stub = avl_first(&imc->imc_stubs); stub != NULL; 767 stub = AVL_NEXT(&imc->imc_stubs, stub)) { 768 imc_socket_t *sock = imc_map_find_socket(imc, stub); 769 770 if (sock == NULL) { 771 dev_err(imc->imc_dip, CE_WARN, "found stub type %u " 772 "PCI%x,%x with bdf %u/%u/%u that does not match a " 773 "known PCI bus for any of %u sockets", 774 stub->istub_table->imcs_type, stub->istub_vid, 775 stub->istub_did, stub->istub_bus, stub->istub_dev, 776 stub->istub_func, imc->imc_nsockets); 777 continue; 778 } 779 780 /* 781 * We don't have to worry about duplicates here. We check to 782 * make sure that we have unique bdfs here. 783 */ 784 switch (stub->istub_table->imcs_type) { 785 case IMC_TYPE_MC0_M2M: 786 sock->isock_imcs[0].icn_m2m = stub; 787 break; 788 case IMC_TYPE_MC1_M2M: 789 sock->isock_imcs[1].icn_m2m = stub; 790 break; 791 case IMC_TYPE_MC0_MAIN0: 792 sock->isock_nimc++; 793 sock->isock_imcs[0].icn_main0 = stub; 794 795 /* 796 * On Skylake, the MAIN0 does double duty as channel 797 * zero and as the TAD. 798 */ 799 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 800 sock->isock_imcs[0].icn_nchannels++; 801 sock->isock_imcs[0].icn_channels[0].ich_desc = 802 stub; 803 sock->isock_tad[0].itad_stub = stub; 804 sock->isock_ntad++; 805 } 806 break; 807 case IMC_TYPE_MC0_MAIN1: 808 sock->isock_imcs[0].icn_main1 = stub; 809 break; 810 case IMC_TYPE_MC1_MAIN0: 811 sock->isock_nimc++; 812 sock->isock_imcs[1].icn_main0 = stub; 813 814 /* 815 * On Skylake, the MAIN0 does double duty as channel 816 * zero and as the TAD. 817 */ 818 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 819 sock->isock_imcs[1].icn_nchannels++; 820 sock->isock_imcs[1].icn_channels[0].ich_desc = 821 stub; 822 sock->isock_tad[1].itad_stub = stub; 823 sock->isock_ntad++; 824 } 825 break; 826 case IMC_TYPE_MC1_MAIN1: 827 sock->isock_imcs[1].icn_main1 = stub; 828 break; 829 case IMC_TYPE_MC0_CHANNEL0: 830 sock->isock_imcs[0].icn_nchannels++; 831 sock->isock_imcs[0].icn_channels[0].ich_desc = stub; 832 break; 833 case IMC_TYPE_MC0_CHANNEL1: 834 sock->isock_imcs[0].icn_nchannels++; 835 sock->isock_imcs[0].icn_channels[1].ich_desc = stub; 836 break; 837 case IMC_TYPE_MC0_CHANNEL2: 838 sock->isock_imcs[0].icn_nchannels++; 839 sock->isock_imcs[0].icn_channels[2].ich_desc = stub; 840 break; 841 case IMC_TYPE_MC0_CHANNEL3: 842 sock->isock_imcs[0].icn_nchannels++; 843 sock->isock_imcs[0].icn_channels[3].ich_desc = stub; 844 break; 845 case IMC_TYPE_MC1_CHANNEL0: 846 sock->isock_imcs[1].icn_nchannels++; 847 sock->isock_imcs[1].icn_channels[0].ich_desc = stub; 848 break; 849 case IMC_TYPE_MC1_CHANNEL1: 850 sock->isock_imcs[1].icn_nchannels++; 851 sock->isock_imcs[1].icn_channels[1].ich_desc = stub; 852 break; 853 case IMC_TYPE_MC1_CHANNEL2: 854 sock->isock_imcs[1].icn_nchannels++; 855 sock->isock_imcs[1].icn_channels[2].ich_desc = stub; 856 break; 857 case IMC_TYPE_MC1_CHANNEL3: 858 sock->isock_imcs[1].icn_nchannels++; 859 sock->isock_imcs[1].icn_channels[3].ich_desc = stub; 860 break; 861 case IMC_TYPE_SAD_DRAM: 862 sock->isock_sad.isad_dram = stub; 863 break; 864 case IMC_TYPE_SAD_MMIO: 865 sock->isock_sad.isad_mmio = stub; 866 break; 867 case IMC_TYPE_SAD_MISC: 868 sock->isock_sad.isad_tolh = stub; 869 break; 870 case IMC_TYPE_VTD_MISC: 871 /* 872 * Some systems have multiple VT-D Misc. entry points 873 * in the system. In this case, only use the first one 874 * we find. 875 */ 876 if (imc->imc_gvtd_misc == NULL) { 877 imc->imc_gvtd_misc = stub; 878 } 879 break; 880 case IMC_TYPE_SAD_MCROUTE: 881 ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE); 882 imc_mcroute_check(imc, &sock->isock_sad, stub); 883 break; 884 case IMC_TYPE_UBOX: 885 sock->isock_ubox = stub; 886 break; 887 case IMC_TYPE_HA0: 888 sock->isock_ntad++; 889 sock->isock_tad[0].itad_stub = stub; 890 break; 891 case IMC_TYPE_HA1: 892 sock->isock_ntad++; 893 sock->isock_tad[1].itad_stub = stub; 894 break; 895 case IMC_TYPE_UBOX_CPUBUSNO: 896 sock->isock_cpubusno = stub; 897 break; 898 default: 899 /* 900 * Attempt to still attach if we can. 901 */ 902 dev_err(imc->imc_dip, CE_WARN, "Encountered unknown " 903 "IMC type (%u) on PCI %x,%x", 904 stub->istub_table->imcs_type, 905 stub->istub_vid, stub->istub_did); 906 break; 907 } 908 } 909 910 return (B_TRUE); 911 } 912 913 /* 914 * Go through and fix up various aspects of the stubs mappings on systems. The 915 * following are a list of what we need to fix up: 916 * 917 * 1. On Haswell and newer systems, there is only one global VT-d device. We 918 * need to go back and map that to all of the per-socket imc_sad_t entries. 919 */ 920 static void 921 imc_fixup_stubs(imc_t *imc) 922 { 923 if (imc->imc_gen >= IMC_GEN_HASWELL) { 924 uint_t i; 925 926 for (i = 0; i < imc->imc_nsockets; i++) { 927 ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh, 928 ==, NULL); 929 imc->imc_sockets[i].isock_sad.isad_tolh = 930 imc->imc_gvtd_misc; 931 } 932 } 933 } 934 935 /* 936 * In the wild we've hit a few odd cases where not all devices are exposed that 937 * we might expect by firmware. In particular we've seen and validate the 938 * following cases: 939 * 940 * o We don't find all of the channel devices that we expect, e.g. we have the 941 * stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW 942 * with an E5-2630v3. 943 */ 944 static boolean_t 945 imc_validate_stubs(imc_t *imc) 946 { 947 for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) { 948 imc_socket_t *socket = &imc->imc_sockets[sock]; 949 950 for (uint_t mc = 0; mc < socket->isock_nimc; mc++) { 951 imc_mc_t *mcp = &socket->isock_imcs[mc]; 952 953 for (uint_t chan = 0; chan < mcp->icn_nchannels; 954 chan++) { 955 if (mcp->icn_channels[chan].ich_desc == NULL) { 956 dev_err(imc->imc_dip, CE_WARN, 957 "!missing device for socket %u/" 958 "imc %u/channel %u", sock, mc, 959 chan); 960 return (B_FALSE); 961 } 962 } 963 } 964 } 965 966 return (B_TRUE); 967 } 968 969 /* 970 * Attempt to map all of the discovered sockets to the corresponding APIC based 971 * socket. We do these mappings by getting the node id of the socket and 972 * adjusting it to make sure that no home agent is present in it. We use the 973 * UBOX to avoid any home agent related bits that are present in other 974 * registers. 975 */ 976 static void 977 imc_map_sockets(imc_t *imc) 978 { 979 uint_t i; 980 981 for (i = 0; i < imc->imc_nsockets; i++) { 982 uint32_t nodeid; 983 ddi_acc_handle_t h; 984 985 h = imc->imc_sockets[i].isock_ubox->istub_cfgspace; 986 nodeid = pci_config_get32(h, 987 imc->imc_gen_data->igd_sad_nodeid_offset); 988 if (nodeid == PCI_EINVAL32) { 989 imc->imc_sockets[i].isock_valid |= 990 IMC_SOCKET_V_BAD_NODEID; 991 continue; 992 } 993 994 imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid); 995 imc->imc_spointers[nodeid] = &imc->imc_sockets[i]; 996 } 997 } 998 999 /* 1000 * Decode the MTR, accounting for variances between processor generations. 1001 */ 1002 static void 1003 imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr) 1004 { 1005 uint8_t disable; 1006 1007 /* 1008 * Check present first, before worrying about anything else. 1009 */ 1010 if (imc->imc_gen < IMC_GEN_SKYLAKE && 1011 IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) { 1012 dimm->idimm_present = B_FALSE; 1013 return; 1014 } else if (imc->imc_gen >= IMC_GEN_SKYLAKE && 1015 IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) { 1016 dimm->idimm_present = B_FALSE; 1017 return; 1018 } 1019 1020 dimm->idimm_present = B_TRUE; 1021 dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE; 1022 if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN || 1023 dimm->idimm_ncolumns > IMC_MTR_CA_MAX) { 1024 dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS; 1025 } 1026 1027 dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE; 1028 if (dimm->idimm_nrows < IMC_MTR_RA_MIN || 1029 dimm->idimm_nrows > IMC_MTR_RA_MAX) { 1030 dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS; 1031 } 1032 1033 /* 1034 * Determine Density, this information is not present on Sandy Bridge. 1035 */ 1036 switch (imc->imc_gen) { 1037 case IMC_GEN_IVY: 1038 dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr); 1039 break; 1040 case IMC_GEN_HASWELL: 1041 case IMC_GEN_BROADWELL: 1042 switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) { 1043 case 0: 1044 default: 1045 dimm->idimm_density = 0; 1046 dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; 1047 break; 1048 case 1: 1049 dimm->idimm_density = 2; 1050 break; 1051 case 2: 1052 dimm->idimm_density = 4; 1053 break; 1054 case 3: 1055 dimm->idimm_density = 8; 1056 break; 1057 } 1058 break; 1059 case IMC_GEN_SKYLAKE: 1060 switch (IMC_MTR_DENSITY_SKX(mtr)) { 1061 case 0: 1062 default: 1063 dimm->idimm_density = 0; 1064 dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; 1065 break; 1066 case 1: 1067 dimm->idimm_density = 2; 1068 break; 1069 case 2: 1070 dimm->idimm_density = 4; 1071 break; 1072 case 3: 1073 dimm->idimm_density = 8; 1074 break; 1075 case 4: 1076 dimm->idimm_density = 16; 1077 break; 1078 case 5: 1079 dimm->idimm_density = 12; 1080 break; 1081 } 1082 break; 1083 case IMC_GEN_UNKNOWN: 1084 case IMC_GEN_SANDY: 1085 dimm->idimm_density = 0; 1086 break; 1087 } 1088 1089 /* 1090 * The values of width are the same on IVY->SKX, but the bits are 1091 * different. This doesn't exist on SNB. 1092 */ 1093 if (imc->imc_gen > IMC_GEN_SANDY) { 1094 uint8_t width; 1095 1096 if (imc->imc_gen >= IMC_GEN_BROADWELL) { 1097 width = IMC_MTR_WIDTH_BRD_SKX(mtr); 1098 } else { 1099 width = IMC_MTR_WIDTH_IVB_HAS(mtr); 1100 } 1101 switch (width) { 1102 case 0: 1103 dimm->idimm_width = 4; 1104 break; 1105 case 1: 1106 dimm->idimm_width = 8; 1107 break; 1108 case 2: 1109 dimm->idimm_width = 16; 1110 break; 1111 default: 1112 dimm->idimm_width = 0; 1113 dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH; 1114 break; 1115 } 1116 } else { 1117 dimm->idimm_width = 0; 1118 } 1119 1120 dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr); 1121 switch (imc->imc_gen) { 1122 case IMC_GEN_HASWELL: 1123 case IMC_GEN_BROADWELL: 1124 case IMC_GEN_SKYLAKE: 1125 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) { 1126 dimm->idimm_nranks = 0; 1127 dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; 1128 } 1129 break; 1130 default: 1131 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) { 1132 dimm->idimm_nranks = 0; 1133 dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; 1134 } 1135 } 1136 1137 disable = IMC_MTR_RANK_DISABLE(mtr); 1138 dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0; 1139 dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0; 1140 dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0; 1141 dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0; 1142 1143 /* 1144 * Only Haswell and later have this information. 1145 */ 1146 if (imc->imc_gen >= IMC_GEN_HASWELL) { 1147 dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0; 1148 dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0; 1149 dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr); 1150 if (dimm->idimm_3dsranks != 0) { 1151 dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks; 1152 } 1153 } 1154 1155 1156 if (icn->icn_dimm_type == IMC_DIMM_DDR4) { 1157 dimm->idimm_nbanks = 16; 1158 } else { 1159 dimm->idimm_nbanks = 8; 1160 } 1161 1162 /* 1163 * To calculate the DIMM size we need first take the number of rows and 1164 * columns. This gives us the number of slots per chip. In a given rank 1165 * there are nbanks of these. There are nrank entries of those. Each of 1166 * these slots can fit a byte. 1167 */ 1168 dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 * 1169 (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows)); 1170 } 1171 1172 static void 1173 imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan) 1174 { 1175 uint_t i; 1176 1177 /* 1178 * There's one register for each DIMM that might be present, we always 1179 * read that information to determine information about the DIMMs. 1180 */ 1181 chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms; 1182 for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { 1183 uint32_t mtr; 1184 imc_dimm_t *dimm = &chan->ich_dimms[i]; 1185 1186 bzero(dimm, sizeof (imc_dimm_t)); 1187 mtr = pci_config_get32(chan->ich_desc->istub_cfgspace, 1188 imc->imc_gen_data->igd_mtr_offsets[i]); 1189 dimm->idimm_mtr = mtr; 1190 /* 1191 * We don't really expect to get a bad PCIe read. However, if we 1192 * do, treat that for the moment as though the DIMM is bad. 1193 */ 1194 if (mtr == PCI_EINVAL32) { 1195 dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ; 1196 continue; 1197 } 1198 1199 imc_decode_mtr(imc, icn, dimm, mtr); 1200 } 1201 } 1202 1203 static boolean_t 1204 imc_fill_controller(imc_t *imc, imc_mc_t *icn) 1205 { 1206 uint32_t mcmtr; 1207 1208 mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace, 1209 imc->imc_gen_data->igd_mcmtr_offset); 1210 if (mcmtr == PCI_EINVAL32) { 1211 icn->icn_invalid = B_TRUE; 1212 return (B_FALSE); 1213 } 1214 1215 icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0; 1216 if (imc->imc_gen < IMC_GEN_SKYLAKE) { 1217 icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0; 1218 } else { 1219 icn->icn_lockstep = B_FALSE; 1220 } 1221 1222 icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0; 1223 1224 /* 1225 * SNB and IVB only support DDR3. Haswell and Broadwell may support 1226 * DDR4, depends on the SKU. Skylake only supports DDR4. 1227 */ 1228 switch (imc->imc_gen) { 1229 case IMC_GEN_SANDY: 1230 case IMC_GEN_IVY: 1231 icn->icn_dimm_type = IMC_DIMM_DDR3; 1232 break; 1233 case IMC_GEN_HASWELL: 1234 case IMC_GEN_BROADWELL: 1235 if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) { 1236 icn->icn_dimm_type = IMC_DIMM_DDR4; 1237 } else { 1238 icn->icn_dimm_type = IMC_DIMM_DDR3; 1239 } 1240 break; 1241 default: 1242 /* 1243 * Skylake and on are all DDR4. 1244 */ 1245 icn->icn_dimm_type = IMC_DIMM_DDR4; 1246 break; 1247 } 1248 1249 if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) { 1250 icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace, 1251 imc->imc_gen_data->igd_topo_offset); 1252 } 1253 1254 return (B_TRUE); 1255 } 1256 1257 /* 1258 * Walk the IMC data and fill in the information on DIMMs and the memory 1259 * controller configurations. 1260 */ 1261 static void 1262 imc_fill_data(imc_t *imc) 1263 { 1264 uint_t csock, cmc, cchan; 1265 1266 for (csock = 0; csock < imc->imc_nsockets; csock++) { 1267 imc_socket_t *sock = &imc->imc_sockets[csock]; 1268 1269 for (cmc = 0; cmc < sock->isock_nimc; cmc++) { 1270 imc_mc_t *icn = &sock->isock_imcs[cmc]; 1271 1272 if (!imc_fill_controller(imc, icn)) 1273 continue; 1274 1275 for (cchan = 0; cchan < icn->icn_nchannels; cchan++) { 1276 imc_fill_dimms(imc, icn, 1277 &icn->icn_channels[cchan]); 1278 } 1279 } 1280 } 1281 } 1282 1283 static nvlist_t * 1284 imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm) 1285 { 1286 nvlist_t *nvl; 1287 1288 nvl = fnvlist_alloc(); 1289 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT, 1290 dimm->idimm_present); 1291 if (!dimm->idimm_present) { 1292 return (nvl); 1293 } 1294 1295 fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size); 1296 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS, 1297 dimm->idimm_ncolumns); 1298 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS, 1299 dimm->idimm_nrows); 1300 1301 if (imc->imc_gen > IMC_GEN_SANDY) { 1302 fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY, 1303 dimm->idimm_density * (1ULL << 30)); 1304 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH, 1305 dimm->idimm_width); 1306 } 1307 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS, 1308 dimm->idimm_nranks); 1309 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS, 1310 dimm->idimm_nbanks); 1311 fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS, 1312 dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE); 1313 1314 if (imc->imc_gen >= IMC_GEN_HASWELL) { 1315 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL, 1316 dimm->idimm_hdrl); 1317 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP, 1318 dimm->idimm_hdrl_parity); 1319 if (dimm->idimm_3dsranks > 0) { 1320 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK, 1321 dimm->idimm_3dsranks); 1322 } 1323 } 1324 1325 return (nvl); 1326 } 1327 1328 static nvlist_t * 1329 imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan) 1330 { 1331 nvlist_t *nvl; 1332 nvlist_t *dimms[IMC_MAX_DIMMPERCHAN]; 1333 uint_t i; 1334 1335 nvl = fnvlist_alloc(); 1336 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC, 1337 imc->imc_gen_data->igd_max_dimms); 1338 for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { 1339 dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]); 1340 } 1341 1342 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS, 1343 dimms, i); 1344 1345 for (; i > 0; i--) { 1346 nvlist_free(dimms[i-1]); 1347 } 1348 1349 return (nvl); 1350 } 1351 1352 static nvlist_t * 1353 imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn) 1354 { 1355 nvlist_t *nvl; 1356 nvlist_t *channels[IMC_MAX_CHANPERMC]; 1357 uint_t i; 1358 1359 nvl = fnvlist_alloc(); 1360 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels); 1361 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC, 1362 icn->icn_ecc); 1363 if (icn->icn_lockstep) { 1364 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, 1365 MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK); 1366 } else { 1367 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, 1368 MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP); 1369 1370 } 1371 1372 if (icn->icn_closed) { 1373 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, 1374 MCINTEL_NVLIST_V1_MC_POLICY_CLOSED); 1375 } else { 1376 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, 1377 MCINTEL_NVLIST_V1_MC_POLICY_OPEN); 1378 } 1379 1380 for (i = 0; i < icn->icn_nchannels; i++) { 1381 channels[i] = imc_nvl_create_channel(imc, 1382 &icn->icn_channels[i]); 1383 } 1384 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS, 1385 channels, icn->icn_nchannels); 1386 for (i = 0; i < icn->icn_nchannels; i++) { 1387 nvlist_free(channels[i]); 1388 } 1389 1390 return (nvl); 1391 } 1392 1393 static void 1394 imc_nvl_pack(imc_socket_t *sock, boolean_t sleep) 1395 { 1396 char *buf = NULL; 1397 size_t len = 0; 1398 int kmflag; 1399 1400 if (sock->isock_nvl == NULL) 1401 return; 1402 1403 if (sock->isock_buf != NULL) 1404 return; 1405 1406 if (sleep) { 1407 kmflag = KM_SLEEP; 1408 } else { 1409 kmflag = KM_NOSLEEP_LAZY; 1410 } 1411 1412 if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR, 1413 kmflag) != 0) { 1414 return; 1415 } 1416 1417 sock->isock_buf = buf; 1418 sock->isock_buflen = len; 1419 sock->isock_gen++; 1420 } 1421 1422 static void 1423 imc_decoder_pack(imc_t *imc) 1424 { 1425 char *buf = NULL; 1426 size_t len = 0; 1427 1428 if (imc->imc_decoder_buf != NULL) 1429 return; 1430 1431 if (imc->imc_decoder_dump == NULL) { 1432 imc->imc_decoder_dump = imc_dump_decoder(imc); 1433 } 1434 1435 if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR, 1436 KM_NOSLEEP_LAZY) != 0) { 1437 return; 1438 } 1439 1440 imc->imc_decoder_buf = buf; 1441 imc->imc_decoder_len = len; 1442 } 1443 1444 static void 1445 imc_nvl_create(imc_t *imc) 1446 { 1447 uint_t csock; 1448 for (csock = 0; csock < imc->imc_nsockets; csock++) { 1449 uint_t i; 1450 nvlist_t *nvl; 1451 nvlist_t *mcs[IMC_MAX_IMCPERSOCK]; 1452 imc_socket_t *sock = &imc->imc_sockets[csock]; 1453 1454 nvl = fnvlist_alloc(); 1455 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR, 1456 MCINTEL_NVLIST_VERS1); 1457 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC, 1458 sock->isock_nimc); 1459 1460 for (i = 0; i < sock->isock_nimc; i++) { 1461 mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]); 1462 } 1463 1464 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS, 1465 mcs, sock->isock_nimc); 1466 1467 for (i = 0; i < sock->isock_nimc; i++) { 1468 nvlist_free(mcs[i]); 1469 } 1470 1471 sock->isock_nvl = nvl; 1472 imc_nvl_pack(sock, B_TRUE); 1473 } 1474 } 1475 1476 /* 1477 * Determine the top of low and high memory. These determine whether transaction 1478 * addresses target main memory or not. Unfortunately, the way that these are 1479 * stored and fetched changes with different generations. 1480 */ 1481 static void 1482 imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad) 1483 { 1484 uint32_t tolm, tohm_low, tohm_hi; 1485 1486 tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace, 1487 imc->imc_gen_data->igd_tolm_offset); 1488 tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace, 1489 imc->imc_gen_data->igd_tohm_low_offset); 1490 if (imc->imc_gen_data->igd_tohm_hi_offset != 0) { 1491 tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace, 1492 imc->imc_gen_data->igd_tohm_hi_offset); 1493 } else { 1494 tohm_hi = 0; 1495 } 1496 1497 if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 || 1498 tohm_hi == PCI_EINVAL32) { 1499 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; 1500 return; 1501 } 1502 1503 switch (imc->imc_gen) { 1504 case IMC_GEN_SANDY: 1505 case IMC_GEN_IVY: 1506 sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) << 1507 IMC_TOLM_SNB_IVY_SHIFT; 1508 sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) << 1509 IMC_TOLM_SNB_IVY_SHIFT; 1510 break; 1511 case IMC_GEN_HASWELL: 1512 case IMC_GEN_BROADWELL: 1513 case IMC_GEN_SKYLAKE: 1514 sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK; 1515 sad->isad_tohm = ((uint64_t)tohm_low & 1516 IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32); 1517 1518 /* 1519 * Adjust the values to turn them into an exclusive range. 1520 */ 1521 sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL; 1522 sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL; 1523 break; 1524 default: 1525 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " 1526 "set to unknown generation: %u", imc->imc_gen); 1527 return; 1528 } 1529 } 1530 1531 static void 1532 imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule, 1533 uint32_t raw) 1534 { 1535 uint_t attr; 1536 uint64_t limit; 1537 bzero(rule, sizeof (imc_sad_rule_t)); 1538 1539 rule->isr_raw_dram = raw; 1540 rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0; 1541 if (imc->imc_gen < IMC_GEN_SKYLAKE) { 1542 switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) { 1543 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6: 1544 rule->isr_imode = IMC_SAD_IMODE_8t6; 1545 break; 1546 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR: 1547 rule->isr_imode = IMC_SAD_IMODE_8t6XOR; 1548 break; 1549 } 1550 } else { 1551 switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) { 1552 case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6: 1553 rule->isr_imode = IMC_SAD_IMODE_8t6; 1554 break; 1555 case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8: 1556 rule->isr_imode = IMC_SAD_IMODE_10t8; 1557 break; 1558 case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12: 1559 rule->isr_imode = IMC_SAD_IMODE_14t12; 1560 break; 1561 case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30: 1562 rule->isr_imode = IMC_SAD_IMODE_32t30; 1563 break; 1564 } 1565 } 1566 1567 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1568 attr = IMC_SAD_DRAM_ATTR_SKX(raw); 1569 } else { 1570 attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw); 1571 } 1572 1573 switch (attr) { 1574 case IMC_SAD_DRAM_ATTR_DRAM: 1575 rule->isr_type = IMC_SAD_TYPE_DRAM; 1576 break; 1577 case IMC_SAD_DRAM_ATTR_MMCFG: 1578 rule->isr_type = IMC_SAD_TYPE_MMCFG; 1579 break; 1580 case IMC_SAD_DRAM_ATTR_NXM: 1581 if (imc->imc_gen < IMC_GEN_SKYLAKE) { 1582 sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; 1583 } 1584 rule->isr_type = IMC_SAD_TYPE_NXM; 1585 break; 1586 default: 1587 sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; 1588 break; 1589 } 1590 1591 /* 1592 * Fetch the limit which represents bits 45:26 and then adjust this so 1593 * that it is exclusive. 1594 */ 1595 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1596 limit = IMC_SAD_DRAM_LIMIT_SKX(raw); 1597 } else { 1598 limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw); 1599 } 1600 rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) + 1601 IMC_SAD_DRAM_LIMIT_EXCLUSIVE; 1602 1603 /* 1604 * The rest of this does not apply to Sandy Bridge. 1605 */ 1606 if (imc->imc_gen == IMC_GEN_SANDY) 1607 return; 1608 1609 if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) { 1610 rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0; 1611 return; 1612 } 1613 1614 switch (IMC_SAD_DRAM_MOD23_SKX(raw)) { 1615 case IMC_SAD_DRAM_MOD23_MOD3: 1616 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3; 1617 break; 1618 case IMC_SAD_DRAM_MOD23_MOD2_C01: 1619 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01; 1620 break; 1621 case IMC_SAD_DRAM_MOD23_MOD2_C12: 1622 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12; 1623 break; 1624 case IMC_SAD_DRAM_MOD23_MOD2_C02: 1625 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02; 1626 break; 1627 } 1628 1629 rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0; 1630 switch (IMC_SAD_DRAM_MOD3_SKX(raw)) { 1631 case IMC_SAD_DRAM_MOD3_MODE_45t6: 1632 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6; 1633 break; 1634 case IMC_SAD_DRAM_MOD3_MODE_45t8: 1635 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8; 1636 break; 1637 case IMC_SAD_DRAM_MOD3_MODE_45t12: 1638 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12; 1639 break; 1640 default: 1641 sad->isad_valid |= IMC_SAD_V_BAD_MOD3; 1642 break; 1643 } 1644 } 1645 1646 static void 1647 imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw) 1648 { 1649 uint_t i; 1650 uint32_t mlen, mbase, skipbits, skipafter; 1651 1652 rule->isr_raw_interleave = raw; 1653 1654 /* 1655 * Right now all architectures always have the maximum number of SAD 1656 * interleave targets. 1657 */ 1658 rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE; 1659 1660 /* 1661 * Sandy Bridge has a gap in the interleave list due to the fact that it 1662 * uses a smaller length. 1663 */ 1664 if (imc->imc_gen > IMC_GEN_SANDY) { 1665 mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN; 1666 mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK; 1667 skipbits = skipafter = 0; 1668 } else { 1669 mlen = IMC_SAD_ILEAVE_SNB_LEN; 1670 mbase = IMC_SAD_ILEAVE_SNB_MASK; 1671 skipbits = 2; 1672 skipafter = 4; 1673 } 1674 1675 for (i = 0; i < rule->isr_ntargets; i++) { 1676 uint32_t mask, shift; 1677 1678 shift = i * mlen; 1679 if (i >= skipafter) 1680 shift += skipbits; 1681 mask = mbase << shift; 1682 rule->isr_targets[i] = (raw & mask) >> shift; 1683 } 1684 } 1685 1686 static void 1687 imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad) 1688 { 1689 uint_t i; 1690 off_t off; 1691 1692 sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules; 1693 for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset; 1694 i < sad->isad_nrules; i++, off += sizeof (uint64_t)) { 1695 uint32_t dram, interleave; 1696 imc_sad_rule_t *rule = &sad->isad_rules[i]; 1697 1698 dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off); 1699 interleave = pci_config_get32(sad->isad_dram->istub_cfgspace, 1700 off + 4); 1701 1702 if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) { 1703 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; 1704 return; 1705 } 1706 1707 imc_sad_fill_rule(imc, sad, rule, dram); 1708 imc_sad_fill_rule_interleave(imc, rule, interleave); 1709 } 1710 } 1711 1712 static void 1713 imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad) 1714 { 1715 uint_t i; 1716 imc_sad_mcroute_table_t *mc = &sad->isad_mcroute; 1717 1718 if (imc->imc_gen < IMC_GEN_SKYLAKE) 1719 return; 1720 if (sad->isad_valid != 0) 1721 return; 1722 1723 mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES; 1724 for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) { 1725 uint_t chanoff, ringoff; 1726 1727 ringoff = i * IMC_MC_ROUTE_RING_BITS; 1728 chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET; 1729 1730 mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >> 1731 ringoff) & IMC_MC_ROUTE_RING_MASK; 1732 mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >> 1733 chanoff) & IMC_MC_ROUTE_CHAN_MASK; 1734 } 1735 } 1736 1737 /* 1738 * Initialize the SAD. To do this we have to do a few different things: 1739 * 1740 * 1. Determine where the top of low and high memory is. 1741 * 2. Read and decode all of the rules for the SAD 1742 * 3. On systems with a route table, decode the raw routes 1743 * 1744 * At this point in time, we treat TOLM and TOHM as a per-socket construct, even 1745 * though it really should be global, this just makes life a bit simpler. 1746 */ 1747 static void 1748 imc_decoder_init_sad(imc_t *imc) 1749 { 1750 uint_t i; 1751 1752 for (i = 0; i < imc->imc_nsockets; i++) { 1753 imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad); 1754 imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad); 1755 imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad); 1756 } 1757 } 1758 1759 static void 1760 imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev, 1761 imc_tad_rule_t *rule, uint32_t val) 1762 { 1763 uint64_t limit; 1764 1765 limit = IMC_TAD_LIMIT(val); 1766 rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) + 1767 IMC_TAD_LIMIT_EXCLUSIVE; 1768 rule->itr_raw = val; 1769 1770 switch (IMC_TAD_SOCK_WAY(val)) { 1771 case IMC_TAD_SOCK_WAY_1: 1772 rule->itr_sock_way = 1; 1773 break; 1774 case IMC_TAD_SOCK_WAY_2: 1775 rule->itr_sock_way = 2; 1776 break; 1777 case IMC_TAD_SOCK_WAY_4: 1778 rule->itr_sock_way = 4; 1779 break; 1780 case IMC_TAD_SOCK_WAY_8: 1781 rule->itr_sock_way = 8; 1782 break; 1783 } 1784 1785 rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1; 1786 rule->itr_sock_gran = IMC_TAD_GRAN_64B; 1787 rule->itr_chan_gran = IMC_TAD_GRAN_64B; 1788 1789 /* 1790 * Starting with Skylake the targets that are used are no longer part of 1791 * the TAD. Those come from the IMC route table. 1792 */ 1793 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1794 rule->itr_ntargets = 0; 1795 return; 1796 } 1797 1798 rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS; 1799 rule->itr_targets[0] = IMC_TAD_TARG0(val); 1800 rule->itr_targets[1] = IMC_TAD_TARG1(val); 1801 rule->itr_targets[2] = IMC_TAD_TARG2(val); 1802 rule->itr_targets[3] = IMC_TAD_TARG3(val); 1803 1804 if (prev == NULL) { 1805 rule->itr_base = 0; 1806 } else { 1807 rule->itr_base = prev->itr_limit + 1; 1808 } 1809 } 1810 1811 static void 1812 imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule, 1813 uint32_t val) 1814 { 1815 uint64_t base; 1816 1817 rule->itr_raw_gran = val; 1818 base = IMC_TAD_BASE_BASE(val); 1819 rule->itr_base = base << IMC_TAD_BASE_SHIFT; 1820 1821 switch (IMC_TAD_BASE_CHAN_GRAN(val)) { 1822 case IMC_TAD_BASE_CHAN_GRAN_64B: 1823 rule->itr_sock_gran = IMC_TAD_GRAN_64B; 1824 break; 1825 case IMC_TAD_BASE_CHAN_GRAN_256B: 1826 rule->itr_sock_gran = IMC_TAD_GRAN_256B; 1827 break; 1828 case IMC_TAD_BASE_CHAN_GRAN_4KB: 1829 rule->itr_sock_gran = IMC_TAD_GRAN_4KB; 1830 break; 1831 default: 1832 tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN; 1833 return; 1834 } 1835 1836 switch (IMC_TAD_BASE_SOCK_GRAN(val)) { 1837 case IMC_TAD_BASE_SOCK_GRAN_64B: 1838 rule->itr_sock_gran = IMC_TAD_GRAN_64B; 1839 break; 1840 case IMC_TAD_BASE_SOCK_GRAN_256B: 1841 rule->itr_sock_gran = IMC_TAD_GRAN_256B; 1842 break; 1843 case IMC_TAD_BASE_SOCK_GRAN_4KB: 1844 rule->itr_sock_gran = IMC_TAD_GRAN_4KB; 1845 break; 1846 case IMC_TAD_BASE_SOCK_GRAN_1GB: 1847 rule->itr_sock_gran = IMC_TAD_GRAN_1GB; 1848 break; 1849 } 1850 } 1851 1852 /* 1853 * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's 1854 * suggested that the channel wayness will take this into account and therefore 1855 * should be accurately reflected. 1856 */ 1857 static void 1858 imc_tad_read_rules(imc_t *imc, imc_tad_t *tad) 1859 { 1860 uint_t i; 1861 off_t baseoff; 1862 imc_tad_rule_t *prev; 1863 1864 tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules; 1865 for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset, 1866 prev = NULL; i < tad->itad_nrules; 1867 i++, baseoff += sizeof (uint32_t)) { 1868 uint32_t val; 1869 off_t off; 1870 imc_tad_rule_t *rule = &tad->itad_rules[i]; 1871 1872 /* 1873 * On Skylake, the TAD rules are split among two registers. The 1874 * latter set mimics what exists on pre-Skylake. 1875 */ 1876 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1877 off = baseoff + IMC_SKX_WAYNESS_OFFSET; 1878 } else { 1879 off = baseoff; 1880 } 1881 1882 val = pci_config_get32(tad->itad_stub->istub_cfgspace, off); 1883 if (val == PCI_EINVAL32) { 1884 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1885 return; 1886 } 1887 1888 imc_tad_fill_rule(imc, tad, prev, rule, val); 1889 prev = rule; 1890 if (imc->imc_gen < IMC_GEN_SKYLAKE) 1891 continue; 1892 1893 val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff); 1894 if (val == PCI_EINVAL32) { 1895 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1896 return; 1897 } 1898 1899 imc_tad_fill_skx(imc, tad, rule, val); 1900 } 1901 } 1902 1903 /* 1904 * Check for features which change how decoding works. 1905 */ 1906 static void 1907 imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc) 1908 { 1909 uint32_t val; 1910 1911 /* 1912 * Determine whether or not lockstep mode or mirroring are enabled. 1913 * These change the behavior of how we're supposed to interpret channel 1914 * wayness. Lockstep is available in the TAD's features. Mirroring is 1915 * available on the IMC's features. This isn't present in Skylake+. On 1916 * Skylake Mirorring is a property of the SAD rule and there is no 1917 * lockstep. 1918 */ 1919 switch (imc->imc_gen) { 1920 case IMC_GEN_SANDY: 1921 case IMC_GEN_IVY: 1922 case IMC_GEN_HASWELL: 1923 case IMC_GEN_BROADWELL: 1924 val = pci_config_get32(tad->itad_stub->istub_cfgspace, 1925 imc->imc_gen_data->igd_tad_sysdef); 1926 if (val == PCI_EINVAL32) { 1927 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1928 return; 1929 } 1930 if (IMC_TAD_SYSDEF_LOCKSTEP(val)) { 1931 tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP; 1932 } 1933 1934 val = pci_config_get32(mc->icn_main1->istub_cfgspace, 1935 imc->imc_gen_data->igd_mc_mirror); 1936 if (val == PCI_EINVAL32) { 1937 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1938 return; 1939 } 1940 if (IMC_MC_MIRROR_SNB_BRD(val)) { 1941 tad->itad_flags |= IMC_TAD_FLAG_MIRROR; 1942 } 1943 break; 1944 default: 1945 break; 1946 } 1947 1948 /* 1949 * Now, go through and look at values that'll change how we do the 1950 * channel index and adddress calculation. These are only present 1951 * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge 1952 * and they don't exist on Skylake+. 1953 */ 1954 switch (imc->imc_gen) { 1955 case IMC_GEN_IVY: 1956 case IMC_GEN_HASWELL: 1957 case IMC_GEN_BROADWELL: 1958 val = pci_config_get32(tad->itad_stub->istub_cfgspace, 1959 imc->imc_gen_data->igd_tad_sysdef2); 1960 if (val == PCI_EINVAL32) { 1961 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1962 return; 1963 } 1964 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { 1965 tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT; 1966 } 1967 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { 1968 tad->itad_flags |= IMC_TAD_FLAG_CHANHASH; 1969 } 1970 break; 1971 default: 1972 break; 1973 } 1974 } 1975 1976 /* 1977 * Read the IMC channel interleave records 1978 */ 1979 static void 1980 imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan) 1981 { 1982 uint_t i; 1983 off_t off; 1984 1985 chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules; 1986 for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset; 1987 i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) { 1988 uint32_t val; 1989 uint64_t offset; 1990 1991 val = pci_config_get32(chan->ich_desc->istub_cfgspace, 1992 off); 1993 if (val == PCI_EINVAL32) { 1994 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; 1995 return; 1996 } 1997 1998 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1999 offset = IMC_TADCHAN_OFFSET_SKX(val); 2000 } else { 2001 offset = IMC_TADCHAN_OFFSET_SNB_BRD(val); 2002 } 2003 2004 chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT; 2005 chan->ich_tad_offsets_raw[i] = val; 2006 } 2007 } 2008 2009 static void 2010 imc_decoder_init_tad(imc_t *imc) 2011 { 2012 uint_t i; 2013 2014 for (i = 0; i < imc->imc_nsockets; i++) { 2015 uint_t j; 2016 2017 for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) { 2018 imc_tad_read_features(imc, 2019 &imc->imc_sockets[i].isock_tad[j], 2020 &imc->imc_sockets[i].isock_imcs[j]); 2021 imc_tad_read_rules(imc, 2022 &imc->imc_sockets[i].isock_tad[j]); 2023 } 2024 } 2025 2026 for (i = 0; i < imc->imc_nsockets; i++) { 2027 uint_t j; 2028 imc_socket_t *sock = &imc->imc_sockets[i]; 2029 2030 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { 2031 uint_t k; 2032 imc_mc_t *mc = &sock->isock_imcs[j]; 2033 2034 for (k = 0; k < mc->icn_nchannels; k++) { 2035 imc_channel_t *chan = &mc->icn_channels[k]; 2036 imc_tad_read_interleave(imc, chan); 2037 } 2038 } 2039 } 2040 } 2041 2042 static void 2043 imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan, 2044 imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig) 2045 { 2046 uint_t i; 2047 off_t off, incr; 2048 2049 /* 2050 * Rank interleave offset registers come in two forms. Either they are 2051 * contiguous for a given wayness, meaning that all of the entries for 2052 * wayness zero are contiguous, or they are sparse, meaning that there 2053 * is a bank for entry zero for all wayness, then entry one for all 2054 * wayness, etc. 2055 */ 2056 if (contig) { 2057 off = imc->imc_gen_data->igd_rir_ileave_offset + 2058 (rirno * imc->imc_gen_data->igd_rir_nileaves * 2059 sizeof (uint32_t)); 2060 incr = sizeof (uint32_t); 2061 } else { 2062 off = imc->imc_gen_data->igd_rir_ileave_offset + 2063 (rirno * sizeof (uint32_t)); 2064 incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t); 2065 } 2066 for (i = 0; i < rank->irle_nentries; i++, off += incr) { 2067 uint32_t val; 2068 uint64_t offset; 2069 imc_rank_ileave_entry_t *ent = &rank->irle_entries[i]; 2070 2071 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); 2072 if (val == PCI_EINVAL32) { 2073 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; 2074 return; 2075 } 2076 2077 switch (imc->imc_gen) { 2078 case IMC_GEN_BROADWELL: 2079 ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val); 2080 break; 2081 default: 2082 ent->irle_target = IMC_RIR_OFFSET_TARGET(val); 2083 break; 2084 } 2085 if (imc->imc_gen >= IMC_GEN_HASWELL) { 2086 offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val); 2087 } else { 2088 offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val); 2089 } 2090 ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT; 2091 } 2092 } 2093 2094 static void 2095 imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan) 2096 { 2097 uint_t i; 2098 off_t off; 2099 2100 chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways; 2101 for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset; 2102 i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) { 2103 uint32_t val; 2104 uint64_t lim; 2105 imc_rank_ileave_t *ent = &chan->ich_rankileaves[i]; 2106 2107 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); 2108 if (val == PCI_EINVAL32) { 2109 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; 2110 return; 2111 } 2112 2113 ent->irle_raw = val; 2114 ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0; 2115 ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val); 2116 ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val); 2117 if (imc->imc_gen >= IMC_GEN_HASWELL) { 2118 lim = IMC_RIR_LIMIT_HAS_SKX(val); 2119 } else { 2120 lim = IMC_RIR_LIMIT_SNB_IVB(val); 2121 } 2122 2123 ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) + 2124 IMC_RIR_LIMIT_EXCLUSIVE; 2125 2126 ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves; 2127 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 2128 imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE); 2129 } else { 2130 imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE); 2131 } 2132 } 2133 } 2134 2135 static void 2136 imc_decoder_init_rir(imc_t *imc) 2137 { 2138 uint_t i; 2139 2140 for (i = 0; i < imc->imc_nsockets; i++) { 2141 uint_t j; 2142 imc_socket_t *sock = &imc->imc_sockets[i]; 2143 2144 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { 2145 uint_t k; 2146 imc_mc_t *mc = &sock->isock_imcs[j]; 2147 2148 for (k = 0; k < mc->icn_nchannels; k++) { 2149 imc_channel_t *chan = &mc->icn_channels[k]; 2150 imc_rir_read_wayness(imc, chan); 2151 } 2152 } 2153 } 2154 } 2155 2156 static cmi_errno_t 2157 imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo, 2158 uint32_t synd, int syndtype, mc_unum_t *unump) 2159 { 2160 imc_t *imc = arg; 2161 uint_t i; 2162 imc_decode_state_t dec; 2163 2164 bzero(&dec, sizeof (dec)); 2165 if (!imc_decode_pa(imc, pa, &dec)) { 2166 switch (dec.ids_fail) { 2167 case IMC_DECODE_F_LEGACY_RANGE: 2168 case IMC_DECODE_F_OUTSIDE_DRAM: 2169 return (CMIERR_MC_NOTDIMMADDR); 2170 default: 2171 return (CMIERR_MC_BADSTATE); 2172 } 2173 } 2174 2175 unump->unum_board = 0; 2176 /* 2177 * The chip id needs to be in the order that the OS expects it, which 2178 * may not be our order. 2179 */ 2180 for (i = 0; i < imc->imc_nsockets; i++) { 2181 if (imc->imc_spointers[i] == dec.ids_socket) 2182 break; 2183 } 2184 if (i == imc->imc_nsockets) { 2185 return (CMIERR_MC_BADSTATE); 2186 } 2187 unump->unum_chip = i; 2188 unump->unum_mc = dec.ids_tadid; 2189 unump->unum_chan = dec.ids_channelid; 2190 unump->unum_cs = dec.ids_dimmid; 2191 unump->unum_rank = dec.ids_rankid; 2192 unump->unum_offset = dec.ids_rankaddr; 2193 for (i = 0; i < MC_UNUM_NDIMM; i++) { 2194 unump->unum_dimms[i] = MC_INVALNUM; 2195 } 2196 2197 return (CMI_SUCCESS); 2198 } 2199 2200 static cmi_errno_t 2201 imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa) 2202 { 2203 return (CMIERR_UNKNOWN); 2204 } 2205 2206 static const cmi_mc_ops_t imc_mc_ops = { 2207 .cmi_mc_patounum = imc_mc_patounum, 2208 .cmi_mc_unumtopa = imc_mc_unumtopa 2209 }; 2210 2211 /* 2212 * This is where we really finish attaching and become open for business. This 2213 * occurs once we have all of the expected stubs attached. Here's where all of 2214 * the real fun begins. 2215 */ 2216 static void 2217 imc_attach_complete(void *arg) 2218 { 2219 imc_t *imc = arg; 2220 cmi_errno_t err; 2221 2222 imc_set_gen_data(imc); 2223 2224 /* 2225 * On SKX and newer, we can fail to map PCI buses at this point due to 2226 * bad PCIe reads. 2227 */ 2228 if (!imc_map_stubs(imc)) { 2229 goto done; 2230 } 2231 2232 if (!imc_validate_stubs(imc)) { 2233 imc->imc_flags |= IMC_F_VALIDATE_FAILED; 2234 goto done; 2235 } 2236 2237 imc_fixup_stubs(imc); 2238 imc_map_sockets(imc); 2239 2240 if (!imc_create_minors(imc)) { 2241 goto done; 2242 } 2243 2244 imc_fill_data(imc); 2245 imc_nvl_create(imc); 2246 2247 /* 2248 * Gather additional information that we need so that we can properly 2249 * initialize the memory decoder and encoder. 2250 */ 2251 imc_decoder_init_sad(imc); 2252 imc_decoder_init_tad(imc); 2253 imc_decoder_init_rir(imc); 2254 2255 /* 2256 * Register decoder functions. This may fail. If so, try and complain 2257 * loudly, but stay active to allow other data to be useful. Register a 2258 * global handle. 2259 */ 2260 if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) { 2261 imc->imc_flags |= IMC_F_MCREG_FAILED; 2262 dev_err(imc->imc_dip, CE_WARN, "failed to register memory " 2263 "decoding operations: 0x%x", err); 2264 } 2265 2266 done: 2267 mutex_enter(&imc->imc_lock); 2268 imc->imc_flags &= IMC_F_ATTACH_DISPATCHED; 2269 imc->imc_flags |= IMC_F_ATTACH_COMPLETE; 2270 mutex_exit(&imc->imc_lock); 2271 } 2272 2273 static int 2274 imc_stub_comparator(const void *l, const void *r) 2275 { 2276 const imc_stub_t *sl = l, *sr = r; 2277 if (sl->istub_bus > sr->istub_bus) 2278 return (1); 2279 if (sl->istub_bus < sr->istub_bus) 2280 return (-1); 2281 if (sl->istub_dev > sr->istub_dev) 2282 return (1); 2283 if (sl->istub_dev < sr->istub_dev) 2284 return (-1); 2285 if (sl->istub_func > sr->istub_func) 2286 return (1); 2287 if (sl->istub_func < sr->istub_func) 2288 return (-1); 2289 return (0); 2290 } 2291 2292 static int 2293 imc_stub_scan_cb(dev_info_t *dip, void *arg) 2294 { 2295 int vid, did; 2296 const imc_stub_table_t *table; 2297 imc_t *imc = arg; 2298 int *regs; 2299 uint_t i, nregs; 2300 2301 if (dip == ddi_root_node()) { 2302 return (DDI_WALK_CONTINUE); 2303 } 2304 2305 /* 2306 * Get the dev info name. PCI devices will always be children of PCI 2307 * devices today on x86. If we reach something that has a device name 2308 * that's not PCI, then we can prune it's children. 2309 */ 2310 if (strncmp("pci", ddi_get_name(dip), 3) != 0) { 2311 return (DDI_WALK_PRUNECHILD); 2312 } 2313 2314 /* 2315 * Get the device and vendor ID and see if this is something the imc 2316 * knows about or cares about. 2317 */ 2318 vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2319 "vendor-id", PCI_EINVAL16); 2320 did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2321 "device-id", PCI_EINVAL16); 2322 if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { 2323 return (DDI_WALK_CONTINUE); 2324 } 2325 2326 if (vid != IMC_PCI_VENDOR_INTC) { 2327 return (DDI_WALK_PRUNECHILD); 2328 } 2329 2330 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2331 "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { 2332 return (DDI_WALK_CONTINUE); 2333 } 2334 2335 if (nregs == 0) { 2336 ddi_prop_free(regs); 2337 return (DDI_WALK_CONTINUE); 2338 } 2339 2340 2341 table = NULL; 2342 for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { 2343 if (imc_stub_table[i].imcs_devid == did && 2344 imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && 2345 imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { 2346 table = &imc_stub_table[i]; 2347 break; 2348 } 2349 } 2350 ddi_prop_free(regs); 2351 2352 /* 2353 * Not a match, not interesting. 2354 */ 2355 if (table == NULL) { 2356 return (DDI_WALK_CONTINUE); 2357 } 2358 2359 mutex_enter(&imc->imc_lock); 2360 imc->imc_nscanned++; 2361 mutex_exit(&imc->imc_lock); 2362 2363 return (DDI_WALK_CONTINUE); 2364 } 2365 2366 /* 2367 * From here, go through and see how many of the devices that we know about. 2368 */ 2369 static void 2370 imc_stub_scan(void *arg) 2371 { 2372 imc_t *imc = arg; 2373 boolean_t dispatch = B_FALSE; 2374 2375 /* 2376 * Zero out the scan results in case we've been detached and reattached. 2377 */ 2378 mutex_enter(&imc->imc_lock); 2379 imc->imc_nscanned = 0; 2380 mutex_exit(&imc->imc_lock); 2381 2382 ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc); 2383 2384 mutex_enter(&imc->imc_lock); 2385 imc->imc_flags |= IMC_F_SCAN_COMPLETE; 2386 imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED; 2387 2388 /* 2389 * If the scan found no nodes, then that means that we're on a hardware 2390 * platform that we don't support. Therefore, there's no reason to do 2391 * anything here. 2392 */ 2393 if (imc->imc_nscanned == 0) { 2394 imc->imc_flags |= IMC_F_UNSUP_PLATFORM; 2395 mutex_exit(&imc->imc_lock); 2396 return; 2397 } 2398 2399 if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { 2400 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; 2401 dispatch = B_TRUE; 2402 } 2403 2404 mutex_exit(&imc->imc_lock); 2405 2406 if (dispatch) { 2407 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, 2408 imc, DDI_SLEEP); 2409 } 2410 } 2411 2412 /* 2413 * By default, refuse to allow stubs to detach. 2414 */ 2415 int 2416 imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd) 2417 { 2418 imc_stub_t *stub; 2419 imc_t *imc = imc_data; 2420 2421 mutex_enter(&imc->imc_lock); 2422 2423 /* 2424 * By default, we do not allow stubs to detach. However, if the driver 2425 * has attached to devices on a platform it doesn't recognize or 2426 * support or if the override flag has been set, then allow detach to 2427 * proceed. 2428 */ 2429 if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 && 2430 imc_allow_detach == 0) { 2431 mutex_exit(&imc->imc_lock); 2432 return (DDI_FAILURE); 2433 } 2434 2435 for (stub = avl_first(&imc->imc_stubs); stub != NULL; 2436 stub = AVL_NEXT(&imc->imc_stubs, stub)) { 2437 if (stub->istub_dip == dip) { 2438 break; 2439 } 2440 } 2441 2442 /* 2443 * A device was attached to us that we somehow don't know about. Allow 2444 * this to proceed. 2445 */ 2446 if (stub == NULL) { 2447 mutex_exit(&imc->imc_lock); 2448 return (DDI_SUCCESS); 2449 } 2450 2451 pci_config_teardown(&stub->istub_cfgspace); 2452 avl_remove(&imc->imc_stubs, stub); 2453 kmem_free(stub, sizeof (imc_stub_t)); 2454 mutex_exit(&imc->imc_lock); 2455 2456 return (DDI_SUCCESS); 2457 } 2458 2459 int 2460 imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd) 2461 { 2462 imc_stub_t *stub, *lookup; 2463 int did, vid, *regs; 2464 uint_t i, nregs; 2465 const imc_stub_table_t *table; 2466 avl_index_t idx; 2467 boolean_t dispatch = B_FALSE; 2468 imc_t *imc = imc_data; 2469 2470 if (cmd != DDI_ATTACH) { 2471 return (DDI_FAILURE); 2472 } 2473 2474 /* 2475 * We've been asked to attach a stub. First, determine if this is even a 2476 * PCI device that we should care about. Then, append it to our global 2477 * list and kick off the configuration task. Note that we do this 2478 * configuration task in a taskq so that we don't interfere with the 2479 * normal attach / detach path processing. 2480 */ 2481 if (strncmp("pci", ddi_get_name(dip), 3) != 0) { 2482 return (DDI_FAILURE); 2483 } 2484 2485 /* 2486 * Get the device and vendor ID and see if this is something the imc 2487 * knows about or cares about. 2488 */ 2489 vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2490 "vendor-id", PCI_EINVAL16); 2491 did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2492 "device-id", PCI_EINVAL16); 2493 if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { 2494 return (DDI_FAILURE); 2495 } 2496 2497 /* 2498 * Only accept INTC parts on the imc driver. 2499 */ 2500 if (vid != IMC_PCI_VENDOR_INTC) { 2501 return (DDI_FAILURE); 2502 } 2503 2504 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2505 "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { 2506 return (DDI_FAILURE); 2507 } 2508 2509 if (nregs == 0) { 2510 ddi_prop_free(regs); 2511 return (DDI_FAILURE); 2512 } 2513 2514 /* 2515 * Determine if this matches a known device. 2516 */ 2517 table = NULL; 2518 for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { 2519 if (imc_stub_table[i].imcs_devid == did && 2520 imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && 2521 imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { 2522 table = &imc_stub_table[i]; 2523 break; 2524 } 2525 } 2526 2527 if (i == ARRAY_SIZE(imc_stub_table)) { 2528 ddi_prop_free(regs); 2529 return (DDI_FAILURE); 2530 } 2531 2532 /* 2533 * We've found something. Make sure the generation matches our current 2534 * one. If it does, construct the entry and append it to the list. 2535 */ 2536 mutex_enter(&imc->imc_lock); 2537 if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen != 2538 table->imcs_gen) { 2539 mutex_exit(&imc->imc_lock); 2540 ddi_prop_free(regs); 2541 dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) " 2542 "that has different hardware generation (%u) from current " 2543 "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen); 2544 return (DDI_FAILURE); 2545 } else { 2546 imc->imc_gen = table->imcs_gen; 2547 } 2548 mutex_exit(&imc->imc_lock); 2549 2550 stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP); 2551 stub->istub_dip = dip; 2552 stub->istub_vid = vid; 2553 stub->istub_did = did; 2554 stub->istub_bus = PCI_REG_BUS_G(regs[0]); 2555 stub->istub_dev = PCI_REG_DEV_G(regs[0]); 2556 stub->istub_func = PCI_REG_FUNC_G(regs[0]); 2557 ddi_prop_free(regs); 2558 stub->istub_table = table; 2559 2560 if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) { 2561 kmem_free(stub, sizeof (stub)); 2562 dev_err(dip, CE_WARN, "Failed to set up PCI config space " 2563 "for IMC stub device %s (%u/%u)", ddi_node_name(dip), 2564 vid, did); 2565 return (DDI_FAILURE); 2566 } 2567 2568 mutex_enter(&imc->imc_lock); 2569 if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) { 2570 dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate " 2571 "bdf %u/%u/%u with %s (%u/%u), not attaching", 2572 ddi_node_name(imc->imc_dip), vid, did, 2573 stub->istub_bus, stub->istub_dev, stub->istub_func, 2574 ddi_node_name(lookup->istub_dip), lookup->istub_vid, 2575 lookup->istub_did); 2576 mutex_exit(&imc->imc_lock); 2577 pci_config_teardown(&stub->istub_cfgspace); 2578 kmem_free(stub, sizeof (stub)); 2579 2580 return (DDI_FAILURE); 2581 } 2582 avl_insert(&imc->imc_stubs, stub, idx); 2583 2584 if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE && 2585 avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { 2586 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; 2587 dispatch = B_TRUE; 2588 } 2589 mutex_exit(&imc->imc_lock); 2590 2591 if (dispatch) { 2592 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, 2593 imc, DDI_SLEEP); 2594 } 2595 2596 return (DDI_SUCCESS); 2597 } 2598 2599 static int 2600 imc_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2601 { 2602 imc_t *imc = imc_data; 2603 2604 if ((flag & (FEXCL | FNDELAY)) != 0) 2605 return (EINVAL); 2606 2607 if (otyp != OTYP_CHR) 2608 return (EINVAL); 2609 2610 mutex_enter(&imc->imc_lock); 2611 2612 if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) { 2613 mutex_exit(&imc->imc_lock); 2614 return (ENOTSUP); 2615 } 2616 2617 /* 2618 * It's possible that someone has come in during the window between when 2619 * we've created the minor node and when we've finished doing work. 2620 */ 2621 if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) { 2622 mutex_exit(&imc->imc_lock); 2623 return (EAGAIN); 2624 } 2625 2626 /* 2627 * It's not clear how someone would get a minor that we didn't create. 2628 * But be paranoid and make sure. 2629 */ 2630 if (getminor(*devp) >= imc->imc_nsockets) { 2631 mutex_exit(&imc->imc_lock); 2632 return (EINVAL); 2633 } 2634 2635 /* 2636 * Make sure this socket entry has been filled in. 2637 */ 2638 if (imc->imc_spointers[getminor(*devp)] == NULL) { 2639 mutex_exit(&imc->imc_lock); 2640 return (EINVAL); 2641 } 2642 2643 mutex_exit(&imc->imc_lock); 2644 2645 return (0); 2646 } 2647 2648 static void 2649 imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode) 2650 { 2651 imc_decode_state_t dec; 2652 uint_t i; 2653 2654 bzero(&dec, sizeof (dec)); 2655 if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) { 2656 encode->mcei_err = (uint32_t)dec.ids_fail; 2657 encode->mcei_errdata = dec.ids_fail_data; 2658 return; 2659 } 2660 2661 encode->mcei_errdata = 0; 2662 encode->mcei_err = 0; 2663 encode->mcei_board = 0; 2664 for (i = 0; i < imc->imc_nsockets; i++) { 2665 if (imc->imc_spointers[i] == dec.ids_socket) 2666 break; 2667 } 2668 encode->mcei_chip = i; 2669 /* 2670 * These Intel platforms are all monolithic dies, so set the die to 2671 * zero. 2672 */ 2673 encode->mcei_die = 0; 2674 encode->mcei_mc = dec.ids_tadid; 2675 encode->mcei_chan_addr = dec.ids_chanaddr; 2676 encode->mcei_chan = dec.ids_channelid; 2677 encode->mcei_dimm = dec.ids_dimmid; 2678 encode->mcei_rank_addr = dec.ids_rankaddr; 2679 encode->mcei_rank = dec.ids_rankid; 2680 encode->mcei_row = UINT32_MAX; 2681 encode->mcei_column = UINT32_MAX; 2682 encode->mcei_cs = encode->mcei_rm = UINT8_MAX; 2683 encode->mcei_bank = encode->mcei_bank_group = UINT8_MAX; 2684 encode->mcei_subchan = UINT8_MAX; 2685 } 2686 2687 static int 2688 imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2689 int *rvalp) 2690 { 2691 int ret; 2692 minor_t m; 2693 mc_snapshot_info_t info; 2694 mc_encode_ioc_t encode; 2695 imc_t *imc = imc_data; 2696 imc_socket_t *sock; 2697 2698 mutex_enter(&imc->imc_lock); 2699 m = getminor(dev); 2700 if (m >= imc->imc_nsockets) { 2701 ret = EINVAL; 2702 goto done; 2703 } 2704 sock = imc->imc_spointers[m]; 2705 if (sock == NULL) { 2706 ret = EINVAL; 2707 goto done; 2708 } 2709 2710 /* 2711 * Note, other memory controller drivers don't check mode for reading 2712 * data nor do they care who can read it from a credential perspective. 2713 * As such we don't either at this time. 2714 */ 2715 switch (cmd) { 2716 case MC_IOC_SNAPSHOT_INFO: 2717 imc_nvl_pack(sock, B_FALSE); 2718 if (sock->isock_buf == NULL) { 2719 ret = EIO; 2720 break; 2721 } 2722 2723 info.mcs_size = sock->isock_buflen; 2724 info.mcs_gen = sock->isock_gen; 2725 2726 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { 2727 ret = EFAULT; 2728 break; 2729 } 2730 2731 ret = 0; 2732 break; 2733 case MC_IOC_SNAPSHOT: 2734 imc_nvl_pack(sock, B_FALSE); 2735 if (sock->isock_buf == NULL) { 2736 ret = EIO; 2737 break; 2738 } 2739 2740 if (ddi_copyout(sock->isock_buf, (void *)arg, 2741 sock->isock_buflen, mode) != 0) { 2742 ret = EFAULT; 2743 break; 2744 } 2745 2746 ret = 0; 2747 break; 2748 case MC_IOC_DECODE_SNAPSHOT_INFO: 2749 imc_decoder_pack(imc); 2750 if (imc->imc_decoder_buf == NULL) { 2751 ret = EIO; 2752 break; 2753 } 2754 2755 info.mcs_size = imc->imc_decoder_len; 2756 info.mcs_gen = imc->imc_spointers[0]->isock_gen; 2757 2758 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { 2759 ret = EFAULT; 2760 break; 2761 } 2762 2763 ret = 0; 2764 break; 2765 case MC_IOC_DECODE_SNAPSHOT: 2766 imc_decoder_pack(imc); 2767 if (imc->imc_decoder_buf == NULL) { 2768 ret = EIO; 2769 break; 2770 } 2771 2772 if (ddi_copyout(imc->imc_decoder_buf, (void *)arg, 2773 imc->imc_decoder_len, mode) != 0) { 2774 ret = EFAULT; 2775 break; 2776 } 2777 2778 ret = 0; 2779 break; 2780 case MC_IOC_DECODE_PA: 2781 if (crgetzoneid(credp) != GLOBAL_ZONEID || 2782 drv_priv(credp) != 0) { 2783 ret = EPERM; 2784 break; 2785 } 2786 2787 if (ddi_copyin((void *)arg, &encode, sizeof (encode), 2788 mode & FKIOCTL) != 0) { 2789 ret = EPERM; 2790 break; 2791 } 2792 2793 imc_ioctl_decode(imc, &encode); 2794 ret = 0; 2795 2796 if (ddi_copyout(&encode, (void *)arg, sizeof (encode), 2797 mode & FKIOCTL) != 0) { 2798 ret = EPERM; 2799 break; 2800 } 2801 break; 2802 default: 2803 ret = EINVAL; 2804 goto done; 2805 } 2806 2807 done: 2808 mutex_exit(&imc->imc_lock); 2809 return (ret); 2810 } 2811 2812 static int 2813 imc_close(dev_t dev, int flag, int otyp, cred_t *credp) 2814 { 2815 return (0); 2816 } 2817 2818 static int 2819 imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2820 { 2821 if (cmd != DDI_ATTACH) { 2822 return (DDI_FAILURE); 2823 } 2824 2825 if (imc_data == NULL || imc_data->imc_dip != NULL) { 2826 return (DDI_FAILURE); 2827 } 2828 2829 mutex_enter(&imc_data->imc_lock); 2830 if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1, 2831 TASKQ_DEFAULTPRI, 0)) == NULL) { 2832 mutex_exit(&imc_data->imc_lock); 2833 return (DDI_FAILURE); 2834 } 2835 2836 imc_data->imc_dip = dip; 2837 imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED; 2838 mutex_exit(&imc_data->imc_lock); 2839 2840 (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data, 2841 DDI_SLEEP); 2842 2843 return (DDI_SUCCESS); 2844 } 2845 2846 /* 2847 * We only export a single instance. 2848 */ 2849 static int 2850 imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) 2851 { 2852 /* 2853 * getinfo(9E) shouldn't be called if we're not attached. But be 2854 * paranoid. 2855 */ 2856 if (imc_data == NULL || imc_data->imc_dip == NULL) { 2857 return (DDI_FAILURE); 2858 } 2859 2860 switch (infocmd) { 2861 case DDI_INFO_DEVT2DEVINFO: 2862 *resultp = imc_data->imc_dip; 2863 break; 2864 case DDI_INFO_DEVT2INSTANCE: 2865 *resultp = (void *)0; 2866 break; 2867 default: 2868 return (DDI_FAILURE); 2869 } 2870 2871 return (DDI_SUCCESS); 2872 } 2873 2874 static int 2875 imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2876 { 2877 if (cmd != DDI_DETACH) { 2878 return (DDI_FAILURE); 2879 } 2880 2881 if (imc_data == NULL || imc_data->imc_dip) { 2882 return (DDI_FAILURE); 2883 } 2884 2885 mutex_enter(&imc_data->imc_lock); 2886 2887 /* 2888 * While a scan or attach is outstanding, don't allow us to detach. 2889 */ 2890 if ((imc_data->imc_flags & 2891 (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) { 2892 mutex_exit(&imc_data->imc_lock); 2893 return (DDI_FAILURE); 2894 } 2895 2896 /* 2897 * Because the stub driver depends on the imc driver, we shouldn't be 2898 * able to have any entries in this list when we detach. However, we 2899 * check just to make sure. 2900 */ 2901 if (!avl_is_empty(&imc_data->imc_stubs)) { 2902 mutex_exit(&imc_data->imc_lock); 2903 return (DDI_FAILURE); 2904 } 2905 2906 nvlist_free(imc_data->imc_decoder_dump); 2907 imc_data->imc_decoder_dump = NULL; 2908 if (imc_data->imc_decoder_buf != NULL) { 2909 kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len); 2910 imc_data->imc_decoder_buf = NULL; 2911 imc_data->imc_decoder_len = 0; 2912 } 2913 2914 ddi_remove_minor_node(imc_data->imc_dip, NULL); 2915 imc_data->imc_dip = NULL; 2916 mutex_exit(&imc_data->imc_lock); 2917 2918 ddi_taskq_wait(imc_data->imc_taskq); 2919 ddi_taskq_destroy(imc_data->imc_taskq); 2920 imc_data->imc_taskq = NULL; 2921 2922 return (DDI_SUCCESS); 2923 } 2924 2925 static void 2926 imc_free(void) 2927 { 2928 if (imc_data == NULL) { 2929 return; 2930 } 2931 2932 VERIFY(avl_is_empty(&imc_data->imc_stubs)); 2933 avl_destroy(&imc_data->imc_stubs); 2934 mutex_destroy(&imc_data->imc_lock); 2935 kmem_free(imc_data, sizeof (imc_t)); 2936 imc_data = NULL; 2937 } 2938 2939 static void 2940 imc_alloc(void) 2941 { 2942 imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP); 2943 2944 mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL); 2945 avl_create(&imc_data->imc_stubs, imc_stub_comparator, 2946 sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link)); 2947 } 2948 2949 static struct cb_ops imc_cb_ops = { 2950 .cb_open = imc_open, 2951 .cb_close = imc_close, 2952 .cb_strategy = nodev, 2953 .cb_print = nodev, 2954 .cb_dump = nodev, 2955 .cb_read = nodev, 2956 .cb_write = nodev, 2957 .cb_ioctl = imc_ioctl, 2958 .cb_devmap = nodev, 2959 .cb_mmap = nodev, 2960 .cb_segmap = nodev, 2961 .cb_chpoll = nochpoll, 2962 .cb_prop_op = ddi_prop_op, 2963 .cb_flag = D_MP, 2964 .cb_rev = CB_REV, 2965 .cb_aread = nodev, 2966 .cb_awrite = nodev 2967 }; 2968 2969 static struct dev_ops imc_dev_ops = { 2970 .devo_rev = DEVO_REV, 2971 .devo_refcnt = 0, 2972 .devo_getinfo = imc_getinfo, 2973 .devo_identify = nulldev, 2974 .devo_probe = nulldev, 2975 .devo_attach = imc_attach, 2976 .devo_detach = imc_detach, 2977 .devo_reset = nodev, 2978 .devo_cb_ops = &imc_cb_ops, 2979 .devo_quiesce = ddi_quiesce_not_needed 2980 }; 2981 2982 static struct modldrv imc_modldrv = { 2983 .drv_modops = &mod_driverops, 2984 .drv_linkinfo = "Intel Integrated Memory Controller Driver", 2985 .drv_dev_ops = &imc_dev_ops 2986 }; 2987 2988 static struct modlinkage imc_modlinkage = { 2989 .ml_rev = MODREV_1, 2990 .ml_linkage = { &imc_modldrv, NULL } 2991 }; 2992 2993 int 2994 _init(void) 2995 { 2996 int ret; 2997 2998 if ((ret = mod_install(&imc_modlinkage)) == 0) { 2999 imc_alloc(); 3000 } 3001 3002 return (ret); 3003 } 3004 3005 int 3006 _info(struct modinfo *modinfop) 3007 { 3008 return (mod_info(&imc_modlinkage, modinfop)); 3009 } 3010 3011 int 3012 _fini(void) 3013 { 3014 int ret; 3015 3016 if ((ret = mod_remove(&imc_modlinkage)) == 0) { 3017 imc_free(); 3018 } 3019 return (ret); 3020 } 3021