1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 */ 15 16 /* 17 * Generic Intel Integrated Memory Controller (IMC) Driver 18 * 19 * This driver talks to the CPU's IMC to understand the detailed topology of the 20 * processor and to determine how to map between physical addresses to the 21 * corresponding DIMM. This driver supports the following generations of Intel 22 * chips: 23 * 24 * - Sandy Bridge 25 * - Ivy Bridge 26 * - Haswell 27 * - Broadwell 28 * - Skylake / Cascade Lake 29 * 30 * Memory Decoding 31 * --------------- 32 * 33 * For more detailed summaries of the memory decoding process, please refer to 34 * the Intel External Design Specifications for the corresponding processor. 35 * What follows is a rough overview of how the memory decoding system works. 36 * 37 * First, we'd like to define the following concepts: 38 * 39 * SYSTEM ADDRESS 40 * 41 * This is a physical address that the operating system normally uses. This 42 * address may refer to DRAM, it may refer to memory mapped PCI 43 * configuration space or device registers, or it may refer to other parts 44 * of the system's memory map, such as the extended advanced programmable 45 * interrupt controller (xAPIC), etc. 46 * 47 * DIMM 48 * 49 * Dual-inline memory module. This refers to a physical stick of volatile 50 * memory that is inserted into a slot on the motherboard. 51 * 52 * RANK 53 * 54 * A potential sub-division of a DIMM. A DIMM's memory capacity is divided 55 * into a number of equal sized ranks. For example, an 8 GiB DIMM, may have 56 * 1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks. 57 * 58 * RANK ADDRESS 59 * 60 * An address that exists in the context of a given rank on a DIMM. All 61 * ranks have overlapping addresses, so the address 0x400 exists on all 62 * ranks on a given DIMM. 63 * 64 * CHANNEL 65 * 66 * Multiple DIMMs may be combined into a single channel. The channel 67 * represents the combined memory of all the DIMMs. A given channel only 68 * ever exists on a socket and is bound to a single memory controller. 69 * 70 * CHANNEL ADDRESS 71 * 72 * This is an address that exists logically on a channel. Each address on a 73 * channel maps to a corresponding DIMM that exists on that channel. The 74 * address space on one channel is independent from that on another. This 75 * means that address 0x1000 can exist on each memory channel in the 76 * system. 77 * 78 * INTERLEAVE 79 * 80 * There are several different cases where interleaving occurs on the 81 * system. For example, addresses may be interleaved across sockets, 82 * memory channels, or DIMM ranks. When addresses are interleaved, then 83 * some number of bits in an address are used to select which target to go 84 * to (usually through a look up table). The effect of interleaving is that 85 * addresses that are next to one another may not all go to the same 86 * device. The following image shows a non-interleaving case. 87 * 88 * 0x0fff +-----+ +-----+ 0x7ff 89 * | |\___________/| | 90 * | | __________ | (b) | 91 * | | / \| | 92 * 0x0800 |=====|= +-----+ 0x000 +-----+ 0x7ff 93 * | | \______________________________/| | 94 * | | _______________________________ | (a) | 95 * | |/ \| | 96 * 0x0000 +-----+ +-----+ 0x000 97 * 98 * In this example of non-interleaving, addresses 0x0000 to 0x07ff go to 99 * device (a). While, addresses 0x08000 to 0xfff, go to device (b). 100 * However, each range is divided into the same number of components. 101 * 102 * If instead, we were to look at that with interleaving, what we might say 103 * is that rather than splitting the range in half, we might say that if 104 * the address has bit 8 set (0x100), then it goes to (b), otherwise it 105 * goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a). 106 * 0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a) 107 * again, and then 0x300 to 0x2ff would go back to (b). This would continue 108 * for a while. This would instead look something more like: 109 * 110 * 111 * 0x0fff +-----+ A: 0x7ff +---------+ B: 0x7ff +---------+ 112 * | (b) | | e00-eff | | f00-fff | 113 * 0x0f00 |-----| 0x700 +---------+ 0x700 +---------+ 114 * | (a) | | c00-cff | | d00-dff | 115 * 0x0e00 ~~~~~~~ 0x600 +---------+ 0x600 +---------+ 116 * *** | a00-aff | | b00-bff | 117 * 0x0400 ~~~~~~~ 0x500 +---------+ 0x500 +---------+ 118 * | (b) | | 800-8ff | | 900-9ff | 119 * 0x0300 |-----| 0x400 +---------+ 0x400 +---------+ 120 * | (a) | | 600-6ff | | 700-7ff | 121 * 0x0200 |-----| 0x300 +---------+ 0x300 +---------+ 122 * | (b) | | 400-4ff | | 500-5ff | 123 * 0x0100 |-----| 0x200 +---------+ 0x200 +---------+ 124 * | (a) | | 200-2ff | | 300-3ff | 125 * 0x0000 +-----+ 0x100 +---------+ 0x100 +---------+ 126 * | 000-0ff | | 100-1ff | 127 * 0x000 +---------+ 0x000 +---------+ 128 * 129 * In this example we've performed two-way interleaving. The number of ways 130 * that something can interleave varies based on what we're interleaving 131 * between. 132 * 133 * MEMORY CONTROLLER 134 * 135 * A given processor die (see uts/i86pc/os/cpuid.c) contains a number of 136 * memory controllers. Usually 1 or two. Each memory controller supports a 137 * given number of DIMMs, which are divided across multiple channels. 138 * 139 * TARGET ADDRESS DECODER 140 * 141 * The target address decoder (TAD) is responsible for taking a system 142 * address and transforming it into a channel address based on the rules 143 * that are present. Each memory controller has a corresponding TAD. The 144 * TAD is often contained in a device called a 'Home Agent'. 145 * 146 * SYSTEM ADDRESS DECODER 147 * 148 * The system address decoder (SAD) is responsible for taking a system 149 * address and directing it to the right place, whether this be memory or 150 * otherwise. There is a single memory controller per socket (see 151 * uts/i86pc/os/cpuid.c) that is shared between all the cores currently. 152 * 153 * NODE IDENTIFIER 154 * 155 * The node identifier is used to uniquely identify an element in the 156 * various routing topologies on the die (see uts/i86pc/os/cpuid.c for the 157 * definition of 'die'). One can roughly think about this as a unique 158 * identifier for the socket itself. In general, the primary node ID for a 159 * socket should map to the socket APIC ID. 160 * 161 * Finding Devices 162 * --------------- 163 * 164 * There is a bit of a chicken and egg problem on Intel systems and in the 165 * device driver interface. The information that we need in the system is spread 166 * out amongst a large number of different PCI devices that the processor 167 * exposes. The number of such devices can vary based on the processor 168 * generation and the specific SKU in the processor. To deal with this, we break 169 * the driver into two different components: a stub driver and the full driver. 170 * 171 * The stub driver has aliases for all known PCI devices that we might attach to 172 * in a given generation on the system. This driver is called 'imcstub'. When a 173 * stub attaches, it just registers itself with the main driver, upon which it 174 * has a module dependency. 175 * 176 * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it 177 * kicks off a scan of the device tree which takes place in a task queue. Once 178 * there, it determines the number of devices that it expects to exist by 179 * walking the tree and comparing it against the generation-specific table. 180 * 181 * If all devices are found, we'll go ahead and read through all the devices and 182 * build a map of all the information we need to understand the topology of the 183 * system and to be able to decode addresses. We do this here, because we can be 184 * asked to perform decoding in dangerous contexts (after taking an MCE, panic, 185 * etc) where we don't want to have to rely on the broader kernel functioning at 186 * this point in time. 187 * 188 * Once our topology is built, we'll create minor nodes which are used by the 189 * fault management architecture to query for information and register our 190 * decoding functionality with the kernel. 191 * 192 * PCI Numbering 193 * ------------- 194 * 195 * For each device that we care about, Intel defines the device and function 196 * that we can expect to find the information and PCI configuration space 197 * registers that we care about at. However, the PCI bus is not well defined. 198 * Devices that are on the same socket use the same set of bus numbers; however, 199 * some sockets have multiple device numbers that they'll use to represent 200 * different classes. These bus numbers are programmed by systems firmware as 201 * part of powering on the system. This means, that we need the ability to 202 * map together these disparate ranges ourselves. 203 * 204 * There is a device called a utility box (UBOX), which exists per-socket and 205 * maps the different sockets together. We use this to determine which devices 206 * correspond to which sockets. 207 * 208 * Mapping Sockets 209 * --------------- 210 * 211 * Another wrinkle is that the way that the OS sees the numbering of the CPUs is 212 * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more 213 * information). However, to map to the corresponding socket, we need to look at 214 * the socket's node ID. The order of PCI buses in the system is not required to 215 * have any relation to the socket ID. Therefore, we have to have yet another 216 * indirection table in the imc_t. 217 * 218 * Exposing Data 219 * ------------- 220 * 221 * We expose topology data to FMA using the OS-private memory controller 222 * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a 223 * number of specific interfaces that we can then implement. The ioctl API asks 224 * us for a snapshot of data, which basically has us go through and send an 225 * nvlist_t to userland. This nvlist_t is constructed as part of the scan 226 * process. This nvlist uses the version 1 format, which more explicitly encodes 227 * the topology in a series of nested nvlists. 228 * 229 * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the 230 * decoder and ask it to perform decoding. 231 * 232 * Decoding Addresses 233 * ------------------ 234 * 235 * The decoding logic can be found in common/imc/imc_decode.c. This file is 236 * shared between the kernel and userland to allow for easier testing and 237 * additional flexibility in operation. The decoding process happens in a few 238 * different phases. 239 * 240 * The first phase, is to determine which memory controller on which socket is 241 * responsible for this data. To determine this, we use the system address 242 * decoder and walk the rules, looking for the correct target. There are various 243 * manipulations to the address that exist which are used to determine which 244 * index we use. The way that we interpret the output of the rule varies 245 * somewhat based on the generation. Sandy Bridge just has a node ID which 246 * points us to the socket with its single IMC. On Ivy Bridge through Broadwell, 247 * the memory controller to use is also encoded in part of the node ID. Finally, 248 * on Skylake, the SAD tells us which socket to look at. The socket in question 249 * then has a routing table which tells us which channel on which memory 250 * controller that is local to that socket. 251 * 252 * Once we have the target memory controller, we walk the list of target address 253 * decoder rules. These rules can help tell us which channel we care about 254 * (which is required on Sandy Bridge through Broadwell) and then describe some 255 * amount of the interleaving rules which are used to turn the system address 256 * into a channel address. 257 * 258 * Once we know the channel and the channel address, we walk the rank interleave 259 * rules which help us determine which DIMM and the corresponding rank on it 260 * that the corresponding channel address is on. It also has logic that we need 261 * to use to determine how to transform a channel address into an address on 262 * that specific rank. Once we have that, then the initial decoding is done. 263 * 264 * The logic in imc_decode.c is abstracted away from the broader kernel CMI 265 * logic. This is on purpose and allows us not only an easier time unit testing 266 * the logic, but also allows us to express more high fidelity errors that are 267 * translated into a much smaller subset. This logic is exercised in the 268 * 'imc_test' program which is built in 'test/os-tests/tests/imc'. 269 * 270 * Limitations 271 * ----------- 272 * 273 * Currently, this driver has the following limitations: 274 * 275 * o It doesn't decode the row and column addresses. 276 * o It doesn't encode from a DIMM address to a system address. 277 * o It doesn't properly support lockstep and mirroring modes on Sandy Bridge - 278 * Broadwell platforms. 279 * o It doesn't support virtual lockstep and adaptive mirroring on Purley 280 * platforms. 281 * o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs. 282 * o It doesn't know how to decode three way channel interleaving. 283 * 284 * None of these are intrinsic problems to the driver, it's mostly a matter of 285 * having proper documentation and testing. 286 */ 287 288 #include <sys/modctl.h> 289 #include <sys/conf.h> 290 #include <sys/devops.h> 291 #include <sys/ddi.h> 292 #include <sys/sunddi.h> 293 #include <sys/types.h> 294 #include <sys/file.h> 295 #include <sys/errno.h> 296 #include <sys/open.h> 297 #include <sys/cred.h> 298 #include <sys/pci.h> 299 #include <sys/sysmacros.h> 300 #include <sys/avl.h> 301 #include <sys/stat.h> 302 #include <sys/policy.h> 303 304 #include <sys/cpu_module.h> 305 #include <sys/mc.h> 306 #include <sys/mc_intel.h> 307 308 #include "imc.h" 309 310 /* 311 * These tables contain generational data that varies between processor 312 * generation such as the maximum number of sockets, memory controllers, and the 313 * offsets of the various registers. 314 */ 315 316 static const imc_gen_data_t imc_gen_data_snb = { 317 .igd_max_sockets = 4, 318 .igd_max_imcs = 2, 319 .igd_max_channels = 4, 320 .igd_max_dimms = 3, 321 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, 322 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, 323 IMC_REG_MC_MTR2 }, 324 .igd_mcmtr_offset = 0x7c, 325 .igd_tolm_offset = 0x80, 326 .igd_tohm_low_offset = 0x84, 327 .igd_sad_dram_offset = 0x80, 328 .igd_sad_ndram_rules = 10, 329 .igd_sad_nodeid_offset = 0x40, 330 .igd_tad_nrules = 12, 331 .igd_tad_rule_offset = 0x40, 332 .igd_tad_chan_offset = 0x90, 333 .igd_tad_sysdef = 0x80, 334 .igd_tad_sysdef2 = 0x84, 335 .igd_mc_mirror = 0xac, 336 .igd_rir_nways = 5, 337 .igd_rir_way_offset = 0x108, 338 .igd_rir_nileaves = 8, 339 .igd_rir_ileave_offset = 0x120, 340 .igd_ubox_cpubusno_offset = 0xd0, 341 }; 342 343 static const imc_gen_data_t imc_gen_data_ivb = { 344 .igd_max_sockets = 4, 345 .igd_max_imcs = 2, 346 .igd_max_channels = 4, 347 .igd_max_dimms = 3, 348 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, 349 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, 350 IMC_REG_MC_MTR2 }, 351 .igd_mcmtr_offset = 0x7c, 352 .igd_tolm_offset = 0x80, 353 .igd_tohm_low_offset = 0x84, 354 .igd_sad_dram_offset = 0x60, 355 .igd_sad_ndram_rules = 20, 356 .igd_sad_nodeid_offset = 0x40, 357 .igd_tad_nrules = 12, 358 .igd_tad_rule_offset = 0x40, 359 .igd_tad_chan_offset = 0x90, 360 .igd_tad_sysdef = 0x80, 361 .igd_tad_sysdef2 = 0x84, 362 .igd_mc_mirror = 0xac, 363 .igd_rir_nways = 5, 364 .igd_rir_way_offset = 0x108, 365 .igd_rir_nileaves = 8, 366 .igd_rir_ileave_offset = 0x120, 367 .igd_ubox_cpubusno_offset = 0xd0, 368 }; 369 370 static const imc_gen_data_t imc_gen_data_has_brd = { 371 .igd_max_sockets = 4, 372 .igd_max_imcs = 2, 373 .igd_max_channels = 4, 374 .igd_max_dimms = 3, 375 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX, 376 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, 377 IMC_REG_MC_MTR2 }, 378 .igd_mcmtr_offset = 0x7c, 379 .igd_tolm_offset = 0xd0, 380 .igd_tohm_low_offset = 0xd4, 381 .igd_tohm_hi_offset = 0xd8, 382 .igd_sad_dram_offset = 0x60, 383 .igd_sad_ndram_rules = 20, 384 .igd_sad_nodeid_offset = 0x40, 385 .igd_tad_nrules = 12, 386 .igd_tad_rule_offset = 0x40, 387 .igd_tad_chan_offset = 0x90, 388 .igd_tad_sysdef = 0x80, 389 .igd_tad_sysdef2 = 0x84, 390 .igd_mc_mirror = 0xac, 391 .igd_rir_nways = 5, 392 .igd_rir_way_offset = 0x108, 393 .igd_rir_nileaves = 8, 394 .igd_rir_ileave_offset = 0x120, 395 .igd_ubox_cpubusno_offset = 0xd0, 396 }; 397 398 static const imc_gen_data_t imc_gen_data_skx = { 399 .igd_max_sockets = 8, 400 .igd_max_imcs = 2, 401 .igd_max_channels = 3, 402 .igd_max_dimms = 2, 403 .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, 404 .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 }, 405 .igd_mcmtr_offset = 0x87c, 406 .igd_topo_offset = 0x88, 407 .igd_tolm_offset = 0xd0, 408 .igd_tohm_low_offset = 0xd4, 409 .igd_tohm_hi_offset = 0xd8, 410 .igd_sad_dram_offset = 0x60, 411 .igd_sad_ndram_rules = 24, 412 .igd_sad_nodeid_offset = 0xc0, 413 .igd_tad_nrules = 8, 414 .igd_tad_rule_offset = 0x850, 415 .igd_tad_chan_offset = 0x90, 416 .igd_rir_nways = 4, 417 .igd_rir_way_offset = 0x108, 418 .igd_rir_nileaves = 4, 419 .igd_rir_ileave_offset = 0x120, 420 .igd_ubox_cpubusno_offset = 0xcc, 421 }; 422 423 /* 424 * This table contains all of the devices that we're looking for from a stub 425 * perspective. These are organized by generation. Different generations behave 426 * in slightly different ways. For example, Sandy Bridge through Broadwell use 427 * unique PCI IDs for each PCI device/function combination that appears. Whereas 428 * Skylake based systems use the same PCI ID; however, different device/function 429 * values indicate that the IDs are used for different purposes. 430 */ 431 /* BEGIN CSTYLED */ 432 static const imc_stub_table_t imc_stub_table[] = { 433 /* Sandy Bridge */ 434 { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" }, 435 { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" }, 436 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" }, 437 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" }, 438 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" }, 439 { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" }, 440 { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" }, 441 { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" }, 442 { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" }, 443 { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" }, 444 { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" }, 445 { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" }, 446 /* Ivy Bridge */ 447 { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" }, 448 { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" }, 449 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" }, 450 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" }, 451 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" }, 452 { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" }, 453 { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" }, 454 { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" }, 455 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" }, 456 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" }, 457 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" }, 458 { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" }, 459 { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" }, 460 { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" }, 461 { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" }, 462 { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" }, 463 { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" }, 464 { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" }, 465 { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" }, 466 /* Haswell */ 467 { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" }, 468 { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" }, 469 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" }, 470 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" }, 471 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" }, 472 { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" }, 473 { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" }, 474 { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" }, 475 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" }, 476 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" }, 477 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" }, 478 { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" }, 479 { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" }, 480 { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" }, 481 { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" }, 482 { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" }, 483 { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" }, 484 { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" }, 485 { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" }, 486 /* Broadwell Devices */ 487 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" }, 488 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" }, 489 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" }, 490 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" }, 491 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" }, 492 { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" }, 493 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" }, 494 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" }, 495 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" }, 496 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" }, 497 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" }, 498 { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" }, 499 { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" }, 500 { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" }, 501 { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" }, 502 { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" }, 503 { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" }, 504 { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" }, 505 { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" }, 506 /* Skylake and Cascade Lake Devices */ 507 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" }, 508 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" }, 509 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" }, 510 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" }, 511 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" }, 512 { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" }, 513 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" }, 514 { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" }, 515 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" }, 516 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" }, 517 { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" }, 518 519 /* 520 * There is one SAD MC Route type device per core! Because of this a 521 * wide array of device and functions are allocated. For now, we list 522 * all 28 of them out. 523 */ 524 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" }, 525 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" }, 526 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" }, 527 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" }, 528 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" }, 529 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" }, 530 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" }, 531 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" }, 532 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" }, 533 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" }, 534 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" }, 535 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" }, 536 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" }, 537 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" }, 538 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" }, 539 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" }, 540 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" }, 541 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" }, 542 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" }, 543 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" }, 544 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" }, 545 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" }, 546 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" }, 547 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" }, 548 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" }, 549 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" }, 550 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" }, 551 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" }, 552 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" }, 553 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" }, 554 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" }, 555 { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" }, 556 557 { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" }, 558 { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" }, 559 }; 560 /* END CSTYLED */ 561 562 #define IMC_PCI_VENDOR_INTC 0x8086 563 564 /* 565 * Our IMC data is global and statically set up during a combination of 566 * _init(9E) and attach(9E). While we have a module dependency between the PCI 567 * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't 568 * guarantee that the imc driver has finished attaching. As such we make sure 569 * that it can operate without it being attached in any way. 570 */ 571 static imc_t *imc_data = NULL; 572 573 /* 574 * By default we should not allow the stubs to detach as we don't have a good 575 * way of forcing them to attach again. This is provided in case someone does 576 * want to allow the driver to unload. 577 */ 578 int imc_allow_detach = 0; 579 580 static void 581 imc_set_gen_data(imc_t *imc) 582 { 583 switch (imc->imc_gen) { 584 case IMC_GEN_SANDY: 585 imc->imc_gen_data = &imc_gen_data_snb; 586 break; 587 case IMC_GEN_IVY: 588 imc->imc_gen_data = &imc_gen_data_ivb; 589 break; 590 case IMC_GEN_HASWELL: 591 case IMC_GEN_BROADWELL: 592 imc->imc_gen_data = &imc_gen_data_has_brd; 593 break; 594 case IMC_GEN_SKYLAKE: 595 imc->imc_gen_data = &imc_gen_data_skx; 596 break; 597 default: 598 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " 599 "set to unknown generation: %u", imc->imc_gen); 600 } 601 } 602 603 /* 604 * If our device (dev_info_t) does not have a non-zero unit address, then 605 * devfsadmd will not pay attention to us at all. Therefore we need to set the 606 * unit address below, before we create minor nodes. 607 * 608 * The rest of the system expects us to have one minor node per socket. The 609 * minor node ID should be the ID of the socket. 610 */ 611 static boolean_t 612 imc_create_minors(imc_t *imc) 613 { 614 uint_t i; 615 616 ddi_set_name_addr(imc->imc_dip, "1"); 617 for (i = 0; i < imc->imc_nsockets; i++) { 618 char buf[MAXNAMELEN]; 619 620 if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >= 621 sizeof (buf)) { 622 goto fail; 623 } 624 625 if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i, 626 "ddi_mem_ctrl", 0) != DDI_SUCCESS) { 627 dev_err(imc->imc_dip, CE_WARN, "failed to create " 628 "minor node %u: %s", i, buf); 629 goto fail; 630 } 631 } 632 return (B_TRUE); 633 634 fail: 635 ddi_remove_minor_node(imc->imc_dip, NULL); 636 return (B_FALSE); 637 } 638 639 /* 640 * Check the current MC route value for this SAD. On Skylake systems there is 641 * one per core. Every core should agree. If not, we will not trust the SAD 642 * MCROUTE values and this will cause system address decoding to fail on 643 * skylake. 644 */ 645 static void 646 imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub) 647 { 648 uint32_t val; 649 650 val = pci_config_get32(stub->istub_cfgspace, 651 IMC_REG_SKX_SAD_MC_ROUTE_TABLE); 652 if (val == PCI_EINVAL32) { 653 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; 654 return; 655 } 656 657 if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) { 658 sad->isad_flags |= IMC_SAD_MCROUTE_VALID; 659 sad->isad_mcroute.ismc_raw_mcroute = val; 660 return; 661 } 662 663 /* 664 * Occasionally we see MC ROUTE table entries with a value of zero. 665 * We should ignore those for now. 666 */ 667 if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) { 668 dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch " 669 "with socket. SAD has val 0x%x, system has %x\n", 670 val, sad->isad_mcroute.ismc_raw_mcroute); 671 sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE; 672 } 673 } 674 675 /* 676 * On Skylake, many of the devices that we care about are on separate PCI Buses. 677 * These can be mapped together by the DECS register. However, we need to know 678 * how to map different buses together so that we can more usefully associate 679 * information. The set of buses is all present in the DECS register. We'll 680 * effectively assign sockets to buses. This is also still something that comes 681 * up on pre-Skylake systems as well. 682 */ 683 static boolean_t 684 imc_map_buses(imc_t *imc) 685 { 686 imc_stub_t *stub; 687 uint_t nsock; 688 689 /* 690 * Find the UBOX_DECS registers so we can establish socket mappings. On 691 * Skylake, there are three different sets of buses that we need to 692 * cover all of our devices, while there are only two before that. 693 */ 694 for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL; 695 stub = AVL_NEXT(&imc->imc_stubs, stub)) { 696 uint32_t busno; 697 698 if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) { 699 continue; 700 } 701 702 busno = pci_config_get32(stub->istub_cfgspace, 703 imc->imc_gen_data->igd_ubox_cpubusno_offset); 704 if (busno == PCI_EINVAL32) { 705 dev_err(imc->imc_dip, CE_WARN, "failed to read " 706 "UBOX_DECS CPUBUSNO0: invalid PCI read"); 707 return (B_FALSE); 708 } 709 710 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 711 imc->imc_sockets[nsock].isock_nbus = 3; 712 imc->imc_sockets[nsock].isock_bus[0] = 713 IMC_UBOX_CPUBUSNO_0(busno); 714 imc->imc_sockets[nsock].isock_bus[1] = 715 IMC_UBOX_CPUBUSNO_1(busno); 716 imc->imc_sockets[nsock].isock_bus[2] = 717 IMC_UBOX_CPUBUSNO_2(busno); 718 } else { 719 imc->imc_sockets[nsock].isock_bus[0] = 720 IMC_UBOX_CPUBUSNO_0(busno); 721 imc->imc_sockets[nsock].isock_bus[1] = 722 IMC_UBOX_CPUBUSNO_1(busno); 723 imc->imc_sockets[nsock].isock_nbus = 2; 724 } 725 nsock++; 726 } 727 imc->imc_nsockets = nsock; 728 729 return (B_TRUE); 730 } 731 732 /* 733 * For a given stub that we've found, map it to its corresponding socket based 734 * on the PCI bus that it has. 735 */ 736 static imc_socket_t * 737 imc_map_find_socket(imc_t *imc, imc_stub_t *stub) 738 { 739 uint_t i; 740 741 for (i = 0; i < imc->imc_nsockets; i++) { 742 uint_t bus; 743 744 for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) { 745 if (imc->imc_sockets[i].isock_bus[bus] == 746 stub->istub_bus) { 747 return (&imc->imc_sockets[i]); 748 } 749 } 750 } 751 752 return (NULL); 753 } 754 755 static boolean_t 756 imc_map_stubs(imc_t *imc) 757 { 758 imc_stub_t *stub; 759 760 if (!imc_map_buses(imc)) { 761 return (B_FALSE); 762 } 763 764 stub = avl_first(&imc->imc_stubs); 765 for (stub = avl_first(&imc->imc_stubs); stub != NULL; 766 stub = AVL_NEXT(&imc->imc_stubs, stub)) { 767 imc_socket_t *sock = imc_map_find_socket(imc, stub); 768 769 if (sock == NULL) { 770 dev_err(imc->imc_dip, CE_WARN, "found stub type %u " 771 "PCI%x,%x with bdf %u/%u/%u that does not match a " 772 "known PCI bus for any of %u sockets", 773 stub->istub_table->imcs_type, stub->istub_vid, 774 stub->istub_did, stub->istub_bus, stub->istub_dev, 775 stub->istub_func, imc->imc_nsockets); 776 continue; 777 } 778 779 /* 780 * We don't have to worry about duplicates here. We check to 781 * make sure that we have unique bdfs here. 782 */ 783 switch (stub->istub_table->imcs_type) { 784 case IMC_TYPE_MC0_M2M: 785 sock->isock_imcs[0].icn_m2m = stub; 786 break; 787 case IMC_TYPE_MC1_M2M: 788 sock->isock_imcs[1].icn_m2m = stub; 789 break; 790 case IMC_TYPE_MC0_MAIN0: 791 sock->isock_nimc++; 792 sock->isock_imcs[0].icn_main0 = stub; 793 794 /* 795 * On Skylake, the MAIN0 does double duty as channel 796 * zero and as the TAD. 797 */ 798 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 799 sock->isock_imcs[0].icn_nchannels++; 800 sock->isock_imcs[0].icn_channels[0].ich_desc = 801 stub; 802 sock->isock_tad[0].itad_stub = stub; 803 sock->isock_ntad++; 804 } 805 break; 806 case IMC_TYPE_MC0_MAIN1: 807 sock->isock_imcs[0].icn_main1 = stub; 808 break; 809 case IMC_TYPE_MC1_MAIN0: 810 sock->isock_nimc++; 811 sock->isock_imcs[1].icn_main0 = stub; 812 813 /* 814 * On Skylake, the MAIN0 does double duty as channel 815 * zero and as the TAD. 816 */ 817 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 818 sock->isock_imcs[1].icn_nchannels++; 819 sock->isock_imcs[1].icn_channels[0].ich_desc = 820 stub; 821 sock->isock_tad[1].itad_stub = stub; 822 sock->isock_ntad++; 823 } 824 break; 825 case IMC_TYPE_MC1_MAIN1: 826 sock->isock_imcs[1].icn_main1 = stub; 827 break; 828 case IMC_TYPE_MC0_CHANNEL0: 829 sock->isock_imcs[0].icn_nchannels++; 830 sock->isock_imcs[0].icn_channels[0].ich_desc = stub; 831 break; 832 case IMC_TYPE_MC0_CHANNEL1: 833 sock->isock_imcs[0].icn_nchannels++; 834 sock->isock_imcs[0].icn_channels[1].ich_desc = stub; 835 break; 836 case IMC_TYPE_MC0_CHANNEL2: 837 sock->isock_imcs[0].icn_nchannels++; 838 sock->isock_imcs[0].icn_channels[2].ich_desc = stub; 839 break; 840 case IMC_TYPE_MC0_CHANNEL3: 841 sock->isock_imcs[0].icn_nchannels++; 842 sock->isock_imcs[0].icn_channels[3].ich_desc = stub; 843 break; 844 case IMC_TYPE_MC1_CHANNEL0: 845 sock->isock_imcs[1].icn_nchannels++; 846 sock->isock_imcs[1].icn_channels[0].ich_desc = stub; 847 break; 848 case IMC_TYPE_MC1_CHANNEL1: 849 sock->isock_imcs[1].icn_nchannels++; 850 sock->isock_imcs[1].icn_channels[1].ich_desc = stub; 851 break; 852 case IMC_TYPE_MC1_CHANNEL2: 853 sock->isock_imcs[1].icn_nchannels++; 854 sock->isock_imcs[1].icn_channels[2].ich_desc = stub; 855 break; 856 case IMC_TYPE_MC1_CHANNEL3: 857 sock->isock_imcs[1].icn_nchannels++; 858 sock->isock_imcs[1].icn_channels[3].ich_desc = stub; 859 break; 860 case IMC_TYPE_SAD_DRAM: 861 sock->isock_sad.isad_dram = stub; 862 break; 863 case IMC_TYPE_SAD_MMIO: 864 sock->isock_sad.isad_mmio = stub; 865 break; 866 case IMC_TYPE_SAD_MISC: 867 sock->isock_sad.isad_tolh = stub; 868 break; 869 case IMC_TYPE_VTD_MISC: 870 /* 871 * Some systems have multiple VT-D Misc. entry points 872 * in the system. In this case, only use the first one 873 * we find. 874 */ 875 if (imc->imc_gvtd_misc == NULL) { 876 imc->imc_gvtd_misc = stub; 877 } 878 break; 879 case IMC_TYPE_SAD_MCROUTE: 880 ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE); 881 imc_mcroute_check(imc, &sock->isock_sad, stub); 882 break; 883 case IMC_TYPE_UBOX: 884 sock->isock_ubox = stub; 885 break; 886 case IMC_TYPE_HA0: 887 sock->isock_ntad++; 888 sock->isock_tad[0].itad_stub = stub; 889 break; 890 case IMC_TYPE_HA1: 891 sock->isock_ntad++; 892 sock->isock_tad[1].itad_stub = stub; 893 break; 894 case IMC_TYPE_UBOX_CPUBUSNO: 895 sock->isock_cpubusno = stub; 896 break; 897 default: 898 /* 899 * Attempt to still attach if we can. 900 */ 901 dev_err(imc->imc_dip, CE_WARN, "Encountered unknown " 902 "IMC type (%u) on PCI %x,%x", 903 stub->istub_table->imcs_type, 904 stub->istub_vid, stub->istub_did); 905 break; 906 } 907 } 908 909 return (B_TRUE); 910 } 911 912 /* 913 * Go through and fix up various aspects of the stubs mappings on systems. The 914 * following are a list of what we need to fix up: 915 * 916 * 1. On Haswell and newer systems, there is only one global VT-d device. We 917 * need to go back and map that to all of the per-socket imc_sad_t entries. 918 */ 919 static void 920 imc_fixup_stubs(imc_t *imc) 921 { 922 if (imc->imc_gen >= IMC_GEN_HASWELL) { 923 uint_t i; 924 925 for (i = 0; i < imc->imc_nsockets; i++) { 926 ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh, 927 ==, NULL); 928 imc->imc_sockets[i].isock_sad.isad_tolh = 929 imc->imc_gvtd_misc; 930 } 931 } 932 } 933 934 /* 935 * In the wild we've hit a few odd cases where not all devices are exposed that 936 * we might expect by firmware. In particular we've seen and validate the 937 * following cases: 938 * 939 * o We don't find all of the channel devices that we expect, e.g. we have the 940 * stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW 941 * with an E5-2630v3. 942 */ 943 static boolean_t 944 imc_validate_stubs(imc_t *imc) 945 { 946 for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) { 947 imc_socket_t *socket = &imc->imc_sockets[sock]; 948 949 for (uint_t mc = 0; mc < socket->isock_nimc; mc++) { 950 imc_mc_t *mcp = &socket->isock_imcs[mc]; 951 952 for (uint_t chan = 0; chan < mcp->icn_nchannels; 953 chan++) { 954 if (mcp->icn_channels[chan].ich_desc == NULL) { 955 dev_err(imc->imc_dip, CE_WARN, 956 "!missing device for socket %u/" 957 "imc %u/channel %u", sock, mc, 958 chan); 959 return (B_FALSE); 960 } 961 } 962 } 963 } 964 965 return (B_TRUE); 966 } 967 968 /* 969 * Attempt to map all of the discovered sockets to the corresponding APIC based 970 * socket. We do these mappings by getting the node id of the socket and 971 * adjusting it to make sure that no home agent is present in it. We use the 972 * UBOX to avoid any home agent related bits that are present in other 973 * registers. 974 */ 975 static void 976 imc_map_sockets(imc_t *imc) 977 { 978 uint_t i; 979 980 for (i = 0; i < imc->imc_nsockets; i++) { 981 uint32_t nodeid; 982 ddi_acc_handle_t h; 983 984 h = imc->imc_sockets[i].isock_ubox->istub_cfgspace; 985 nodeid = pci_config_get32(h, 986 imc->imc_gen_data->igd_sad_nodeid_offset); 987 if (nodeid == PCI_EINVAL32) { 988 imc->imc_sockets[i].isock_valid |= 989 IMC_SOCKET_V_BAD_NODEID; 990 continue; 991 } 992 993 imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid); 994 imc->imc_spointers[nodeid] = &imc->imc_sockets[i]; 995 } 996 } 997 998 /* 999 * Decode the MTR, accounting for variances between processor generations. 1000 */ 1001 static void 1002 imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr) 1003 { 1004 uint8_t disable; 1005 1006 /* 1007 * Check present first, before worrying about anything else. 1008 */ 1009 if (imc->imc_gen < IMC_GEN_SKYLAKE && 1010 IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) { 1011 dimm->idimm_present = B_FALSE; 1012 return; 1013 } else if (imc->imc_gen >= IMC_GEN_SKYLAKE && 1014 IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) { 1015 dimm->idimm_present = B_FALSE; 1016 return; 1017 } 1018 1019 dimm->idimm_present = B_TRUE; 1020 dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE; 1021 if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN || 1022 dimm->idimm_ncolumns > IMC_MTR_CA_MAX) { 1023 dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS; 1024 } 1025 1026 dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE; 1027 if (dimm->idimm_nrows < IMC_MTR_RA_MIN || 1028 dimm->idimm_nrows > IMC_MTR_RA_MAX) { 1029 dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS; 1030 } 1031 1032 /* 1033 * Determine Density, this information is not present on Sandy Bridge. 1034 */ 1035 switch (imc->imc_gen) { 1036 case IMC_GEN_IVY: 1037 dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr); 1038 break; 1039 case IMC_GEN_HASWELL: 1040 case IMC_GEN_BROADWELL: 1041 switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) { 1042 case 0: 1043 default: 1044 dimm->idimm_density = 0; 1045 dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; 1046 break; 1047 case 1: 1048 dimm->idimm_density = 2; 1049 break; 1050 case 2: 1051 dimm->idimm_density = 4; 1052 break; 1053 case 3: 1054 dimm->idimm_density = 8; 1055 break; 1056 } 1057 break; 1058 case IMC_GEN_SKYLAKE: 1059 switch (IMC_MTR_DENSITY_SKX(mtr)) { 1060 case 0: 1061 default: 1062 dimm->idimm_density = 0; 1063 dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; 1064 break; 1065 case 1: 1066 dimm->idimm_density = 2; 1067 break; 1068 case 2: 1069 dimm->idimm_density = 4; 1070 break; 1071 case 3: 1072 dimm->idimm_density = 8; 1073 break; 1074 case 4: 1075 dimm->idimm_density = 16; 1076 break; 1077 case 5: 1078 dimm->idimm_density = 12; 1079 break; 1080 } 1081 break; 1082 case IMC_GEN_UNKNOWN: 1083 case IMC_GEN_SANDY: 1084 dimm->idimm_density = 0; 1085 break; 1086 } 1087 1088 /* 1089 * The values of width are the same on IVY->SKX, but the bits are 1090 * different. This doesn't exist on SNB. 1091 */ 1092 if (imc->imc_gen > IMC_GEN_SANDY) { 1093 uint8_t width; 1094 1095 if (imc->imc_gen >= IMC_GEN_BROADWELL) { 1096 width = IMC_MTR_WIDTH_BRD_SKX(mtr); 1097 } else { 1098 width = IMC_MTR_WIDTH_IVB_HAS(mtr); 1099 } 1100 switch (width) { 1101 case 0: 1102 dimm->idimm_width = 4; 1103 break; 1104 case 1: 1105 dimm->idimm_width = 8; 1106 break; 1107 case 2: 1108 dimm->idimm_width = 16; 1109 break; 1110 default: 1111 dimm->idimm_width = 0; 1112 dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH; 1113 break; 1114 } 1115 } else { 1116 dimm->idimm_width = 0; 1117 } 1118 1119 dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr); 1120 switch (imc->imc_gen) { 1121 case IMC_GEN_HASWELL: 1122 case IMC_GEN_BROADWELL: 1123 case IMC_GEN_SKYLAKE: 1124 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) { 1125 dimm->idimm_nranks = 0; 1126 dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; 1127 } 1128 break; 1129 default: 1130 if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) { 1131 dimm->idimm_nranks = 0; 1132 dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; 1133 } 1134 } 1135 1136 disable = IMC_MTR_RANK_DISABLE(mtr); 1137 dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0; 1138 dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0; 1139 dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0; 1140 dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0; 1141 1142 /* 1143 * Only Haswell and later have this information. 1144 */ 1145 if (imc->imc_gen >= IMC_GEN_HASWELL) { 1146 dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0; 1147 dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0; 1148 dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr); 1149 if (dimm->idimm_3dsranks != 0) { 1150 dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks; 1151 } 1152 } 1153 1154 1155 if (icn->icn_dimm_type == IMC_DIMM_DDR4) { 1156 dimm->idimm_nbanks = 16; 1157 } else { 1158 dimm->idimm_nbanks = 8; 1159 } 1160 1161 /* 1162 * To calculate the DIMM size we need first take the number of rows and 1163 * columns. This gives us the number of slots per chip. In a given rank 1164 * there are nbanks of these. There are nrank entries of those. Each of 1165 * these slots can fit a byte. 1166 */ 1167 dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 * 1168 (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows)); 1169 } 1170 1171 static void 1172 imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan) 1173 { 1174 uint_t i; 1175 1176 /* 1177 * There's one register for each DIMM that might be present, we always 1178 * read that information to determine information about the DIMMs. 1179 */ 1180 chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms; 1181 for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { 1182 uint32_t mtr; 1183 imc_dimm_t *dimm = &chan->ich_dimms[i]; 1184 1185 bzero(dimm, sizeof (imc_dimm_t)); 1186 mtr = pci_config_get32(chan->ich_desc->istub_cfgspace, 1187 imc->imc_gen_data->igd_mtr_offsets[i]); 1188 dimm->idimm_mtr = mtr; 1189 /* 1190 * We don't really expect to get a bad PCIe read. However, if we 1191 * do, treat that for the moment as though the DIMM is bad. 1192 */ 1193 if (mtr == PCI_EINVAL32) { 1194 dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ; 1195 continue; 1196 } 1197 1198 imc_decode_mtr(imc, icn, dimm, mtr); 1199 } 1200 } 1201 1202 static boolean_t 1203 imc_fill_controller(imc_t *imc, imc_mc_t *icn) 1204 { 1205 uint32_t mcmtr; 1206 1207 mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace, 1208 imc->imc_gen_data->igd_mcmtr_offset); 1209 if (mcmtr == PCI_EINVAL32) { 1210 icn->icn_invalid = B_TRUE; 1211 return (B_FALSE); 1212 } 1213 1214 icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0; 1215 if (imc->imc_gen < IMC_GEN_SKYLAKE) { 1216 icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0; 1217 } else { 1218 icn->icn_lockstep = B_FALSE; 1219 } 1220 1221 icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0; 1222 1223 /* 1224 * SNB and IVB only support DDR3. Haswell and Broadwell may support 1225 * DDR4, depends on the SKU. Skylake only supports DDR4. 1226 */ 1227 switch (imc->imc_gen) { 1228 case IMC_GEN_SANDY: 1229 case IMC_GEN_IVY: 1230 icn->icn_dimm_type = IMC_DIMM_DDR3; 1231 break; 1232 case IMC_GEN_HASWELL: 1233 case IMC_GEN_BROADWELL: 1234 if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) { 1235 icn->icn_dimm_type = IMC_DIMM_DDR4; 1236 } else { 1237 icn->icn_dimm_type = IMC_DIMM_DDR3; 1238 } 1239 break; 1240 default: 1241 /* 1242 * Skylake and on are all DDR4. 1243 */ 1244 icn->icn_dimm_type = IMC_DIMM_DDR4; 1245 break; 1246 } 1247 1248 if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) { 1249 icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace, 1250 imc->imc_gen_data->igd_topo_offset); 1251 } 1252 1253 return (B_TRUE); 1254 } 1255 1256 /* 1257 * Walk the IMC data and fill in the information on DIMMs and the memory 1258 * controller configurations. 1259 */ 1260 static void 1261 imc_fill_data(imc_t *imc) 1262 { 1263 uint_t csock, cmc, cchan; 1264 1265 for (csock = 0; csock < imc->imc_nsockets; csock++) { 1266 imc_socket_t *sock = &imc->imc_sockets[csock]; 1267 1268 for (cmc = 0; cmc < sock->isock_nimc; cmc++) { 1269 imc_mc_t *icn = &sock->isock_imcs[cmc]; 1270 1271 if (!imc_fill_controller(imc, icn)) 1272 continue; 1273 1274 for (cchan = 0; cchan < icn->icn_nchannels; cchan++) { 1275 imc_fill_dimms(imc, icn, 1276 &icn->icn_channels[cchan]); 1277 } 1278 } 1279 } 1280 } 1281 1282 static nvlist_t * 1283 imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm) 1284 { 1285 nvlist_t *nvl; 1286 1287 nvl = fnvlist_alloc(); 1288 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT, 1289 dimm->idimm_present); 1290 if (!dimm->idimm_present) { 1291 return (nvl); 1292 } 1293 1294 fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size); 1295 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS, 1296 dimm->idimm_ncolumns); 1297 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS, 1298 dimm->idimm_nrows); 1299 1300 if (imc->imc_gen > IMC_GEN_SANDY) { 1301 fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY, 1302 dimm->idimm_density * (1ULL << 30)); 1303 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH, 1304 dimm->idimm_width); 1305 } 1306 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS, 1307 dimm->idimm_nranks); 1308 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS, 1309 dimm->idimm_nbanks); 1310 fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS, 1311 dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE); 1312 1313 if (imc->imc_gen >= IMC_GEN_HASWELL) { 1314 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL, 1315 dimm->idimm_hdrl); 1316 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP, 1317 dimm->idimm_hdrl_parity); 1318 if (dimm->idimm_3dsranks > 0) { 1319 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK, 1320 dimm->idimm_3dsranks); 1321 } 1322 } 1323 1324 return (nvl); 1325 } 1326 1327 static nvlist_t * 1328 imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan) 1329 { 1330 nvlist_t *nvl; 1331 nvlist_t *dimms[IMC_MAX_DIMMPERCHAN]; 1332 uint_t i; 1333 1334 nvl = fnvlist_alloc(); 1335 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC, 1336 imc->imc_gen_data->igd_max_dimms); 1337 for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { 1338 dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]); 1339 } 1340 1341 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS, 1342 dimms, i); 1343 1344 for (; i > 0; i--) { 1345 nvlist_free(dimms[i-1]); 1346 } 1347 1348 return (nvl); 1349 } 1350 1351 static nvlist_t * 1352 imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn) 1353 { 1354 nvlist_t *nvl; 1355 nvlist_t *channels[IMC_MAX_CHANPERMC]; 1356 uint_t i; 1357 1358 nvl = fnvlist_alloc(); 1359 fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels); 1360 fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC, 1361 icn->icn_ecc); 1362 if (icn->icn_lockstep) { 1363 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, 1364 MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK); 1365 } else { 1366 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, 1367 MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP); 1368 1369 } 1370 1371 if (icn->icn_closed) { 1372 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, 1373 MCINTEL_NVLIST_V1_MC_POLICY_CLOSED); 1374 } else { 1375 fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, 1376 MCINTEL_NVLIST_V1_MC_POLICY_OPEN); 1377 } 1378 1379 for (i = 0; i < icn->icn_nchannels; i++) { 1380 channels[i] = imc_nvl_create_channel(imc, 1381 &icn->icn_channels[i]); 1382 } 1383 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS, 1384 channels, icn->icn_nchannels); 1385 for (i = 0; i < icn->icn_nchannels; i++) { 1386 nvlist_free(channels[i]); 1387 } 1388 1389 return (nvl); 1390 } 1391 1392 static void 1393 imc_nvl_pack(imc_socket_t *sock, boolean_t sleep) 1394 { 1395 char *buf = NULL; 1396 size_t len = 0; 1397 int kmflag; 1398 1399 if (sock->isock_nvl == NULL) 1400 return; 1401 1402 if (sock->isock_buf != NULL) 1403 return; 1404 1405 if (sleep) { 1406 kmflag = KM_SLEEP; 1407 } else { 1408 kmflag = KM_NOSLEEP_LAZY; 1409 } 1410 1411 if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR, 1412 kmflag) != 0) { 1413 return; 1414 } 1415 1416 sock->isock_buf = buf; 1417 sock->isock_buflen = len; 1418 sock->isock_gen++; 1419 } 1420 1421 static void 1422 imc_decoder_pack(imc_t *imc) 1423 { 1424 char *buf = NULL; 1425 size_t len = 0; 1426 1427 if (imc->imc_decoder_buf != NULL) 1428 return; 1429 1430 if (imc->imc_decoder_dump == NULL) { 1431 imc->imc_decoder_dump = imc_dump_decoder(imc); 1432 } 1433 1434 if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR, 1435 KM_NOSLEEP_LAZY) != 0) { 1436 return; 1437 } 1438 1439 imc->imc_decoder_buf = buf; 1440 imc->imc_decoder_len = len; 1441 } 1442 1443 static void 1444 imc_nvl_create(imc_t *imc) 1445 { 1446 uint_t csock; 1447 for (csock = 0; csock < imc->imc_nsockets; csock++) { 1448 uint_t i; 1449 nvlist_t *nvl; 1450 nvlist_t *mcs[IMC_MAX_IMCPERSOCK]; 1451 imc_socket_t *sock = &imc->imc_sockets[csock]; 1452 1453 nvl = fnvlist_alloc(); 1454 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR, 1455 MCINTEL_NVLIST_VERS1); 1456 fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC, 1457 sock->isock_nimc); 1458 1459 for (i = 0; i < sock->isock_nimc; i++) { 1460 mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]); 1461 } 1462 1463 fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS, 1464 mcs, sock->isock_nimc); 1465 1466 for (i = 0; i < sock->isock_nimc; i++) { 1467 nvlist_free(mcs[i]); 1468 } 1469 1470 sock->isock_nvl = nvl; 1471 imc_nvl_pack(sock, B_TRUE); 1472 } 1473 } 1474 1475 /* 1476 * Determine the top of low and high memory. These determine whether transaction 1477 * addresses target main memory or not. Unfortunately, the way that these are 1478 * stored and fetched changes with different generations. 1479 */ 1480 static void 1481 imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad) 1482 { 1483 uint32_t tolm, tohm_low, tohm_hi; 1484 1485 tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace, 1486 imc->imc_gen_data->igd_tolm_offset); 1487 tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace, 1488 imc->imc_gen_data->igd_tohm_low_offset); 1489 if (imc->imc_gen_data->igd_tohm_hi_offset != 0) { 1490 tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace, 1491 imc->imc_gen_data->igd_tohm_hi_offset); 1492 } else { 1493 tohm_hi = 0; 1494 } 1495 1496 if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 || 1497 tohm_hi == PCI_EINVAL32) { 1498 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; 1499 return; 1500 } 1501 1502 switch (imc->imc_gen) { 1503 case IMC_GEN_SANDY: 1504 case IMC_GEN_IVY: 1505 sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) << 1506 IMC_TOLM_SNB_IVY_SHIFT; 1507 sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) << 1508 IMC_TOLM_SNB_IVY_SHIFT; 1509 break; 1510 case IMC_GEN_HASWELL: 1511 case IMC_GEN_BROADWELL: 1512 case IMC_GEN_SKYLAKE: 1513 sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK; 1514 sad->isad_tohm = ((uint64_t)tohm_low & 1515 IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32); 1516 1517 /* 1518 * Adjust the values to turn them into an exclusive range. 1519 */ 1520 sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL; 1521 sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL; 1522 break; 1523 default: 1524 dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " 1525 "set to unknown generation: %u", imc->imc_gen); 1526 return; 1527 } 1528 } 1529 1530 static void 1531 imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule, 1532 uint32_t raw) 1533 { 1534 uint_t attr; 1535 uint64_t limit; 1536 bzero(rule, sizeof (imc_sad_rule_t)); 1537 1538 rule->isr_raw_dram = raw; 1539 rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0; 1540 if (imc->imc_gen < IMC_GEN_SKYLAKE) { 1541 switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) { 1542 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6: 1543 rule->isr_imode = IMC_SAD_IMODE_8t6; 1544 break; 1545 case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR: 1546 rule->isr_imode = IMC_SAD_IMODE_8t6XOR; 1547 break; 1548 } 1549 } else { 1550 switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) { 1551 case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6: 1552 rule->isr_imode = IMC_SAD_IMODE_8t6; 1553 break; 1554 case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8: 1555 rule->isr_imode = IMC_SAD_IMODE_10t8; 1556 break; 1557 case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12: 1558 rule->isr_imode = IMC_SAD_IMODE_14t12; 1559 break; 1560 case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30: 1561 rule->isr_imode = IMC_SAD_IMODE_32t30; 1562 break; 1563 } 1564 } 1565 1566 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1567 attr = IMC_SAD_DRAM_ATTR_SKX(raw); 1568 } else { 1569 attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw); 1570 } 1571 1572 switch (attr) { 1573 case IMC_SAD_DRAM_ATTR_DRAM: 1574 rule->isr_type = IMC_SAD_TYPE_DRAM; 1575 break; 1576 case IMC_SAD_DRAM_ATTR_MMCFG: 1577 rule->isr_type = IMC_SAD_TYPE_MMCFG; 1578 break; 1579 case IMC_SAD_DRAM_ATTR_NXM: 1580 if (imc->imc_gen < IMC_GEN_SKYLAKE) { 1581 sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; 1582 } 1583 rule->isr_type = IMC_SAD_TYPE_NXM; 1584 break; 1585 default: 1586 sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; 1587 break; 1588 } 1589 1590 /* 1591 * Fetch the limit which represents bits 45:26 and then adjust this so 1592 * that it is exclusive. 1593 */ 1594 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1595 limit = IMC_SAD_DRAM_LIMIT_SKX(raw); 1596 } else { 1597 limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw); 1598 } 1599 rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) + 1600 IMC_SAD_DRAM_LIMIT_EXCLUSIVE; 1601 1602 /* 1603 * The rest of this does not apply to Sandy Bridge. 1604 */ 1605 if (imc->imc_gen == IMC_GEN_SANDY) 1606 return; 1607 1608 if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) { 1609 rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0; 1610 return; 1611 } 1612 1613 switch (IMC_SAD_DRAM_MOD23_SKX(raw)) { 1614 case IMC_SAD_DRAM_MOD23_MOD3: 1615 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3; 1616 break; 1617 case IMC_SAD_DRAM_MOD23_MOD2_C01: 1618 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01; 1619 break; 1620 case IMC_SAD_DRAM_MOD23_MOD2_C12: 1621 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12; 1622 break; 1623 case IMC_SAD_DRAM_MOD23_MOD2_C02: 1624 rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02; 1625 break; 1626 } 1627 1628 rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0; 1629 switch (IMC_SAD_DRAM_MOD3_SKX(raw)) { 1630 case IMC_SAD_DRAM_MOD3_MODE_45t6: 1631 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6; 1632 break; 1633 case IMC_SAD_DRAM_MOD3_MODE_45t8: 1634 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8; 1635 break; 1636 case IMC_SAD_DRAM_MOD3_MODE_45t12: 1637 rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12; 1638 break; 1639 default: 1640 sad->isad_valid |= IMC_SAD_V_BAD_MOD3; 1641 break; 1642 } 1643 } 1644 1645 static void 1646 imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw) 1647 { 1648 uint_t i; 1649 uint32_t mlen, mbase, skipbits, skipafter; 1650 1651 rule->isr_raw_interleave = raw; 1652 1653 /* 1654 * Right now all architectures always have the maximum number of SAD 1655 * interleave targets. 1656 */ 1657 rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE; 1658 1659 /* 1660 * Sandy Bridge has a gap in the interleave list due to the fact that it 1661 * uses a smaller length. 1662 */ 1663 if (imc->imc_gen > IMC_GEN_SANDY) { 1664 mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN; 1665 mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK; 1666 skipbits = skipafter = 0; 1667 } else { 1668 mlen = IMC_SAD_ILEAVE_SNB_LEN; 1669 mbase = IMC_SAD_ILEAVE_SNB_MASK; 1670 skipbits = 2; 1671 skipafter = 4; 1672 } 1673 1674 for (i = 0; i < rule->isr_ntargets; i++) { 1675 uint32_t mask, shift; 1676 1677 shift = i * mlen; 1678 if (i >= skipafter) 1679 shift += skipbits; 1680 mask = mbase << shift; 1681 rule->isr_targets[i] = (raw & mask) >> shift; 1682 } 1683 } 1684 1685 static void 1686 imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad) 1687 { 1688 uint_t i; 1689 off_t off; 1690 1691 sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules; 1692 for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset; 1693 i < sad->isad_nrules; i++, off += sizeof (uint64_t)) { 1694 uint32_t dram, interleave; 1695 imc_sad_rule_t *rule = &sad->isad_rules[i]; 1696 1697 dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off); 1698 interleave = pci_config_get32(sad->isad_dram->istub_cfgspace, 1699 off + 4); 1700 1701 if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) { 1702 sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; 1703 return; 1704 } 1705 1706 imc_sad_fill_rule(imc, sad, rule, dram); 1707 imc_sad_fill_rule_interleave(imc, rule, interleave); 1708 } 1709 } 1710 1711 static void 1712 imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad) 1713 { 1714 uint_t i; 1715 imc_sad_mcroute_table_t *mc = &sad->isad_mcroute; 1716 1717 if (imc->imc_gen < IMC_GEN_SKYLAKE) 1718 return; 1719 if (sad->isad_valid != 0) 1720 return; 1721 1722 mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES; 1723 for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) { 1724 uint_t chanoff, ringoff; 1725 1726 ringoff = i * IMC_MC_ROUTE_RING_BITS; 1727 chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET; 1728 1729 mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >> 1730 ringoff) & IMC_MC_ROUTE_RING_MASK; 1731 mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >> 1732 chanoff) & IMC_MC_ROUTE_CHAN_MASK; 1733 } 1734 } 1735 1736 /* 1737 * Initialize the SAD. To do this we have to do a few different things: 1738 * 1739 * 1. Determine where the top of low and high memory is. 1740 * 2. Read and decode all of the rules for the SAD 1741 * 3. On systems with a route table, decode the raw routes 1742 * 1743 * At this point in time, we treat TOLM and TOHM as a per-socket construct, even 1744 * though it really should be global, this just makes life a bit simpler. 1745 */ 1746 static void 1747 imc_decoder_init_sad(imc_t *imc) 1748 { 1749 uint_t i; 1750 1751 for (i = 0; i < imc->imc_nsockets; i++) { 1752 imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad); 1753 imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad); 1754 imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad); 1755 } 1756 } 1757 1758 static void 1759 imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev, 1760 imc_tad_rule_t *rule, uint32_t val) 1761 { 1762 uint64_t limit; 1763 1764 limit = IMC_TAD_LIMIT(val); 1765 rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) + 1766 IMC_TAD_LIMIT_EXCLUSIVE; 1767 rule->itr_raw = val; 1768 1769 switch (IMC_TAD_SOCK_WAY(val)) { 1770 case IMC_TAD_SOCK_WAY_1: 1771 rule->itr_sock_way = 1; 1772 break; 1773 case IMC_TAD_SOCK_WAY_2: 1774 rule->itr_sock_way = 2; 1775 break; 1776 case IMC_TAD_SOCK_WAY_4: 1777 rule->itr_sock_way = 4; 1778 break; 1779 case IMC_TAD_SOCK_WAY_8: 1780 rule->itr_sock_way = 8; 1781 break; 1782 } 1783 1784 rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1; 1785 rule->itr_sock_gran = IMC_TAD_GRAN_64B; 1786 rule->itr_chan_gran = IMC_TAD_GRAN_64B; 1787 1788 /* 1789 * Starting with Skylake the targets that are used are no longer part of 1790 * the TAD. Those come from the IMC route table. 1791 */ 1792 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1793 rule->itr_ntargets = 0; 1794 return; 1795 } 1796 1797 rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS; 1798 rule->itr_targets[0] = IMC_TAD_TARG0(val); 1799 rule->itr_targets[1] = IMC_TAD_TARG1(val); 1800 rule->itr_targets[2] = IMC_TAD_TARG2(val); 1801 rule->itr_targets[3] = IMC_TAD_TARG3(val); 1802 1803 if (prev == NULL) { 1804 rule->itr_base = 0; 1805 } else { 1806 rule->itr_base = prev->itr_limit + 1; 1807 } 1808 } 1809 1810 static void 1811 imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule, 1812 uint32_t val) 1813 { 1814 uint64_t base; 1815 1816 rule->itr_raw_gran = val; 1817 base = IMC_TAD_BASE_BASE(val); 1818 rule->itr_base = base << IMC_TAD_BASE_SHIFT; 1819 1820 switch (IMC_TAD_BASE_CHAN_GRAN(val)) { 1821 case IMC_TAD_BASE_CHAN_GRAN_64B: 1822 rule->itr_sock_gran = IMC_TAD_GRAN_64B; 1823 break; 1824 case IMC_TAD_BASE_CHAN_GRAN_256B: 1825 rule->itr_sock_gran = IMC_TAD_GRAN_256B; 1826 break; 1827 case IMC_TAD_BASE_CHAN_GRAN_4KB: 1828 rule->itr_sock_gran = IMC_TAD_GRAN_4KB; 1829 break; 1830 default: 1831 tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN; 1832 return; 1833 } 1834 1835 switch (IMC_TAD_BASE_SOCK_GRAN(val)) { 1836 case IMC_TAD_BASE_SOCK_GRAN_64B: 1837 rule->itr_sock_gran = IMC_TAD_GRAN_64B; 1838 break; 1839 case IMC_TAD_BASE_SOCK_GRAN_256B: 1840 rule->itr_sock_gran = IMC_TAD_GRAN_256B; 1841 break; 1842 case IMC_TAD_BASE_SOCK_GRAN_4KB: 1843 rule->itr_sock_gran = IMC_TAD_GRAN_4KB; 1844 break; 1845 case IMC_TAD_BASE_SOCK_GRAN_1GB: 1846 rule->itr_sock_gran = IMC_TAD_GRAN_1GB; 1847 break; 1848 } 1849 } 1850 1851 /* 1852 * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's 1853 * suggested that the channel wayness will take this into account and therefore 1854 * should be accurately reflected. 1855 */ 1856 static void 1857 imc_tad_read_rules(imc_t *imc, imc_tad_t *tad) 1858 { 1859 uint_t i; 1860 off_t baseoff; 1861 imc_tad_rule_t *prev; 1862 1863 tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules; 1864 for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset, 1865 prev = NULL; i < tad->itad_nrules; 1866 i++, baseoff += sizeof (uint32_t)) { 1867 uint32_t val; 1868 off_t off; 1869 imc_tad_rule_t *rule = &tad->itad_rules[i]; 1870 1871 /* 1872 * On Skylake, the TAD rules are split among two registers. The 1873 * latter set mimics what exists on pre-Skylake. 1874 */ 1875 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1876 off = baseoff + IMC_SKX_WAYNESS_OFFSET; 1877 } else { 1878 off = baseoff; 1879 } 1880 1881 val = pci_config_get32(tad->itad_stub->istub_cfgspace, off); 1882 if (val == PCI_EINVAL32) { 1883 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1884 return; 1885 } 1886 1887 imc_tad_fill_rule(imc, tad, prev, rule, val); 1888 prev = rule; 1889 if (imc->imc_gen < IMC_GEN_SKYLAKE) 1890 continue; 1891 1892 val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff); 1893 if (val == PCI_EINVAL32) { 1894 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1895 return; 1896 } 1897 1898 imc_tad_fill_skx(imc, tad, rule, val); 1899 } 1900 } 1901 1902 /* 1903 * Check for features which change how decoding works. 1904 */ 1905 static void 1906 imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc) 1907 { 1908 uint32_t val; 1909 1910 /* 1911 * Determine whether or not lockstep mode or mirroring are enabled. 1912 * These change the behavior of how we're supposed to interpret channel 1913 * wayness. Lockstep is available in the TAD's features. Mirroring is 1914 * available on the IMC's features. This isn't present in Skylake+. On 1915 * Skylake Mirorring is a property of the SAD rule and there is no 1916 * lockstep. 1917 */ 1918 switch (imc->imc_gen) { 1919 case IMC_GEN_SANDY: 1920 case IMC_GEN_IVY: 1921 case IMC_GEN_HASWELL: 1922 case IMC_GEN_BROADWELL: 1923 val = pci_config_get32(tad->itad_stub->istub_cfgspace, 1924 imc->imc_gen_data->igd_tad_sysdef); 1925 if (val == PCI_EINVAL32) { 1926 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1927 return; 1928 } 1929 if (IMC_TAD_SYSDEF_LOCKSTEP(val)) { 1930 tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP; 1931 } 1932 1933 val = pci_config_get32(mc->icn_main1->istub_cfgspace, 1934 imc->imc_gen_data->igd_mc_mirror); 1935 if (val == PCI_EINVAL32) { 1936 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1937 return; 1938 } 1939 if (IMC_MC_MIRROR_SNB_BRD(val)) { 1940 tad->itad_flags |= IMC_TAD_FLAG_MIRROR; 1941 } 1942 break; 1943 default: 1944 break; 1945 } 1946 1947 /* 1948 * Now, go through and look at values that'll change how we do the 1949 * channel index and adddress calculation. These are only present 1950 * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge 1951 * and they don't exist on Skylake+. 1952 */ 1953 switch (imc->imc_gen) { 1954 case IMC_GEN_IVY: 1955 case IMC_GEN_HASWELL: 1956 case IMC_GEN_BROADWELL: 1957 val = pci_config_get32(tad->itad_stub->istub_cfgspace, 1958 imc->imc_gen_data->igd_tad_sysdef2); 1959 if (val == PCI_EINVAL32) { 1960 tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; 1961 return; 1962 } 1963 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { 1964 tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT; 1965 } 1966 if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { 1967 tad->itad_flags |= IMC_TAD_FLAG_CHANHASH; 1968 } 1969 break; 1970 default: 1971 break; 1972 } 1973 } 1974 1975 /* 1976 * Read the IMC channel interleave records 1977 */ 1978 static void 1979 imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan) 1980 { 1981 uint_t i; 1982 off_t off; 1983 1984 chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules; 1985 for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset; 1986 i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) { 1987 uint32_t val; 1988 uint64_t offset; 1989 1990 val = pci_config_get32(chan->ich_desc->istub_cfgspace, 1991 off); 1992 if (val == PCI_EINVAL32) { 1993 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; 1994 return; 1995 } 1996 1997 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 1998 offset = IMC_TADCHAN_OFFSET_SKX(val); 1999 } else { 2000 offset = IMC_TADCHAN_OFFSET_SNB_BRD(val); 2001 } 2002 2003 chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT; 2004 chan->ich_tad_offsets_raw[i] = val; 2005 } 2006 } 2007 2008 static void 2009 imc_decoder_init_tad(imc_t *imc) 2010 { 2011 uint_t i; 2012 2013 for (i = 0; i < imc->imc_nsockets; i++) { 2014 uint_t j; 2015 2016 for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) { 2017 imc_tad_read_features(imc, 2018 &imc->imc_sockets[i].isock_tad[j], 2019 &imc->imc_sockets[i].isock_imcs[j]); 2020 imc_tad_read_rules(imc, 2021 &imc->imc_sockets[i].isock_tad[j]); 2022 } 2023 } 2024 2025 for (i = 0; i < imc->imc_nsockets; i++) { 2026 uint_t j; 2027 imc_socket_t *sock = &imc->imc_sockets[i]; 2028 2029 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { 2030 uint_t k; 2031 imc_mc_t *mc = &sock->isock_imcs[j]; 2032 2033 for (k = 0; k < mc->icn_nchannels; k++) { 2034 imc_channel_t *chan = &mc->icn_channels[k]; 2035 imc_tad_read_interleave(imc, chan); 2036 } 2037 } 2038 } 2039 } 2040 2041 static void 2042 imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan, 2043 imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig) 2044 { 2045 uint_t i; 2046 off_t off, incr; 2047 2048 /* 2049 * Rank interleave offset registers come in two forms. Either they are 2050 * contiguous for a given wayness, meaning that all of the entries for 2051 * wayness zero are contiguous, or they are sparse, meaning that there 2052 * is a bank for entry zero for all wayness, then entry one for all 2053 * wayness, etc. 2054 */ 2055 if (contig) { 2056 off = imc->imc_gen_data->igd_rir_ileave_offset + 2057 (rirno * imc->imc_gen_data->igd_rir_nileaves * 2058 sizeof (uint32_t)); 2059 incr = sizeof (uint32_t); 2060 } else { 2061 off = imc->imc_gen_data->igd_rir_ileave_offset + 2062 (rirno * sizeof (uint32_t)); 2063 incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t); 2064 } 2065 for (i = 0; i < rank->irle_nentries; i++, off += incr) { 2066 uint32_t val; 2067 uint64_t offset; 2068 imc_rank_ileave_entry_t *ent = &rank->irle_entries[i]; 2069 2070 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); 2071 if (val == PCI_EINVAL32) { 2072 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; 2073 return; 2074 } 2075 2076 switch (imc->imc_gen) { 2077 case IMC_GEN_BROADWELL: 2078 ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val); 2079 break; 2080 default: 2081 ent->irle_target = IMC_RIR_OFFSET_TARGET(val); 2082 break; 2083 } 2084 if (imc->imc_gen >= IMC_GEN_HASWELL) { 2085 offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val); 2086 } else { 2087 offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val); 2088 } 2089 ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT; 2090 } 2091 } 2092 2093 static void 2094 imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan) 2095 { 2096 uint_t i; 2097 off_t off; 2098 2099 chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways; 2100 for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset; 2101 i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) { 2102 uint32_t val; 2103 uint64_t lim; 2104 imc_rank_ileave_t *ent = &chan->ich_rankileaves[i]; 2105 2106 val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); 2107 if (val == PCI_EINVAL32) { 2108 chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; 2109 return; 2110 } 2111 2112 ent->irle_raw = val; 2113 ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0; 2114 ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val); 2115 ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val); 2116 if (imc->imc_gen >= IMC_GEN_HASWELL) { 2117 lim = IMC_RIR_LIMIT_HAS_SKX(val); 2118 } else { 2119 lim = IMC_RIR_LIMIT_SNB_IVB(val); 2120 } 2121 2122 ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) + 2123 IMC_RIR_LIMIT_EXCLUSIVE; 2124 2125 ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves; 2126 if (imc->imc_gen >= IMC_GEN_SKYLAKE) { 2127 imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE); 2128 } else { 2129 imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE); 2130 } 2131 } 2132 } 2133 2134 static void 2135 imc_decoder_init_rir(imc_t *imc) 2136 { 2137 uint_t i; 2138 2139 for (i = 0; i < imc->imc_nsockets; i++) { 2140 uint_t j; 2141 imc_socket_t *sock = &imc->imc_sockets[i]; 2142 2143 for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { 2144 uint_t k; 2145 imc_mc_t *mc = &sock->isock_imcs[j]; 2146 2147 for (k = 0; k < mc->icn_nchannels; k++) { 2148 imc_channel_t *chan = &mc->icn_channels[k]; 2149 imc_rir_read_wayness(imc, chan); 2150 } 2151 } 2152 } 2153 } 2154 2155 static cmi_errno_t 2156 imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo, 2157 uint32_t synd, int syndtype, mc_unum_t *unump) 2158 { 2159 imc_t *imc = arg; 2160 uint_t i; 2161 imc_decode_state_t dec; 2162 2163 bzero(&dec, sizeof (dec)); 2164 if (!imc_decode_pa(imc, pa, &dec)) { 2165 switch (dec.ids_fail) { 2166 case IMC_DECODE_F_LEGACY_RANGE: 2167 case IMC_DECODE_F_OUTSIDE_DRAM: 2168 return (CMIERR_MC_NOTDIMMADDR); 2169 default: 2170 return (CMIERR_MC_BADSTATE); 2171 } 2172 } 2173 2174 unump->unum_board = 0; 2175 /* 2176 * The chip id needs to be in the order that the OS expects it, which 2177 * may not be our order. 2178 */ 2179 for (i = 0; i < imc->imc_nsockets; i++) { 2180 if (imc->imc_spointers[i] == dec.ids_socket) 2181 break; 2182 } 2183 if (i == imc->imc_nsockets) { 2184 return (CMIERR_MC_BADSTATE); 2185 } 2186 unump->unum_chip = i; 2187 unump->unum_mc = dec.ids_tadid; 2188 unump->unum_chan = dec.ids_channelid; 2189 unump->unum_cs = dec.ids_dimmid; 2190 unump->unum_rank = dec.ids_rankid; 2191 unump->unum_offset = dec.ids_rankaddr; 2192 for (i = 0; i < MC_UNUM_NDIMM; i++) { 2193 unump->unum_dimms[i] = MC_INVALNUM; 2194 } 2195 2196 return (CMI_SUCCESS); 2197 } 2198 2199 static cmi_errno_t 2200 imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa) 2201 { 2202 return (CMIERR_UNKNOWN); 2203 } 2204 2205 static const cmi_mc_ops_t imc_mc_ops = { 2206 .cmi_mc_patounum = imc_mc_patounum, 2207 .cmi_mc_unumtopa = imc_mc_unumtopa 2208 }; 2209 2210 /* 2211 * This is where we really finish attaching and become open for business. This 2212 * occurs once we have all of the expected stubs attached. Here's where all of 2213 * the real fun begins. 2214 */ 2215 static void 2216 imc_attach_complete(void *arg) 2217 { 2218 imc_t *imc = arg; 2219 cmi_errno_t err; 2220 2221 imc_set_gen_data(imc); 2222 2223 /* 2224 * On SKX and newer, we can fail to map PCI buses at this point due to 2225 * bad PCIe reads. 2226 */ 2227 if (!imc_map_stubs(imc)) { 2228 goto done; 2229 } 2230 2231 if (!imc_validate_stubs(imc)) { 2232 imc->imc_flags |= IMC_F_VALIDATE_FAILED; 2233 goto done; 2234 } 2235 2236 imc_fixup_stubs(imc); 2237 imc_map_sockets(imc); 2238 2239 if (!imc_create_minors(imc)) { 2240 goto done; 2241 } 2242 2243 imc_fill_data(imc); 2244 imc_nvl_create(imc); 2245 2246 /* 2247 * Gather additional information that we need so that we can properly 2248 * initialize the memory decoder and encoder. 2249 */ 2250 imc_decoder_init_sad(imc); 2251 imc_decoder_init_tad(imc); 2252 imc_decoder_init_rir(imc); 2253 2254 /* 2255 * Register decoder functions. This may fail. If so, try and complain 2256 * loudly, but stay active to allow other data to be useful. Register a 2257 * global handle. 2258 */ 2259 if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) { 2260 imc->imc_flags |= IMC_F_MCREG_FAILED; 2261 dev_err(imc->imc_dip, CE_WARN, "failed to register memory " 2262 "decoding operations: 0x%x", err); 2263 } 2264 2265 done: 2266 mutex_enter(&imc->imc_lock); 2267 imc->imc_flags &= IMC_F_ATTACH_DISPATCHED; 2268 imc->imc_flags |= IMC_F_ATTACH_COMPLETE; 2269 mutex_exit(&imc->imc_lock); 2270 } 2271 2272 static int 2273 imc_stub_comparator(const void *l, const void *r) 2274 { 2275 const imc_stub_t *sl = l, *sr = r; 2276 if (sl->istub_bus > sr->istub_bus) 2277 return (1); 2278 if (sl->istub_bus < sr->istub_bus) 2279 return (-1); 2280 if (sl->istub_dev > sr->istub_dev) 2281 return (1); 2282 if (sl->istub_dev < sr->istub_dev) 2283 return (-1); 2284 if (sl->istub_func > sr->istub_func) 2285 return (1); 2286 if (sl->istub_func < sr->istub_func) 2287 return (-1); 2288 return (0); 2289 } 2290 2291 static int 2292 imc_stub_scan_cb(dev_info_t *dip, void *arg) 2293 { 2294 int vid, did; 2295 const imc_stub_table_t *table; 2296 imc_t *imc = arg; 2297 int *regs; 2298 uint_t i, nregs; 2299 2300 if (dip == ddi_root_node()) { 2301 return (DDI_WALK_CONTINUE); 2302 } 2303 2304 /* 2305 * Get the dev info name. PCI devices will always be children of PCI 2306 * devices today on x86. If we reach something that has a device name 2307 * that's not PCI, then we can prune it's children. 2308 */ 2309 if (strncmp("pci", ddi_get_name(dip), 3) != 0) { 2310 return (DDI_WALK_PRUNECHILD); 2311 } 2312 2313 /* 2314 * Get the device and vendor ID and see if this is something the imc 2315 * knows about or cares about. 2316 */ 2317 vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2318 "vendor-id", PCI_EINVAL16); 2319 did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2320 "device-id", PCI_EINVAL16); 2321 if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { 2322 return (DDI_WALK_CONTINUE); 2323 } 2324 2325 if (vid != IMC_PCI_VENDOR_INTC) { 2326 return (DDI_WALK_PRUNECHILD); 2327 } 2328 2329 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2330 "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { 2331 return (DDI_WALK_CONTINUE); 2332 } 2333 2334 if (nregs == 0) { 2335 ddi_prop_free(regs); 2336 return (DDI_WALK_CONTINUE); 2337 } 2338 2339 2340 table = NULL; 2341 for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { 2342 if (imc_stub_table[i].imcs_devid == did && 2343 imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && 2344 imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { 2345 table = &imc_stub_table[i]; 2346 break; 2347 } 2348 } 2349 ddi_prop_free(regs); 2350 2351 /* 2352 * Not a match, not interesting. 2353 */ 2354 if (table == NULL) { 2355 return (DDI_WALK_CONTINUE); 2356 } 2357 2358 mutex_enter(&imc->imc_lock); 2359 imc->imc_nscanned++; 2360 mutex_exit(&imc->imc_lock); 2361 2362 return (DDI_WALK_CONTINUE); 2363 } 2364 2365 /* 2366 * From here, go through and see how many of the devices that we know about. 2367 */ 2368 static void 2369 imc_stub_scan(void *arg) 2370 { 2371 imc_t *imc = arg; 2372 boolean_t dispatch = B_FALSE; 2373 2374 /* 2375 * Zero out the scan results in case we've been detached and reattached. 2376 */ 2377 mutex_enter(&imc->imc_lock); 2378 imc->imc_nscanned = 0; 2379 mutex_exit(&imc->imc_lock); 2380 2381 ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc); 2382 2383 mutex_enter(&imc->imc_lock); 2384 imc->imc_flags |= IMC_F_SCAN_COMPLETE; 2385 imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED; 2386 2387 /* 2388 * If the scan found no nodes, then that means that we're on a hardware 2389 * platform that we don't support. Therefore, there's no reason to do 2390 * anything here. 2391 */ 2392 if (imc->imc_nscanned == 0) { 2393 imc->imc_flags |= IMC_F_UNSUP_PLATFORM; 2394 mutex_exit(&imc->imc_lock); 2395 return; 2396 } 2397 2398 if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { 2399 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; 2400 dispatch = B_TRUE; 2401 } 2402 2403 mutex_exit(&imc->imc_lock); 2404 2405 if (dispatch) { 2406 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, 2407 imc, DDI_SLEEP); 2408 } 2409 } 2410 2411 /* 2412 * By default, refuse to allow stubs to detach. 2413 */ 2414 int 2415 imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd) 2416 { 2417 imc_stub_t *stub; 2418 imc_t *imc = imc_data; 2419 2420 mutex_enter(&imc->imc_lock); 2421 2422 /* 2423 * By default, we do not allow stubs to detach. However, if the driver 2424 * has attached to devices on a platform it doesn't recognize or 2425 * support or if the override flag has been set, then allow detach to 2426 * proceed. 2427 */ 2428 if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 && 2429 imc_allow_detach == 0) { 2430 mutex_exit(&imc->imc_lock); 2431 return (DDI_FAILURE); 2432 } 2433 2434 for (stub = avl_first(&imc->imc_stubs); stub != NULL; 2435 stub = AVL_NEXT(&imc->imc_stubs, stub)) { 2436 if (stub->istub_dip == dip) { 2437 break; 2438 } 2439 } 2440 2441 /* 2442 * A device was attached to us that we somehow don't know about. Allow 2443 * this to proceed. 2444 */ 2445 if (stub == NULL) { 2446 mutex_exit(&imc->imc_lock); 2447 return (DDI_SUCCESS); 2448 } 2449 2450 pci_config_teardown(&stub->istub_cfgspace); 2451 avl_remove(&imc->imc_stubs, stub); 2452 kmem_free(stub, sizeof (imc_stub_t)); 2453 mutex_exit(&imc->imc_lock); 2454 2455 return (DDI_SUCCESS); 2456 } 2457 2458 int 2459 imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd) 2460 { 2461 imc_stub_t *stub, *lookup; 2462 int did, vid, *regs; 2463 uint_t i, nregs; 2464 const imc_stub_table_t *table; 2465 avl_index_t idx; 2466 boolean_t dispatch = B_FALSE; 2467 imc_t *imc = imc_data; 2468 2469 if (cmd != DDI_ATTACH) { 2470 return (DDI_FAILURE); 2471 } 2472 2473 /* 2474 * We've been asked to attach a stub. First, determine if this is even a 2475 * PCI device that we should care about. Then, append it to our global 2476 * list and kick off the configuration task. Note that we do this 2477 * configuration task in a taskq so that we don't interfere with the 2478 * normal attach / detach path processing. 2479 */ 2480 if (strncmp("pci", ddi_get_name(dip), 3) != 0) { 2481 return (DDI_FAILURE); 2482 } 2483 2484 /* 2485 * Get the device and vendor ID and see if this is something the imc 2486 * knows about or cares about. 2487 */ 2488 vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2489 "vendor-id", PCI_EINVAL16); 2490 did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2491 "device-id", PCI_EINVAL16); 2492 if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { 2493 return (DDI_FAILURE); 2494 } 2495 2496 /* 2497 * Only accept INTC parts on the imc driver. 2498 */ 2499 if (vid != IMC_PCI_VENDOR_INTC) { 2500 return (DDI_FAILURE); 2501 } 2502 2503 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2504 "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { 2505 return (DDI_FAILURE); 2506 } 2507 2508 if (nregs == 0) { 2509 ddi_prop_free(regs); 2510 return (DDI_FAILURE); 2511 } 2512 2513 /* 2514 * Determine if this matches a known device. 2515 */ 2516 table = NULL; 2517 for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { 2518 if (imc_stub_table[i].imcs_devid == did && 2519 imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && 2520 imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { 2521 table = &imc_stub_table[i]; 2522 break; 2523 } 2524 } 2525 2526 if (i == ARRAY_SIZE(imc_stub_table)) { 2527 ddi_prop_free(regs); 2528 return (DDI_FAILURE); 2529 } 2530 2531 /* 2532 * We've found something. Make sure the generation matches our current 2533 * one. If it does, construct the entry and append it to the list. 2534 */ 2535 mutex_enter(&imc->imc_lock); 2536 if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen != 2537 table->imcs_gen) { 2538 mutex_exit(&imc->imc_lock); 2539 ddi_prop_free(regs); 2540 dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) " 2541 "that has different hardware generation (%u) from current " 2542 "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen); 2543 return (DDI_FAILURE); 2544 } else { 2545 imc->imc_gen = table->imcs_gen; 2546 } 2547 mutex_exit(&imc->imc_lock); 2548 2549 stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP); 2550 stub->istub_dip = dip; 2551 stub->istub_vid = vid; 2552 stub->istub_did = did; 2553 stub->istub_bus = PCI_REG_BUS_G(regs[0]); 2554 stub->istub_dev = PCI_REG_DEV_G(regs[0]); 2555 stub->istub_func = PCI_REG_FUNC_G(regs[0]); 2556 ddi_prop_free(regs); 2557 stub->istub_table = table; 2558 2559 if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) { 2560 kmem_free(stub, sizeof (stub)); 2561 dev_err(dip, CE_WARN, "Failed to set up PCI config space " 2562 "for IMC stub device %s (%u/%u)", ddi_node_name(dip), 2563 vid, did); 2564 return (DDI_FAILURE); 2565 } 2566 2567 mutex_enter(&imc->imc_lock); 2568 if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) { 2569 dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate " 2570 "bdf %u/%u/%u with %s (%u/%u), not attaching", 2571 ddi_node_name(imc->imc_dip), vid, did, 2572 stub->istub_bus, stub->istub_dev, stub->istub_func, 2573 ddi_node_name(lookup->istub_dip), lookup->istub_vid, 2574 lookup->istub_did); 2575 mutex_exit(&imc->imc_lock); 2576 pci_config_teardown(&stub->istub_cfgspace); 2577 kmem_free(stub, sizeof (stub)); 2578 2579 return (DDI_FAILURE); 2580 } 2581 avl_insert(&imc->imc_stubs, stub, idx); 2582 2583 if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE && 2584 avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { 2585 imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; 2586 dispatch = B_TRUE; 2587 } 2588 mutex_exit(&imc->imc_lock); 2589 2590 if (dispatch) { 2591 (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, 2592 imc, DDI_SLEEP); 2593 } 2594 2595 return (DDI_SUCCESS); 2596 } 2597 2598 static int 2599 imc_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2600 { 2601 imc_t *imc = imc_data; 2602 2603 if ((flag & (FEXCL | FNDELAY)) != 0) 2604 return (EINVAL); 2605 2606 if (otyp != OTYP_CHR) 2607 return (EINVAL); 2608 2609 mutex_enter(&imc->imc_lock); 2610 2611 if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) { 2612 mutex_exit(&imc->imc_lock); 2613 return (ENOTSUP); 2614 } 2615 2616 /* 2617 * It's possible that someone has come in during the window between when 2618 * we've created the minor node and when we've finished doing work. 2619 */ 2620 if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) { 2621 mutex_exit(&imc->imc_lock); 2622 return (EAGAIN); 2623 } 2624 2625 /* 2626 * It's not clear how someone would get a minor that we didn't create. 2627 * But be paranoid and make sure. 2628 */ 2629 if (getminor(*devp) >= imc->imc_nsockets) { 2630 mutex_exit(&imc->imc_lock); 2631 return (EINVAL); 2632 } 2633 2634 /* 2635 * Make sure this socket entry has been filled in. 2636 */ 2637 if (imc->imc_spointers[getminor(*devp)] == NULL) { 2638 mutex_exit(&imc->imc_lock); 2639 return (EINVAL); 2640 } 2641 2642 mutex_exit(&imc->imc_lock); 2643 2644 return (0); 2645 } 2646 2647 static void 2648 imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode) 2649 { 2650 imc_decode_state_t dec; 2651 uint_t i; 2652 2653 bzero(&dec, sizeof (dec)); 2654 if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) { 2655 encode->mcei_err = (uint32_t)dec.ids_fail; 2656 encode->mcei_errdata = dec.ids_fail_data; 2657 return; 2658 } 2659 2660 encode->mcei_errdata = 0; 2661 encode->mcei_err = 0; 2662 encode->mcei_board = 0; 2663 for (i = 0; i < imc->imc_nsockets; i++) { 2664 if (imc->imc_spointers[i] == dec.ids_socket) 2665 break; 2666 } 2667 encode->mcei_chip = i; 2668 encode->mcei_mc = dec.ids_tadid; 2669 encode->mcei_chan = dec.ids_channelid; 2670 encode->mcei_dimm = dec.ids_dimmid; 2671 encode->mcei_rank_addr = dec.ids_rankaddr; 2672 encode->mcei_rank = dec.ids_rankid; 2673 encode->mcei_row = UINT32_MAX; 2674 encode->mcei_column = UINT32_MAX; 2675 encode->mcei_pad = 0; 2676 } 2677 2678 static int 2679 imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2680 int *rvalp) 2681 { 2682 int ret; 2683 minor_t m; 2684 mc_snapshot_info_t info; 2685 mc_encode_ioc_t encode; 2686 imc_t *imc = imc_data; 2687 imc_socket_t *sock; 2688 2689 mutex_enter(&imc->imc_lock); 2690 m = getminor(dev); 2691 if (m >= imc->imc_nsockets) { 2692 ret = EINVAL; 2693 goto done; 2694 } 2695 sock = imc->imc_spointers[m]; 2696 if (sock == NULL) { 2697 ret = EINVAL; 2698 goto done; 2699 } 2700 2701 /* 2702 * Note, other memory controller drivers don't check mode for reading 2703 * data nor do they care who can read it from a credential perspective. 2704 * As such we don't either at this time. 2705 */ 2706 switch (cmd) { 2707 case MC_IOC_SNAPSHOT_INFO: 2708 imc_nvl_pack(sock, B_FALSE); 2709 if (sock->isock_buf == NULL) { 2710 ret = EIO; 2711 break; 2712 } 2713 2714 info.mcs_size = sock->isock_buflen; 2715 info.mcs_gen = sock->isock_gen; 2716 2717 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { 2718 ret = EFAULT; 2719 break; 2720 } 2721 2722 ret = 0; 2723 break; 2724 case MC_IOC_SNAPSHOT: 2725 imc_nvl_pack(sock, B_FALSE); 2726 if (sock->isock_buf == NULL) { 2727 ret = EIO; 2728 break; 2729 } 2730 2731 if (ddi_copyout(sock->isock_buf, (void *)arg, 2732 sock->isock_buflen, mode) != 0) { 2733 ret = EFAULT; 2734 break; 2735 } 2736 2737 ret = 0; 2738 break; 2739 case MC_IOC_DECODE_SNAPSHOT_INFO: 2740 imc_decoder_pack(imc); 2741 if (imc->imc_decoder_buf == NULL) { 2742 ret = EIO; 2743 break; 2744 } 2745 2746 info.mcs_size = imc->imc_decoder_len; 2747 info.mcs_gen = imc->imc_spointers[0]->isock_gen; 2748 2749 if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { 2750 ret = EFAULT; 2751 break; 2752 } 2753 2754 ret = 0; 2755 break; 2756 case MC_IOC_DECODE_SNAPSHOT: 2757 imc_decoder_pack(imc); 2758 if (imc->imc_decoder_buf == NULL) { 2759 ret = EIO; 2760 break; 2761 } 2762 2763 if (ddi_copyout(imc->imc_decoder_buf, (void *)arg, 2764 imc->imc_decoder_len, mode) != 0) { 2765 ret = EFAULT; 2766 break; 2767 } 2768 2769 ret = 0; 2770 break; 2771 case MC_IOC_DECODE_PA: 2772 if (crgetzoneid(credp) != GLOBAL_ZONEID || 2773 drv_priv(credp) != 0) { 2774 ret = EPERM; 2775 break; 2776 } 2777 2778 if (ddi_copyin((void *)arg, &encode, sizeof (encode), 2779 mode & FKIOCTL) != 0) { 2780 ret = EPERM; 2781 break; 2782 } 2783 2784 imc_ioctl_decode(imc, &encode); 2785 ret = 0; 2786 2787 if (ddi_copyout(&encode, (void *)arg, sizeof (encode), 2788 mode & FKIOCTL) != 0) { 2789 ret = EPERM; 2790 break; 2791 } 2792 break; 2793 default: 2794 ret = EINVAL; 2795 goto done; 2796 } 2797 2798 done: 2799 mutex_exit(&imc->imc_lock); 2800 return (ret); 2801 } 2802 2803 static int 2804 imc_close(dev_t dev, int flag, int otyp, cred_t *credp) 2805 { 2806 return (0); 2807 } 2808 2809 static int 2810 imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2811 { 2812 if (cmd != DDI_ATTACH) { 2813 return (DDI_FAILURE); 2814 } 2815 2816 if (imc_data == NULL || imc_data->imc_dip != NULL) { 2817 return (DDI_FAILURE); 2818 } 2819 2820 mutex_enter(&imc_data->imc_lock); 2821 if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1, 2822 TASKQ_DEFAULTPRI, 0)) == NULL) { 2823 mutex_exit(&imc_data->imc_lock); 2824 return (DDI_FAILURE); 2825 } 2826 2827 imc_data->imc_dip = dip; 2828 imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED; 2829 mutex_exit(&imc_data->imc_lock); 2830 2831 (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data, 2832 DDI_SLEEP); 2833 2834 return (DDI_SUCCESS); 2835 } 2836 2837 /* 2838 * We only export a single instance. 2839 */ 2840 static int 2841 imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) 2842 { 2843 /* 2844 * getinfo(9E) shouldn't be called if we're not attached. But be 2845 * paranoid. 2846 */ 2847 if (imc_data == NULL || imc_data->imc_dip == NULL) { 2848 return (DDI_FAILURE); 2849 } 2850 2851 switch (infocmd) { 2852 case DDI_INFO_DEVT2DEVINFO: 2853 *resultp = imc_data->imc_dip; 2854 break; 2855 case DDI_INFO_DEVT2INSTANCE: 2856 *resultp = (void *)0; 2857 break; 2858 default: 2859 return (DDI_FAILURE); 2860 } 2861 2862 return (DDI_SUCCESS); 2863 } 2864 2865 static int 2866 imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2867 { 2868 if (cmd != DDI_DETACH) { 2869 return (DDI_FAILURE); 2870 } 2871 2872 if (imc_data == NULL || imc_data->imc_dip) { 2873 return (DDI_FAILURE); 2874 } 2875 2876 mutex_enter(&imc_data->imc_lock); 2877 2878 /* 2879 * While a scan or attach is outstanding, don't allow us to detach. 2880 */ 2881 if ((imc_data->imc_flags & 2882 (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) { 2883 mutex_exit(&imc_data->imc_lock); 2884 return (DDI_FAILURE); 2885 } 2886 2887 /* 2888 * Because the stub driver depends on the imc driver, we shouldn't be 2889 * able to have any entries in this list when we detach. However, we 2890 * check just to make sure. 2891 */ 2892 if (!avl_is_empty(&imc_data->imc_stubs)) { 2893 mutex_exit(&imc_data->imc_lock); 2894 return (DDI_FAILURE); 2895 } 2896 2897 nvlist_free(imc_data->imc_decoder_dump); 2898 imc_data->imc_decoder_dump = NULL; 2899 if (imc_data->imc_decoder_buf != NULL) { 2900 kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len); 2901 imc_data->imc_decoder_buf = NULL; 2902 imc_data->imc_decoder_len = 0; 2903 } 2904 2905 ddi_remove_minor_node(imc_data->imc_dip, NULL); 2906 imc_data->imc_dip = NULL; 2907 mutex_exit(&imc_data->imc_lock); 2908 2909 ddi_taskq_wait(imc_data->imc_taskq); 2910 ddi_taskq_destroy(imc_data->imc_taskq); 2911 imc_data->imc_taskq = NULL; 2912 2913 return (DDI_SUCCESS); 2914 } 2915 2916 static void 2917 imc_free(void) 2918 { 2919 if (imc_data == NULL) { 2920 return; 2921 } 2922 2923 VERIFY(avl_is_empty(&imc_data->imc_stubs)); 2924 avl_destroy(&imc_data->imc_stubs); 2925 mutex_destroy(&imc_data->imc_lock); 2926 kmem_free(imc_data, sizeof (imc_t)); 2927 imc_data = NULL; 2928 } 2929 2930 static void 2931 imc_alloc(void) 2932 { 2933 imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP); 2934 2935 mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL); 2936 avl_create(&imc_data->imc_stubs, imc_stub_comparator, 2937 sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link)); 2938 } 2939 2940 static struct cb_ops imc_cb_ops = { 2941 .cb_open = imc_open, 2942 .cb_close = imc_close, 2943 .cb_strategy = nodev, 2944 .cb_print = nodev, 2945 .cb_dump = nodev, 2946 .cb_read = nodev, 2947 .cb_write = nodev, 2948 .cb_ioctl = imc_ioctl, 2949 .cb_devmap = nodev, 2950 .cb_mmap = nodev, 2951 .cb_segmap = nodev, 2952 .cb_chpoll = nochpoll, 2953 .cb_prop_op = ddi_prop_op, 2954 .cb_flag = D_MP, 2955 .cb_rev = CB_REV, 2956 .cb_aread = nodev, 2957 .cb_awrite = nodev 2958 }; 2959 2960 static struct dev_ops imc_dev_ops = { 2961 .devo_rev = DEVO_REV, 2962 .devo_refcnt = 0, 2963 .devo_getinfo = imc_getinfo, 2964 .devo_identify = nulldev, 2965 .devo_probe = nulldev, 2966 .devo_attach = imc_attach, 2967 .devo_detach = imc_detach, 2968 .devo_reset = nodev, 2969 .devo_cb_ops = &imc_cb_ops, 2970 .devo_quiesce = ddi_quiesce_not_needed 2971 }; 2972 2973 static struct modldrv imc_modldrv = { 2974 .drv_modops = &mod_driverops, 2975 .drv_linkinfo = "Intel Integrated Memory Controller Driver", 2976 .drv_dev_ops = &imc_dev_ops 2977 }; 2978 2979 static struct modlinkage imc_modlinkage = { 2980 .ml_rev = MODREV_1, 2981 .ml_linkage = { &imc_modldrv, NULL } 2982 }; 2983 2984 int 2985 _init(void) 2986 { 2987 int ret; 2988 2989 if ((ret = mod_install(&imc_modlinkage)) == 0) { 2990 imc_alloc(); 2991 } 2992 2993 return (ret); 2994 } 2995 2996 int 2997 _info(struct modinfo *modinfop) 2998 { 2999 return (mod_info(&imc_modlinkage, modinfop)); 3000 } 3001 3002 int 3003 _fini(void) 3004 { 3005 int ret; 3006 3007 if ((ret = mod_remove(&imc_modlinkage)) == 0) { 3008 imc_free(); 3009 } 3010 return (ret); 3011 } 3012