xref: /illumos-gate/usr/src/uts/intel/io/imc/imc.c (revision 0be687ea0c09cd50b4ae51df829900fea257d535)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * Generic Intel Integrated Memory Controller (IMC) Driver
18  *
19  * This driver talks to the CPU's IMC to understand the detailed topology of the
20  * processor and to determine how to map between physical addresses to the
21  * corresponding DIMM. This driver supports the following generations of Intel
22  * chips:
23  *
24  *  - Sandy Bridge
25  *  - Ivy Bridge
26  *  - Haswell
27  *  - Broadwell
28  *  - Skylake / Cascade Lake
29  *
30  * Memory Decoding
31  * ---------------
32  *
33  * For more detailed summaries of the memory decoding process, please refer to
34  * the Intel External Design Specifications for the corresponding processor.
35  * What follows is a rough overview of how the memory decoding system works.
36  *
37  * First, we'd like to define the following concepts:
38  *
39  * SYSTEM ADDRESS
40  *
41  *	This is a physical address that the operating system normally uses. This
42  *	address may refer to DRAM, it may refer to memory mapped PCI
43  *	configuration space or device registers, or it may refer to other parts
44  *	of the system's memory map, such as the extended advanced programmable
45  *	interrupt controller (xAPIC), etc.
46  *
47  * DIMM
48  *
49  *	Dual-inline memory module. This refers to a physical stick of volatile
50  *	memory that is inserted into a slot on the motherboard.
51  *
52  * RANK
53  *
54  *	A potential sub-division of a DIMM. A DIMM's memory capacity is divided
55  *	into a number of equal sized ranks. For example, an 8 GiB DIMM, may have
56  *	1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks.
57  *
58  * RANK ADDRESS
59  *
60  *	An address that exists in the context of a given rank on a DIMM. All
61  *	ranks have overlapping addresses, so the address 0x400 exists on all
62  *	ranks on a given DIMM.
63  *
64  * CHANNEL
65  *
66  *	Multiple DIMMs may be combined into a single channel. The channel
67  *	represents the combined memory of all the DIMMs. A given channel only
68  *	ever exists on a socket and is bound to a single memory controller.
69  *
70  * CHANNEL ADDRESS
71  *
72  *	This is an address that exists logically on a channel. Each address on a
73  *	channel maps to a corresponding DIMM that exists on that channel. The
74  *	address space on one channel is independent from that on another. This
75  *	means that address 0x1000 can exist on each memory channel in the
76  *	system.
77  *
78  * INTERLEAVE
79  *
80  *	There are several different cases where interleaving occurs on the
81  *	system. For example, addresses may be interleaved across sockets,
82  *	memory channels, or DIMM ranks. When addresses are interleaved, then
83  *	some number of bits in an address are used to select which target to go
84  *	to (usually through a look up table). The effect of interleaving is that
85  *	addresses that are next to one another may not all go to the same
86  *	device. The following image shows a non-interleaving case.
87  *
88  *	0x0fff +-----+             +-----+ 0x7ff
89  *	       |     |\___________/|     |
90  *	       |     |  __________ | (b) |
91  *	       |     | /          \|     |
92  *	0x0800 |=====|=            +-----+ 0x000       +-----+ 0x7ff
93  *	       |     | \______________________________/|     |
94  *	       |     | _______________________________ | (a) |
95  *	       |     |/                               \|     |
96  *	0x0000 +-----+                                 +-----+ 0x000
97  *
98  *	In this example of non-interleaving, addresses 0x0000 to 0x07ff go to
99  *	device (a). While, addresses 0x08000 to 0xfff, go to device (b).
100  *	However, each range is divided into the same number of components.
101  *
102  *	If instead, we were to look at that with interleaving, what we might say
103  *	is that rather than splitting the range in half, we might say that if
104  *	the address has bit 8 set (0x100), then it goes to (b), otherwise it
105  *	goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a).
106  *	0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a)
107  *	again, and then 0x300 to 0x2ff would go back to (b). This would continue
108  *	for a while. This would instead look something more like:
109  *
110  *
111  *      0x0fff +-----+       A: 0x7ff +---------+   B: 0x7ff +---------+
112  *             | (b) |                | e00-eff |            | f00-fff |
113  *      0x0f00 |-----|          0x700 +---------+      0x700 +---------+
114  *             | (a) |                | c00-cff |            | d00-dff |
115  *      0x0e00 ~~~~~~~          0x600 +---------+      0x600 +---------+
116  *               ***                  | a00-aff |            | b00-bff |
117  *      0x0400 ~~~~~~~          0x500 +---------+      0x500 +---------+
118  *             | (b) |                | 800-8ff |            | 900-9ff |
119  *      0x0300 |-----|          0x400 +---------+      0x400 +---------+
120  *             | (a) |                | 600-6ff |            | 700-7ff |
121  *      0x0200 |-----|          0x300 +---------+      0x300 +---------+
122  *             | (b) |                | 400-4ff |            | 500-5ff |
123  *      0x0100 |-----|          0x200 +---------+      0x200 +---------+
124  *             | (a) |                | 200-2ff |            | 300-3ff |
125  *      0x0000 +-----+          0x100 +---------+      0x100 +---------+
126  *                                    | 000-0ff |            | 100-1ff |
127  *                              0x000 +---------+      0x000 +---------+
128  *
129  *	In this example we've performed two-way interleaving. The number of ways
130  *	that something can interleave varies based on what we're interleaving
131  *	between.
132  *
133  * MEMORY CONTROLLER
134  *
135  *	A given processor die (see uts/i86pc/os/cpuid.c) contains a number of
136  *	memory controllers. Usually 1 or two. Each memory controller supports a
137  *	given number of DIMMs, which are divided across multiple channels.
138  *
139  * TARGET ADDRESS DECODER
140  *
141  *	The target address decoder (TAD) is responsible for taking a system
142  *	address and transforming it into a channel address based on the rules
143  *	that are present. Each memory controller has a corresponding TAD. The
144  *	TAD is often contained in a device called a 'Home Agent'.
145  *
146  * SYSTEM ADDRESS DECODER
147  *
148  *	The system address decoder (SAD) is responsible for taking a system
149  *	address and directing it to the right place, whether this be memory or
150  *	otherwise. There is a single memory controller per socket (see
151  *	uts/i86pc/os/cpuid.c) that is shared between all the cores currently.
152  *
153  * NODE IDENTIFIER
154  *
155  *	The node identifier is used to uniquely identify an element in the
156  *	various routing topologies on the die (see uts/i86pc/os/cpuid.c for the
157  *	definition of 'die'). One can roughly think about this as a unique
158  *	identifier for the socket itself. In general, the primary node ID for a
159  *	socket should map to the socket APIC ID.
160  *
161  * Finding Devices
162  * ---------------
163  *
164  * There is a bit of a chicken and egg problem on Intel systems and in the
165  * device driver interface. The information that we need in the system is spread
166  * out amongst a large number of different PCI devices that the processor
167  * exposes. The number of such devices can vary based on the processor
168  * generation and the specific SKU in the processor. To deal with this, we break
169  * the driver into two different components: a stub driver and the full driver.
170  *
171  * The stub driver has aliases for all known PCI devices that we might attach to
172  * in a given generation on the system. This driver is called 'imcstub'. When a
173  * stub attaches, it just registers itself with the main driver, upon which it
174  * has a module dependency.
175  *
176  * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it
177  * kicks off a scan of the device tree which takes place in a task queue. Once
178  * there, it determines the number of devices that it expects to exist by
179  * walking the tree and comparing it against the generation-specific table.
180  *
181  * If all devices are found, we'll go ahead and read through all the devices and
182  * build a map of all the information we need to understand the topology of the
183  * system and to be able to decode addresses. We do this here, because we can be
184  * asked to perform decoding in dangerous contexts (after taking an MCE, panic,
185  * etc) where we don't want to have to rely on the broader kernel functioning at
186  * this point in time.
187  *
188  * Once our topology is built, we'll create minor nodes which are used by the
189  * fault management architecture to query for information and register our
190  * decoding functionality with the kernel.
191  *
192  * PCI Numbering
193  * -------------
194  *
195  * For each device that we care about, Intel defines the device and function
196  * that we can expect to find the information and PCI configuration space
197  * registers that we care about at. However, the PCI bus is not well defined.
198  * Devices that are on the same socket use the same set of bus numbers; however,
199  * some sockets have multiple device numbers that they'll use to represent
200  * different classes. These bus numbers are programmed by systems firmware as
201  * part of powering on the system. This means, that we need the ability to
202  * map together these disparate ranges ourselves.
203  *
204  * There is a device called a utility box (UBOX), which exists per-socket and
205  * maps the different sockets together. We use this to determine which devices
206  * correspond to which sockets.
207  *
208  * Mapping Sockets
209  * ---------------
210  *
211  * Another wrinkle is that the way that the OS sees the numbering of the CPUs is
212  * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more
213  * information). However, to map to the corresponding socket, we need to look at
214  * the socket's node ID. The order of PCI buses in the system is not required to
215  * have any relation to the socket ID. Therefore, we have to have yet another
216  * indirection table in the imc_t.
217  *
218  * Exposing Data
219  * -------------
220  *
221  * We expose topology data to FMA using the OS-private memory controller
222  * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a
223  * number of specific interfaces that we can then implement. The ioctl API asks
224  * us for a snapshot of data, which basically has us go through and send an
225  * nvlist_t to userland. This nvlist_t is constructed as part of the scan
226  * process. This nvlist uses the version 1 format, which more explicitly encodes
227  * the topology in a series of nested nvlists.
228  *
229  * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the
230  * decoder and ask it to perform decoding.
231  *
232  * Decoding Addresses
233  * ------------------
234  *
235  * The decoding logic can be found in common/imc/imc_decode.c. This file is
236  * shared between the kernel and userland to allow for easier testing and
237  * additional flexibility in operation. The decoding process happens in a few
238  * different phases.
239  *
240  * The first phase, is to determine which memory controller on which socket is
241  * responsible for this data. To determine this, we use the system address
242  * decoder and walk the rules, looking for the correct target. There are various
243  * manipulations to the address that exist which are used to determine which
244  * index we use. The way that we interpret the output of the rule varies
245  * somewhat based on the generation. Sandy Bridge just has a node ID which
246  * points us to the socket with its single IMC. On Ivy Bridge through Broadwell,
247  * the memory controller to use is also encoded in part of the node ID. Finally,
248  * on Skylake, the SAD tells us which socket to look at. The socket in question
249  * then has a routing table which tells us which channel on which memory
250  * controller that is local to that socket.
251  *
252  * Once we have the target memory controller, we walk the list of target address
253  * decoder rules. These rules can help tell us which channel we care about
254  * (which is required on Sandy Bridge through Broadwell) and then describe some
255  * amount of the interleaving rules which are used to turn the system address
256  * into a channel address.
257  *
258  * Once we know the channel and the channel address, we walk the rank interleave
259  * rules which help us determine which DIMM and the corresponding rank on it
260  * that the corresponding channel address is on. It also has logic that we need
261  * to use to determine how to transform a channel address into an address on
262  * that specific rank. Once we have that, then the initial decoding is done.
263  *
264  * The logic in imc_decode.c is abstracted away from the broader kernel CMI
265  * logic.  This is on purpose and allows us not only an easier time unit testing
266  * the logic, but also allows us to express more high fidelity errors that are
267  * translated into a much smaller subset. This logic is exercised in the
268  * 'imc_test' program which is built in 'test/os-tests/tests/imc'.
269  *
270  * Limitations
271  * -----------
272  *
273  * Currently, this driver has the following limitations:
274  *
275  *  o It doesn't decode the row and column addresses.
276  *  o It doesn't encode from a DIMM address to a system address.
277  *  o It doesn't properly support lockstep and mirroring modes on Sandy Bridge -
278  *    Broadwell platforms.
279  *  o It doesn't support virtual lockstep and adaptive mirroring on Purley
280  *    platforms.
281  *  o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs.
282  *  o It doesn't know how to decode three way channel interleaving.
283  *
284  * None of these are intrinsic problems to the driver, it's mostly a matter of
285  * having proper documentation and testing.
286  */
287 
288 #include <sys/modctl.h>
289 #include <sys/conf.h>
290 #include <sys/devops.h>
291 #include <sys/ddi.h>
292 #include <sys/sunddi.h>
293 #include <sys/types.h>
294 #include <sys/file.h>
295 #include <sys/errno.h>
296 #include <sys/open.h>
297 #include <sys/cred.h>
298 #include <sys/pci.h>
299 #include <sys/sysmacros.h>
300 #include <sys/avl.h>
301 #include <sys/stat.h>
302 #include <sys/policy.h>
303 
304 #include <sys/cpu_module.h>
305 #include <sys/mc.h>
306 #include <sys/mc_intel.h>
307 
308 #include "imc.h"
309 
310 /*
311  * These tables contain generational data that varies between processor
312  * generation such as the maximum number of sockets, memory controllers, and the
313  * offsets of the various registers.
314  */
315 
316 static const imc_gen_data_t imc_gen_data_snb = {
317 	.igd_max_sockets = 4,
318 	.igd_max_imcs = 2,
319 	.igd_max_channels = 4,
320 	.igd_max_dimms = 3,
321 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
322 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
323 	    IMC_REG_MC_MTR2 },
324 	.igd_mcmtr_offset = 0x7c,
325 	.igd_tolm_offset = 0x80,
326 	.igd_tohm_low_offset = 0x84,
327 	.igd_sad_dram_offset = 0x80,
328 	.igd_sad_ndram_rules = 10,
329 	.igd_sad_nodeid_offset = 0x40,
330 	.igd_tad_nrules = 12,
331 	.igd_tad_rule_offset = 0x40,
332 	.igd_tad_chan_offset = 0x90,
333 	.igd_tad_sysdef = 0x80,
334 	.igd_tad_sysdef2 = 0x84,
335 	.igd_mc_mirror = 0xac,
336 	.igd_rir_nways = 5,
337 	.igd_rir_way_offset = 0x108,
338 	.igd_rir_nileaves = 8,
339 	.igd_rir_ileave_offset = 0x120,
340 	.igd_ubox_cpubusno_offset = 0xd0,
341 };
342 
343 static const imc_gen_data_t imc_gen_data_ivb = {
344 	.igd_max_sockets = 4,
345 	.igd_max_imcs = 2,
346 	.igd_max_channels = 4,
347 	.igd_max_dimms = 3,
348 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
349 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
350 	    IMC_REG_MC_MTR2 },
351 	.igd_mcmtr_offset = 0x7c,
352 	.igd_tolm_offset = 0x80,
353 	.igd_tohm_low_offset = 0x84,
354 	.igd_sad_dram_offset = 0x60,
355 	.igd_sad_ndram_rules = 20,
356 	.igd_sad_nodeid_offset = 0x40,
357 	.igd_tad_nrules = 12,
358 	.igd_tad_rule_offset = 0x40,
359 	.igd_tad_chan_offset = 0x90,
360 	.igd_tad_sysdef = 0x80,
361 	.igd_tad_sysdef2 = 0x84,
362 	.igd_mc_mirror = 0xac,
363 	.igd_rir_nways = 5,
364 	.igd_rir_way_offset = 0x108,
365 	.igd_rir_nileaves = 8,
366 	.igd_rir_ileave_offset = 0x120,
367 	.igd_ubox_cpubusno_offset = 0xd0,
368 };
369 
370 static const imc_gen_data_t imc_gen_data_has_brd = {
371 	.igd_max_sockets = 4,
372 	.igd_max_imcs = 2,
373 	.igd_max_channels = 4,
374 	.igd_max_dimms = 3,
375 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX,
376 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
377 	    IMC_REG_MC_MTR2 },
378 	.igd_mcmtr_offset = 0x7c,
379 	.igd_tolm_offset = 0xd0,
380 	.igd_tohm_low_offset = 0xd4,
381 	.igd_tohm_hi_offset = 0xd8,
382 	.igd_sad_dram_offset = 0x60,
383 	.igd_sad_ndram_rules = 20,
384 	.igd_sad_nodeid_offset = 0x40,
385 	.igd_tad_nrules = 12,
386 	.igd_tad_rule_offset = 0x40,
387 	.igd_tad_chan_offset = 0x90,
388 	.igd_tad_sysdef = 0x80,
389 	.igd_tad_sysdef2 = 0x84,
390 	.igd_mc_mirror = 0xac,
391 	.igd_rir_nways = 5,
392 	.igd_rir_way_offset = 0x108,
393 	.igd_rir_nileaves = 8,
394 	.igd_rir_ileave_offset = 0x120,
395 	.igd_ubox_cpubusno_offset = 0xd0,
396 };
397 
398 static const imc_gen_data_t imc_gen_data_skx = {
399 	.igd_max_sockets = 8,
400 	.igd_max_imcs = 2,
401 	.igd_max_channels = 3,
402 	.igd_max_dimms = 2,
403 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
404 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 },
405 	.igd_mcmtr_offset = 0x87c,
406 	.igd_topo_offset = 0x88,
407 	.igd_tolm_offset = 0xd0,
408 	.igd_tohm_low_offset = 0xd4,
409 	.igd_tohm_hi_offset = 0xd8,
410 	.igd_sad_dram_offset = 0x60,
411 	.igd_sad_ndram_rules = 24,
412 	.igd_sad_nodeid_offset = 0xc0,
413 	.igd_tad_nrules = 8,
414 	.igd_tad_rule_offset = 0x850,
415 	.igd_tad_chan_offset = 0x90,
416 	.igd_rir_nways = 4,
417 	.igd_rir_way_offset = 0x108,
418 	.igd_rir_nileaves = 4,
419 	.igd_rir_ileave_offset = 0x120,
420 	.igd_ubox_cpubusno_offset = 0xcc,
421 };
422 
423 /*
424  * This table contains all of the devices that we're looking for from a stub
425  * perspective. These are organized by generation. Different generations behave
426  * in slightly different ways. For example, Sandy Bridge through Broadwell use
427  * unique PCI IDs for each PCI device/function combination that appears. Whereas
428  * Skylake based systems use the same PCI ID; however, different device/function
429  * values indicate that the IDs are used for different purposes.
430  */
431 /* BEGIN CSTYLED */
432 static const imc_stub_table_t imc_stub_table[] = {
433 	/* Sandy Bridge */
434 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" },
435 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" },
436 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" },
437 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" },
438 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" },
439 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" },
440 	{ IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" },
441 	{ IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" },
442 	{ IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" },
443 	{ IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" },
444 	{ IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" },
445 	{ IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" },
446 	/* Ivy Bridge */
447 	{ IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" },
448 	{ IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" },
449 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" },
450 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" },
451 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" },
452 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" },
453 	{ IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" },
454 	{ IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" },
455 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" },
456 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" },
457 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" },
458 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" },
459 	{ IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" },
460 	{ IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" },
461 	{ IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" },
462 	{ IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" },
463 	{ IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" },
464 	{ IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" },
465 	{ IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" },
466 	/* Haswell */
467 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" },
468 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" },
469 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" },
470 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" },
471 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" },
472 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" },
473 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" },
474 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" },
475 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" },
476 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" },
477 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" },
478 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" },
479 	{ IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" },
480 	{ IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" },
481 	{ IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" },
482 	{ IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" },
483 	{ IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" },
484 	{ IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" },
485 	{ IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" },
486 	/* Broadwell Devices */
487 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" },
488 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" },
489 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" },
490 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" },
491 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" },
492 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" },
493 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" },
494 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" },
495 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" },
496 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" },
497 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" },
498 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" },
499 	{ IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" },
500 	{ IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" },
501 	{ IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" },
502 	{ IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" },
503 	{ IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" },
504 	{ IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" },
505 	{ IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" },
506 	/* Skylake and Cascade Lake Devices */
507 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" },
508 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" },
509 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" },
510 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" },
511 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" },
512 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" },
513 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" },
514 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" },
515 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" },
516 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" },
517 	{ IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" },
518 
519 	/*
520 	 * There is one SAD MC Route type device per core! Because of this a
521 	 * wide array of device and functions are allocated. For now, we list
522 	 * all 28 of them out.
523 	 */
524 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" },
525 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" },
526 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" },
527 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" },
528 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" },
529 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" },
530 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" },
531 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" },
532 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" },
533 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" },
534 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" },
535 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" },
536 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" },
537 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" },
538 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" },
539 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" },
540 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" },
541 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" },
542 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" },
543 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" },
544 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" },
545 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" },
546 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" },
547 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" },
548 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" },
549 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" },
550 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" },
551 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" },
552 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" },
553 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" },
554 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" },
555 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" },
556 
557 	{ IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" },
558 	{ IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" },
559 };
560 /* END CSTYLED */
561 
562 #define	IMC_PCI_VENDOR_INTC	0x8086
563 
564 /*
565  * Our IMC data is global and statically set up during a combination of
566  * _init(9E) and attach(9E). While we have a module dependency between the PCI
567  * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't
568  * guarantee that the imc driver has finished attaching. As such we make sure
569  * that it can operate without it being attached in any way.
570  */
571 static imc_t *imc_data = NULL;
572 
573 /*
574  * By default we should not allow the stubs to detach as we don't have a good
575  * way of forcing them to attach again. This is provided in case someone does
576  * want to allow the driver to unload.
577  */
578 int imc_allow_detach = 0;
579 
580 static void
581 imc_set_gen_data(imc_t *imc)
582 {
583 	switch (imc->imc_gen) {
584 	case IMC_GEN_SANDY:
585 		imc->imc_gen_data = &imc_gen_data_snb;
586 		break;
587 	case IMC_GEN_IVY:
588 		imc->imc_gen_data = &imc_gen_data_ivb;
589 		break;
590 	case IMC_GEN_HASWELL:
591 	case IMC_GEN_BROADWELL:
592 		imc->imc_gen_data = &imc_gen_data_has_brd;
593 		break;
594 	case IMC_GEN_SKYLAKE:
595 		imc->imc_gen_data = &imc_gen_data_skx;
596 		break;
597 	default:
598 		dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
599 		    "set to unknown generation: %u", imc->imc_gen);
600 	}
601 }
602 
603 /*
604  * If our device (dev_info_t) does not have a non-zero unit address, then
605  * devfsadmd will not pay attention to us at all. Therefore we need to set the
606  * unit address below, before we create minor nodes.
607  *
608  * The rest of the system expects us to have one minor node per socket. The
609  * minor node ID should be the ID of the socket.
610  */
611 static boolean_t
612 imc_create_minors(imc_t *imc)
613 {
614 	uint_t i;
615 
616 	ddi_set_name_addr(imc->imc_dip, "1");
617 	for (i = 0; i < imc->imc_nsockets; i++) {
618 		char buf[MAXNAMELEN];
619 
620 		if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >=
621 		    sizeof (buf)) {
622 			goto fail;
623 		}
624 
625 		if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i,
626 		    "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
627 			dev_err(imc->imc_dip, CE_WARN, "failed to create "
628 			    "minor node %u: %s", i, buf);
629 			goto fail;
630 		}
631 	}
632 	return (B_TRUE);
633 
634 fail:
635 	ddi_remove_minor_node(imc->imc_dip, NULL);
636 	return (B_FALSE);
637 }
638 
639 /*
640  * Check the current MC route value for this SAD. On Skylake systems there is
641  * one per core. Every core should agree. If not, we will not trust the SAD
642  * MCROUTE values and this will cause system address decoding to fail on
643  * skylake.
644  */
645 static void
646 imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub)
647 {
648 	uint32_t val;
649 
650 	val = pci_config_get32(stub->istub_cfgspace,
651 	    IMC_REG_SKX_SAD_MC_ROUTE_TABLE);
652 	if (val == PCI_EINVAL32) {
653 		sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
654 		return;
655 	}
656 
657 	if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) {
658 		sad->isad_flags |= IMC_SAD_MCROUTE_VALID;
659 		sad->isad_mcroute.ismc_raw_mcroute = val;
660 		return;
661 	}
662 
663 	/*
664 	 * Occasionally we see MC ROUTE table entries with a value of zero.
665 	 * We should ignore those for now.
666 	 */
667 	if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) {
668 		dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch "
669 		    "with socket. SAD has val 0x%x, system has %x\n",
670 		    val, sad->isad_mcroute.ismc_raw_mcroute);
671 		sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE;
672 	}
673 }
674 
675 /*
676  * On Skylake, many of the devices that we care about are on separate PCI Buses.
677  * These can be mapped together by the DECS register. However, we need to know
678  * how to map different buses together so that we can more usefully associate
679  * information. The set of buses is all present in the DECS register. We'll
680  * effectively assign sockets to buses. This is also still something that comes
681  * up on pre-Skylake systems as well.
682  */
683 static boolean_t
684 imc_map_buses(imc_t *imc)
685 {
686 	imc_stub_t *stub;
687 	uint_t nsock;
688 
689 	/*
690 	 * Find the UBOX_DECS registers so we can establish socket mappings. On
691 	 * Skylake, there are three different sets of buses that we need to
692 	 * cover all of our devices, while there are only two before that.
693 	 */
694 	for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL;
695 	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
696 		uint32_t busno;
697 
698 		if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) {
699 			continue;
700 		}
701 
702 		busno = pci_config_get32(stub->istub_cfgspace,
703 		    imc->imc_gen_data->igd_ubox_cpubusno_offset);
704 		if (busno == PCI_EINVAL32) {
705 			dev_err(imc->imc_dip, CE_WARN, "failed to read "
706 			    "UBOX_DECS CPUBUSNO0: invalid PCI read");
707 			return (B_FALSE);
708 		}
709 
710 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
711 			imc->imc_sockets[nsock].isock_nbus = 3;
712 			imc->imc_sockets[nsock].isock_bus[0] =
713 			    IMC_UBOX_CPUBUSNO_0(busno);
714 			imc->imc_sockets[nsock].isock_bus[1] =
715 			    IMC_UBOX_CPUBUSNO_1(busno);
716 			imc->imc_sockets[nsock].isock_bus[2] =
717 			    IMC_UBOX_CPUBUSNO_2(busno);
718 		} else {
719 			imc->imc_sockets[nsock].isock_bus[0] =
720 			    IMC_UBOX_CPUBUSNO_0(busno);
721 			imc->imc_sockets[nsock].isock_bus[1] =
722 			    IMC_UBOX_CPUBUSNO_1(busno);
723 			imc->imc_sockets[nsock].isock_nbus = 2;
724 		}
725 		nsock++;
726 	}
727 	imc->imc_nsockets = nsock;
728 
729 	return (B_TRUE);
730 }
731 
732 /*
733  * For a given stub that we've found, map it to its corresponding socket based
734  * on the PCI bus that it has.
735  */
736 static imc_socket_t *
737 imc_map_find_socket(imc_t *imc, imc_stub_t *stub)
738 {
739 	uint_t i;
740 
741 	for (i = 0; i < imc->imc_nsockets; i++) {
742 		uint_t bus;
743 
744 		for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) {
745 			if (imc->imc_sockets[i].isock_bus[bus] ==
746 			    stub->istub_bus) {
747 				return (&imc->imc_sockets[i]);
748 			}
749 		}
750 	}
751 
752 	return (NULL);
753 }
754 
755 static boolean_t
756 imc_map_stubs(imc_t *imc)
757 {
758 	imc_stub_t *stub;
759 
760 	if (!imc_map_buses(imc)) {
761 		return (B_FALSE);
762 	}
763 
764 	stub = avl_first(&imc->imc_stubs);
765 	for (stub = avl_first(&imc->imc_stubs); stub != NULL;
766 	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
767 		imc_socket_t *sock = imc_map_find_socket(imc, stub);
768 
769 		if (sock == NULL) {
770 			dev_err(imc->imc_dip, CE_WARN, "found stub type %u "
771 			    "PCI%x,%x with bdf %u/%u/%u that does not match a "
772 			    "known PCI bus for any of %u sockets",
773 			    stub->istub_table->imcs_type, stub->istub_vid,
774 			    stub->istub_did, stub->istub_bus, stub->istub_dev,
775 			    stub->istub_func, imc->imc_nsockets);
776 			continue;
777 		}
778 
779 		/*
780 		 * We don't have to worry about duplicates here. We check to
781 		 * make sure that we have unique bdfs here.
782 		 */
783 		switch (stub->istub_table->imcs_type) {
784 		case IMC_TYPE_MC0_M2M:
785 			sock->isock_imcs[0].icn_m2m = stub;
786 			break;
787 		case IMC_TYPE_MC1_M2M:
788 			sock->isock_imcs[1].icn_m2m = stub;
789 			break;
790 		case IMC_TYPE_MC0_MAIN0:
791 			sock->isock_nimc++;
792 			sock->isock_imcs[0].icn_main0 = stub;
793 
794 			/*
795 			 * On Skylake, the MAIN0 does double duty as channel
796 			 * zero and as the TAD.
797 			 */
798 			if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
799 				sock->isock_imcs[0].icn_nchannels++;
800 				sock->isock_imcs[0].icn_channels[0].ich_desc =
801 				    stub;
802 				sock->isock_tad[0].itad_stub = stub;
803 				sock->isock_ntad++;
804 			}
805 			break;
806 		case IMC_TYPE_MC0_MAIN1:
807 			sock->isock_imcs[0].icn_main1 = stub;
808 			break;
809 		case IMC_TYPE_MC1_MAIN0:
810 			sock->isock_nimc++;
811 			sock->isock_imcs[1].icn_main0 = stub;
812 
813 			/*
814 			 * On Skylake, the MAIN0 does double duty as channel
815 			 * zero and as the TAD.
816 			 */
817 			if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
818 				sock->isock_imcs[1].icn_nchannels++;
819 				sock->isock_imcs[1].icn_channels[0].ich_desc =
820 				    stub;
821 				sock->isock_tad[1].itad_stub = stub;
822 				sock->isock_ntad++;
823 			}
824 			break;
825 		case IMC_TYPE_MC1_MAIN1:
826 			sock->isock_imcs[1].icn_main1 = stub;
827 			break;
828 		case IMC_TYPE_MC0_CHANNEL0:
829 			sock->isock_imcs[0].icn_nchannels++;
830 			sock->isock_imcs[0].icn_channels[0].ich_desc = stub;
831 			break;
832 		case IMC_TYPE_MC0_CHANNEL1:
833 			sock->isock_imcs[0].icn_nchannels++;
834 			sock->isock_imcs[0].icn_channels[1].ich_desc = stub;
835 			break;
836 		case IMC_TYPE_MC0_CHANNEL2:
837 			sock->isock_imcs[0].icn_nchannels++;
838 			sock->isock_imcs[0].icn_channels[2].ich_desc = stub;
839 			break;
840 		case IMC_TYPE_MC0_CHANNEL3:
841 			sock->isock_imcs[0].icn_nchannels++;
842 			sock->isock_imcs[0].icn_channels[3].ich_desc = stub;
843 			break;
844 		case IMC_TYPE_MC1_CHANNEL0:
845 			sock->isock_imcs[1].icn_nchannels++;
846 			sock->isock_imcs[1].icn_channels[0].ich_desc = stub;
847 			break;
848 		case IMC_TYPE_MC1_CHANNEL1:
849 			sock->isock_imcs[1].icn_nchannels++;
850 			sock->isock_imcs[1].icn_channels[1].ich_desc = stub;
851 			break;
852 		case IMC_TYPE_MC1_CHANNEL2:
853 			sock->isock_imcs[1].icn_nchannels++;
854 			sock->isock_imcs[1].icn_channels[2].ich_desc = stub;
855 			break;
856 		case IMC_TYPE_MC1_CHANNEL3:
857 			sock->isock_imcs[1].icn_nchannels++;
858 			sock->isock_imcs[1].icn_channels[3].ich_desc = stub;
859 			break;
860 		case IMC_TYPE_SAD_DRAM:
861 			sock->isock_sad.isad_dram = stub;
862 			break;
863 		case IMC_TYPE_SAD_MMIO:
864 			sock->isock_sad.isad_mmio = stub;
865 			break;
866 		case IMC_TYPE_SAD_MISC:
867 			sock->isock_sad.isad_tolh = stub;
868 			break;
869 		case IMC_TYPE_VTD_MISC:
870 			/*
871 			 * Some systems have multiple VT-D Misc. entry points
872 			 * in the system. In this case, only use the first one
873 			 * we find.
874 			 */
875 			if (imc->imc_gvtd_misc == NULL) {
876 				imc->imc_gvtd_misc = stub;
877 			}
878 			break;
879 		case IMC_TYPE_SAD_MCROUTE:
880 			ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE);
881 			imc_mcroute_check(imc, &sock->isock_sad, stub);
882 			break;
883 		case IMC_TYPE_UBOX:
884 			sock->isock_ubox = stub;
885 			break;
886 		case IMC_TYPE_HA0:
887 			sock->isock_ntad++;
888 			sock->isock_tad[0].itad_stub = stub;
889 			break;
890 		case IMC_TYPE_HA1:
891 			sock->isock_ntad++;
892 			sock->isock_tad[1].itad_stub = stub;
893 			break;
894 		case IMC_TYPE_UBOX_CPUBUSNO:
895 			sock->isock_cpubusno = stub;
896 			break;
897 		default:
898 			/*
899 			 * Attempt to still attach if we can.
900 			 */
901 			dev_err(imc->imc_dip, CE_WARN, "Encountered unknown "
902 			    "IMC type (%u) on PCI %x,%x",
903 			    stub->istub_table->imcs_type,
904 			    stub->istub_vid, stub->istub_did);
905 			break;
906 		}
907 	}
908 
909 	return (B_TRUE);
910 }
911 
912 /*
913  * Go through and fix up various aspects of the stubs mappings on systems. The
914  * following are a list of what we need to fix up:
915  *
916  *  1. On Haswell and newer systems, there is only one global VT-d device. We
917  *     need to go back and map that to all of the per-socket imc_sad_t entries.
918  */
919 static void
920 imc_fixup_stubs(imc_t *imc)
921 {
922 	if (imc->imc_gen >= IMC_GEN_HASWELL) {
923 		uint_t i;
924 
925 		for (i = 0; i < imc->imc_nsockets; i++) {
926 			ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh,
927 			    ==, NULL);
928 			imc->imc_sockets[i].isock_sad.isad_tolh =
929 			    imc->imc_gvtd_misc;
930 		}
931 	}
932 }
933 
934 /*
935  * In the wild we've hit a few odd cases where not all devices are exposed that
936  * we might expect by firmware. In particular we've seen and validate the
937  * following cases:
938  *
939  *  o We don't find all of the channel devices that we expect, e.g. we have the
940  *    stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW
941  *    with an E5-2630v3.
942  */
943 static boolean_t
944 imc_validate_stubs(imc_t *imc)
945 {
946 	for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) {
947 		imc_socket_t *socket = &imc->imc_sockets[sock];
948 
949 		for (uint_t mc = 0; mc < socket->isock_nimc; mc++) {
950 			imc_mc_t *mcp = &socket->isock_imcs[mc];
951 
952 			for (uint_t chan = 0; chan < mcp->icn_nchannels;
953 			    chan++) {
954 				if (mcp->icn_channels[chan].ich_desc == NULL) {
955 					dev_err(imc->imc_dip, CE_WARN,
956 					    "!missing device for socket %u/"
957 					    "imc %u/channel %u", sock, mc,
958 					    chan);
959 					return (B_FALSE);
960 				}
961 			}
962 		}
963 	}
964 
965 	return (B_TRUE);
966 }
967 
968 /*
969  * Attempt to map all of the discovered sockets to the corresponding APIC based
970  * socket. We do these mappings by getting the node id of the socket and
971  * adjusting it to make sure that no home agent is present in it. We use the
972  * UBOX to avoid any home agent related bits that are present in other
973  * registers.
974  */
975 static void
976 imc_map_sockets(imc_t *imc)
977 {
978 	uint_t i;
979 
980 	for (i = 0; i < imc->imc_nsockets; i++) {
981 		uint32_t nodeid;
982 		ddi_acc_handle_t h;
983 
984 		h = imc->imc_sockets[i].isock_ubox->istub_cfgspace;
985 		nodeid = pci_config_get32(h,
986 		    imc->imc_gen_data->igd_sad_nodeid_offset);
987 		if (nodeid == PCI_EINVAL32) {
988 			imc->imc_sockets[i].isock_valid |=
989 			    IMC_SOCKET_V_BAD_NODEID;
990 			continue;
991 		}
992 
993 		imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid);
994 		imc->imc_spointers[nodeid] = &imc->imc_sockets[i];
995 	}
996 }
997 
998 /*
999  * Decode the MTR, accounting for variances between processor generations.
1000  */
1001 static void
1002 imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr)
1003 {
1004 	uint8_t disable;
1005 
1006 	/*
1007 	 * Check present first, before worrying about anything else.
1008 	 */
1009 	if (imc->imc_gen < IMC_GEN_SKYLAKE &&
1010 	    IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) {
1011 		dimm->idimm_present = B_FALSE;
1012 		return;
1013 	} else if (imc->imc_gen >= IMC_GEN_SKYLAKE &&
1014 	    IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) {
1015 		dimm->idimm_present = B_FALSE;
1016 		return;
1017 	}
1018 
1019 	dimm->idimm_present = B_TRUE;
1020 	dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE;
1021 	if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN ||
1022 	    dimm->idimm_ncolumns > IMC_MTR_CA_MAX) {
1023 		dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS;
1024 	}
1025 
1026 	dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE;
1027 	if (dimm->idimm_nrows < IMC_MTR_RA_MIN ||
1028 	    dimm->idimm_nrows > IMC_MTR_RA_MAX) {
1029 		dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS;
1030 	}
1031 
1032 	/*
1033 	 * Determine Density, this information is not present on Sandy Bridge.
1034 	 */
1035 	switch (imc->imc_gen) {
1036 	case IMC_GEN_IVY:
1037 		dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr);
1038 		break;
1039 	case IMC_GEN_HASWELL:
1040 	case IMC_GEN_BROADWELL:
1041 		switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) {
1042 		case 0:
1043 		default:
1044 			dimm->idimm_density = 0;
1045 			dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1046 			break;
1047 		case 1:
1048 			dimm->idimm_density = 2;
1049 			break;
1050 		case 2:
1051 			dimm->idimm_density = 4;
1052 			break;
1053 		case 3:
1054 			dimm->idimm_density = 8;
1055 			break;
1056 		}
1057 		break;
1058 	case IMC_GEN_SKYLAKE:
1059 		switch (IMC_MTR_DENSITY_SKX(mtr)) {
1060 		case 0:
1061 		default:
1062 			dimm->idimm_density = 0;
1063 			dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1064 			break;
1065 		case 1:
1066 			dimm->idimm_density = 2;
1067 			break;
1068 		case 2:
1069 			dimm->idimm_density = 4;
1070 			break;
1071 		case 3:
1072 			dimm->idimm_density = 8;
1073 			break;
1074 		case 4:
1075 			dimm->idimm_density = 16;
1076 			break;
1077 		case 5:
1078 			dimm->idimm_density = 12;
1079 			break;
1080 		}
1081 		break;
1082 	case IMC_GEN_UNKNOWN:
1083 	case IMC_GEN_SANDY:
1084 		dimm->idimm_density = 0;
1085 		break;
1086 	}
1087 
1088 	/*
1089 	 * The values of width are the same on IVY->SKX, but the bits are
1090 	 * different. This doesn't exist on SNB.
1091 	 */
1092 	if (imc->imc_gen > IMC_GEN_SANDY) {
1093 		uint8_t width;
1094 
1095 		if (imc->imc_gen >= IMC_GEN_BROADWELL) {
1096 			width = IMC_MTR_WIDTH_BRD_SKX(mtr);
1097 		} else {
1098 			width = IMC_MTR_WIDTH_IVB_HAS(mtr);
1099 		}
1100 		switch (width) {
1101 		case 0:
1102 			dimm->idimm_width = 4;
1103 			break;
1104 		case 1:
1105 			dimm->idimm_width = 8;
1106 			break;
1107 		case 2:
1108 			dimm->idimm_width = 16;
1109 			break;
1110 		default:
1111 			dimm->idimm_width = 0;
1112 			dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH;
1113 			break;
1114 		}
1115 	} else {
1116 		dimm->idimm_width = 0;
1117 	}
1118 
1119 	dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr);
1120 	switch (imc->imc_gen) {
1121 	case IMC_GEN_HASWELL:
1122 	case IMC_GEN_BROADWELL:
1123 	case IMC_GEN_SKYLAKE:
1124 		if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) {
1125 			dimm->idimm_nranks = 0;
1126 			dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1127 		}
1128 		break;
1129 	default:
1130 		if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) {
1131 			dimm->idimm_nranks = 0;
1132 			dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1133 		}
1134 	}
1135 
1136 	disable = IMC_MTR_RANK_DISABLE(mtr);
1137 	dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0;
1138 	dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0;
1139 	dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0;
1140 	dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0;
1141 
1142 	/*
1143 	 * Only Haswell and later have this information.
1144 	 */
1145 	if (imc->imc_gen >= IMC_GEN_HASWELL) {
1146 		dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0;
1147 		dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0;
1148 		dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr);
1149 		if (dimm->idimm_3dsranks != 0) {
1150 			dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks;
1151 		}
1152 	}
1153 
1154 
1155 	if (icn->icn_dimm_type == IMC_DIMM_DDR4) {
1156 		dimm->idimm_nbanks = 16;
1157 	} else {
1158 		dimm->idimm_nbanks = 8;
1159 	}
1160 
1161 	/*
1162 	 * To calculate the DIMM size we need first take the number of rows and
1163 	 * columns. This gives us the number of slots per chip. In a given rank
1164 	 * there are nbanks of these. There are nrank entries of those. Each of
1165 	 * these slots can fit a byte.
1166 	 */
1167 	dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 *
1168 	    (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows));
1169 }
1170 
1171 static void
1172 imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan)
1173 {
1174 	uint_t i;
1175 
1176 	/*
1177 	 * There's one register for each DIMM that might be present, we always
1178 	 * read that information to determine information about the DIMMs.
1179 	 */
1180 	chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms;
1181 	for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1182 		uint32_t mtr;
1183 		imc_dimm_t *dimm = &chan->ich_dimms[i];
1184 
1185 		bzero(dimm, sizeof (imc_dimm_t));
1186 		mtr = pci_config_get32(chan->ich_desc->istub_cfgspace,
1187 		    imc->imc_gen_data->igd_mtr_offsets[i]);
1188 		dimm->idimm_mtr = mtr;
1189 		/*
1190 		 * We don't really expect to get a bad PCIe read. However, if we
1191 		 * do, treat that for the moment as though the DIMM is bad.
1192 		 */
1193 		if (mtr == PCI_EINVAL32) {
1194 			dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ;
1195 			continue;
1196 		}
1197 
1198 		imc_decode_mtr(imc, icn, dimm, mtr);
1199 	}
1200 }
1201 
1202 static boolean_t
1203 imc_fill_controller(imc_t *imc, imc_mc_t *icn)
1204 {
1205 	uint32_t mcmtr;
1206 
1207 	mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace,
1208 	    imc->imc_gen_data->igd_mcmtr_offset);
1209 	if (mcmtr == PCI_EINVAL32) {
1210 		icn->icn_invalid = B_TRUE;
1211 		return (B_FALSE);
1212 	}
1213 
1214 	icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0;
1215 	if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1216 		icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0;
1217 	} else {
1218 		icn->icn_lockstep = B_FALSE;
1219 	}
1220 
1221 	icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0;
1222 
1223 	/*
1224 	 * SNB and IVB only support DDR3. Haswell and Broadwell may support
1225 	 * DDR4, depends on the SKU. Skylake only supports DDR4.
1226 	 */
1227 	switch (imc->imc_gen) {
1228 	case IMC_GEN_SANDY:
1229 	case IMC_GEN_IVY:
1230 		icn->icn_dimm_type = IMC_DIMM_DDR3;
1231 		break;
1232 	case IMC_GEN_HASWELL:
1233 	case IMC_GEN_BROADWELL:
1234 		if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) {
1235 			icn->icn_dimm_type = IMC_DIMM_DDR4;
1236 		} else {
1237 			icn->icn_dimm_type = IMC_DIMM_DDR3;
1238 		}
1239 		break;
1240 	default:
1241 		/*
1242 		 * Skylake and on are all DDR4.
1243 		 */
1244 		icn->icn_dimm_type = IMC_DIMM_DDR4;
1245 		break;
1246 	}
1247 
1248 	if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) {
1249 		icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace,
1250 		    imc->imc_gen_data->igd_topo_offset);
1251 	}
1252 
1253 	return (B_TRUE);
1254 }
1255 
1256 /*
1257  * Walk the IMC data and fill in the information on DIMMs and the memory
1258  * controller configurations.
1259  */
1260 static void
1261 imc_fill_data(imc_t *imc)
1262 {
1263 	uint_t csock, cmc, cchan;
1264 
1265 	for (csock = 0; csock < imc->imc_nsockets; csock++) {
1266 		imc_socket_t *sock = &imc->imc_sockets[csock];
1267 
1268 		for (cmc = 0; cmc < sock->isock_nimc; cmc++) {
1269 			imc_mc_t *icn = &sock->isock_imcs[cmc];
1270 
1271 			if (!imc_fill_controller(imc, icn))
1272 				continue;
1273 
1274 			for (cchan = 0; cchan < icn->icn_nchannels; cchan++) {
1275 				imc_fill_dimms(imc, icn,
1276 				    &icn->icn_channels[cchan]);
1277 			}
1278 		}
1279 	}
1280 }
1281 
1282 static nvlist_t *
1283 imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm)
1284 {
1285 	nvlist_t *nvl;
1286 
1287 	nvl = fnvlist_alloc();
1288 	fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT,
1289 	    dimm->idimm_present);
1290 	if (!dimm->idimm_present) {
1291 		return (nvl);
1292 	}
1293 
1294 	fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size);
1295 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS,
1296 	    dimm->idimm_ncolumns);
1297 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS,
1298 	    dimm->idimm_nrows);
1299 
1300 	if (imc->imc_gen > IMC_GEN_SANDY) {
1301 		fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY,
1302 		    dimm->idimm_density * (1ULL << 30));
1303 		fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH,
1304 		    dimm->idimm_width);
1305 	}
1306 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS,
1307 	    dimm->idimm_nranks);
1308 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS,
1309 	    dimm->idimm_nbanks);
1310 	fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS,
1311 	    dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE);
1312 
1313 	if (imc->imc_gen >= IMC_GEN_HASWELL) {
1314 		fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL,
1315 		    dimm->idimm_hdrl);
1316 		fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP,
1317 		    dimm->idimm_hdrl_parity);
1318 		if (dimm->idimm_3dsranks > 0) {
1319 			fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK,
1320 			    dimm->idimm_3dsranks);
1321 		}
1322 	}
1323 
1324 	return (nvl);
1325 }
1326 
1327 static nvlist_t *
1328 imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan)
1329 {
1330 	nvlist_t *nvl;
1331 	nvlist_t *dimms[IMC_MAX_DIMMPERCHAN];
1332 	uint_t i;
1333 
1334 	nvl = fnvlist_alloc();
1335 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC,
1336 	    imc->imc_gen_data->igd_max_dimms);
1337 	for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1338 		dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]);
1339 	}
1340 
1341 	fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS,
1342 	    dimms, i);
1343 
1344 	for (; i > 0; i--) {
1345 		nvlist_free(dimms[i-1]);
1346 	}
1347 
1348 	return (nvl);
1349 }
1350 
1351 static nvlist_t *
1352 imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn)
1353 {
1354 	nvlist_t *nvl;
1355 	nvlist_t *channels[IMC_MAX_CHANPERMC];
1356 	uint_t i;
1357 
1358 	nvl = fnvlist_alloc();
1359 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels);
1360 	fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC,
1361 	    icn->icn_ecc);
1362 	if (icn->icn_lockstep) {
1363 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1364 		    MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK);
1365 	} else {
1366 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1367 		    MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP);
1368 
1369 	}
1370 
1371 	if (icn->icn_closed) {
1372 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1373 		    MCINTEL_NVLIST_V1_MC_POLICY_CLOSED);
1374 	} else {
1375 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1376 		    MCINTEL_NVLIST_V1_MC_POLICY_OPEN);
1377 	}
1378 
1379 	for (i = 0; i < icn->icn_nchannels; i++) {
1380 		channels[i] = imc_nvl_create_channel(imc,
1381 		    &icn->icn_channels[i]);
1382 	}
1383 	fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS,
1384 	    channels, icn->icn_nchannels);
1385 	for (i = 0; i < icn->icn_nchannels; i++) {
1386 		nvlist_free(channels[i]);
1387 	}
1388 
1389 	return (nvl);
1390 }
1391 
1392 static void
1393 imc_nvl_pack(imc_socket_t *sock, boolean_t sleep)
1394 {
1395 	char *buf = NULL;
1396 	size_t len = 0;
1397 	int kmflag;
1398 
1399 	if (sock->isock_nvl == NULL)
1400 		return;
1401 
1402 	if (sock->isock_buf != NULL)
1403 		return;
1404 
1405 	if (sleep) {
1406 		kmflag = KM_SLEEP;
1407 	} else {
1408 		kmflag = KM_NOSLEEP | KM_NORMALPRI;
1409 	}
1410 
1411 	if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR,
1412 	    kmflag) != 0) {
1413 		return;
1414 	}
1415 
1416 	sock->isock_buf = buf;
1417 	sock->isock_buflen = len;
1418 	sock->isock_gen++;
1419 }
1420 
1421 static void
1422 imc_decoder_pack(imc_t *imc)
1423 {
1424 	char *buf = NULL;
1425 	size_t len = 0;
1426 
1427 	if (imc->imc_decoder_buf != NULL)
1428 		return;
1429 
1430 	if (imc->imc_decoder_dump == NULL) {
1431 		imc->imc_decoder_dump = imc_dump_decoder(imc);
1432 	}
1433 
1434 	if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR,
1435 	    KM_NOSLEEP | KM_NORMALPRI) != 0) {
1436 		return;
1437 	}
1438 
1439 	imc->imc_decoder_buf = buf;
1440 	imc->imc_decoder_len = len;
1441 }
1442 
1443 static void
1444 imc_nvl_create(imc_t *imc)
1445 {
1446 	uint_t csock;
1447 	for (csock = 0; csock < imc->imc_nsockets; csock++) {
1448 		uint_t i;
1449 		nvlist_t *nvl;
1450 		nvlist_t *mcs[IMC_MAX_IMCPERSOCK];
1451 		imc_socket_t *sock = &imc->imc_sockets[csock];
1452 
1453 		nvl = fnvlist_alloc();
1454 		fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR,
1455 		    MCINTEL_NVLIST_VERS1);
1456 		fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC,
1457 		    sock->isock_nimc);
1458 
1459 		for (i = 0; i < sock->isock_nimc; i++) {
1460 			mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]);
1461 		}
1462 
1463 		fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS,
1464 		    mcs, sock->isock_nimc);
1465 
1466 		for (i = 0; i < sock->isock_nimc; i++) {
1467 			nvlist_free(mcs[i]);
1468 		}
1469 
1470 		sock->isock_nvl = nvl;
1471 		imc_nvl_pack(sock, B_TRUE);
1472 	}
1473 }
1474 
1475 /*
1476  * Determine the top of low and high memory. These determine whether transaction
1477  * addresses target main memory or not. Unfortunately, the way that these are
1478  * stored and fetched changes with different generations.
1479  */
1480 static void
1481 imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad)
1482 {
1483 	uint32_t tolm, tohm_low, tohm_hi;
1484 
1485 	tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1486 	    imc->imc_gen_data->igd_tolm_offset);
1487 	tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1488 	    imc->imc_gen_data->igd_tohm_low_offset);
1489 	if (imc->imc_gen_data->igd_tohm_hi_offset != 0) {
1490 		tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1491 		    imc->imc_gen_data->igd_tohm_hi_offset);
1492 	} else {
1493 		tohm_hi = 0;
1494 	}
1495 
1496 	if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 ||
1497 	    tohm_hi == PCI_EINVAL32) {
1498 		sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1499 		return;
1500 	}
1501 
1502 	switch (imc->imc_gen) {
1503 	case IMC_GEN_SANDY:
1504 	case IMC_GEN_IVY:
1505 		sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) <<
1506 		    IMC_TOLM_SNB_IVY_SHIFT;
1507 		sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) <<
1508 		    IMC_TOLM_SNB_IVY_SHIFT;
1509 		break;
1510 	case IMC_GEN_HASWELL:
1511 	case IMC_GEN_BROADWELL:
1512 	case IMC_GEN_SKYLAKE:
1513 		sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK;
1514 		sad->isad_tohm = ((uint64_t)tohm_low &
1515 		    IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32);
1516 
1517 		/*
1518 		 * Adjust the values to turn them into an exclusive range.
1519 		 */
1520 		sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL;
1521 		sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL;
1522 		break;
1523 	default:
1524 		dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
1525 		    "set to unknown generation: %u", imc->imc_gen);
1526 		return;
1527 	}
1528 }
1529 
1530 static void
1531 imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule,
1532     uint32_t raw)
1533 {
1534 	uint_t attr;
1535 	uint64_t limit;
1536 	bzero(rule, sizeof (imc_sad_rule_t));
1537 
1538 	rule->isr_raw_dram = raw;
1539 	rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0;
1540 	if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1541 		switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) {
1542 		case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6:
1543 			rule->isr_imode = IMC_SAD_IMODE_8t6;
1544 			break;
1545 		case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR:
1546 			rule->isr_imode = IMC_SAD_IMODE_8t6XOR;
1547 			break;
1548 		}
1549 	} else {
1550 		switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) {
1551 		case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6:
1552 			rule->isr_imode = IMC_SAD_IMODE_8t6;
1553 			break;
1554 		case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8:
1555 			rule->isr_imode = IMC_SAD_IMODE_10t8;
1556 			break;
1557 		case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12:
1558 			rule->isr_imode = IMC_SAD_IMODE_14t12;
1559 			break;
1560 		case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30:
1561 			rule->isr_imode = IMC_SAD_IMODE_32t30;
1562 			break;
1563 		}
1564 	}
1565 
1566 	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1567 		attr = IMC_SAD_DRAM_ATTR_SKX(raw);
1568 	} else {
1569 		attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw);
1570 	}
1571 
1572 	switch (attr) {
1573 	case IMC_SAD_DRAM_ATTR_DRAM:
1574 		rule->isr_type = IMC_SAD_TYPE_DRAM;
1575 		break;
1576 	case IMC_SAD_DRAM_ATTR_MMCFG:
1577 		rule->isr_type = IMC_SAD_TYPE_MMCFG;
1578 		break;
1579 	case IMC_SAD_DRAM_ATTR_NXM:
1580 		if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1581 			sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1582 		}
1583 		rule->isr_type = IMC_SAD_TYPE_NXM;
1584 		break;
1585 	default:
1586 		sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1587 		break;
1588 	}
1589 
1590 	/*
1591 	 * Fetch the limit which represents bits 45:26 and then adjust this so
1592 	 * that it is exclusive.
1593 	 */
1594 	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1595 		limit = IMC_SAD_DRAM_LIMIT_SKX(raw);
1596 	} else {
1597 		limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw);
1598 	}
1599 	rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) +
1600 	    IMC_SAD_DRAM_LIMIT_EXCLUSIVE;
1601 
1602 	/*
1603 	 * The rest of this does not apply to Sandy Bridge.
1604 	 */
1605 	if (imc->imc_gen == IMC_GEN_SANDY)
1606 		return;
1607 
1608 	if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) {
1609 		rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0;
1610 		return;
1611 	}
1612 
1613 	switch (IMC_SAD_DRAM_MOD23_SKX(raw)) {
1614 	case IMC_SAD_DRAM_MOD23_MOD3:
1615 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3;
1616 		break;
1617 	case IMC_SAD_DRAM_MOD23_MOD2_C01:
1618 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01;
1619 		break;
1620 	case IMC_SAD_DRAM_MOD23_MOD2_C12:
1621 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12;
1622 		break;
1623 	case IMC_SAD_DRAM_MOD23_MOD2_C02:
1624 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02;
1625 		break;
1626 	}
1627 
1628 	rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0;
1629 	switch (IMC_SAD_DRAM_MOD3_SKX(raw)) {
1630 	case IMC_SAD_DRAM_MOD3_MODE_45t6:
1631 		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6;
1632 		break;
1633 	case IMC_SAD_DRAM_MOD3_MODE_45t8:
1634 		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8;
1635 		break;
1636 	case IMC_SAD_DRAM_MOD3_MODE_45t12:
1637 		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12;
1638 		break;
1639 	default:
1640 		sad->isad_valid |= IMC_SAD_V_BAD_MOD3;
1641 		break;
1642 	}
1643 }
1644 
1645 static void
1646 imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw)
1647 {
1648 	uint_t i;
1649 	uint32_t mlen, mbase, skipbits, skipafter;
1650 
1651 	rule->isr_raw_interleave = raw;
1652 
1653 	/*
1654 	 * Right now all architectures always have the maximum number of SAD
1655 	 * interleave targets.
1656 	 */
1657 	rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE;
1658 
1659 	/*
1660 	 * Sandy Bridge has a gap in the interleave list due to the fact that it
1661 	 * uses a smaller length.
1662 	 */
1663 	if (imc->imc_gen > IMC_GEN_SANDY) {
1664 		mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN;
1665 		mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK;
1666 		skipbits = skipafter = 0;
1667 	} else {
1668 		mlen = IMC_SAD_ILEAVE_SNB_LEN;
1669 		mbase = IMC_SAD_ILEAVE_SNB_MASK;
1670 		skipbits = 2;
1671 		skipafter = 4;
1672 	}
1673 
1674 	for (i = 0; i < rule->isr_ntargets; i++) {
1675 		uint32_t mask, shift;
1676 
1677 		shift = i * mlen;
1678 		if (i >= skipafter)
1679 			shift += skipbits;
1680 		mask = mbase << shift;
1681 		rule->isr_targets[i] = (raw & mask) >> shift;
1682 	}
1683 }
1684 
1685 static void
1686 imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad)
1687 {
1688 	uint_t i;
1689 	off_t off;
1690 
1691 	sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules;
1692 	for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset;
1693 	    i < sad->isad_nrules; i++, off += sizeof (uint64_t)) {
1694 		uint32_t dram, interleave;
1695 		imc_sad_rule_t *rule = &sad->isad_rules[i];
1696 
1697 		dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off);
1698 		interleave = pci_config_get32(sad->isad_dram->istub_cfgspace,
1699 		    off + 4);
1700 
1701 		if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) {
1702 			sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1703 			return;
1704 		}
1705 
1706 		imc_sad_fill_rule(imc, sad, rule, dram);
1707 		imc_sad_fill_rule_interleave(imc, rule, interleave);
1708 	}
1709 }
1710 
1711 static void
1712 imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad)
1713 {
1714 	uint_t i;
1715 	imc_sad_mcroute_table_t *mc = &sad->isad_mcroute;
1716 
1717 	if (imc->imc_gen < IMC_GEN_SKYLAKE)
1718 		return;
1719 	if (sad->isad_valid != 0)
1720 		return;
1721 
1722 	mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES;
1723 	for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) {
1724 		uint_t chanoff, ringoff;
1725 
1726 		ringoff = i * IMC_MC_ROUTE_RING_BITS;
1727 		chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET;
1728 
1729 		mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >>
1730 		    ringoff) & IMC_MC_ROUTE_RING_MASK;
1731 		mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >>
1732 		    chanoff) & IMC_MC_ROUTE_CHAN_MASK;
1733 	}
1734 }
1735 
1736 /*
1737  * Initialize the SAD. To do this we have to do a few different things:
1738  *
1739  * 1. Determine where the top of low and high memory is.
1740  * 2. Read and decode all of the rules for the SAD
1741  * 3. On systems with a route table, decode the raw routes
1742  *
1743  * At this point in time, we treat TOLM and TOHM as a per-socket construct, even
1744  * though it really should be global, this just makes life a bit simpler.
1745  */
1746 static void
1747 imc_decoder_init_sad(imc_t *imc)
1748 {
1749 	uint_t i;
1750 
1751 	for (i = 0; i < imc->imc_nsockets; i++) {
1752 		imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad);
1753 		imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad);
1754 		imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad);
1755 	}
1756 }
1757 
1758 static void
1759 imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev,
1760     imc_tad_rule_t *rule, uint32_t val)
1761 {
1762 	uint64_t limit;
1763 
1764 	limit = IMC_TAD_LIMIT(val);
1765 	rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) +
1766 	    IMC_TAD_LIMIT_EXCLUSIVE;
1767 	rule->itr_raw = val;
1768 
1769 	switch (IMC_TAD_SOCK_WAY(val)) {
1770 	case IMC_TAD_SOCK_WAY_1:
1771 		rule->itr_sock_way = 1;
1772 		break;
1773 	case IMC_TAD_SOCK_WAY_2:
1774 		rule->itr_sock_way = 2;
1775 		break;
1776 	case IMC_TAD_SOCK_WAY_4:
1777 		rule->itr_sock_way = 4;
1778 		break;
1779 	case IMC_TAD_SOCK_WAY_8:
1780 		rule->itr_sock_way = 8;
1781 		break;
1782 	}
1783 
1784 	rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1;
1785 	rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1786 	rule->itr_chan_gran = IMC_TAD_GRAN_64B;
1787 
1788 	/*
1789 	 * Starting with Skylake the targets that are used are no longer part of
1790 	 * the TAD. Those come from the IMC route table.
1791 	 */
1792 	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1793 		rule->itr_ntargets = 0;
1794 		return;
1795 	}
1796 
1797 	rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS;
1798 	rule->itr_targets[0] = IMC_TAD_TARG0(val);
1799 	rule->itr_targets[1] = IMC_TAD_TARG1(val);
1800 	rule->itr_targets[2] = IMC_TAD_TARG2(val);
1801 	rule->itr_targets[3] = IMC_TAD_TARG3(val);
1802 
1803 	if (prev == NULL) {
1804 		rule->itr_base = 0;
1805 	} else {
1806 		rule->itr_base = prev->itr_limit + 1;
1807 	}
1808 }
1809 
1810 static void
1811 imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule,
1812     uint32_t val)
1813 {
1814 	uint64_t base;
1815 
1816 	rule->itr_raw_gran = val;
1817 	base = IMC_TAD_BASE_BASE(val);
1818 	rule->itr_base = base << IMC_TAD_BASE_SHIFT;
1819 
1820 	switch (IMC_TAD_BASE_CHAN_GRAN(val)) {
1821 	case IMC_TAD_BASE_CHAN_GRAN_64B:
1822 		rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1823 		break;
1824 	case IMC_TAD_BASE_CHAN_GRAN_256B:
1825 		rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1826 		break;
1827 	case IMC_TAD_BASE_CHAN_GRAN_4KB:
1828 		rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1829 		break;
1830 	default:
1831 		tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN;
1832 		return;
1833 	}
1834 
1835 	switch (IMC_TAD_BASE_SOCK_GRAN(val)) {
1836 	case IMC_TAD_BASE_SOCK_GRAN_64B:
1837 		rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1838 		break;
1839 	case IMC_TAD_BASE_SOCK_GRAN_256B:
1840 		rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1841 		break;
1842 	case IMC_TAD_BASE_SOCK_GRAN_4KB:
1843 		rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1844 		break;
1845 	case IMC_TAD_BASE_SOCK_GRAN_1GB:
1846 		rule->itr_sock_gran = IMC_TAD_GRAN_1GB;
1847 		break;
1848 	}
1849 }
1850 
1851 /*
1852  * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's
1853  * suggested that the channel wayness will take this into account and therefore
1854  * should be accurately reflected.
1855  */
1856 static void
1857 imc_tad_read_rules(imc_t *imc, imc_tad_t *tad)
1858 {
1859 	uint_t i;
1860 	off_t baseoff;
1861 	imc_tad_rule_t *prev;
1862 
1863 	tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules;
1864 	for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset,
1865 	    prev = NULL; i < tad->itad_nrules;
1866 	    i++, baseoff += sizeof (uint32_t)) {
1867 		uint32_t val;
1868 		off_t off;
1869 		imc_tad_rule_t *rule = &tad->itad_rules[i];
1870 
1871 		/*
1872 		 * On Skylake, the TAD rules are split among two registers. The
1873 		 * latter set mimics what exists on pre-Skylake.
1874 		 */
1875 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1876 			off = baseoff + IMC_SKX_WAYNESS_OFFSET;
1877 		} else {
1878 			off = baseoff;
1879 		}
1880 
1881 		val = pci_config_get32(tad->itad_stub->istub_cfgspace, off);
1882 		if (val == PCI_EINVAL32) {
1883 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1884 			return;
1885 		}
1886 
1887 		imc_tad_fill_rule(imc, tad, prev, rule, val);
1888 		prev = rule;
1889 		if (imc->imc_gen < IMC_GEN_SKYLAKE)
1890 			continue;
1891 
1892 		val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff);
1893 		if (val == PCI_EINVAL32) {
1894 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1895 			return;
1896 		}
1897 
1898 		imc_tad_fill_skx(imc, tad, rule, val);
1899 	}
1900 }
1901 
1902 /*
1903  * Check for features which change how decoding works.
1904  */
1905 static void
1906 imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc)
1907 {
1908 	uint32_t val;
1909 
1910 	/*
1911 	 * Determine whether or not lockstep mode or mirroring are enabled.
1912 	 * These change the behavior of how we're supposed to interpret channel
1913 	 * wayness. Lockstep is available in the TAD's features. Mirroring is
1914 	 * available on the IMC's features. This isn't present in Skylake+. On
1915 	 * Skylake Mirorring is a property of the SAD rule and there is no
1916 	 * lockstep.
1917 	 */
1918 	switch (imc->imc_gen) {
1919 	case IMC_GEN_SANDY:
1920 	case IMC_GEN_IVY:
1921 	case IMC_GEN_HASWELL:
1922 	case IMC_GEN_BROADWELL:
1923 		val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1924 		    imc->imc_gen_data->igd_tad_sysdef);
1925 		if (val == PCI_EINVAL32) {
1926 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1927 			return;
1928 		}
1929 		if (IMC_TAD_SYSDEF_LOCKSTEP(val)) {
1930 			tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP;
1931 		}
1932 
1933 		val = pci_config_get32(mc->icn_main1->istub_cfgspace,
1934 		    imc->imc_gen_data->igd_mc_mirror);
1935 		if (val == PCI_EINVAL32) {
1936 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1937 			return;
1938 		}
1939 		if (IMC_MC_MIRROR_SNB_BRD(val)) {
1940 			tad->itad_flags |= IMC_TAD_FLAG_MIRROR;
1941 		}
1942 		break;
1943 	default:
1944 		break;
1945 	}
1946 
1947 	/*
1948 	 * Now, go through and look at values that'll change how we do the
1949 	 * channel index and adddress calculation. These are only present
1950 	 * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge
1951 	 * and they don't exist on Skylake+.
1952 	 */
1953 	switch (imc->imc_gen) {
1954 	case IMC_GEN_IVY:
1955 	case IMC_GEN_HASWELL:
1956 	case IMC_GEN_BROADWELL:
1957 		val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1958 		    imc->imc_gen_data->igd_tad_sysdef2);
1959 		if (val == PCI_EINVAL32) {
1960 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1961 			return;
1962 		}
1963 		if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1964 			tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT;
1965 		}
1966 		if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1967 			tad->itad_flags |= IMC_TAD_FLAG_CHANHASH;
1968 		}
1969 		break;
1970 	default:
1971 		break;
1972 	}
1973 }
1974 
1975 /*
1976  * Read the IMC channel interleave records
1977  */
1978 static void
1979 imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan)
1980 {
1981 	uint_t i;
1982 	off_t off;
1983 
1984 	chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules;
1985 	for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset;
1986 	    i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) {
1987 		uint32_t val;
1988 		uint64_t offset;
1989 
1990 		val = pci_config_get32(chan->ich_desc->istub_cfgspace,
1991 		    off);
1992 		if (val == PCI_EINVAL32) {
1993 			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
1994 			return;
1995 		}
1996 
1997 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1998 			offset = IMC_TADCHAN_OFFSET_SKX(val);
1999 		} else {
2000 			offset = IMC_TADCHAN_OFFSET_SNB_BRD(val);
2001 		}
2002 
2003 		chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT;
2004 		chan->ich_tad_offsets_raw[i] = val;
2005 	}
2006 }
2007 
2008 static void
2009 imc_decoder_init_tad(imc_t *imc)
2010 {
2011 	uint_t i;
2012 
2013 	for (i = 0; i < imc->imc_nsockets; i++) {
2014 		uint_t j;
2015 
2016 		for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) {
2017 			imc_tad_read_features(imc,
2018 			    &imc->imc_sockets[i].isock_tad[j],
2019 			    &imc->imc_sockets[i].isock_imcs[j]);
2020 			imc_tad_read_rules(imc,
2021 			    &imc->imc_sockets[i].isock_tad[j]);
2022 		}
2023 	}
2024 
2025 	for (i = 0; i < imc->imc_nsockets; i++) {
2026 		uint_t j;
2027 		imc_socket_t *sock = &imc->imc_sockets[i];
2028 
2029 		for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2030 			uint_t k;
2031 			imc_mc_t *mc = &sock->isock_imcs[j];
2032 
2033 			for (k = 0; k < mc->icn_nchannels; k++) {
2034 				imc_channel_t *chan = &mc->icn_channels[k];
2035 				imc_tad_read_interleave(imc, chan);
2036 			}
2037 		}
2038 	}
2039 }
2040 
2041 static void
2042 imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan,
2043     imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig)
2044 {
2045 	uint_t i;
2046 	off_t off, incr;
2047 
2048 	/*
2049 	 * Rank interleave offset registers come in two forms. Either they are
2050 	 * contiguous for a given wayness, meaning that all of the entries for
2051 	 * wayness zero are contiguous, or they are sparse, meaning that there
2052 	 * is a bank for entry zero for all wayness, then entry one for all
2053 	 * wayness, etc.
2054 	 */
2055 	if (contig) {
2056 		off = imc->imc_gen_data->igd_rir_ileave_offset +
2057 		    (rirno * imc->imc_gen_data->igd_rir_nileaves *
2058 		    sizeof (uint32_t));
2059 		incr = sizeof (uint32_t);
2060 	} else {
2061 		off = imc->imc_gen_data->igd_rir_ileave_offset +
2062 		    (rirno * sizeof (uint32_t));
2063 		incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t);
2064 	}
2065 	for (i = 0; i < rank->irle_nentries; i++, off += incr) {
2066 		uint32_t val;
2067 		uint64_t offset;
2068 		imc_rank_ileave_entry_t *ent = &rank->irle_entries[i];
2069 
2070 		val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2071 		if (val == PCI_EINVAL32) {
2072 			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2073 			return;
2074 		}
2075 
2076 		switch (imc->imc_gen) {
2077 		case IMC_GEN_BROADWELL:
2078 			ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val);
2079 			break;
2080 		default:
2081 			ent->irle_target = IMC_RIR_OFFSET_TARGET(val);
2082 			break;
2083 		}
2084 		if (imc->imc_gen >= IMC_GEN_HASWELL) {
2085 			offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val);
2086 		} else {
2087 			offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val);
2088 		}
2089 		ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT;
2090 	}
2091 }
2092 
2093 static void
2094 imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan)
2095 {
2096 	uint_t i;
2097 	off_t off;
2098 
2099 	chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways;
2100 	for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset;
2101 	    i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) {
2102 		uint32_t val;
2103 		uint64_t lim;
2104 		imc_rank_ileave_t *ent = &chan->ich_rankileaves[i];
2105 
2106 		val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2107 		if (val == PCI_EINVAL32) {
2108 			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2109 			return;
2110 		}
2111 
2112 		ent->irle_raw = val;
2113 		ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0;
2114 		ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val);
2115 		ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val);
2116 		if (imc->imc_gen >= IMC_GEN_HASWELL) {
2117 			lim = IMC_RIR_LIMIT_HAS_SKX(val);
2118 		} else {
2119 			lim = IMC_RIR_LIMIT_SNB_IVB(val);
2120 		}
2121 
2122 		ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) +
2123 		    IMC_RIR_LIMIT_EXCLUSIVE;
2124 
2125 		ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves;
2126 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
2127 			imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE);
2128 		} else {
2129 			imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE);
2130 		}
2131 	}
2132 }
2133 
2134 static void
2135 imc_decoder_init_rir(imc_t *imc)
2136 {
2137 	uint_t i;
2138 
2139 	for (i = 0; i < imc->imc_nsockets; i++) {
2140 		uint_t j;
2141 		imc_socket_t *sock = &imc->imc_sockets[i];
2142 
2143 		for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2144 			uint_t k;
2145 			imc_mc_t *mc = &sock->isock_imcs[j];
2146 
2147 			for (k = 0; k < mc->icn_nchannels; k++) {
2148 				imc_channel_t *chan = &mc->icn_channels[k];
2149 				imc_rir_read_wayness(imc, chan);
2150 			}
2151 		}
2152 	}
2153 }
2154 
2155 static cmi_errno_t
2156 imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo,
2157     uint32_t synd, int syndtype, mc_unum_t *unump)
2158 {
2159 	imc_t *imc = arg;
2160 	uint_t i;
2161 	imc_decode_state_t dec;
2162 
2163 	bzero(&dec, sizeof (dec));
2164 	if (!imc_decode_pa(imc, pa, &dec)) {
2165 		switch (dec.ids_fail) {
2166 		case IMC_DECODE_F_LEGACY_RANGE:
2167 		case IMC_DECODE_F_OUTSIDE_DRAM:
2168 			return (CMIERR_MC_NOTDIMMADDR);
2169 		default:
2170 			return (CMIERR_MC_BADSTATE);
2171 		}
2172 	}
2173 
2174 	unump->unum_board = 0;
2175 	/*
2176 	 * The chip id needs to be in the order that the OS expects it, which
2177 	 * may not be our order.
2178 	 */
2179 	for (i = 0; i < imc->imc_nsockets; i++) {
2180 		if (imc->imc_spointers[i] == dec.ids_socket)
2181 			break;
2182 	}
2183 	if (i == imc->imc_nsockets) {
2184 		return (CMIERR_MC_BADSTATE);
2185 	}
2186 	unump->unum_chip = i;
2187 	unump->unum_mc = dec.ids_tadid;
2188 	unump->unum_chan = dec.ids_channelid;
2189 	unump->unum_cs = dec.ids_dimmid;
2190 	unump->unum_rank = dec.ids_rankid;
2191 	unump->unum_offset = dec.ids_rankaddr;
2192 	for (i = 0; i < MC_UNUM_NDIMM; i++) {
2193 		unump->unum_dimms[i] = MC_INVALNUM;
2194 	}
2195 
2196 	return (CMI_SUCCESS);
2197 }
2198 
2199 static cmi_errno_t
2200 imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa)
2201 {
2202 	return (CMIERR_UNKNOWN);
2203 }
2204 
2205 static const cmi_mc_ops_t imc_mc_ops = {
2206 	.cmi_mc_patounum = imc_mc_patounum,
2207 	.cmi_mc_unumtopa = imc_mc_unumtopa
2208 };
2209 
2210 /*
2211  * This is where we really finish attaching and become open for business. This
2212  * occurs once we have all of the expected stubs attached. Here's where all of
2213  * the real fun begins.
2214  */
2215 static void
2216 imc_attach_complete(void *arg)
2217 {
2218 	imc_t *imc = arg;
2219 	cmi_errno_t err;
2220 
2221 	imc_set_gen_data(imc);
2222 
2223 	/*
2224 	 * On SKX and newer, we can fail to map PCI buses at this point due to
2225 	 * bad PCIe reads.
2226 	 */
2227 	if (!imc_map_stubs(imc)) {
2228 		goto done;
2229 	}
2230 
2231 	if (!imc_validate_stubs(imc)) {
2232 		imc->imc_flags |= IMC_F_VALIDATE_FAILED;
2233 		goto done;
2234 	}
2235 
2236 	imc_fixup_stubs(imc);
2237 	imc_map_sockets(imc);
2238 
2239 	if (!imc_create_minors(imc)) {
2240 		goto done;
2241 	}
2242 
2243 	imc_fill_data(imc);
2244 	imc_nvl_create(imc);
2245 
2246 	/*
2247 	 * Gather additional information that we need so that we can properly
2248 	 * initialize the memory decoder and encoder.
2249 	 */
2250 	imc_decoder_init_sad(imc);
2251 	imc_decoder_init_tad(imc);
2252 	imc_decoder_init_rir(imc);
2253 
2254 	/*
2255 	 * Register decoder functions. This may fail. If so, try and complain
2256 	 * loudly, but stay active to allow other data to be useful. Register a
2257 	 * global handle.
2258 	 */
2259 	if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) {
2260 		imc->imc_flags |= IMC_F_MCREG_FAILED;
2261 		dev_err(imc->imc_dip, CE_WARN, "failed to register memory "
2262 		    "decoding operations: 0x%x", err);
2263 	}
2264 
2265 done:
2266 	mutex_enter(&imc->imc_lock);
2267 	imc->imc_flags &= IMC_F_ATTACH_DISPATCHED;
2268 	imc->imc_flags |= IMC_F_ATTACH_COMPLETE;
2269 	mutex_exit(&imc->imc_lock);
2270 }
2271 
2272 static int
2273 imc_stub_comparator(const void *l, const void *r)
2274 {
2275 	const imc_stub_t *sl = l, *sr = r;
2276 	if (sl->istub_bus > sr->istub_bus)
2277 		return (1);
2278 	if (sl->istub_bus < sr->istub_bus)
2279 		return (-1);
2280 	if (sl->istub_dev > sr->istub_dev)
2281 		return (1);
2282 	if (sl->istub_dev < sr->istub_dev)
2283 		return (-1);
2284 	if (sl->istub_func > sr->istub_func)
2285 		return (1);
2286 	if (sl->istub_func < sr->istub_func)
2287 		return (-1);
2288 	return (0);
2289 }
2290 
2291 static int
2292 imc_stub_scan_cb(dev_info_t *dip, void *arg)
2293 {
2294 	int vid, did;
2295 	const imc_stub_table_t *table;
2296 	imc_t *imc = arg;
2297 	int *regs;
2298 	uint_t i, nregs;
2299 
2300 	if (dip == ddi_root_node()) {
2301 		return (DDI_WALK_CONTINUE);
2302 	}
2303 
2304 	/*
2305 	 * Get the dev info name. PCI devices will always be children of PCI
2306 	 * devices today on x86. If we reach something that has a device name
2307 	 * that's not PCI, then we can prune it's children.
2308 	 */
2309 	if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2310 		return (DDI_WALK_PRUNECHILD);
2311 	}
2312 
2313 	/*
2314 	 * Get the device and vendor ID and see if this is something the imc
2315 	 * knows about or cares about.
2316 	 */
2317 	vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2318 	    "vendor-id", PCI_EINVAL16);
2319 	did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2320 	    "device-id", PCI_EINVAL16);
2321 	if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2322 		return (DDI_WALK_CONTINUE);
2323 	}
2324 
2325 	if (vid != IMC_PCI_VENDOR_INTC) {
2326 		return (DDI_WALK_PRUNECHILD);
2327 	}
2328 
2329 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2330 	    "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2331 		return (DDI_WALK_CONTINUE);
2332 	}
2333 
2334 	if (nregs == 0) {
2335 		ddi_prop_free(regs);
2336 		return (DDI_WALK_CONTINUE);
2337 	}
2338 
2339 
2340 	table = NULL;
2341 	for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2342 		if (imc_stub_table[i].imcs_devid == did &&
2343 		    imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2344 		    imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2345 			table = &imc_stub_table[i];
2346 			break;
2347 		}
2348 	}
2349 	ddi_prop_free(regs);
2350 
2351 	/*
2352 	 * Not a match, not interesting.
2353 	 */
2354 	if (table == NULL) {
2355 		return (DDI_WALK_CONTINUE);
2356 	}
2357 
2358 	mutex_enter(&imc->imc_lock);
2359 	imc->imc_nscanned++;
2360 	mutex_exit(&imc->imc_lock);
2361 
2362 	return (DDI_WALK_CONTINUE);
2363 }
2364 
2365 /*
2366  * From here, go through and see how many of the devices that we know about.
2367  */
2368 static void
2369 imc_stub_scan(void *arg)
2370 {
2371 	imc_t *imc = arg;
2372 	boolean_t dispatch = B_FALSE;
2373 
2374 	/*
2375 	 * Zero out the scan results in case we've been detached and reattached.
2376 	 */
2377 	mutex_enter(&imc->imc_lock);
2378 	imc->imc_nscanned = 0;
2379 	mutex_exit(&imc->imc_lock);
2380 
2381 	ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc);
2382 
2383 	mutex_enter(&imc->imc_lock);
2384 	imc->imc_flags |= IMC_F_SCAN_COMPLETE;
2385 	imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED;
2386 
2387 	/*
2388 	 * If the scan found no nodes, then that means that we're on a hardware
2389 	 * platform that we don't support. Therefore, there's no reason to do
2390 	 * anything here.
2391 	 */
2392 	if (imc->imc_nscanned == 0) {
2393 		imc->imc_flags |= IMC_F_UNSUP_PLATFORM;
2394 		mutex_exit(&imc->imc_lock);
2395 		return;
2396 	}
2397 
2398 	if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2399 		imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2400 		dispatch = B_TRUE;
2401 	}
2402 
2403 	mutex_exit(&imc->imc_lock);
2404 
2405 	if (dispatch) {
2406 		(void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2407 		    imc, DDI_SLEEP);
2408 	}
2409 }
2410 
2411 /*
2412  * By default, refuse to allow stubs to detach.
2413  */
2414 int
2415 imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd)
2416 {
2417 	imc_stub_t *stub;
2418 	imc_t *imc = imc_data;
2419 
2420 	mutex_enter(&imc->imc_lock);
2421 
2422 	/*
2423 	 * By default, we do not allow stubs to detach. However, if the driver
2424 	 * has attached to devices on a platform it doesn't recognize or
2425 	 * support or if the override flag has been set, then allow detach to
2426 	 * proceed.
2427 	 */
2428 	if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 &&
2429 	    imc_allow_detach == 0) {
2430 		mutex_exit(&imc->imc_lock);
2431 		return (DDI_FAILURE);
2432 	}
2433 
2434 	for (stub = avl_first(&imc->imc_stubs); stub != NULL;
2435 	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
2436 		if (stub->istub_dip == dip) {
2437 			break;
2438 		}
2439 	}
2440 
2441 	/*
2442 	 * A device was attached to us that we somehow don't know about. Allow
2443 	 * this to proceed.
2444 	 */
2445 	if (stub == NULL) {
2446 		mutex_exit(&imc->imc_lock);
2447 		return (DDI_SUCCESS);
2448 	}
2449 
2450 	pci_config_teardown(&stub->istub_cfgspace);
2451 	avl_remove(&imc->imc_stubs, stub);
2452 	kmem_free(stub, sizeof (imc_stub_t));
2453 	mutex_exit(&imc->imc_lock);
2454 
2455 	return (DDI_SUCCESS);
2456 }
2457 
2458 int
2459 imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd)
2460 {
2461 	imc_stub_t *stub, *lookup;
2462 	int did, vid, *regs;
2463 	uint_t i, nregs;
2464 	const imc_stub_table_t *table;
2465 	avl_index_t idx;
2466 	boolean_t dispatch = B_FALSE;
2467 	imc_t *imc = imc_data;
2468 
2469 	if (cmd != DDI_ATTACH) {
2470 		return (DDI_FAILURE);
2471 	}
2472 
2473 	/*
2474 	 * We've been asked to attach a stub. First, determine if this is even a
2475 	 * PCI device that we should care about. Then, append it to our global
2476 	 * list and kick off the configuration task. Note that we do this
2477 	 * configuration task in a taskq so that we don't interfere with the
2478 	 * normal attach / detach path processing.
2479 	 */
2480 	if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2481 		return (DDI_FAILURE);
2482 	}
2483 
2484 	/*
2485 	 * Get the device and vendor ID and see if this is something the imc
2486 	 * knows about or cares about.
2487 	 */
2488 	vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2489 	    "vendor-id", PCI_EINVAL16);
2490 	did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2491 	    "device-id", PCI_EINVAL16);
2492 	if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2493 		return (DDI_FAILURE);
2494 	}
2495 
2496 	/*
2497 	 * Only accept INTC parts on the imc driver.
2498 	 */
2499 	if (vid != IMC_PCI_VENDOR_INTC) {
2500 		return (DDI_FAILURE);
2501 	}
2502 
2503 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2504 	    "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2505 		return (DDI_FAILURE);
2506 	}
2507 
2508 	if (nregs == 0) {
2509 		ddi_prop_free(regs);
2510 		return (DDI_FAILURE);
2511 	}
2512 
2513 	/*
2514 	 * Determine if this matches a known device.
2515 	 */
2516 	table = NULL;
2517 	for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2518 		if (imc_stub_table[i].imcs_devid == did &&
2519 		    imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2520 		    imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2521 			table = &imc_stub_table[i];
2522 			break;
2523 		}
2524 	}
2525 
2526 	if (i == ARRAY_SIZE(imc_stub_table)) {
2527 		ddi_prop_free(regs);
2528 		return (DDI_FAILURE);
2529 	}
2530 
2531 	/*
2532 	 * We've found something. Make sure the generation matches our current
2533 	 * one. If it does, construct the entry and append it to the list.
2534 	 */
2535 	mutex_enter(&imc->imc_lock);
2536 	if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen !=
2537 	    table->imcs_gen) {
2538 		mutex_exit(&imc->imc_lock);
2539 		ddi_prop_free(regs);
2540 		dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) "
2541 		    "that has different hardware generation (%u) from current "
2542 		    "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen);
2543 		return (DDI_FAILURE);
2544 	} else {
2545 		imc->imc_gen = table->imcs_gen;
2546 	}
2547 	mutex_exit(&imc->imc_lock);
2548 
2549 	stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP);
2550 	stub->istub_dip = dip;
2551 	stub->istub_vid = vid;
2552 	stub->istub_did = did;
2553 	stub->istub_bus = PCI_REG_BUS_G(regs[0]);
2554 	stub->istub_dev = PCI_REG_DEV_G(regs[0]);
2555 	stub->istub_func = PCI_REG_FUNC_G(regs[0]);
2556 	ddi_prop_free(regs);
2557 	stub->istub_table = table;
2558 
2559 	if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) {
2560 		kmem_free(stub, sizeof (stub));
2561 		dev_err(dip, CE_WARN, "Failed to set up PCI config space "
2562 		    "for IMC stub device %s (%u/%u)", ddi_node_name(dip),
2563 		    vid, did);
2564 		return (DDI_FAILURE);
2565 	}
2566 
2567 	mutex_enter(&imc->imc_lock);
2568 	if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) {
2569 		dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate "
2570 		    "bdf %u/%u/%u with %s (%u/%u), not attaching",
2571 		    ddi_node_name(imc->imc_dip), vid, did,
2572 		    stub->istub_bus, stub->istub_dev, stub->istub_func,
2573 		    ddi_node_name(lookup->istub_dip), lookup->istub_vid,
2574 		    lookup->istub_did);
2575 		mutex_exit(&imc->imc_lock);
2576 		pci_config_teardown(&stub->istub_cfgspace);
2577 		kmem_free(stub, sizeof (stub));
2578 
2579 		return (DDI_FAILURE);
2580 	}
2581 	avl_insert(&imc->imc_stubs, stub, idx);
2582 
2583 	if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE &&
2584 	    avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2585 		imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2586 		dispatch = B_TRUE;
2587 	}
2588 	mutex_exit(&imc->imc_lock);
2589 
2590 	if (dispatch) {
2591 		(void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2592 		    imc, DDI_SLEEP);
2593 	}
2594 
2595 	return (DDI_SUCCESS);
2596 }
2597 
2598 static int
2599 imc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2600 {
2601 	imc_t *imc = imc_data;
2602 
2603 	if ((flag & (FEXCL | FNDELAY)) != 0)
2604 		return (EINVAL);
2605 
2606 	if (otyp != OTYP_CHR)
2607 		return (EINVAL);
2608 
2609 	mutex_enter(&imc->imc_lock);
2610 
2611 	if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) {
2612 		mutex_exit(&imc->imc_lock);
2613 		return (ENOTSUP);
2614 	}
2615 
2616 	/*
2617 	 * It's possible that someone has come in during the window between when
2618 	 * we've created the minor node and when we've finished doing work.
2619 	 */
2620 	if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) {
2621 		mutex_exit(&imc->imc_lock);
2622 		return (EAGAIN);
2623 	}
2624 
2625 	/*
2626 	 * It's not clear how someone would get a minor that we didn't create.
2627 	 * But be paranoid and make sure.
2628 	 */
2629 	if (getminor(*devp) >= imc->imc_nsockets) {
2630 		mutex_exit(&imc->imc_lock);
2631 		return (EINVAL);
2632 	}
2633 
2634 	/*
2635 	 * Make sure this socket entry has been filled in.
2636 	 */
2637 	if (imc->imc_spointers[getminor(*devp)] == NULL) {
2638 		mutex_exit(&imc->imc_lock);
2639 		return (EINVAL);
2640 	}
2641 
2642 	mutex_exit(&imc->imc_lock);
2643 
2644 	return (0);
2645 }
2646 
2647 static void
2648 imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode)
2649 {
2650 	imc_decode_state_t dec;
2651 	uint_t i;
2652 
2653 	bzero(&dec, sizeof (dec));
2654 	if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) {
2655 		encode->mcei_err = (uint32_t)dec.ids_fail;
2656 		encode->mcei_errdata = dec.ids_fail_data;
2657 		return;
2658 	}
2659 
2660 	encode->mcei_errdata = 0;
2661 	encode->mcei_err = 0;
2662 	encode->mcei_board = 0;
2663 	for (i = 0; i < imc->imc_nsockets; i++) {
2664 		if (imc->imc_spointers[i] == dec.ids_socket)
2665 			break;
2666 	}
2667 	encode->mcei_chip = i;
2668 	encode->mcei_mc = dec.ids_tadid;
2669 	encode->mcei_chan = dec.ids_channelid;
2670 	encode->mcei_dimm = dec.ids_dimmid;
2671 	encode->mcei_rank_addr = dec.ids_rankaddr;
2672 	encode->mcei_rank = dec.ids_rankid;
2673 	encode->mcei_row = UINT32_MAX;
2674 	encode->mcei_column = UINT32_MAX;
2675 	encode->mcei_pad = 0;
2676 }
2677 
2678 static int
2679 imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2680     int *rvalp)
2681 {
2682 	int ret;
2683 	minor_t m;
2684 	mc_snapshot_info_t info;
2685 	mc_encode_ioc_t encode;
2686 	imc_t *imc = imc_data;
2687 	imc_socket_t *sock;
2688 
2689 	mutex_enter(&imc->imc_lock);
2690 	m = getminor(dev);
2691 	if (m >= imc->imc_nsockets) {
2692 		ret = EINVAL;
2693 		goto done;
2694 	}
2695 	sock = imc->imc_spointers[m];
2696 	if (sock == NULL) {
2697 		ret = EINVAL;
2698 		goto done;
2699 	}
2700 
2701 	/*
2702 	 * Note, other memory controller drivers don't check mode for reading
2703 	 * data nor do they care who can read it from a credential perspective.
2704 	 * As such we don't either at this time.
2705 	 */
2706 	switch (cmd) {
2707 	case MC_IOC_SNAPSHOT_INFO:
2708 		imc_nvl_pack(sock, B_FALSE);
2709 		if (sock->isock_buf == NULL) {
2710 			ret = EIO;
2711 			break;
2712 		}
2713 
2714 		info.mcs_size = sock->isock_buflen;
2715 		info.mcs_gen = sock->isock_gen;
2716 
2717 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2718 			ret = EFAULT;
2719 			break;
2720 		}
2721 
2722 		ret = 0;
2723 		break;
2724 	case MC_IOC_SNAPSHOT:
2725 		imc_nvl_pack(sock, B_FALSE);
2726 		if (sock->isock_buf == NULL) {
2727 			ret = EIO;
2728 			break;
2729 		}
2730 
2731 		if (ddi_copyout(sock->isock_buf, (void *)arg,
2732 		    sock->isock_buflen, mode) != 0) {
2733 			ret = EFAULT;
2734 			break;
2735 		}
2736 
2737 		ret = 0;
2738 		break;
2739 	case MC_IOC_DECODE_SNAPSHOT_INFO:
2740 		imc_decoder_pack(imc);
2741 		if (imc->imc_decoder_buf == NULL) {
2742 			ret = EIO;
2743 			break;
2744 		}
2745 
2746 		info.mcs_size = imc->imc_decoder_len;
2747 		info.mcs_gen = imc->imc_spointers[0]->isock_gen;
2748 
2749 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2750 			ret = EFAULT;
2751 			break;
2752 		}
2753 
2754 		ret = 0;
2755 		break;
2756 	case MC_IOC_DECODE_SNAPSHOT:
2757 		imc_decoder_pack(imc);
2758 		if (imc->imc_decoder_buf == NULL) {
2759 			ret = EIO;
2760 			break;
2761 		}
2762 
2763 		if (ddi_copyout(imc->imc_decoder_buf, (void *)arg,
2764 		    imc->imc_decoder_len, mode) != 0) {
2765 			ret = EFAULT;
2766 			break;
2767 		}
2768 
2769 		ret = 0;
2770 		break;
2771 	case MC_IOC_DECODE_PA:
2772 		if (crgetzoneid(credp) != GLOBAL_ZONEID ||
2773 		    drv_priv(credp) != 0) {
2774 			ret = EPERM;
2775 			break;
2776 		}
2777 
2778 		if (ddi_copyin((void *)arg, &encode, sizeof (encode),
2779 		    mode & FKIOCTL) != 0) {
2780 			ret = EPERM;
2781 			break;
2782 		}
2783 
2784 		imc_ioctl_decode(imc, &encode);
2785 		ret = 0;
2786 
2787 		if (ddi_copyout(&encode, (void *)arg, sizeof (encode),
2788 		    mode & FKIOCTL) != 0) {
2789 			ret = EPERM;
2790 			break;
2791 		}
2792 		break;
2793 	default:
2794 		ret = EINVAL;
2795 		goto done;
2796 	}
2797 
2798 done:
2799 	mutex_exit(&imc->imc_lock);
2800 	return (ret);
2801 }
2802 
2803 static int
2804 imc_close(dev_t dev, int flag, int otyp, cred_t *credp)
2805 {
2806 	return (0);
2807 }
2808 
2809 static int
2810 imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2811 {
2812 	if (cmd != DDI_ATTACH) {
2813 		return (DDI_FAILURE);
2814 	}
2815 
2816 	if (imc_data == NULL || imc_data->imc_dip != NULL) {
2817 		return (DDI_FAILURE);
2818 	}
2819 
2820 	mutex_enter(&imc_data->imc_lock);
2821 	if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1,
2822 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
2823 		mutex_exit(&imc_data->imc_lock);
2824 		return (DDI_FAILURE);
2825 	}
2826 
2827 	imc_data->imc_dip = dip;
2828 	imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED;
2829 	mutex_exit(&imc_data->imc_lock);
2830 
2831 	(void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data,
2832 	    DDI_SLEEP);
2833 
2834 	return (DDI_SUCCESS);
2835 }
2836 
2837 /*
2838  * We only export a single instance.
2839  */
2840 static int
2841 imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
2842 {
2843 	/*
2844 	 * getinfo(9E) shouldn't be called if we're not attached. But be
2845 	 * paranoid.
2846 	 */
2847 	if (imc_data == NULL || imc_data->imc_dip == NULL) {
2848 		return (DDI_FAILURE);
2849 	}
2850 
2851 	switch (infocmd) {
2852 	case DDI_INFO_DEVT2DEVINFO:
2853 		*resultp = imc_data->imc_dip;
2854 		break;
2855 	case DDI_INFO_DEVT2INSTANCE:
2856 		*resultp = (void *)0;
2857 		break;
2858 	default:
2859 		return (DDI_FAILURE);
2860 	}
2861 
2862 	return (DDI_SUCCESS);
2863 }
2864 
2865 static int
2866 imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2867 {
2868 	if (cmd != DDI_DETACH) {
2869 		return (DDI_FAILURE);
2870 	}
2871 
2872 	if (imc_data == NULL || imc_data->imc_dip) {
2873 		return (DDI_FAILURE);
2874 	}
2875 
2876 	mutex_enter(&imc_data->imc_lock);
2877 
2878 	/*
2879 	 * While a scan or attach is outstanding, don't allow us to detach.
2880 	 */
2881 	if ((imc_data->imc_flags &
2882 	    (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) {
2883 		mutex_exit(&imc_data->imc_lock);
2884 		return (DDI_FAILURE);
2885 	}
2886 
2887 	/*
2888 	 * Because the stub driver depends on the imc driver, we shouldn't be
2889 	 * able to have any entries in this list when we detach. However, we
2890 	 * check just to make sure.
2891 	 */
2892 	if (!avl_is_empty(&imc_data->imc_stubs)) {
2893 		mutex_exit(&imc_data->imc_lock);
2894 		return (DDI_FAILURE);
2895 	}
2896 
2897 	nvlist_free(imc_data->imc_decoder_dump);
2898 	imc_data->imc_decoder_dump = NULL;
2899 	if (imc_data->imc_decoder_buf != NULL) {
2900 		kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len);
2901 		imc_data->imc_decoder_buf = NULL;
2902 		imc_data->imc_decoder_len = 0;
2903 	}
2904 
2905 	ddi_remove_minor_node(imc_data->imc_dip, NULL);
2906 	imc_data->imc_dip = NULL;
2907 	mutex_exit(&imc_data->imc_lock);
2908 
2909 	ddi_taskq_wait(imc_data->imc_taskq);
2910 	ddi_taskq_destroy(imc_data->imc_taskq);
2911 	imc_data->imc_taskq = NULL;
2912 
2913 	return (DDI_SUCCESS);
2914 }
2915 
2916 static void
2917 imc_free(void)
2918 {
2919 	if (imc_data == NULL) {
2920 		return;
2921 	}
2922 
2923 	VERIFY(avl_is_empty(&imc_data->imc_stubs));
2924 	avl_destroy(&imc_data->imc_stubs);
2925 	mutex_destroy(&imc_data->imc_lock);
2926 	kmem_free(imc_data, sizeof (imc_t));
2927 	imc_data = NULL;
2928 }
2929 
2930 static void
2931 imc_alloc(void)
2932 {
2933 	imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP);
2934 
2935 	mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL);
2936 	avl_create(&imc_data->imc_stubs, imc_stub_comparator,
2937 	    sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link));
2938 }
2939 
2940 static struct cb_ops imc_cb_ops = {
2941 	.cb_open = imc_open,
2942 	.cb_close = imc_close,
2943 	.cb_strategy = nodev,
2944 	.cb_print = nodev,
2945 	.cb_dump = nodev,
2946 	.cb_read = nodev,
2947 	.cb_write = nodev,
2948 	.cb_ioctl = imc_ioctl,
2949 	.cb_devmap = nodev,
2950 	.cb_mmap = nodev,
2951 	.cb_segmap = nodev,
2952 	.cb_chpoll = nochpoll,
2953 	.cb_prop_op = ddi_prop_op,
2954 	.cb_flag = D_MP,
2955 	.cb_rev = CB_REV,
2956 	.cb_aread = nodev,
2957 	.cb_awrite = nodev
2958 };
2959 
2960 static struct dev_ops imc_dev_ops = {
2961 	.devo_rev = DEVO_REV,
2962 	.devo_refcnt = 0,
2963 	.devo_getinfo = imc_getinfo,
2964 	.devo_identify = nulldev,
2965 	.devo_probe = nulldev,
2966 	.devo_attach = imc_attach,
2967 	.devo_detach = imc_detach,
2968 	.devo_reset = nodev,
2969 	.devo_cb_ops = &imc_cb_ops,
2970 	.devo_quiesce = ddi_quiesce_not_needed
2971 };
2972 
2973 static struct modldrv imc_modldrv = {
2974 	.drv_modops = &mod_driverops,
2975 	.drv_linkinfo = "Intel Integrated Memory Controller Driver",
2976 	.drv_dev_ops = &imc_dev_ops
2977 };
2978 
2979 static struct modlinkage imc_modlinkage = {
2980 	.ml_rev = MODREV_1,
2981 	.ml_linkage = { &imc_modldrv, NULL }
2982 };
2983 
2984 int
2985 _init(void)
2986 {
2987 	int ret;
2988 
2989 	if ((ret = mod_install(&imc_modlinkage)) == 0) {
2990 		imc_alloc();
2991 	}
2992 
2993 	return (ret);
2994 }
2995 
2996 int
2997 _info(struct modinfo *modinfop)
2998 {
2999 	return (mod_info(&imc_modlinkage, modinfop));
3000 }
3001 
3002 int
3003 _fini(void)
3004 {
3005 	int ret;
3006 
3007 	if ((ret = mod_remove(&imc_modlinkage)) == 0) {
3008 		imc_free();
3009 	}
3010 	return (ret);
3011 }
3012