xref: /illumos-gate/usr/src/uts/intel/io/imc/imc.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2022 Oxide Computer Company
15  */
16 
17 /*
18  * Generic Intel Integrated Memory Controller (IMC) Driver
19  *
20  * This driver talks to the CPU's IMC to understand the detailed topology of the
21  * processor and to determine how to map between physical addresses to the
22  * corresponding DIMM. This driver supports the following generations of Intel
23  * chips:
24  *
25  *  - Sandy Bridge
26  *  - Ivy Bridge
27  *  - Haswell
28  *  - Broadwell
29  *  - Skylake / Cascade Lake
30  *
31  * Memory Decoding
32  * ---------------
33  *
34  * For more detailed summaries of the memory decoding process, please refer to
35  * the Intel External Design Specifications for the corresponding processor.
36  * What follows is a rough overview of how the memory decoding system works.
37  *
38  * First, we'd like to define the following concepts:
39  *
40  * SYSTEM ADDRESS
41  *
42  *	This is a physical address that the operating system normally uses. This
43  *	address may refer to DRAM, it may refer to memory mapped PCI
44  *	configuration space or device registers, or it may refer to other parts
45  *	of the system's memory map, such as the extended advanced programmable
46  *	interrupt controller (xAPIC), etc.
47  *
48  * DIMM
49  *
50  *	Dual-inline memory module. This refers to a physical stick of volatile
51  *	memory that is inserted into a slot on the motherboard.
52  *
53  * RANK
54  *
55  *	A potential sub-division of a DIMM. A DIMM's memory capacity is divided
56  *	into a number of equal sized ranks. For example, an 8 GiB DIMM, may have
57  *	1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks.
58  *
59  * RANK ADDRESS
60  *
61  *	An address that exists in the context of a given rank on a DIMM. All
62  *	ranks have overlapping addresses, so the address 0x400 exists on all
63  *	ranks on a given DIMM.
64  *
65  * CHANNEL
66  *
67  *	Multiple DIMMs may be combined into a single channel. The channel
68  *	represents the combined memory of all the DIMMs. A given channel only
69  *	ever exists on a socket and is bound to a single memory controller.
70  *
71  * CHANNEL ADDRESS
72  *
73  *	This is an address that exists logically on a channel. Each address on a
74  *	channel maps to a corresponding DIMM that exists on that channel. The
75  *	address space on one channel is independent from that on another. This
76  *	means that address 0x1000 can exist on each memory channel in the
77  *	system.
78  *
79  * INTERLEAVE
80  *
81  *	There are several different cases where interleaving occurs on the
82  *	system. For example, addresses may be interleaved across sockets,
83  *	memory channels, or DIMM ranks. When addresses are interleaved, then
84  *	some number of bits in an address are used to select which target to go
85  *	to (usually through a look up table). The effect of interleaving is that
86  *	addresses that are next to one another may not all go to the same
87  *	device. The following image shows a non-interleaving case.
88  *
89  *	0x0fff +-----+             +-----+ 0x7ff
90  *	       |     |\___________/|     |
91  *	       |     |  __________ | (b) |
92  *	       |     | /          \|     |
93  *	0x0800 |=====|=            +-----+ 0x000       +-----+ 0x7ff
94  *	       |     | \______________________________/|     |
95  *	       |     | _______________________________ | (a) |
96  *	       |     |/                               \|     |
97  *	0x0000 +-----+                                 +-----+ 0x000
98  *
99  *	In this example of non-interleaving, addresses 0x0000 to 0x07ff go to
100  *	device (a). While, addresses 0x08000 to 0xfff, go to device (b).
101  *	However, each range is divided into the same number of components.
102  *
103  *	If instead, we were to look at that with interleaving, what we might say
104  *	is that rather than splitting the range in half, we might say that if
105  *	the address has bit 8 set (0x100), then it goes to (b), otherwise it
106  *	goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a).
107  *	0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a)
108  *	again, and then 0x300 to 0x2ff would go back to (b). This would continue
109  *	for a while. This would instead look something more like:
110  *
111  *
112  *      0x0fff +-----+       A: 0x7ff +---------+   B: 0x7ff +---------+
113  *             | (b) |                | e00-eff |            | f00-fff |
114  *      0x0f00 |-----|          0x700 +---------+      0x700 +---------+
115  *             | (a) |                | c00-cff |            | d00-dff |
116  *      0x0e00 ~~~~~~~          0x600 +---------+      0x600 +---------+
117  *               ***                  | a00-aff |            | b00-bff |
118  *      0x0400 ~~~~~~~          0x500 +---------+      0x500 +---------+
119  *             | (b) |                | 800-8ff |            | 900-9ff |
120  *      0x0300 |-----|          0x400 +---------+      0x400 +---------+
121  *             | (a) |                | 600-6ff |            | 700-7ff |
122  *      0x0200 |-----|          0x300 +---------+      0x300 +---------+
123  *             | (b) |                | 400-4ff |            | 500-5ff |
124  *      0x0100 |-----|          0x200 +---------+      0x200 +---------+
125  *             | (a) |                | 200-2ff |            | 300-3ff |
126  *      0x0000 +-----+          0x100 +---------+      0x100 +---------+
127  *                                    | 000-0ff |            | 100-1ff |
128  *                              0x000 +---------+      0x000 +---------+
129  *
130  *	In this example we've performed two-way interleaving. The number of ways
131  *	that something can interleave varies based on what we're interleaving
132  *	between.
133  *
134  * MEMORY CONTROLLER
135  *
136  *	A given processor die (see uts/i86pc/os/cpuid.c) contains a number of
137  *	memory controllers. Usually 1 or two. Each memory controller supports a
138  *	given number of DIMMs, which are divided across multiple channels.
139  *
140  * TARGET ADDRESS DECODER
141  *
142  *	The target address decoder (TAD) is responsible for taking a system
143  *	address and transforming it into a channel address based on the rules
144  *	that are present. Each memory controller has a corresponding TAD. The
145  *	TAD is often contained in a device called a 'Home Agent'.
146  *
147  * SYSTEM ADDRESS DECODER
148  *
149  *	The system address decoder (SAD) is responsible for taking a system
150  *	address and directing it to the right place, whether this be memory or
151  *	otherwise. There is a single memory controller per socket (see
152  *	uts/i86pc/os/cpuid.c) that is shared between all the cores currently.
153  *
154  * NODE IDENTIFIER
155  *
156  *	The node identifier is used to uniquely identify an element in the
157  *	various routing topologies on the die (see uts/i86pc/os/cpuid.c for the
158  *	definition of 'die'). One can roughly think about this as a unique
159  *	identifier for the socket itself. In general, the primary node ID for a
160  *	socket should map to the socket APIC ID.
161  *
162  * Finding Devices
163  * ---------------
164  *
165  * There is a bit of a chicken and egg problem on Intel systems and in the
166  * device driver interface. The information that we need in the system is spread
167  * out amongst a large number of different PCI devices that the processor
168  * exposes. The number of such devices can vary based on the processor
169  * generation and the specific SKU in the processor. To deal with this, we break
170  * the driver into two different components: a stub driver and the full driver.
171  *
172  * The stub driver has aliases for all known PCI devices that we might attach to
173  * in a given generation on the system. This driver is called 'imcstub'. When a
174  * stub attaches, it just registers itself with the main driver, upon which it
175  * has a module dependency.
176  *
177  * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it
178  * kicks off a scan of the device tree which takes place in a task queue. Once
179  * there, it determines the number of devices that it expects to exist by
180  * walking the tree and comparing it against the generation-specific table.
181  *
182  * If all devices are found, we'll go ahead and read through all the devices and
183  * build a map of all the information we need to understand the topology of the
184  * system and to be able to decode addresses. We do this here, because we can be
185  * asked to perform decoding in dangerous contexts (after taking an MCE, panic,
186  * etc) where we don't want to have to rely on the broader kernel functioning at
187  * this point in time.
188  *
189  * Once our topology is built, we'll create minor nodes which are used by the
190  * fault management architecture to query for information and register our
191  * decoding functionality with the kernel.
192  *
193  * PCI Numbering
194  * -------------
195  *
196  * For each device that we care about, Intel defines the device and function
197  * that we can expect to find the information and PCI configuration space
198  * registers that we care about at. However, the PCI bus is not well defined.
199  * Devices that are on the same socket use the same set of bus numbers; however,
200  * some sockets have multiple device numbers that they'll use to represent
201  * different classes. These bus numbers are programmed by systems firmware as
202  * part of powering on the system. This means, that we need the ability to
203  * map together these disparate ranges ourselves.
204  *
205  * There is a device called a utility box (UBOX), which exists per-socket and
206  * maps the different sockets together. We use this to determine which devices
207  * correspond to which sockets.
208  *
209  * Mapping Sockets
210  * ---------------
211  *
212  * Another wrinkle is that the way that the OS sees the numbering of the CPUs is
213  * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more
214  * information). However, to map to the corresponding socket, we need to look at
215  * the socket's node ID. The order of PCI buses in the system is not required to
216  * have any relation to the socket ID. Therefore, we have to have yet another
217  * indirection table in the imc_t.
218  *
219  * Exposing Data
220  * -------------
221  *
222  * We expose topology data to FMA using the OS-private memory controller
223  * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a
224  * number of specific interfaces that we can then implement. The ioctl API asks
225  * us for a snapshot of data, which basically has us go through and send an
226  * nvlist_t to userland. This nvlist_t is constructed as part of the scan
227  * process. This nvlist uses the version 1 format, which more explicitly encodes
228  * the topology in a series of nested nvlists.
229  *
230  * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the
231  * decoder and ask it to perform decoding.
232  *
233  * Decoding Addresses
234  * ------------------
235  *
236  * The decoding logic can be found in common/imc/imc_decode.c. This file is
237  * shared between the kernel and userland to allow for easier testing and
238  * additional flexibility in operation. The decoding process happens in a few
239  * different phases.
240  *
241  * The first phase, is to determine which memory controller on which socket is
242  * responsible for this data. To determine this, we use the system address
243  * decoder and walk the rules, looking for the correct target. There are various
244  * manipulations to the address that exist which are used to determine which
245  * index we use. The way that we interpret the output of the rule varies
246  * somewhat based on the generation. Sandy Bridge just has a node ID which
247  * points us to the socket with its single IMC. On Ivy Bridge through Broadwell,
248  * the memory controller to use is also encoded in part of the node ID. Finally,
249  * on Skylake, the SAD tells us which socket to look at. The socket in question
250  * then has a routing table which tells us which channel on which memory
251  * controller that is local to that socket.
252  *
253  * Once we have the target memory controller, we walk the list of target address
254  * decoder rules. These rules can help tell us which channel we care about
255  * (which is required on Sandy Bridge through Broadwell) and then describe some
256  * amount of the interleaving rules which are used to turn the system address
257  * into a channel address.
258  *
259  * Once we know the channel and the channel address, we walk the rank interleave
260  * rules which help us determine which DIMM and the corresponding rank on it
261  * that the corresponding channel address is on. It also has logic that we need
262  * to use to determine how to transform a channel address into an address on
263  * that specific rank. Once we have that, then the initial decoding is done.
264  *
265  * The logic in imc_decode.c is abstracted away from the broader kernel CMI
266  * logic.  This is on purpose and allows us not only an easier time unit testing
267  * the logic, but also allows us to express more high fidelity errors that are
268  * translated into a much smaller subset. This logic is exercised in the
269  * 'imc_test' program which is built in 'test/os-tests/tests/imc'.
270  *
271  * Limitations
272  * -----------
273  *
274  * Currently, this driver has the following limitations:
275  *
276  *  o It doesn't decode the row and column addresses.
277  *  o It doesn't encode from a DIMM address to a system address.
278  *  o It doesn't properly support lockstep and mirroring modes on Sandy Bridge -
279  *    Broadwell platforms.
280  *  o It doesn't support virtual lockstep and adaptive mirroring on Purley
281  *    platforms.
282  *  o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs.
283  *  o It doesn't know how to decode three way channel interleaving.
284  *
285  * None of these are intrinsic problems to the driver, it's mostly a matter of
286  * having proper documentation and testing.
287  */
288 
289 #include <sys/modctl.h>
290 #include <sys/conf.h>
291 #include <sys/devops.h>
292 #include <sys/ddi.h>
293 #include <sys/sunddi.h>
294 #include <sys/types.h>
295 #include <sys/file.h>
296 #include <sys/errno.h>
297 #include <sys/open.h>
298 #include <sys/cred.h>
299 #include <sys/pci.h>
300 #include <sys/sysmacros.h>
301 #include <sys/avl.h>
302 #include <sys/stat.h>
303 #include <sys/policy.h>
304 
305 #include <sys/cpu_module.h>
306 #include <sys/mc.h>
307 #include <sys/mc_intel.h>
308 
309 #include "imc.h"
310 
311 /*
312  * These tables contain generational data that varies between processor
313  * generation such as the maximum number of sockets, memory controllers, and the
314  * offsets of the various registers.
315  */
316 
317 static const imc_gen_data_t imc_gen_data_snb = {
318 	.igd_max_sockets = 4,
319 	.igd_max_imcs = 2,
320 	.igd_max_channels = 4,
321 	.igd_max_dimms = 3,
322 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
323 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
324 	    IMC_REG_MC_MTR2 },
325 	.igd_mcmtr_offset = 0x7c,
326 	.igd_tolm_offset = 0x80,
327 	.igd_tohm_low_offset = 0x84,
328 	.igd_sad_dram_offset = 0x80,
329 	.igd_sad_ndram_rules = 10,
330 	.igd_sad_nodeid_offset = 0x40,
331 	.igd_tad_nrules = 12,
332 	.igd_tad_rule_offset = 0x40,
333 	.igd_tad_chan_offset = 0x90,
334 	.igd_tad_sysdef = 0x80,
335 	.igd_tad_sysdef2 = 0x84,
336 	.igd_mc_mirror = 0xac,
337 	.igd_rir_nways = 5,
338 	.igd_rir_way_offset = 0x108,
339 	.igd_rir_nileaves = 8,
340 	.igd_rir_ileave_offset = 0x120,
341 	.igd_ubox_cpubusno_offset = 0xd0,
342 };
343 
344 static const imc_gen_data_t imc_gen_data_ivb = {
345 	.igd_max_sockets = 4,
346 	.igd_max_imcs = 2,
347 	.igd_max_channels = 4,
348 	.igd_max_dimms = 3,
349 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
350 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
351 	    IMC_REG_MC_MTR2 },
352 	.igd_mcmtr_offset = 0x7c,
353 	.igd_tolm_offset = 0x80,
354 	.igd_tohm_low_offset = 0x84,
355 	.igd_sad_dram_offset = 0x60,
356 	.igd_sad_ndram_rules = 20,
357 	.igd_sad_nodeid_offset = 0x40,
358 	.igd_tad_nrules = 12,
359 	.igd_tad_rule_offset = 0x40,
360 	.igd_tad_chan_offset = 0x90,
361 	.igd_tad_sysdef = 0x80,
362 	.igd_tad_sysdef2 = 0x84,
363 	.igd_mc_mirror = 0xac,
364 	.igd_rir_nways = 5,
365 	.igd_rir_way_offset = 0x108,
366 	.igd_rir_nileaves = 8,
367 	.igd_rir_ileave_offset = 0x120,
368 	.igd_ubox_cpubusno_offset = 0xd0,
369 };
370 
371 static const imc_gen_data_t imc_gen_data_has_brd = {
372 	.igd_max_sockets = 4,
373 	.igd_max_imcs = 2,
374 	.igd_max_channels = 4,
375 	.igd_max_dimms = 3,
376 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX,
377 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
378 	    IMC_REG_MC_MTR2 },
379 	.igd_mcmtr_offset = 0x7c,
380 	.igd_tolm_offset = 0xd0,
381 	.igd_tohm_low_offset = 0xd4,
382 	.igd_tohm_hi_offset = 0xd8,
383 	.igd_sad_dram_offset = 0x60,
384 	.igd_sad_ndram_rules = 20,
385 	.igd_sad_nodeid_offset = 0x40,
386 	.igd_tad_nrules = 12,
387 	.igd_tad_rule_offset = 0x40,
388 	.igd_tad_chan_offset = 0x90,
389 	.igd_tad_sysdef = 0x80,
390 	.igd_tad_sysdef2 = 0x84,
391 	.igd_mc_mirror = 0xac,
392 	.igd_rir_nways = 5,
393 	.igd_rir_way_offset = 0x108,
394 	.igd_rir_nileaves = 8,
395 	.igd_rir_ileave_offset = 0x120,
396 	.igd_ubox_cpubusno_offset = 0xd0,
397 };
398 
399 static const imc_gen_data_t imc_gen_data_skx = {
400 	.igd_max_sockets = 8,
401 	.igd_max_imcs = 2,
402 	.igd_max_channels = 3,
403 	.igd_max_dimms = 2,
404 	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
405 	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 },
406 	.igd_mcmtr_offset = 0x87c,
407 	.igd_topo_offset = 0x88,
408 	.igd_tolm_offset = 0xd0,
409 	.igd_tohm_low_offset = 0xd4,
410 	.igd_tohm_hi_offset = 0xd8,
411 	.igd_sad_dram_offset = 0x60,
412 	.igd_sad_ndram_rules = 24,
413 	.igd_sad_nodeid_offset = 0xc0,
414 	.igd_tad_nrules = 8,
415 	.igd_tad_rule_offset = 0x850,
416 	.igd_tad_chan_offset = 0x90,
417 	.igd_rir_nways = 4,
418 	.igd_rir_way_offset = 0x108,
419 	.igd_rir_nileaves = 4,
420 	.igd_rir_ileave_offset = 0x120,
421 	.igd_ubox_cpubusno_offset = 0xcc,
422 };
423 
424 /*
425  * This table contains all of the devices that we're looking for from a stub
426  * perspective. These are organized by generation. Different generations behave
427  * in slightly different ways. For example, Sandy Bridge through Broadwell use
428  * unique PCI IDs for each PCI device/function combination that appears. Whereas
429  * Skylake based systems use the same PCI ID; however, different device/function
430  * values indicate that the IDs are used for different purposes.
431  */
432 /* BEGIN CSTYLED */
433 static const imc_stub_table_t imc_stub_table[] = {
434 	/* Sandy Bridge */
435 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" },
436 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" },
437 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" },
438 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" },
439 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" },
440 	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" },
441 	{ IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" },
442 	{ IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" },
443 	{ IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" },
444 	{ IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" },
445 	{ IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" },
446 	{ IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" },
447 	/* Ivy Bridge */
448 	{ IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" },
449 	{ IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" },
450 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" },
451 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" },
452 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" },
453 	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" },
454 	{ IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" },
455 	{ IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" },
456 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" },
457 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" },
458 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" },
459 	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" },
460 	{ IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" },
461 	{ IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" },
462 	{ IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" },
463 	{ IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" },
464 	{ IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" },
465 	{ IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" },
466 	{ IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" },
467 	/* Haswell */
468 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" },
469 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" },
470 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" },
471 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" },
472 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" },
473 	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" },
474 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" },
475 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" },
476 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" },
477 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" },
478 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" },
479 	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" },
480 	{ IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" },
481 	{ IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" },
482 	{ IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" },
483 	{ IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" },
484 	{ IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" },
485 	{ IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" },
486 	{ IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" },
487 	/* Broadwell Devices */
488 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" },
489 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" },
490 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" },
491 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" },
492 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" },
493 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" },
494 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" },
495 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" },
496 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" },
497 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" },
498 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" },
499 	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" },
500 	{ IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" },
501 	{ IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" },
502 	{ IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" },
503 	{ IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" },
504 	{ IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" },
505 	{ IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" },
506 	{ IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" },
507 	/* Skylake and Cascade Lake Devices */
508 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" },
509 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" },
510 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" },
511 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" },
512 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" },
513 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" },
514 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" },
515 	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" },
516 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" },
517 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" },
518 	{ IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" },
519 
520 	/*
521 	 * There is one SAD MC Route type device per core! Because of this a
522 	 * wide array of device and functions are allocated. For now, we list
523 	 * all 28 of them out.
524 	 */
525 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" },
526 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" },
527 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" },
528 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" },
529 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" },
530 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" },
531 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" },
532 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" },
533 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" },
534 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" },
535 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" },
536 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" },
537 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" },
538 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" },
539 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" },
540 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" },
541 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" },
542 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" },
543 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" },
544 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" },
545 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" },
546 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" },
547 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" },
548 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" },
549 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" },
550 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" },
551 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" },
552 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" },
553 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" },
554 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" },
555 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" },
556 	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" },
557 
558 	{ IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" },
559 	{ IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" },
560 };
561 /* END CSTYLED */
562 
563 #define	IMC_PCI_VENDOR_INTC	0x8086
564 
565 /*
566  * Our IMC data is global and statically set up during a combination of
567  * _init(9E) and attach(9E). While we have a module dependency between the PCI
568  * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't
569  * guarantee that the imc driver has finished attaching. As such we make sure
570  * that it can operate without it being attached in any way.
571  */
572 static imc_t *imc_data = NULL;
573 
574 /*
575  * By default we should not allow the stubs to detach as we don't have a good
576  * way of forcing them to attach again. This is provided in case someone does
577  * want to allow the driver to unload.
578  */
579 int imc_allow_detach = 0;
580 
581 static void
582 imc_set_gen_data(imc_t *imc)
583 {
584 	switch (imc->imc_gen) {
585 	case IMC_GEN_SANDY:
586 		imc->imc_gen_data = &imc_gen_data_snb;
587 		break;
588 	case IMC_GEN_IVY:
589 		imc->imc_gen_data = &imc_gen_data_ivb;
590 		break;
591 	case IMC_GEN_HASWELL:
592 	case IMC_GEN_BROADWELL:
593 		imc->imc_gen_data = &imc_gen_data_has_brd;
594 		break;
595 	case IMC_GEN_SKYLAKE:
596 		imc->imc_gen_data = &imc_gen_data_skx;
597 		break;
598 	default:
599 		dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
600 		    "set to unknown generation: %u", imc->imc_gen);
601 	}
602 }
603 
604 /*
605  * If our device (dev_info_t) does not have a non-zero unit address, then
606  * devfsadmd will not pay attention to us at all. Therefore we need to set the
607  * unit address below, before we create minor nodes.
608  *
609  * The rest of the system expects us to have one minor node per socket. The
610  * minor node ID should be the ID of the socket.
611  */
612 static boolean_t
613 imc_create_minors(imc_t *imc)
614 {
615 	uint_t i;
616 
617 	ddi_set_name_addr(imc->imc_dip, "1");
618 	for (i = 0; i < imc->imc_nsockets; i++) {
619 		char buf[MAXNAMELEN];
620 
621 		if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >=
622 		    sizeof (buf)) {
623 			goto fail;
624 		}
625 
626 		if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i,
627 		    "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
628 			dev_err(imc->imc_dip, CE_WARN, "failed to create "
629 			    "minor node %u: %s", i, buf);
630 			goto fail;
631 		}
632 	}
633 	return (B_TRUE);
634 
635 fail:
636 	ddi_remove_minor_node(imc->imc_dip, NULL);
637 	return (B_FALSE);
638 }
639 
640 /*
641  * Check the current MC route value for this SAD. On Skylake systems there is
642  * one per core. Every core should agree. If not, we will not trust the SAD
643  * MCROUTE values and this will cause system address decoding to fail on
644  * skylake.
645  */
646 static void
647 imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub)
648 {
649 	uint32_t val;
650 
651 	val = pci_config_get32(stub->istub_cfgspace,
652 	    IMC_REG_SKX_SAD_MC_ROUTE_TABLE);
653 	if (val == PCI_EINVAL32) {
654 		sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
655 		return;
656 	}
657 
658 	if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) {
659 		sad->isad_flags |= IMC_SAD_MCROUTE_VALID;
660 		sad->isad_mcroute.ismc_raw_mcroute = val;
661 		return;
662 	}
663 
664 	/*
665 	 * Occasionally we see MC ROUTE table entries with a value of zero.
666 	 * We should ignore those for now.
667 	 */
668 	if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) {
669 		dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch "
670 		    "with socket. SAD has val 0x%x, system has %x\n",
671 		    val, sad->isad_mcroute.ismc_raw_mcroute);
672 		sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE;
673 	}
674 }
675 
676 /*
677  * On Skylake, many of the devices that we care about are on separate PCI Buses.
678  * These can be mapped together by the DECS register. However, we need to know
679  * how to map different buses together so that we can more usefully associate
680  * information. The set of buses is all present in the DECS register. We'll
681  * effectively assign sockets to buses. This is also still something that comes
682  * up on pre-Skylake systems as well.
683  */
684 static boolean_t
685 imc_map_buses(imc_t *imc)
686 {
687 	imc_stub_t *stub;
688 	uint_t nsock;
689 
690 	/*
691 	 * Find the UBOX_DECS registers so we can establish socket mappings. On
692 	 * Skylake, there are three different sets of buses that we need to
693 	 * cover all of our devices, while there are only two before that.
694 	 */
695 	for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL;
696 	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
697 		uint32_t busno;
698 
699 		if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) {
700 			continue;
701 		}
702 
703 		busno = pci_config_get32(stub->istub_cfgspace,
704 		    imc->imc_gen_data->igd_ubox_cpubusno_offset);
705 		if (busno == PCI_EINVAL32) {
706 			dev_err(imc->imc_dip, CE_WARN, "failed to read "
707 			    "UBOX_DECS CPUBUSNO0: invalid PCI read");
708 			return (B_FALSE);
709 		}
710 
711 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
712 			imc->imc_sockets[nsock].isock_nbus = 3;
713 			imc->imc_sockets[nsock].isock_bus[0] =
714 			    IMC_UBOX_CPUBUSNO_0(busno);
715 			imc->imc_sockets[nsock].isock_bus[1] =
716 			    IMC_UBOX_CPUBUSNO_1(busno);
717 			imc->imc_sockets[nsock].isock_bus[2] =
718 			    IMC_UBOX_CPUBUSNO_2(busno);
719 		} else {
720 			imc->imc_sockets[nsock].isock_bus[0] =
721 			    IMC_UBOX_CPUBUSNO_0(busno);
722 			imc->imc_sockets[nsock].isock_bus[1] =
723 			    IMC_UBOX_CPUBUSNO_1(busno);
724 			imc->imc_sockets[nsock].isock_nbus = 2;
725 		}
726 		nsock++;
727 	}
728 	imc->imc_nsockets = nsock;
729 
730 	return (B_TRUE);
731 }
732 
733 /*
734  * For a given stub that we've found, map it to its corresponding socket based
735  * on the PCI bus that it has.
736  */
737 static imc_socket_t *
738 imc_map_find_socket(imc_t *imc, imc_stub_t *stub)
739 {
740 	uint_t i;
741 
742 	for (i = 0; i < imc->imc_nsockets; i++) {
743 		uint_t bus;
744 
745 		for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) {
746 			if (imc->imc_sockets[i].isock_bus[bus] ==
747 			    stub->istub_bus) {
748 				return (&imc->imc_sockets[i]);
749 			}
750 		}
751 	}
752 
753 	return (NULL);
754 }
755 
756 static boolean_t
757 imc_map_stubs(imc_t *imc)
758 {
759 	imc_stub_t *stub;
760 
761 	if (!imc_map_buses(imc)) {
762 		return (B_FALSE);
763 	}
764 
765 	stub = avl_first(&imc->imc_stubs);
766 	for (stub = avl_first(&imc->imc_stubs); stub != NULL;
767 	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
768 		imc_socket_t *sock = imc_map_find_socket(imc, stub);
769 
770 		if (sock == NULL) {
771 			dev_err(imc->imc_dip, CE_WARN, "found stub type %u "
772 			    "PCI%x,%x with bdf %u/%u/%u that does not match a "
773 			    "known PCI bus for any of %u sockets",
774 			    stub->istub_table->imcs_type, stub->istub_vid,
775 			    stub->istub_did, stub->istub_bus, stub->istub_dev,
776 			    stub->istub_func, imc->imc_nsockets);
777 			continue;
778 		}
779 
780 		/*
781 		 * We don't have to worry about duplicates here. We check to
782 		 * make sure that we have unique bdfs here.
783 		 */
784 		switch (stub->istub_table->imcs_type) {
785 		case IMC_TYPE_MC0_M2M:
786 			sock->isock_imcs[0].icn_m2m = stub;
787 			break;
788 		case IMC_TYPE_MC1_M2M:
789 			sock->isock_imcs[1].icn_m2m = stub;
790 			break;
791 		case IMC_TYPE_MC0_MAIN0:
792 			sock->isock_nimc++;
793 			sock->isock_imcs[0].icn_main0 = stub;
794 
795 			/*
796 			 * On Skylake, the MAIN0 does double duty as channel
797 			 * zero and as the TAD.
798 			 */
799 			if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
800 				sock->isock_imcs[0].icn_nchannels++;
801 				sock->isock_imcs[0].icn_channels[0].ich_desc =
802 				    stub;
803 				sock->isock_tad[0].itad_stub = stub;
804 				sock->isock_ntad++;
805 			}
806 			break;
807 		case IMC_TYPE_MC0_MAIN1:
808 			sock->isock_imcs[0].icn_main1 = stub;
809 			break;
810 		case IMC_TYPE_MC1_MAIN0:
811 			sock->isock_nimc++;
812 			sock->isock_imcs[1].icn_main0 = stub;
813 
814 			/*
815 			 * On Skylake, the MAIN0 does double duty as channel
816 			 * zero and as the TAD.
817 			 */
818 			if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
819 				sock->isock_imcs[1].icn_nchannels++;
820 				sock->isock_imcs[1].icn_channels[0].ich_desc =
821 				    stub;
822 				sock->isock_tad[1].itad_stub = stub;
823 				sock->isock_ntad++;
824 			}
825 			break;
826 		case IMC_TYPE_MC1_MAIN1:
827 			sock->isock_imcs[1].icn_main1 = stub;
828 			break;
829 		case IMC_TYPE_MC0_CHANNEL0:
830 			sock->isock_imcs[0].icn_nchannels++;
831 			sock->isock_imcs[0].icn_channels[0].ich_desc = stub;
832 			break;
833 		case IMC_TYPE_MC0_CHANNEL1:
834 			sock->isock_imcs[0].icn_nchannels++;
835 			sock->isock_imcs[0].icn_channels[1].ich_desc = stub;
836 			break;
837 		case IMC_TYPE_MC0_CHANNEL2:
838 			sock->isock_imcs[0].icn_nchannels++;
839 			sock->isock_imcs[0].icn_channels[2].ich_desc = stub;
840 			break;
841 		case IMC_TYPE_MC0_CHANNEL3:
842 			sock->isock_imcs[0].icn_nchannels++;
843 			sock->isock_imcs[0].icn_channels[3].ich_desc = stub;
844 			break;
845 		case IMC_TYPE_MC1_CHANNEL0:
846 			sock->isock_imcs[1].icn_nchannels++;
847 			sock->isock_imcs[1].icn_channels[0].ich_desc = stub;
848 			break;
849 		case IMC_TYPE_MC1_CHANNEL1:
850 			sock->isock_imcs[1].icn_nchannels++;
851 			sock->isock_imcs[1].icn_channels[1].ich_desc = stub;
852 			break;
853 		case IMC_TYPE_MC1_CHANNEL2:
854 			sock->isock_imcs[1].icn_nchannels++;
855 			sock->isock_imcs[1].icn_channels[2].ich_desc = stub;
856 			break;
857 		case IMC_TYPE_MC1_CHANNEL3:
858 			sock->isock_imcs[1].icn_nchannels++;
859 			sock->isock_imcs[1].icn_channels[3].ich_desc = stub;
860 			break;
861 		case IMC_TYPE_SAD_DRAM:
862 			sock->isock_sad.isad_dram = stub;
863 			break;
864 		case IMC_TYPE_SAD_MMIO:
865 			sock->isock_sad.isad_mmio = stub;
866 			break;
867 		case IMC_TYPE_SAD_MISC:
868 			sock->isock_sad.isad_tolh = stub;
869 			break;
870 		case IMC_TYPE_VTD_MISC:
871 			/*
872 			 * Some systems have multiple VT-D Misc. entry points
873 			 * in the system. In this case, only use the first one
874 			 * we find.
875 			 */
876 			if (imc->imc_gvtd_misc == NULL) {
877 				imc->imc_gvtd_misc = stub;
878 			}
879 			break;
880 		case IMC_TYPE_SAD_MCROUTE:
881 			ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE);
882 			imc_mcroute_check(imc, &sock->isock_sad, stub);
883 			break;
884 		case IMC_TYPE_UBOX:
885 			sock->isock_ubox = stub;
886 			break;
887 		case IMC_TYPE_HA0:
888 			sock->isock_ntad++;
889 			sock->isock_tad[0].itad_stub = stub;
890 			break;
891 		case IMC_TYPE_HA1:
892 			sock->isock_ntad++;
893 			sock->isock_tad[1].itad_stub = stub;
894 			break;
895 		case IMC_TYPE_UBOX_CPUBUSNO:
896 			sock->isock_cpubusno = stub;
897 			break;
898 		default:
899 			/*
900 			 * Attempt to still attach if we can.
901 			 */
902 			dev_err(imc->imc_dip, CE_WARN, "Encountered unknown "
903 			    "IMC type (%u) on PCI %x,%x",
904 			    stub->istub_table->imcs_type,
905 			    stub->istub_vid, stub->istub_did);
906 			break;
907 		}
908 	}
909 
910 	return (B_TRUE);
911 }
912 
913 /*
914  * Go through and fix up various aspects of the stubs mappings on systems. The
915  * following are a list of what we need to fix up:
916  *
917  *  1. On Haswell and newer systems, there is only one global VT-d device. We
918  *     need to go back and map that to all of the per-socket imc_sad_t entries.
919  */
920 static void
921 imc_fixup_stubs(imc_t *imc)
922 {
923 	if (imc->imc_gen >= IMC_GEN_HASWELL) {
924 		uint_t i;
925 
926 		for (i = 0; i < imc->imc_nsockets; i++) {
927 			ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh,
928 			    ==, NULL);
929 			imc->imc_sockets[i].isock_sad.isad_tolh =
930 			    imc->imc_gvtd_misc;
931 		}
932 	}
933 }
934 
935 /*
936  * In the wild we've hit a few odd cases where not all devices are exposed that
937  * we might expect by firmware. In particular we've seen and validate the
938  * following cases:
939  *
940  *  o We don't find all of the channel devices that we expect, e.g. we have the
941  *    stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW
942  *    with an E5-2630v3.
943  */
944 static boolean_t
945 imc_validate_stubs(imc_t *imc)
946 {
947 	for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) {
948 		imc_socket_t *socket = &imc->imc_sockets[sock];
949 
950 		for (uint_t mc = 0; mc < socket->isock_nimc; mc++) {
951 			imc_mc_t *mcp = &socket->isock_imcs[mc];
952 
953 			for (uint_t chan = 0; chan < mcp->icn_nchannels;
954 			    chan++) {
955 				if (mcp->icn_channels[chan].ich_desc == NULL) {
956 					dev_err(imc->imc_dip, CE_WARN,
957 					    "!missing device for socket %u/"
958 					    "imc %u/channel %u", sock, mc,
959 					    chan);
960 					return (B_FALSE);
961 				}
962 			}
963 		}
964 	}
965 
966 	return (B_TRUE);
967 }
968 
969 /*
970  * Attempt to map all of the discovered sockets to the corresponding APIC based
971  * socket. We do these mappings by getting the node id of the socket and
972  * adjusting it to make sure that no home agent is present in it. We use the
973  * UBOX to avoid any home agent related bits that are present in other
974  * registers.
975  */
976 static void
977 imc_map_sockets(imc_t *imc)
978 {
979 	uint_t i;
980 
981 	for (i = 0; i < imc->imc_nsockets; i++) {
982 		uint32_t nodeid;
983 		ddi_acc_handle_t h;
984 
985 		h = imc->imc_sockets[i].isock_ubox->istub_cfgspace;
986 		nodeid = pci_config_get32(h,
987 		    imc->imc_gen_data->igd_sad_nodeid_offset);
988 		if (nodeid == PCI_EINVAL32) {
989 			imc->imc_sockets[i].isock_valid |=
990 			    IMC_SOCKET_V_BAD_NODEID;
991 			continue;
992 		}
993 
994 		imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid);
995 		imc->imc_spointers[nodeid] = &imc->imc_sockets[i];
996 	}
997 }
998 
999 /*
1000  * Decode the MTR, accounting for variances between processor generations.
1001  */
1002 static void
1003 imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr)
1004 {
1005 	uint8_t disable;
1006 
1007 	/*
1008 	 * Check present first, before worrying about anything else.
1009 	 */
1010 	if (imc->imc_gen < IMC_GEN_SKYLAKE &&
1011 	    IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) {
1012 		dimm->idimm_present = B_FALSE;
1013 		return;
1014 	} else if (imc->imc_gen >= IMC_GEN_SKYLAKE &&
1015 	    IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) {
1016 		dimm->idimm_present = B_FALSE;
1017 		return;
1018 	}
1019 
1020 	dimm->idimm_present = B_TRUE;
1021 	dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE;
1022 	if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN ||
1023 	    dimm->idimm_ncolumns > IMC_MTR_CA_MAX) {
1024 		dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS;
1025 	}
1026 
1027 	dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE;
1028 	if (dimm->idimm_nrows < IMC_MTR_RA_MIN ||
1029 	    dimm->idimm_nrows > IMC_MTR_RA_MAX) {
1030 		dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS;
1031 	}
1032 
1033 	/*
1034 	 * Determine Density, this information is not present on Sandy Bridge.
1035 	 */
1036 	switch (imc->imc_gen) {
1037 	case IMC_GEN_IVY:
1038 		dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr);
1039 		break;
1040 	case IMC_GEN_HASWELL:
1041 	case IMC_GEN_BROADWELL:
1042 		switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) {
1043 		case 0:
1044 		default:
1045 			dimm->idimm_density = 0;
1046 			dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1047 			break;
1048 		case 1:
1049 			dimm->idimm_density = 2;
1050 			break;
1051 		case 2:
1052 			dimm->idimm_density = 4;
1053 			break;
1054 		case 3:
1055 			dimm->idimm_density = 8;
1056 			break;
1057 		}
1058 		break;
1059 	case IMC_GEN_SKYLAKE:
1060 		switch (IMC_MTR_DENSITY_SKX(mtr)) {
1061 		case 0:
1062 		default:
1063 			dimm->idimm_density = 0;
1064 			dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1065 			break;
1066 		case 1:
1067 			dimm->idimm_density = 2;
1068 			break;
1069 		case 2:
1070 			dimm->idimm_density = 4;
1071 			break;
1072 		case 3:
1073 			dimm->idimm_density = 8;
1074 			break;
1075 		case 4:
1076 			dimm->idimm_density = 16;
1077 			break;
1078 		case 5:
1079 			dimm->idimm_density = 12;
1080 			break;
1081 		}
1082 		break;
1083 	case IMC_GEN_UNKNOWN:
1084 	case IMC_GEN_SANDY:
1085 		dimm->idimm_density = 0;
1086 		break;
1087 	}
1088 
1089 	/*
1090 	 * The values of width are the same on IVY->SKX, but the bits are
1091 	 * different. This doesn't exist on SNB.
1092 	 */
1093 	if (imc->imc_gen > IMC_GEN_SANDY) {
1094 		uint8_t width;
1095 
1096 		if (imc->imc_gen >= IMC_GEN_BROADWELL) {
1097 			width = IMC_MTR_WIDTH_BRD_SKX(mtr);
1098 		} else {
1099 			width = IMC_MTR_WIDTH_IVB_HAS(mtr);
1100 		}
1101 		switch (width) {
1102 		case 0:
1103 			dimm->idimm_width = 4;
1104 			break;
1105 		case 1:
1106 			dimm->idimm_width = 8;
1107 			break;
1108 		case 2:
1109 			dimm->idimm_width = 16;
1110 			break;
1111 		default:
1112 			dimm->idimm_width = 0;
1113 			dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH;
1114 			break;
1115 		}
1116 	} else {
1117 		dimm->idimm_width = 0;
1118 	}
1119 
1120 	dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr);
1121 	switch (imc->imc_gen) {
1122 	case IMC_GEN_HASWELL:
1123 	case IMC_GEN_BROADWELL:
1124 	case IMC_GEN_SKYLAKE:
1125 		if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) {
1126 			dimm->idimm_nranks = 0;
1127 			dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1128 		}
1129 		break;
1130 	default:
1131 		if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) {
1132 			dimm->idimm_nranks = 0;
1133 			dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1134 		}
1135 	}
1136 
1137 	disable = IMC_MTR_RANK_DISABLE(mtr);
1138 	dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0;
1139 	dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0;
1140 	dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0;
1141 	dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0;
1142 
1143 	/*
1144 	 * Only Haswell and later have this information.
1145 	 */
1146 	if (imc->imc_gen >= IMC_GEN_HASWELL) {
1147 		dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0;
1148 		dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0;
1149 		dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr);
1150 		if (dimm->idimm_3dsranks != 0) {
1151 			dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks;
1152 		}
1153 	}
1154 
1155 
1156 	if (icn->icn_dimm_type == IMC_DIMM_DDR4) {
1157 		dimm->idimm_nbanks = 16;
1158 	} else {
1159 		dimm->idimm_nbanks = 8;
1160 	}
1161 
1162 	/*
1163 	 * To calculate the DIMM size we need first take the number of rows and
1164 	 * columns. This gives us the number of slots per chip. In a given rank
1165 	 * there are nbanks of these. There are nrank entries of those. Each of
1166 	 * these slots can fit a byte.
1167 	 */
1168 	dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 *
1169 	    (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows));
1170 }
1171 
1172 static void
1173 imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan)
1174 {
1175 	uint_t i;
1176 
1177 	/*
1178 	 * There's one register for each DIMM that might be present, we always
1179 	 * read that information to determine information about the DIMMs.
1180 	 */
1181 	chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms;
1182 	for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1183 		uint32_t mtr;
1184 		imc_dimm_t *dimm = &chan->ich_dimms[i];
1185 
1186 		bzero(dimm, sizeof (imc_dimm_t));
1187 		mtr = pci_config_get32(chan->ich_desc->istub_cfgspace,
1188 		    imc->imc_gen_data->igd_mtr_offsets[i]);
1189 		dimm->idimm_mtr = mtr;
1190 		/*
1191 		 * We don't really expect to get a bad PCIe read. However, if we
1192 		 * do, treat that for the moment as though the DIMM is bad.
1193 		 */
1194 		if (mtr == PCI_EINVAL32) {
1195 			dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ;
1196 			continue;
1197 		}
1198 
1199 		imc_decode_mtr(imc, icn, dimm, mtr);
1200 	}
1201 }
1202 
1203 static boolean_t
1204 imc_fill_controller(imc_t *imc, imc_mc_t *icn)
1205 {
1206 	uint32_t mcmtr;
1207 
1208 	mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace,
1209 	    imc->imc_gen_data->igd_mcmtr_offset);
1210 	if (mcmtr == PCI_EINVAL32) {
1211 		icn->icn_invalid = B_TRUE;
1212 		return (B_FALSE);
1213 	}
1214 
1215 	icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0;
1216 	if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1217 		icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0;
1218 	} else {
1219 		icn->icn_lockstep = B_FALSE;
1220 	}
1221 
1222 	icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0;
1223 
1224 	/*
1225 	 * SNB and IVB only support DDR3. Haswell and Broadwell may support
1226 	 * DDR4, depends on the SKU. Skylake only supports DDR4.
1227 	 */
1228 	switch (imc->imc_gen) {
1229 	case IMC_GEN_SANDY:
1230 	case IMC_GEN_IVY:
1231 		icn->icn_dimm_type = IMC_DIMM_DDR3;
1232 		break;
1233 	case IMC_GEN_HASWELL:
1234 	case IMC_GEN_BROADWELL:
1235 		if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) {
1236 			icn->icn_dimm_type = IMC_DIMM_DDR4;
1237 		} else {
1238 			icn->icn_dimm_type = IMC_DIMM_DDR3;
1239 		}
1240 		break;
1241 	default:
1242 		/*
1243 		 * Skylake and on are all DDR4.
1244 		 */
1245 		icn->icn_dimm_type = IMC_DIMM_DDR4;
1246 		break;
1247 	}
1248 
1249 	if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) {
1250 		icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace,
1251 		    imc->imc_gen_data->igd_topo_offset);
1252 	}
1253 
1254 	return (B_TRUE);
1255 }
1256 
1257 /*
1258  * Walk the IMC data and fill in the information on DIMMs and the memory
1259  * controller configurations.
1260  */
1261 static void
1262 imc_fill_data(imc_t *imc)
1263 {
1264 	uint_t csock, cmc, cchan;
1265 
1266 	for (csock = 0; csock < imc->imc_nsockets; csock++) {
1267 		imc_socket_t *sock = &imc->imc_sockets[csock];
1268 
1269 		for (cmc = 0; cmc < sock->isock_nimc; cmc++) {
1270 			imc_mc_t *icn = &sock->isock_imcs[cmc];
1271 
1272 			if (!imc_fill_controller(imc, icn))
1273 				continue;
1274 
1275 			for (cchan = 0; cchan < icn->icn_nchannels; cchan++) {
1276 				imc_fill_dimms(imc, icn,
1277 				    &icn->icn_channels[cchan]);
1278 			}
1279 		}
1280 	}
1281 }
1282 
1283 static nvlist_t *
1284 imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm)
1285 {
1286 	nvlist_t *nvl;
1287 
1288 	nvl = fnvlist_alloc();
1289 	fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT,
1290 	    dimm->idimm_present);
1291 	if (!dimm->idimm_present) {
1292 		return (nvl);
1293 	}
1294 
1295 	fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size);
1296 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS,
1297 	    dimm->idimm_ncolumns);
1298 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS,
1299 	    dimm->idimm_nrows);
1300 
1301 	if (imc->imc_gen > IMC_GEN_SANDY) {
1302 		fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY,
1303 		    dimm->idimm_density * (1ULL << 30));
1304 		fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH,
1305 		    dimm->idimm_width);
1306 	}
1307 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS,
1308 	    dimm->idimm_nranks);
1309 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS,
1310 	    dimm->idimm_nbanks);
1311 	fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS,
1312 	    dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE);
1313 
1314 	if (imc->imc_gen >= IMC_GEN_HASWELL) {
1315 		fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL,
1316 		    dimm->idimm_hdrl);
1317 		fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP,
1318 		    dimm->idimm_hdrl_parity);
1319 		if (dimm->idimm_3dsranks > 0) {
1320 			fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK,
1321 			    dimm->idimm_3dsranks);
1322 		}
1323 	}
1324 
1325 	return (nvl);
1326 }
1327 
1328 static nvlist_t *
1329 imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan)
1330 {
1331 	nvlist_t *nvl;
1332 	nvlist_t *dimms[IMC_MAX_DIMMPERCHAN];
1333 	uint_t i;
1334 
1335 	nvl = fnvlist_alloc();
1336 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC,
1337 	    imc->imc_gen_data->igd_max_dimms);
1338 	for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1339 		dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]);
1340 	}
1341 
1342 	fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS,
1343 	    dimms, i);
1344 
1345 	for (; i > 0; i--) {
1346 		nvlist_free(dimms[i-1]);
1347 	}
1348 
1349 	return (nvl);
1350 }
1351 
1352 static nvlist_t *
1353 imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn)
1354 {
1355 	nvlist_t *nvl;
1356 	nvlist_t *channels[IMC_MAX_CHANPERMC];
1357 	uint_t i;
1358 
1359 	nvl = fnvlist_alloc();
1360 	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels);
1361 	fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC,
1362 	    icn->icn_ecc);
1363 	if (icn->icn_lockstep) {
1364 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1365 		    MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK);
1366 	} else {
1367 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1368 		    MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP);
1369 
1370 	}
1371 
1372 	if (icn->icn_closed) {
1373 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1374 		    MCINTEL_NVLIST_V1_MC_POLICY_CLOSED);
1375 	} else {
1376 		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1377 		    MCINTEL_NVLIST_V1_MC_POLICY_OPEN);
1378 	}
1379 
1380 	for (i = 0; i < icn->icn_nchannels; i++) {
1381 		channels[i] = imc_nvl_create_channel(imc,
1382 		    &icn->icn_channels[i]);
1383 	}
1384 	fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS,
1385 	    channels, icn->icn_nchannels);
1386 	for (i = 0; i < icn->icn_nchannels; i++) {
1387 		nvlist_free(channels[i]);
1388 	}
1389 
1390 	return (nvl);
1391 }
1392 
1393 static void
1394 imc_nvl_pack(imc_socket_t *sock, boolean_t sleep)
1395 {
1396 	char *buf = NULL;
1397 	size_t len = 0;
1398 	int kmflag;
1399 
1400 	if (sock->isock_nvl == NULL)
1401 		return;
1402 
1403 	if (sock->isock_buf != NULL)
1404 		return;
1405 
1406 	if (sleep) {
1407 		kmflag = KM_SLEEP;
1408 	} else {
1409 		kmflag = KM_NOSLEEP_LAZY;
1410 	}
1411 
1412 	if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR,
1413 	    kmflag) != 0) {
1414 		return;
1415 	}
1416 
1417 	sock->isock_buf = buf;
1418 	sock->isock_buflen = len;
1419 	sock->isock_gen++;
1420 }
1421 
1422 static void
1423 imc_decoder_pack(imc_t *imc)
1424 {
1425 	char *buf = NULL;
1426 	size_t len = 0;
1427 
1428 	if (imc->imc_decoder_buf != NULL)
1429 		return;
1430 
1431 	if (imc->imc_decoder_dump == NULL) {
1432 		imc->imc_decoder_dump = imc_dump_decoder(imc);
1433 	}
1434 
1435 	if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR,
1436 	    KM_NOSLEEP_LAZY) != 0) {
1437 		return;
1438 	}
1439 
1440 	imc->imc_decoder_buf = buf;
1441 	imc->imc_decoder_len = len;
1442 }
1443 
1444 static void
1445 imc_nvl_create(imc_t *imc)
1446 {
1447 	uint_t csock;
1448 	for (csock = 0; csock < imc->imc_nsockets; csock++) {
1449 		uint_t i;
1450 		nvlist_t *nvl;
1451 		nvlist_t *mcs[IMC_MAX_IMCPERSOCK];
1452 		imc_socket_t *sock = &imc->imc_sockets[csock];
1453 
1454 		nvl = fnvlist_alloc();
1455 		fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR,
1456 		    MCINTEL_NVLIST_VERS1);
1457 		fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC,
1458 		    sock->isock_nimc);
1459 
1460 		for (i = 0; i < sock->isock_nimc; i++) {
1461 			mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]);
1462 		}
1463 
1464 		fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS,
1465 		    mcs, sock->isock_nimc);
1466 
1467 		for (i = 0; i < sock->isock_nimc; i++) {
1468 			nvlist_free(mcs[i]);
1469 		}
1470 
1471 		sock->isock_nvl = nvl;
1472 		imc_nvl_pack(sock, B_TRUE);
1473 	}
1474 }
1475 
1476 /*
1477  * Determine the top of low and high memory. These determine whether transaction
1478  * addresses target main memory or not. Unfortunately, the way that these are
1479  * stored and fetched changes with different generations.
1480  */
1481 static void
1482 imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad)
1483 {
1484 	uint32_t tolm, tohm_low, tohm_hi;
1485 
1486 	tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1487 	    imc->imc_gen_data->igd_tolm_offset);
1488 	tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1489 	    imc->imc_gen_data->igd_tohm_low_offset);
1490 	if (imc->imc_gen_data->igd_tohm_hi_offset != 0) {
1491 		tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1492 		    imc->imc_gen_data->igd_tohm_hi_offset);
1493 	} else {
1494 		tohm_hi = 0;
1495 	}
1496 
1497 	if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 ||
1498 	    tohm_hi == PCI_EINVAL32) {
1499 		sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1500 		return;
1501 	}
1502 
1503 	switch (imc->imc_gen) {
1504 	case IMC_GEN_SANDY:
1505 	case IMC_GEN_IVY:
1506 		sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) <<
1507 		    IMC_TOLM_SNB_IVY_SHIFT;
1508 		sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) <<
1509 		    IMC_TOLM_SNB_IVY_SHIFT;
1510 		break;
1511 	case IMC_GEN_HASWELL:
1512 	case IMC_GEN_BROADWELL:
1513 	case IMC_GEN_SKYLAKE:
1514 		sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK;
1515 		sad->isad_tohm = ((uint64_t)tohm_low &
1516 		    IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32);
1517 
1518 		/*
1519 		 * Adjust the values to turn them into an exclusive range.
1520 		 */
1521 		sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL;
1522 		sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL;
1523 		break;
1524 	default:
1525 		dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
1526 		    "set to unknown generation: %u", imc->imc_gen);
1527 		return;
1528 	}
1529 }
1530 
1531 static void
1532 imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule,
1533     uint32_t raw)
1534 {
1535 	uint_t attr;
1536 	uint64_t limit;
1537 	bzero(rule, sizeof (imc_sad_rule_t));
1538 
1539 	rule->isr_raw_dram = raw;
1540 	rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0;
1541 	if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1542 		switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) {
1543 		case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6:
1544 			rule->isr_imode = IMC_SAD_IMODE_8t6;
1545 			break;
1546 		case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR:
1547 			rule->isr_imode = IMC_SAD_IMODE_8t6XOR;
1548 			break;
1549 		}
1550 	} else {
1551 		switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) {
1552 		case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6:
1553 			rule->isr_imode = IMC_SAD_IMODE_8t6;
1554 			break;
1555 		case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8:
1556 			rule->isr_imode = IMC_SAD_IMODE_10t8;
1557 			break;
1558 		case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12:
1559 			rule->isr_imode = IMC_SAD_IMODE_14t12;
1560 			break;
1561 		case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30:
1562 			rule->isr_imode = IMC_SAD_IMODE_32t30;
1563 			break;
1564 		}
1565 	}
1566 
1567 	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1568 		attr = IMC_SAD_DRAM_ATTR_SKX(raw);
1569 	} else {
1570 		attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw);
1571 	}
1572 
1573 	switch (attr) {
1574 	case IMC_SAD_DRAM_ATTR_DRAM:
1575 		rule->isr_type = IMC_SAD_TYPE_DRAM;
1576 		break;
1577 	case IMC_SAD_DRAM_ATTR_MMCFG:
1578 		rule->isr_type = IMC_SAD_TYPE_MMCFG;
1579 		break;
1580 	case IMC_SAD_DRAM_ATTR_NXM:
1581 		if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1582 			sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1583 		}
1584 		rule->isr_type = IMC_SAD_TYPE_NXM;
1585 		break;
1586 	default:
1587 		sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1588 		break;
1589 	}
1590 
1591 	/*
1592 	 * Fetch the limit which represents bits 45:26 and then adjust this so
1593 	 * that it is exclusive.
1594 	 */
1595 	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1596 		limit = IMC_SAD_DRAM_LIMIT_SKX(raw);
1597 	} else {
1598 		limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw);
1599 	}
1600 	rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) +
1601 	    IMC_SAD_DRAM_LIMIT_EXCLUSIVE;
1602 
1603 	/*
1604 	 * The rest of this does not apply to Sandy Bridge.
1605 	 */
1606 	if (imc->imc_gen == IMC_GEN_SANDY)
1607 		return;
1608 
1609 	if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) {
1610 		rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0;
1611 		return;
1612 	}
1613 
1614 	switch (IMC_SAD_DRAM_MOD23_SKX(raw)) {
1615 	case IMC_SAD_DRAM_MOD23_MOD3:
1616 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3;
1617 		break;
1618 	case IMC_SAD_DRAM_MOD23_MOD2_C01:
1619 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01;
1620 		break;
1621 	case IMC_SAD_DRAM_MOD23_MOD2_C12:
1622 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12;
1623 		break;
1624 	case IMC_SAD_DRAM_MOD23_MOD2_C02:
1625 		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02;
1626 		break;
1627 	}
1628 
1629 	rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0;
1630 	switch (IMC_SAD_DRAM_MOD3_SKX(raw)) {
1631 	case IMC_SAD_DRAM_MOD3_MODE_45t6:
1632 		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6;
1633 		break;
1634 	case IMC_SAD_DRAM_MOD3_MODE_45t8:
1635 		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8;
1636 		break;
1637 	case IMC_SAD_DRAM_MOD3_MODE_45t12:
1638 		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12;
1639 		break;
1640 	default:
1641 		sad->isad_valid |= IMC_SAD_V_BAD_MOD3;
1642 		break;
1643 	}
1644 }
1645 
1646 static void
1647 imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw)
1648 {
1649 	uint_t i;
1650 	uint32_t mlen, mbase, skipbits, skipafter;
1651 
1652 	rule->isr_raw_interleave = raw;
1653 
1654 	/*
1655 	 * Right now all architectures always have the maximum number of SAD
1656 	 * interleave targets.
1657 	 */
1658 	rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE;
1659 
1660 	/*
1661 	 * Sandy Bridge has a gap in the interleave list due to the fact that it
1662 	 * uses a smaller length.
1663 	 */
1664 	if (imc->imc_gen > IMC_GEN_SANDY) {
1665 		mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN;
1666 		mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK;
1667 		skipbits = skipafter = 0;
1668 	} else {
1669 		mlen = IMC_SAD_ILEAVE_SNB_LEN;
1670 		mbase = IMC_SAD_ILEAVE_SNB_MASK;
1671 		skipbits = 2;
1672 		skipafter = 4;
1673 	}
1674 
1675 	for (i = 0; i < rule->isr_ntargets; i++) {
1676 		uint32_t mask, shift;
1677 
1678 		shift = i * mlen;
1679 		if (i >= skipafter)
1680 			shift += skipbits;
1681 		mask = mbase << shift;
1682 		rule->isr_targets[i] = (raw & mask) >> shift;
1683 	}
1684 }
1685 
1686 static void
1687 imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad)
1688 {
1689 	uint_t i;
1690 	off_t off;
1691 
1692 	sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules;
1693 	for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset;
1694 	    i < sad->isad_nrules; i++, off += sizeof (uint64_t)) {
1695 		uint32_t dram, interleave;
1696 		imc_sad_rule_t *rule = &sad->isad_rules[i];
1697 
1698 		dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off);
1699 		interleave = pci_config_get32(sad->isad_dram->istub_cfgspace,
1700 		    off + 4);
1701 
1702 		if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) {
1703 			sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1704 			return;
1705 		}
1706 
1707 		imc_sad_fill_rule(imc, sad, rule, dram);
1708 		imc_sad_fill_rule_interleave(imc, rule, interleave);
1709 	}
1710 }
1711 
1712 static void
1713 imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad)
1714 {
1715 	uint_t i;
1716 	imc_sad_mcroute_table_t *mc = &sad->isad_mcroute;
1717 
1718 	if (imc->imc_gen < IMC_GEN_SKYLAKE)
1719 		return;
1720 	if (sad->isad_valid != 0)
1721 		return;
1722 
1723 	mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES;
1724 	for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) {
1725 		uint_t chanoff, ringoff;
1726 
1727 		ringoff = i * IMC_MC_ROUTE_RING_BITS;
1728 		chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET;
1729 
1730 		mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >>
1731 		    ringoff) & IMC_MC_ROUTE_RING_MASK;
1732 		mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >>
1733 		    chanoff) & IMC_MC_ROUTE_CHAN_MASK;
1734 	}
1735 }
1736 
1737 /*
1738  * Initialize the SAD. To do this we have to do a few different things:
1739  *
1740  * 1. Determine where the top of low and high memory is.
1741  * 2. Read and decode all of the rules for the SAD
1742  * 3. On systems with a route table, decode the raw routes
1743  *
1744  * At this point in time, we treat TOLM and TOHM as a per-socket construct, even
1745  * though it really should be global, this just makes life a bit simpler.
1746  */
1747 static void
1748 imc_decoder_init_sad(imc_t *imc)
1749 {
1750 	uint_t i;
1751 
1752 	for (i = 0; i < imc->imc_nsockets; i++) {
1753 		imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad);
1754 		imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad);
1755 		imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad);
1756 	}
1757 }
1758 
1759 static void
1760 imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev,
1761     imc_tad_rule_t *rule, uint32_t val)
1762 {
1763 	uint64_t limit;
1764 
1765 	limit = IMC_TAD_LIMIT(val);
1766 	rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) +
1767 	    IMC_TAD_LIMIT_EXCLUSIVE;
1768 	rule->itr_raw = val;
1769 
1770 	switch (IMC_TAD_SOCK_WAY(val)) {
1771 	case IMC_TAD_SOCK_WAY_1:
1772 		rule->itr_sock_way = 1;
1773 		break;
1774 	case IMC_TAD_SOCK_WAY_2:
1775 		rule->itr_sock_way = 2;
1776 		break;
1777 	case IMC_TAD_SOCK_WAY_4:
1778 		rule->itr_sock_way = 4;
1779 		break;
1780 	case IMC_TAD_SOCK_WAY_8:
1781 		rule->itr_sock_way = 8;
1782 		break;
1783 	}
1784 
1785 	rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1;
1786 	rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1787 	rule->itr_chan_gran = IMC_TAD_GRAN_64B;
1788 
1789 	/*
1790 	 * Starting with Skylake the targets that are used are no longer part of
1791 	 * the TAD. Those come from the IMC route table.
1792 	 */
1793 	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1794 		rule->itr_ntargets = 0;
1795 		return;
1796 	}
1797 
1798 	rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS;
1799 	rule->itr_targets[0] = IMC_TAD_TARG0(val);
1800 	rule->itr_targets[1] = IMC_TAD_TARG1(val);
1801 	rule->itr_targets[2] = IMC_TAD_TARG2(val);
1802 	rule->itr_targets[3] = IMC_TAD_TARG3(val);
1803 
1804 	if (prev == NULL) {
1805 		rule->itr_base = 0;
1806 	} else {
1807 		rule->itr_base = prev->itr_limit + 1;
1808 	}
1809 }
1810 
1811 static void
1812 imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule,
1813     uint32_t val)
1814 {
1815 	uint64_t base;
1816 
1817 	rule->itr_raw_gran = val;
1818 	base = IMC_TAD_BASE_BASE(val);
1819 	rule->itr_base = base << IMC_TAD_BASE_SHIFT;
1820 
1821 	switch (IMC_TAD_BASE_CHAN_GRAN(val)) {
1822 	case IMC_TAD_BASE_CHAN_GRAN_64B:
1823 		rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1824 		break;
1825 	case IMC_TAD_BASE_CHAN_GRAN_256B:
1826 		rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1827 		break;
1828 	case IMC_TAD_BASE_CHAN_GRAN_4KB:
1829 		rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1830 		break;
1831 	default:
1832 		tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN;
1833 		return;
1834 	}
1835 
1836 	switch (IMC_TAD_BASE_SOCK_GRAN(val)) {
1837 	case IMC_TAD_BASE_SOCK_GRAN_64B:
1838 		rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1839 		break;
1840 	case IMC_TAD_BASE_SOCK_GRAN_256B:
1841 		rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1842 		break;
1843 	case IMC_TAD_BASE_SOCK_GRAN_4KB:
1844 		rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1845 		break;
1846 	case IMC_TAD_BASE_SOCK_GRAN_1GB:
1847 		rule->itr_sock_gran = IMC_TAD_GRAN_1GB;
1848 		break;
1849 	}
1850 }
1851 
1852 /*
1853  * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's
1854  * suggested that the channel wayness will take this into account and therefore
1855  * should be accurately reflected.
1856  */
1857 static void
1858 imc_tad_read_rules(imc_t *imc, imc_tad_t *tad)
1859 {
1860 	uint_t i;
1861 	off_t baseoff;
1862 	imc_tad_rule_t *prev;
1863 
1864 	tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules;
1865 	for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset,
1866 	    prev = NULL; i < tad->itad_nrules;
1867 	    i++, baseoff += sizeof (uint32_t)) {
1868 		uint32_t val;
1869 		off_t off;
1870 		imc_tad_rule_t *rule = &tad->itad_rules[i];
1871 
1872 		/*
1873 		 * On Skylake, the TAD rules are split among two registers. The
1874 		 * latter set mimics what exists on pre-Skylake.
1875 		 */
1876 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1877 			off = baseoff + IMC_SKX_WAYNESS_OFFSET;
1878 		} else {
1879 			off = baseoff;
1880 		}
1881 
1882 		val = pci_config_get32(tad->itad_stub->istub_cfgspace, off);
1883 		if (val == PCI_EINVAL32) {
1884 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1885 			return;
1886 		}
1887 
1888 		imc_tad_fill_rule(imc, tad, prev, rule, val);
1889 		prev = rule;
1890 		if (imc->imc_gen < IMC_GEN_SKYLAKE)
1891 			continue;
1892 
1893 		val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff);
1894 		if (val == PCI_EINVAL32) {
1895 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1896 			return;
1897 		}
1898 
1899 		imc_tad_fill_skx(imc, tad, rule, val);
1900 	}
1901 }
1902 
1903 /*
1904  * Check for features which change how decoding works.
1905  */
1906 static void
1907 imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc)
1908 {
1909 	uint32_t val;
1910 
1911 	/*
1912 	 * Determine whether or not lockstep mode or mirroring are enabled.
1913 	 * These change the behavior of how we're supposed to interpret channel
1914 	 * wayness. Lockstep is available in the TAD's features. Mirroring is
1915 	 * available on the IMC's features. This isn't present in Skylake+. On
1916 	 * Skylake Mirorring is a property of the SAD rule and there is no
1917 	 * lockstep.
1918 	 */
1919 	switch (imc->imc_gen) {
1920 	case IMC_GEN_SANDY:
1921 	case IMC_GEN_IVY:
1922 	case IMC_GEN_HASWELL:
1923 	case IMC_GEN_BROADWELL:
1924 		val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1925 		    imc->imc_gen_data->igd_tad_sysdef);
1926 		if (val == PCI_EINVAL32) {
1927 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1928 			return;
1929 		}
1930 		if (IMC_TAD_SYSDEF_LOCKSTEP(val)) {
1931 			tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP;
1932 		}
1933 
1934 		val = pci_config_get32(mc->icn_main1->istub_cfgspace,
1935 		    imc->imc_gen_data->igd_mc_mirror);
1936 		if (val == PCI_EINVAL32) {
1937 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1938 			return;
1939 		}
1940 		if (IMC_MC_MIRROR_SNB_BRD(val)) {
1941 			tad->itad_flags |= IMC_TAD_FLAG_MIRROR;
1942 		}
1943 		break;
1944 	default:
1945 		break;
1946 	}
1947 
1948 	/*
1949 	 * Now, go through and look at values that'll change how we do the
1950 	 * channel index and adddress calculation. These are only present
1951 	 * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge
1952 	 * and they don't exist on Skylake+.
1953 	 */
1954 	switch (imc->imc_gen) {
1955 	case IMC_GEN_IVY:
1956 	case IMC_GEN_HASWELL:
1957 	case IMC_GEN_BROADWELL:
1958 		val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1959 		    imc->imc_gen_data->igd_tad_sysdef2);
1960 		if (val == PCI_EINVAL32) {
1961 			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1962 			return;
1963 		}
1964 		if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1965 			tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT;
1966 		}
1967 		if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1968 			tad->itad_flags |= IMC_TAD_FLAG_CHANHASH;
1969 		}
1970 		break;
1971 	default:
1972 		break;
1973 	}
1974 }
1975 
1976 /*
1977  * Read the IMC channel interleave records
1978  */
1979 static void
1980 imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan)
1981 {
1982 	uint_t i;
1983 	off_t off;
1984 
1985 	chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules;
1986 	for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset;
1987 	    i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) {
1988 		uint32_t val;
1989 		uint64_t offset;
1990 
1991 		val = pci_config_get32(chan->ich_desc->istub_cfgspace,
1992 		    off);
1993 		if (val == PCI_EINVAL32) {
1994 			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
1995 			return;
1996 		}
1997 
1998 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1999 			offset = IMC_TADCHAN_OFFSET_SKX(val);
2000 		} else {
2001 			offset = IMC_TADCHAN_OFFSET_SNB_BRD(val);
2002 		}
2003 
2004 		chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT;
2005 		chan->ich_tad_offsets_raw[i] = val;
2006 	}
2007 }
2008 
2009 static void
2010 imc_decoder_init_tad(imc_t *imc)
2011 {
2012 	uint_t i;
2013 
2014 	for (i = 0; i < imc->imc_nsockets; i++) {
2015 		uint_t j;
2016 
2017 		for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) {
2018 			imc_tad_read_features(imc,
2019 			    &imc->imc_sockets[i].isock_tad[j],
2020 			    &imc->imc_sockets[i].isock_imcs[j]);
2021 			imc_tad_read_rules(imc,
2022 			    &imc->imc_sockets[i].isock_tad[j]);
2023 		}
2024 	}
2025 
2026 	for (i = 0; i < imc->imc_nsockets; i++) {
2027 		uint_t j;
2028 		imc_socket_t *sock = &imc->imc_sockets[i];
2029 
2030 		for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2031 			uint_t k;
2032 			imc_mc_t *mc = &sock->isock_imcs[j];
2033 
2034 			for (k = 0; k < mc->icn_nchannels; k++) {
2035 				imc_channel_t *chan = &mc->icn_channels[k];
2036 				imc_tad_read_interleave(imc, chan);
2037 			}
2038 		}
2039 	}
2040 }
2041 
2042 static void
2043 imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan,
2044     imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig)
2045 {
2046 	uint_t i;
2047 	off_t off, incr;
2048 
2049 	/*
2050 	 * Rank interleave offset registers come in two forms. Either they are
2051 	 * contiguous for a given wayness, meaning that all of the entries for
2052 	 * wayness zero are contiguous, or they are sparse, meaning that there
2053 	 * is a bank for entry zero for all wayness, then entry one for all
2054 	 * wayness, etc.
2055 	 */
2056 	if (contig) {
2057 		off = imc->imc_gen_data->igd_rir_ileave_offset +
2058 		    (rirno * imc->imc_gen_data->igd_rir_nileaves *
2059 		    sizeof (uint32_t));
2060 		incr = sizeof (uint32_t);
2061 	} else {
2062 		off = imc->imc_gen_data->igd_rir_ileave_offset +
2063 		    (rirno * sizeof (uint32_t));
2064 		incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t);
2065 	}
2066 	for (i = 0; i < rank->irle_nentries; i++, off += incr) {
2067 		uint32_t val;
2068 		uint64_t offset;
2069 		imc_rank_ileave_entry_t *ent = &rank->irle_entries[i];
2070 
2071 		val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2072 		if (val == PCI_EINVAL32) {
2073 			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2074 			return;
2075 		}
2076 
2077 		switch (imc->imc_gen) {
2078 		case IMC_GEN_BROADWELL:
2079 			ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val);
2080 			break;
2081 		default:
2082 			ent->irle_target = IMC_RIR_OFFSET_TARGET(val);
2083 			break;
2084 		}
2085 		if (imc->imc_gen >= IMC_GEN_HASWELL) {
2086 			offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val);
2087 		} else {
2088 			offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val);
2089 		}
2090 		ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT;
2091 	}
2092 }
2093 
2094 static void
2095 imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan)
2096 {
2097 	uint_t i;
2098 	off_t off;
2099 
2100 	chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways;
2101 	for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset;
2102 	    i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) {
2103 		uint32_t val;
2104 		uint64_t lim;
2105 		imc_rank_ileave_t *ent = &chan->ich_rankileaves[i];
2106 
2107 		val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2108 		if (val == PCI_EINVAL32) {
2109 			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2110 			return;
2111 		}
2112 
2113 		ent->irle_raw = val;
2114 		ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0;
2115 		ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val);
2116 		ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val);
2117 		if (imc->imc_gen >= IMC_GEN_HASWELL) {
2118 			lim = IMC_RIR_LIMIT_HAS_SKX(val);
2119 		} else {
2120 			lim = IMC_RIR_LIMIT_SNB_IVB(val);
2121 		}
2122 
2123 		ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) +
2124 		    IMC_RIR_LIMIT_EXCLUSIVE;
2125 
2126 		ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves;
2127 		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
2128 			imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE);
2129 		} else {
2130 			imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE);
2131 		}
2132 	}
2133 }
2134 
2135 static void
2136 imc_decoder_init_rir(imc_t *imc)
2137 {
2138 	uint_t i;
2139 
2140 	for (i = 0; i < imc->imc_nsockets; i++) {
2141 		uint_t j;
2142 		imc_socket_t *sock = &imc->imc_sockets[i];
2143 
2144 		for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2145 			uint_t k;
2146 			imc_mc_t *mc = &sock->isock_imcs[j];
2147 
2148 			for (k = 0; k < mc->icn_nchannels; k++) {
2149 				imc_channel_t *chan = &mc->icn_channels[k];
2150 				imc_rir_read_wayness(imc, chan);
2151 			}
2152 		}
2153 	}
2154 }
2155 
2156 static cmi_errno_t
2157 imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo,
2158     uint32_t synd, int syndtype, mc_unum_t *unump)
2159 {
2160 	imc_t *imc = arg;
2161 	uint_t i;
2162 	imc_decode_state_t dec;
2163 
2164 	bzero(&dec, sizeof (dec));
2165 	if (!imc_decode_pa(imc, pa, &dec)) {
2166 		switch (dec.ids_fail) {
2167 		case IMC_DECODE_F_LEGACY_RANGE:
2168 		case IMC_DECODE_F_OUTSIDE_DRAM:
2169 			return (CMIERR_MC_NOTDIMMADDR);
2170 		default:
2171 			return (CMIERR_MC_BADSTATE);
2172 		}
2173 	}
2174 
2175 	unump->unum_board = 0;
2176 	/*
2177 	 * The chip id needs to be in the order that the OS expects it, which
2178 	 * may not be our order.
2179 	 */
2180 	for (i = 0; i < imc->imc_nsockets; i++) {
2181 		if (imc->imc_spointers[i] == dec.ids_socket)
2182 			break;
2183 	}
2184 	if (i == imc->imc_nsockets) {
2185 		return (CMIERR_MC_BADSTATE);
2186 	}
2187 	unump->unum_chip = i;
2188 	unump->unum_mc = dec.ids_tadid;
2189 	unump->unum_chan = dec.ids_channelid;
2190 	unump->unum_cs = dec.ids_dimmid;
2191 	unump->unum_rank = dec.ids_rankid;
2192 	unump->unum_offset = dec.ids_rankaddr;
2193 	for (i = 0; i < MC_UNUM_NDIMM; i++) {
2194 		unump->unum_dimms[i] = MC_INVALNUM;
2195 	}
2196 
2197 	return (CMI_SUCCESS);
2198 }
2199 
2200 static cmi_errno_t
2201 imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa)
2202 {
2203 	return (CMIERR_UNKNOWN);
2204 }
2205 
2206 static const cmi_mc_ops_t imc_mc_ops = {
2207 	.cmi_mc_patounum = imc_mc_patounum,
2208 	.cmi_mc_unumtopa = imc_mc_unumtopa
2209 };
2210 
2211 /*
2212  * This is where we really finish attaching and become open for business. This
2213  * occurs once we have all of the expected stubs attached. Here's where all of
2214  * the real fun begins.
2215  */
2216 static void
2217 imc_attach_complete(void *arg)
2218 {
2219 	imc_t *imc = arg;
2220 	cmi_errno_t err;
2221 
2222 	imc_set_gen_data(imc);
2223 
2224 	/*
2225 	 * On SKX and newer, we can fail to map PCI buses at this point due to
2226 	 * bad PCIe reads.
2227 	 */
2228 	if (!imc_map_stubs(imc)) {
2229 		goto done;
2230 	}
2231 
2232 	if (!imc_validate_stubs(imc)) {
2233 		imc->imc_flags |= IMC_F_VALIDATE_FAILED;
2234 		goto done;
2235 	}
2236 
2237 	imc_fixup_stubs(imc);
2238 	imc_map_sockets(imc);
2239 
2240 	if (!imc_create_minors(imc)) {
2241 		goto done;
2242 	}
2243 
2244 	imc_fill_data(imc);
2245 	imc_nvl_create(imc);
2246 
2247 	/*
2248 	 * Gather additional information that we need so that we can properly
2249 	 * initialize the memory decoder and encoder.
2250 	 */
2251 	imc_decoder_init_sad(imc);
2252 	imc_decoder_init_tad(imc);
2253 	imc_decoder_init_rir(imc);
2254 
2255 	/*
2256 	 * Register decoder functions. This may fail. If so, try and complain
2257 	 * loudly, but stay active to allow other data to be useful. Register a
2258 	 * global handle.
2259 	 */
2260 	if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) {
2261 		imc->imc_flags |= IMC_F_MCREG_FAILED;
2262 		dev_err(imc->imc_dip, CE_WARN, "failed to register memory "
2263 		    "decoding operations: 0x%x", err);
2264 	}
2265 
2266 done:
2267 	mutex_enter(&imc->imc_lock);
2268 	imc->imc_flags &= IMC_F_ATTACH_DISPATCHED;
2269 	imc->imc_flags |= IMC_F_ATTACH_COMPLETE;
2270 	mutex_exit(&imc->imc_lock);
2271 }
2272 
2273 static int
2274 imc_stub_comparator(const void *l, const void *r)
2275 {
2276 	const imc_stub_t *sl = l, *sr = r;
2277 	if (sl->istub_bus > sr->istub_bus)
2278 		return (1);
2279 	if (sl->istub_bus < sr->istub_bus)
2280 		return (-1);
2281 	if (sl->istub_dev > sr->istub_dev)
2282 		return (1);
2283 	if (sl->istub_dev < sr->istub_dev)
2284 		return (-1);
2285 	if (sl->istub_func > sr->istub_func)
2286 		return (1);
2287 	if (sl->istub_func < sr->istub_func)
2288 		return (-1);
2289 	return (0);
2290 }
2291 
2292 static int
2293 imc_stub_scan_cb(dev_info_t *dip, void *arg)
2294 {
2295 	int vid, did;
2296 	const imc_stub_table_t *table;
2297 	imc_t *imc = arg;
2298 	int *regs;
2299 	uint_t i, nregs;
2300 
2301 	if (dip == ddi_root_node()) {
2302 		return (DDI_WALK_CONTINUE);
2303 	}
2304 
2305 	/*
2306 	 * Get the dev info name. PCI devices will always be children of PCI
2307 	 * devices today on x86. If we reach something that has a device name
2308 	 * that's not PCI, then we can prune it's children.
2309 	 */
2310 	if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2311 		return (DDI_WALK_PRUNECHILD);
2312 	}
2313 
2314 	/*
2315 	 * Get the device and vendor ID and see if this is something the imc
2316 	 * knows about or cares about.
2317 	 */
2318 	vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2319 	    "vendor-id", PCI_EINVAL16);
2320 	did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2321 	    "device-id", PCI_EINVAL16);
2322 	if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2323 		return (DDI_WALK_CONTINUE);
2324 	}
2325 
2326 	if (vid != IMC_PCI_VENDOR_INTC) {
2327 		return (DDI_WALK_PRUNECHILD);
2328 	}
2329 
2330 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2331 	    "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2332 		return (DDI_WALK_CONTINUE);
2333 	}
2334 
2335 	if (nregs == 0) {
2336 		ddi_prop_free(regs);
2337 		return (DDI_WALK_CONTINUE);
2338 	}
2339 
2340 
2341 	table = NULL;
2342 	for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2343 		if (imc_stub_table[i].imcs_devid == did &&
2344 		    imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2345 		    imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2346 			table = &imc_stub_table[i];
2347 			break;
2348 		}
2349 	}
2350 	ddi_prop_free(regs);
2351 
2352 	/*
2353 	 * Not a match, not interesting.
2354 	 */
2355 	if (table == NULL) {
2356 		return (DDI_WALK_CONTINUE);
2357 	}
2358 
2359 	mutex_enter(&imc->imc_lock);
2360 	imc->imc_nscanned++;
2361 	mutex_exit(&imc->imc_lock);
2362 
2363 	return (DDI_WALK_CONTINUE);
2364 }
2365 
2366 /*
2367  * From here, go through and see how many of the devices that we know about.
2368  */
2369 static void
2370 imc_stub_scan(void *arg)
2371 {
2372 	imc_t *imc = arg;
2373 	boolean_t dispatch = B_FALSE;
2374 
2375 	/*
2376 	 * Zero out the scan results in case we've been detached and reattached.
2377 	 */
2378 	mutex_enter(&imc->imc_lock);
2379 	imc->imc_nscanned = 0;
2380 	mutex_exit(&imc->imc_lock);
2381 
2382 	ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc);
2383 
2384 	mutex_enter(&imc->imc_lock);
2385 	imc->imc_flags |= IMC_F_SCAN_COMPLETE;
2386 	imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED;
2387 
2388 	/*
2389 	 * If the scan found no nodes, then that means that we're on a hardware
2390 	 * platform that we don't support. Therefore, there's no reason to do
2391 	 * anything here.
2392 	 */
2393 	if (imc->imc_nscanned == 0) {
2394 		imc->imc_flags |= IMC_F_UNSUP_PLATFORM;
2395 		mutex_exit(&imc->imc_lock);
2396 		return;
2397 	}
2398 
2399 	if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2400 		imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2401 		dispatch = B_TRUE;
2402 	}
2403 
2404 	mutex_exit(&imc->imc_lock);
2405 
2406 	if (dispatch) {
2407 		(void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2408 		    imc, DDI_SLEEP);
2409 	}
2410 }
2411 
2412 /*
2413  * By default, refuse to allow stubs to detach.
2414  */
2415 int
2416 imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd)
2417 {
2418 	imc_stub_t *stub;
2419 	imc_t *imc = imc_data;
2420 
2421 	mutex_enter(&imc->imc_lock);
2422 
2423 	/*
2424 	 * By default, we do not allow stubs to detach. However, if the driver
2425 	 * has attached to devices on a platform it doesn't recognize or
2426 	 * support or if the override flag has been set, then allow detach to
2427 	 * proceed.
2428 	 */
2429 	if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 &&
2430 	    imc_allow_detach == 0) {
2431 		mutex_exit(&imc->imc_lock);
2432 		return (DDI_FAILURE);
2433 	}
2434 
2435 	for (stub = avl_first(&imc->imc_stubs); stub != NULL;
2436 	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
2437 		if (stub->istub_dip == dip) {
2438 			break;
2439 		}
2440 	}
2441 
2442 	/*
2443 	 * A device was attached to us that we somehow don't know about. Allow
2444 	 * this to proceed.
2445 	 */
2446 	if (stub == NULL) {
2447 		mutex_exit(&imc->imc_lock);
2448 		return (DDI_SUCCESS);
2449 	}
2450 
2451 	pci_config_teardown(&stub->istub_cfgspace);
2452 	avl_remove(&imc->imc_stubs, stub);
2453 	kmem_free(stub, sizeof (imc_stub_t));
2454 	mutex_exit(&imc->imc_lock);
2455 
2456 	return (DDI_SUCCESS);
2457 }
2458 
2459 int
2460 imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd)
2461 {
2462 	imc_stub_t *stub, *lookup;
2463 	int did, vid, *regs;
2464 	uint_t i, nregs;
2465 	const imc_stub_table_t *table;
2466 	avl_index_t idx;
2467 	boolean_t dispatch = B_FALSE;
2468 	imc_t *imc = imc_data;
2469 
2470 	if (cmd != DDI_ATTACH) {
2471 		return (DDI_FAILURE);
2472 	}
2473 
2474 	/*
2475 	 * We've been asked to attach a stub. First, determine if this is even a
2476 	 * PCI device that we should care about. Then, append it to our global
2477 	 * list and kick off the configuration task. Note that we do this
2478 	 * configuration task in a taskq so that we don't interfere with the
2479 	 * normal attach / detach path processing.
2480 	 */
2481 	if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2482 		return (DDI_FAILURE);
2483 	}
2484 
2485 	/*
2486 	 * Get the device and vendor ID and see if this is something the imc
2487 	 * knows about or cares about.
2488 	 */
2489 	vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2490 	    "vendor-id", PCI_EINVAL16);
2491 	did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2492 	    "device-id", PCI_EINVAL16);
2493 	if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2494 		return (DDI_FAILURE);
2495 	}
2496 
2497 	/*
2498 	 * Only accept INTC parts on the imc driver.
2499 	 */
2500 	if (vid != IMC_PCI_VENDOR_INTC) {
2501 		return (DDI_FAILURE);
2502 	}
2503 
2504 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2505 	    "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2506 		return (DDI_FAILURE);
2507 	}
2508 
2509 	if (nregs == 0) {
2510 		ddi_prop_free(regs);
2511 		return (DDI_FAILURE);
2512 	}
2513 
2514 	/*
2515 	 * Determine if this matches a known device.
2516 	 */
2517 	table = NULL;
2518 	for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2519 		if (imc_stub_table[i].imcs_devid == did &&
2520 		    imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2521 		    imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2522 			table = &imc_stub_table[i];
2523 			break;
2524 		}
2525 	}
2526 
2527 	if (i == ARRAY_SIZE(imc_stub_table)) {
2528 		ddi_prop_free(regs);
2529 		return (DDI_FAILURE);
2530 	}
2531 
2532 	/*
2533 	 * We've found something. Make sure the generation matches our current
2534 	 * one. If it does, construct the entry and append it to the list.
2535 	 */
2536 	mutex_enter(&imc->imc_lock);
2537 	if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen !=
2538 	    table->imcs_gen) {
2539 		mutex_exit(&imc->imc_lock);
2540 		ddi_prop_free(regs);
2541 		dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) "
2542 		    "that has different hardware generation (%u) from current "
2543 		    "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen);
2544 		return (DDI_FAILURE);
2545 	} else {
2546 		imc->imc_gen = table->imcs_gen;
2547 	}
2548 	mutex_exit(&imc->imc_lock);
2549 
2550 	stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP);
2551 	stub->istub_dip = dip;
2552 	stub->istub_vid = vid;
2553 	stub->istub_did = did;
2554 	stub->istub_bus = PCI_REG_BUS_G(regs[0]);
2555 	stub->istub_dev = PCI_REG_DEV_G(regs[0]);
2556 	stub->istub_func = PCI_REG_FUNC_G(regs[0]);
2557 	ddi_prop_free(regs);
2558 	stub->istub_table = table;
2559 
2560 	if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) {
2561 		kmem_free(stub, sizeof (stub));
2562 		dev_err(dip, CE_WARN, "Failed to set up PCI config space "
2563 		    "for IMC stub device %s (%u/%u)", ddi_node_name(dip),
2564 		    vid, did);
2565 		return (DDI_FAILURE);
2566 	}
2567 
2568 	mutex_enter(&imc->imc_lock);
2569 	if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) {
2570 		dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate "
2571 		    "bdf %u/%u/%u with %s (%u/%u), not attaching",
2572 		    ddi_node_name(imc->imc_dip), vid, did,
2573 		    stub->istub_bus, stub->istub_dev, stub->istub_func,
2574 		    ddi_node_name(lookup->istub_dip), lookup->istub_vid,
2575 		    lookup->istub_did);
2576 		mutex_exit(&imc->imc_lock);
2577 		pci_config_teardown(&stub->istub_cfgspace);
2578 		kmem_free(stub, sizeof (stub));
2579 
2580 		return (DDI_FAILURE);
2581 	}
2582 	avl_insert(&imc->imc_stubs, stub, idx);
2583 
2584 	if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE &&
2585 	    avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2586 		imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2587 		dispatch = B_TRUE;
2588 	}
2589 	mutex_exit(&imc->imc_lock);
2590 
2591 	if (dispatch) {
2592 		(void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2593 		    imc, DDI_SLEEP);
2594 	}
2595 
2596 	return (DDI_SUCCESS);
2597 }
2598 
2599 static int
2600 imc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2601 {
2602 	imc_t *imc = imc_data;
2603 
2604 	if ((flag & (FEXCL | FNDELAY)) != 0)
2605 		return (EINVAL);
2606 
2607 	if (otyp != OTYP_CHR)
2608 		return (EINVAL);
2609 
2610 	mutex_enter(&imc->imc_lock);
2611 
2612 	if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) {
2613 		mutex_exit(&imc->imc_lock);
2614 		return (ENOTSUP);
2615 	}
2616 
2617 	/*
2618 	 * It's possible that someone has come in during the window between when
2619 	 * we've created the minor node and when we've finished doing work.
2620 	 */
2621 	if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) {
2622 		mutex_exit(&imc->imc_lock);
2623 		return (EAGAIN);
2624 	}
2625 
2626 	/*
2627 	 * It's not clear how someone would get a minor that we didn't create.
2628 	 * But be paranoid and make sure.
2629 	 */
2630 	if (getminor(*devp) >= imc->imc_nsockets) {
2631 		mutex_exit(&imc->imc_lock);
2632 		return (EINVAL);
2633 	}
2634 
2635 	/*
2636 	 * Make sure this socket entry has been filled in.
2637 	 */
2638 	if (imc->imc_spointers[getminor(*devp)] == NULL) {
2639 		mutex_exit(&imc->imc_lock);
2640 		return (EINVAL);
2641 	}
2642 
2643 	mutex_exit(&imc->imc_lock);
2644 
2645 	return (0);
2646 }
2647 
2648 static void
2649 imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode)
2650 {
2651 	imc_decode_state_t dec;
2652 	uint_t i;
2653 
2654 	bzero(&dec, sizeof (dec));
2655 	if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) {
2656 		encode->mcei_err = (uint32_t)dec.ids_fail;
2657 		encode->mcei_errdata = dec.ids_fail_data;
2658 		return;
2659 	}
2660 
2661 	encode->mcei_errdata = 0;
2662 	encode->mcei_err = 0;
2663 	encode->mcei_board = 0;
2664 	for (i = 0; i < imc->imc_nsockets; i++) {
2665 		if (imc->imc_spointers[i] == dec.ids_socket)
2666 			break;
2667 	}
2668 	encode->mcei_chip = i;
2669 	/*
2670 	 * These Intel platforms are all monolithic dies, so set the die to
2671 	 * zero.
2672 	 */
2673 	encode->mcei_die = 0;
2674 	encode->mcei_mc = dec.ids_tadid;
2675 	encode->mcei_chan_addr = dec.ids_chanaddr;
2676 	encode->mcei_chan = dec.ids_channelid;
2677 	encode->mcei_dimm = dec.ids_dimmid;
2678 	encode->mcei_rank_addr = dec.ids_rankaddr;
2679 	encode->mcei_rank = dec.ids_rankid;
2680 	encode->mcei_row = UINT32_MAX;
2681 	encode->mcei_column = UINT32_MAX;
2682 	encode->mcei_cs = encode->mcei_rm = UINT8_MAX;
2683 	encode->mcei_bank = encode->mcei_bank_group = UINT8_MAX;
2684 	encode->mcei_subchan = UINT8_MAX;
2685 }
2686 
2687 static int
2688 imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2689     int *rvalp)
2690 {
2691 	int ret;
2692 	minor_t m;
2693 	mc_snapshot_info_t info;
2694 	mc_encode_ioc_t encode;
2695 	imc_t *imc = imc_data;
2696 	imc_socket_t *sock;
2697 
2698 	mutex_enter(&imc->imc_lock);
2699 	m = getminor(dev);
2700 	if (m >= imc->imc_nsockets) {
2701 		ret = EINVAL;
2702 		goto done;
2703 	}
2704 	sock = imc->imc_spointers[m];
2705 	if (sock == NULL) {
2706 		ret = EINVAL;
2707 		goto done;
2708 	}
2709 
2710 	/*
2711 	 * Note, other memory controller drivers don't check mode for reading
2712 	 * data nor do they care who can read it from a credential perspective.
2713 	 * As such we don't either at this time.
2714 	 */
2715 	switch (cmd) {
2716 	case MC_IOC_SNAPSHOT_INFO:
2717 		imc_nvl_pack(sock, B_FALSE);
2718 		if (sock->isock_buf == NULL) {
2719 			ret = EIO;
2720 			break;
2721 		}
2722 
2723 		info.mcs_size = sock->isock_buflen;
2724 		info.mcs_gen = sock->isock_gen;
2725 
2726 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2727 			ret = EFAULT;
2728 			break;
2729 		}
2730 
2731 		ret = 0;
2732 		break;
2733 	case MC_IOC_SNAPSHOT:
2734 		imc_nvl_pack(sock, B_FALSE);
2735 		if (sock->isock_buf == NULL) {
2736 			ret = EIO;
2737 			break;
2738 		}
2739 
2740 		if (ddi_copyout(sock->isock_buf, (void *)arg,
2741 		    sock->isock_buflen, mode) != 0) {
2742 			ret = EFAULT;
2743 			break;
2744 		}
2745 
2746 		ret = 0;
2747 		break;
2748 	case MC_IOC_DECODE_SNAPSHOT_INFO:
2749 		imc_decoder_pack(imc);
2750 		if (imc->imc_decoder_buf == NULL) {
2751 			ret = EIO;
2752 			break;
2753 		}
2754 
2755 		info.mcs_size = imc->imc_decoder_len;
2756 		info.mcs_gen = imc->imc_spointers[0]->isock_gen;
2757 
2758 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2759 			ret = EFAULT;
2760 			break;
2761 		}
2762 
2763 		ret = 0;
2764 		break;
2765 	case MC_IOC_DECODE_SNAPSHOT:
2766 		imc_decoder_pack(imc);
2767 		if (imc->imc_decoder_buf == NULL) {
2768 			ret = EIO;
2769 			break;
2770 		}
2771 
2772 		if (ddi_copyout(imc->imc_decoder_buf, (void *)arg,
2773 		    imc->imc_decoder_len, mode) != 0) {
2774 			ret = EFAULT;
2775 			break;
2776 		}
2777 
2778 		ret = 0;
2779 		break;
2780 	case MC_IOC_DECODE_PA:
2781 		if (crgetzoneid(credp) != GLOBAL_ZONEID ||
2782 		    drv_priv(credp) != 0) {
2783 			ret = EPERM;
2784 			break;
2785 		}
2786 
2787 		if (ddi_copyin((void *)arg, &encode, sizeof (encode),
2788 		    mode & FKIOCTL) != 0) {
2789 			ret = EPERM;
2790 			break;
2791 		}
2792 
2793 		imc_ioctl_decode(imc, &encode);
2794 		ret = 0;
2795 
2796 		if (ddi_copyout(&encode, (void *)arg, sizeof (encode),
2797 		    mode & FKIOCTL) != 0) {
2798 			ret = EPERM;
2799 			break;
2800 		}
2801 		break;
2802 	default:
2803 		ret = EINVAL;
2804 		goto done;
2805 	}
2806 
2807 done:
2808 	mutex_exit(&imc->imc_lock);
2809 	return (ret);
2810 }
2811 
2812 static int
2813 imc_close(dev_t dev, int flag, int otyp, cred_t *credp)
2814 {
2815 	return (0);
2816 }
2817 
2818 static int
2819 imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2820 {
2821 	if (cmd != DDI_ATTACH) {
2822 		return (DDI_FAILURE);
2823 	}
2824 
2825 	if (imc_data == NULL || imc_data->imc_dip != NULL) {
2826 		return (DDI_FAILURE);
2827 	}
2828 
2829 	mutex_enter(&imc_data->imc_lock);
2830 	if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1,
2831 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
2832 		mutex_exit(&imc_data->imc_lock);
2833 		return (DDI_FAILURE);
2834 	}
2835 
2836 	imc_data->imc_dip = dip;
2837 	imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED;
2838 	mutex_exit(&imc_data->imc_lock);
2839 
2840 	(void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data,
2841 	    DDI_SLEEP);
2842 
2843 	return (DDI_SUCCESS);
2844 }
2845 
2846 /*
2847  * We only export a single instance.
2848  */
2849 static int
2850 imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
2851 {
2852 	/*
2853 	 * getinfo(9E) shouldn't be called if we're not attached. But be
2854 	 * paranoid.
2855 	 */
2856 	if (imc_data == NULL || imc_data->imc_dip == NULL) {
2857 		return (DDI_FAILURE);
2858 	}
2859 
2860 	switch (infocmd) {
2861 	case DDI_INFO_DEVT2DEVINFO:
2862 		*resultp = imc_data->imc_dip;
2863 		break;
2864 	case DDI_INFO_DEVT2INSTANCE:
2865 		*resultp = (void *)0;
2866 		break;
2867 	default:
2868 		return (DDI_FAILURE);
2869 	}
2870 
2871 	return (DDI_SUCCESS);
2872 }
2873 
2874 static int
2875 imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2876 {
2877 	if (cmd != DDI_DETACH) {
2878 		return (DDI_FAILURE);
2879 	}
2880 
2881 	if (imc_data == NULL || imc_data->imc_dip) {
2882 		return (DDI_FAILURE);
2883 	}
2884 
2885 	mutex_enter(&imc_data->imc_lock);
2886 
2887 	/*
2888 	 * While a scan or attach is outstanding, don't allow us to detach.
2889 	 */
2890 	if ((imc_data->imc_flags &
2891 	    (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) {
2892 		mutex_exit(&imc_data->imc_lock);
2893 		return (DDI_FAILURE);
2894 	}
2895 
2896 	/*
2897 	 * Because the stub driver depends on the imc driver, we shouldn't be
2898 	 * able to have any entries in this list when we detach. However, we
2899 	 * check just to make sure.
2900 	 */
2901 	if (!avl_is_empty(&imc_data->imc_stubs)) {
2902 		mutex_exit(&imc_data->imc_lock);
2903 		return (DDI_FAILURE);
2904 	}
2905 
2906 	nvlist_free(imc_data->imc_decoder_dump);
2907 	imc_data->imc_decoder_dump = NULL;
2908 	if (imc_data->imc_decoder_buf != NULL) {
2909 		kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len);
2910 		imc_data->imc_decoder_buf = NULL;
2911 		imc_data->imc_decoder_len = 0;
2912 	}
2913 
2914 	ddi_remove_minor_node(imc_data->imc_dip, NULL);
2915 	imc_data->imc_dip = NULL;
2916 	mutex_exit(&imc_data->imc_lock);
2917 
2918 	ddi_taskq_wait(imc_data->imc_taskq);
2919 	ddi_taskq_destroy(imc_data->imc_taskq);
2920 	imc_data->imc_taskq = NULL;
2921 
2922 	return (DDI_SUCCESS);
2923 }
2924 
2925 static void
2926 imc_free(void)
2927 {
2928 	if (imc_data == NULL) {
2929 		return;
2930 	}
2931 
2932 	VERIFY(avl_is_empty(&imc_data->imc_stubs));
2933 	avl_destroy(&imc_data->imc_stubs);
2934 	mutex_destroy(&imc_data->imc_lock);
2935 	kmem_free(imc_data, sizeof (imc_t));
2936 	imc_data = NULL;
2937 }
2938 
2939 static void
2940 imc_alloc(void)
2941 {
2942 	imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP);
2943 
2944 	mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL);
2945 	avl_create(&imc_data->imc_stubs, imc_stub_comparator,
2946 	    sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link));
2947 }
2948 
2949 static struct cb_ops imc_cb_ops = {
2950 	.cb_open = imc_open,
2951 	.cb_close = imc_close,
2952 	.cb_strategy = nodev,
2953 	.cb_print = nodev,
2954 	.cb_dump = nodev,
2955 	.cb_read = nodev,
2956 	.cb_write = nodev,
2957 	.cb_ioctl = imc_ioctl,
2958 	.cb_devmap = nodev,
2959 	.cb_mmap = nodev,
2960 	.cb_segmap = nodev,
2961 	.cb_chpoll = nochpoll,
2962 	.cb_prop_op = ddi_prop_op,
2963 	.cb_flag = D_MP,
2964 	.cb_rev = CB_REV,
2965 	.cb_aread = nodev,
2966 	.cb_awrite = nodev
2967 };
2968 
2969 static struct dev_ops imc_dev_ops = {
2970 	.devo_rev = DEVO_REV,
2971 	.devo_refcnt = 0,
2972 	.devo_getinfo = imc_getinfo,
2973 	.devo_identify = nulldev,
2974 	.devo_probe = nulldev,
2975 	.devo_attach = imc_attach,
2976 	.devo_detach = imc_detach,
2977 	.devo_reset = nodev,
2978 	.devo_cb_ops = &imc_cb_ops,
2979 	.devo_quiesce = ddi_quiesce_not_needed
2980 };
2981 
2982 static struct modldrv imc_modldrv = {
2983 	.drv_modops = &mod_driverops,
2984 	.drv_linkinfo = "Intel Integrated Memory Controller Driver",
2985 	.drv_dev_ops = &imc_dev_ops
2986 };
2987 
2988 static struct modlinkage imc_modlinkage = {
2989 	.ml_rev = MODREV_1,
2990 	.ml_linkage = { &imc_modldrv, NULL }
2991 };
2992 
2993 int
2994 _init(void)
2995 {
2996 	int ret;
2997 
2998 	if ((ret = mod_install(&imc_modlinkage)) == 0) {
2999 		imc_alloc();
3000 	}
3001 
3002 	return (ret);
3003 }
3004 
3005 int
3006 _info(struct modinfo *modinfop)
3007 {
3008 	return (mod_info(&imc_modlinkage, modinfop));
3009 }
3010 
3011 int
3012 _fini(void)
3013 {
3014 	int ret;
3015 
3016 	if ((ret = mod_remove(&imc_modlinkage)) == 0) {
3017 		imc_free();
3018 	}
3019 	return (ret);
3020 }
3021