xref: /illumos-gate/usr/src/uts/intel/io/mc-amd/mcamd_drv.c (revision 327151705b7439cb7ab35c370f682cac7ef9523a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/ddi.h>
28 #include <sys/ddifm.h>
29 #include <sys/sunddi.h>
30 #include <sys/sunndi.h>
31 #include <sys/stat.h>
32 #include <sys/modctl.h>
33 #include <sys/types.h>
34 #include <sys/cpuvar.h>
35 #include <sys/cmn_err.h>
36 #include <sys/kmem.h>
37 #include <sys/cred.h>
38 #include <sys/ksynch.h>
39 #include <sys/rwlock.h>
40 #include <sys/pghw.h>
41 #include <sys/open.h>
42 #include <sys/policy.h>
43 #include <sys/x86_archext.h>
44 #include <sys/cpu_module.h>
45 #include <qsort.h>
46 #include <sys/pci_cfgspace.h>
47 #include <sys/mc.h>
48 #include <sys/mc_amd.h>
49 #include <sys/smbios.h>
50 #include <sys/pci.h>
51 #include <mcamd.h>
52 #include <mcamd_dimmcfg.h>
53 #include <mcamd_pcicfg.h>
54 #include <mcamd_api.h>
55 #include <sys/fm/cpu/AMD.h>
56 #include <sys/fm/smb/fmsmb.h>
57 #include <sys/fm/protocol.h>
58 #include <sys/fm/util.h>
59 
60 /*
61  * Set to prevent mc-amd from attaching.
62  */
63 int mc_no_attach = 0;
64 
65 /*
66  * Of the 754/939/940 packages, only socket 940 supports quadrank registered
67  * dimms.  Unfortunately, no memory-controller register indicates the
68  * presence of quadrank dimm support or presence (i.e., in terms of number
69  * of slots per cpu, and chip-select lines per slot,  The following may be set
70  * in /etc/system to indicate the presence of quadrank support on a motherboard.
71  *
72  * There is no need to set this for F(1207) and S1g1.
73  */
74 int mc_quadranksupport = 0;
75 
76 mc_t *mc_list, *mc_last;
77 krwlock_t mc_lock;
78 int mc_hold_attached = 1;
79 
80 #define	MAX(m, n) ((m) >= (n) ? (m) : (n))
81 #define	MIN(m, n) ((m) <= (n) ? (m) : (n))
82 
83 /*
84  * The following tuneable is used to determine the DRAM scrubbing rate.
85  * The values range from 0x00-0x16 as described in the BKDG.  Zero
86  * disables DRAM scrubbing.  Values above zero indicate rates in descending
87  * order.
88  *
89  * The default value below is used on several Sun systems.  In the future
90  * this code should assign values dynamically based on memory sizing.
91  */
92 uint32_t mc_scrub_rate_dram = 0xd;	/* 64B every 163.8 us; 1GB per 45 min */
93 
94 enum {
95 	MC_SCRUB_BIOSDEFAULT,	/* retain system default value */
96 	MC_SCRUB_FIXED,		/* assign mc_scrub_rate_* values */
97 	MC_SCRUB_MAX		/* assign max of system and tunables */
98 } mc_scrub_policy = MC_SCRUB_MAX;
99 
100 static void
101 mc_snapshot_destroy(mc_t *mc)
102 {
103 	ASSERT(RW_LOCK_HELD(&mc_lock));
104 
105 	if (mc->mc_snapshot == NULL)
106 		return;
107 
108 	kmem_free(mc->mc_snapshot, mc->mc_snapshotsz);
109 	mc->mc_snapshot = NULL;
110 	mc->mc_snapshotsz = 0;
111 	mc->mc_snapshotgen++;
112 }
113 
114 static int
115 mc_snapshot_update(mc_t *mc)
116 {
117 	ASSERT(RW_LOCK_HELD(&mc_lock));
118 
119 	if (mc->mc_snapshot != NULL)
120 		return (0);
121 
122 	if (nvlist_pack(mc->mc_nvl, &mc->mc_snapshot, &mc->mc_snapshotsz,
123 	    NV_ENCODE_XDR, KM_SLEEP) != 0)
124 		return (-1);
125 
126 	return (0);
127 }
128 
129 static mc_t *
130 mc_lookup_by_chipid(int chipid)
131 {
132 	mc_t *mc;
133 
134 	ASSERT(RW_LOCK_HELD(&mc_lock));
135 
136 	for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
137 		if (mc->mc_props.mcp_num  == chipid)
138 			return (mc);
139 	}
140 
141 	return (NULL);
142 }
143 
144 /*
145  * Read config register pairs into the two arrays provided on the given
146  * handle and at offsets as follows:
147  *
148  *	Index	Array r1 offset			Array r2 offset
149  *	0	r1addr				r2addr
150  *	1	r1addr + incr			r2addr + incr
151  *	2	r1addr + 2 * incr		r2addr + 2 * incr
152  *	...
153  *	n - 1	r1addr + (n - 1) * incr		r2addr + (n - 1) * incr
154  *
155  * The number of registers to read into the r1 array is r1n; the number
156  * for the r2 array is r2n.
157  */
158 static void
159 mc_prop_read_pair(mc_pcicfg_hdl_t cfghdl, uint32_t *r1, off_t r1addr,
160     int r1n, uint32_t *r2, off_t r2addr, int r2n, off_t incr)
161 {
162 	int i;
163 
164 	for (i = 0; i < MAX(r1n, r2n); i++, r1addr += incr, r2addr += incr) {
165 		if (i < r1n)
166 			r1[i] = mc_pcicfg_get32(cfghdl, r1addr);
167 		if (i < r2n)
168 			r2[i] = mc_pcicfg_get32(cfghdl, r2addr);
169 	}
170 }
171 
172 /*ARGSUSED*/
173 static int
174 mc_nvl_add_socket_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
175 {
176 	uint32_t skt = *((uint32_t *)arg1);
177 	cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;
178 
179 	if (cmi_hdl_getsockettype(whdl) == skt) {
180 		cmi_hdl_hold(whdl);	/* short-term hold */
181 		*hdlp = whdl;
182 		return (CMI_HDL_WALK_DONE);
183 	} else {
184 		return (CMI_HDL_WALK_NEXT);
185 	}
186 }
187 
188 static void
189 mc_nvl_add_socket(nvlist_t *nvl, mc_t *mc)
190 {
191 	cmi_hdl_t hdl = NULL;
192 	const char *s;
193 
194 	cmi_hdl_walk(mc_nvl_add_socket_cb, (void *)&mc->mc_socket,
195 	    (void *)&hdl, NULL);
196 	if (hdl == NULL)
197 		s = "Unknown";  /* no cpu for this chipid found */
198 	else
199 		s = cmi_hdl_getsocketstr(hdl);
200 
201 	(void) nvlist_add_string(nvl, "socket", s);
202 
203 	if (hdl != NULL)
204 		cmi_hdl_rele(hdl);
205 }
206 
207 static uint32_t
208 mc_ecc_enabled(mc_t *mc)
209 {
210 	uint32_t rev = mc->mc_props.mcp_rev;
211 	union mcreg_nbcfg nbcfg;
212 
213 	MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;
214 
215 	return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
216 	    MCREG_FIELD_F_preF(&nbcfg, EccEn) :
217 	    MCREG_FIELD_F_revFG(&nbcfg, EccEn));
218 }
219 
220 static uint32_t
221 mc_ck_enabled(mc_t *mc)
222 {
223 	uint32_t rev = mc->mc_props.mcp_rev;
224 	union mcreg_nbcfg nbcfg;
225 
226 	MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;
227 
228 	return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
229 	    MCREG_FIELD_F_preF(&nbcfg, ChipKillEccEn) :
230 	    MCREG_FIELD_F_revFG(&nbcfg, ChipKillEccEn));
231 }
232 
233 static void
234 mc_nvl_add_ecctype(nvlist_t *nvl, mc_t *mc)
235 {
236 	(void) nvlist_add_string(nvl, "ecc-type", mc_ecc_enabled(mc) ?
237 	    (mc_ck_enabled(mc) ? "ChipKill 128/16" : "Normal 64/8") : "None");
238 }
239 
240 static void
241 mc_nvl_add_prop(nvlist_t *nvl, void *node, mcamd_propcode_t code, int reqval)
242 {
243 	int valfound;
244 	uint64_t value;
245 	const char *name = mcamd_get_propname(code);
246 
247 	valfound = mcamd_get_numprop(NULL, (mcamd_node_t *)node, code, &value);
248 
249 	ASSERT(name != NULL && valfound);
250 	if (name != NULL && valfound && (!reqval || value != MC_INVALNUM))
251 		(void) nvlist_add_uint64(nvl, name, value);
252 }
253 
254 static void
255 mc_nvl_add_cslist(nvlist_t *mcnvl, mc_t *mc)
256 {
257 	mc_cs_t *mccs = mc->mc_cslist;
258 	nvlist_t *cslist[MC_CHIP_NCS];
259 	int nelem, i;
260 
261 	for (nelem = 0; mccs != NULL; mccs = mccs->mccs_next, nelem++) {
262 		nvlist_t **csp = &cslist[nelem];
263 		char csname[MCDCFG_CSNAMELEN];
264 
265 		(void) nvlist_alloc(csp, NV_UNIQUE_NAME, KM_SLEEP);
266 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_NUM, 0);
267 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_BASE_ADDR, 0);
268 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_MASK, 0);
269 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_SIZE, 0);
270 
271 		/*
272 		 * It is possible for an mc_cs_t not to have associated
273 		 * DIMM info if mcdcfg_lookup failed.
274 		 */
275 		if (mccs->mccs_csl[0] != NULL) {
276 			mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM1, 1);
277 			mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[0], csname,
278 			    sizeof (csname));
279 			(void) nvlist_add_string(*csp, "dimm1-csname", csname);
280 		}
281 
282 		if (mccs->mccs_csl[1] != NULL) {
283 			mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM2, 1);
284 			mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[1], csname,
285 			    sizeof (csname));
286 			(void) nvlist_add_string(*csp, "dimm2-csname", csname);
287 		}
288 	}
289 
290 	/* Add cslist nvlist array even if zero members */
291 	(void) nvlist_add_nvlist_array(mcnvl, "cslist", cslist, nelem);
292 	for (i = 0; i < nelem; i++)
293 		nvlist_free(cslist[i]);
294 }
295 
296 static void
297 mc_nvl_add_dimmlist(nvlist_t *mcnvl, mc_t *mc)
298 {
299 	nvlist_t *dimmlist[MC_CHIP_NDIMM];
300 	mc_dimm_t *mcd;
301 	int nelem, i;
302 
303 	for (nelem = 0, mcd = mc->mc_dimmlist; mcd != NULL;
304 	    mcd = mcd->mcd_next, nelem++) {
305 		nvlist_t **dimmp = &dimmlist[nelem];
306 		uint64_t csnums[MC_CHIP_DIMMRANKMAX];
307 		char csname[4][MCDCFG_CSNAMELEN];
308 		char *csnamep[4];
309 		int ncs = 0;
310 
311 		(void) nvlist_alloc(dimmp, NV_UNIQUE_NAME, KM_SLEEP);
312 
313 		mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_NUM, 1);
314 		mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_SIZE, 1);
315 
316 		for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
317 			if (mcd->mcd_cs[i] != NULL) {
318 				csnums[ncs] =
319 				    mcd->mcd_cs[i]->mccs_props.csp_num;
320 				mcdcfg_csname(mc->mc_socket, mcd->mcd_csl[i],
321 				    csname[ncs], MCDCFG_CSNAMELEN);
322 				csnamep[ncs] = csname[ncs];
323 				ncs++;
324 			}
325 		}
326 
327 		(void) nvlist_add_uint64_array(*dimmp, "csnums", csnums, ncs);
328 		(void) nvlist_add_string_array(*dimmp, "csnames", csnamep, ncs);
329 	}
330 
331 	/* Add dimmlist nvlist array even if zero members */
332 	(void) nvlist_add_nvlist_array(mcnvl, "dimmlist", dimmlist, nelem);
333 	for (i = 0; i < nelem; i++)
334 		nvlist_free(dimmlist[i]);
335 }
336 
337 static void
338 mc_nvl_add_htconfig(nvlist_t *mcnvl, mc_t *mc)
339 {
340 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
341 	union mcreg_htroute *htrp = (union mcreg_htroute *)&mcr->mcr_htroute[0];
342 	union mcreg_nodeid *nip = (union mcreg_nodeid *)&mcr->mcr_htnodeid;
343 	union mcreg_unitid *uip = (union mcreg_unitid *)&mcr->mcr_htunitid;
344 	int ndcnt = HT_COHERENTNODES(nip);
345 	uint32_t BCRte[MC_CHIP_MAXNODES];
346 	uint32_t RPRte[MC_CHIP_MAXNODES];
347 	uint32_t RQRte[MC_CHIP_MAXNODES];
348 	nvlist_t *nvl;
349 	int i;
350 
351 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
352 
353 	(void) nvlist_add_uint32(nvl, "NodeId", MCREG_FIELD_CMN(nip, NodeId));
354 	(void) nvlist_add_uint32(nvl, "CoherentNodes", HT_COHERENTNODES(nip));
355 	(void) nvlist_add_uint32(nvl, "SbNode", MCREG_FIELD_CMN(nip, SbNode));
356 	(void) nvlist_add_uint32(nvl, "LkNode", MCREG_FIELD_CMN(nip, LkNode));
357 	(void) nvlist_add_uint32(nvl, "SystemCoreCount",
358 	    HT_SYSTEMCORECOUNT(nip));
359 
360 	(void) nvlist_add_uint32(nvl, "C0Unit", MCREG_FIELD_CMN(uip, C0Unit));
361 	(void) nvlist_add_uint32(nvl, "C1Unit", MCREG_FIELD_CMN(uip, C1Unit));
362 	(void) nvlist_add_uint32(nvl, "McUnit", MCREG_FIELD_CMN(uip, McUnit));
363 	(void) nvlist_add_uint32(nvl, "HbUnit", MCREG_FIELD_CMN(uip, HbUnit));
364 	(void) nvlist_add_uint32(nvl, "SbLink", MCREG_FIELD_CMN(uip, SbLink));
365 
366 	if (ndcnt <= MC_CHIP_MAXNODES) {
367 		for (i = 0; i < ndcnt; i++, htrp++) {
368 			BCRte[i] = MCREG_FIELD_CMN(htrp, BCRte);
369 			RPRte[i] = MCREG_FIELD_CMN(htrp, RPRte);
370 			RQRte[i] = MCREG_FIELD_CMN(htrp, RQRte);
371 		}
372 
373 		(void) nvlist_add_uint32_array(nvl, "BroadcastRoutes",
374 		    &BCRte[0], ndcnt);
375 		(void) nvlist_add_uint32_array(nvl, "ResponseRoutes",
376 		    &RPRte[0], ndcnt);
377 		(void) nvlist_add_uint32_array(nvl, "RequestRoutes",
378 		    &RQRte[0], ndcnt);
379 	}
380 
381 	(void) nvlist_add_nvlist(mcnvl, "htconfig", nvl);
382 	nvlist_free(nvl);
383 }
384 
385 static nvlist_t *
386 mc_nvl_create(mc_t *mc)
387 {
388 	nvlist_t *mcnvl;
389 
390 	(void) nvlist_alloc(&mcnvl, NV_UNIQUE_NAME, KM_SLEEP);
391 
392 	/*
393 	 * Since this nvlist is used in populating the topo tree changes
394 	 * made here may propogate through to changed property names etc
395 	 * in the topo tree.  Some properties in the topo tree will be
396 	 * contracted via ARC, so be careful what you change here.
397 	 */
398 	(void) nvlist_add_uint8(mcnvl, MC_NVLIST_VERSTR, MC_NVLIST_VERS1);
399 
400 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_NUM, 0);
401 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_REV, 0);
402 	(void) nvlist_add_string(mcnvl, "revname", mc->mc_revname);
403 	mc_nvl_add_socket(mcnvl, mc);
404 	mc_nvl_add_ecctype(mcnvl, mc);
405 
406 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BASE_ADDR, 0);
407 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_LIM_ADDR, 0);
408 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILEN, 0);
409 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILSEL, 0);
410 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSINTLVFCTR, 0);
411 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_DRAMHOLE_SIZE, 0);
412 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ACCESS_WIDTH, 0);
413 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSBANKMAPREG, 0);
414 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BANKSWZL, 0);
415 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_MOD64MUX, 0);
416 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_SPARECS, 1);
417 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BADCS, 1);
418 
419 	mc_nvl_add_cslist(mcnvl, mc);
420 	mc_nvl_add_dimmlist(mcnvl, mc);
421 	mc_nvl_add_htconfig(mcnvl, mc);
422 
423 	return (mcnvl);
424 }
425 
426 /*
427  * Link a dimm to its associated chip-selects and chip-select lines.
428  * Total the size of all ranks of this dimm.
429  */
430 static void
431 mc_dimm_csadd(mc_t *mc, mc_dimm_t *mcd, mc_cs_t *mccs, const mcdcfg_csl_t *csl)
432 {
433 	int factor = (mc->mc_props.mcp_accwidth == 128) ? 2 : 1;
434 	uint64_t sz = 0;
435 	int i;
436 
437 	/* Skip to first unused rank slot */
438 	for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
439 		if (mcd->mcd_cs[i] == NULL) {
440 			mcd->mcd_cs[i] = mccs;
441 			mcd->mcd_csl[i] = csl;
442 			sz += mccs->mccs_props.csp_size / factor;
443 			break;
444 		} else {
445 			sz += mcd->mcd_cs[i]->mccs_props.csp_size / factor;
446 		}
447 	}
448 
449 	ASSERT(i != MC_CHIP_DIMMRANKMAX);
450 
451 	mcd->mcd_size = sz;
452 }
453 
454 /*
455  * Create a dimm structure and call to link it to its associated chip-selects.
456  */
457 static mc_dimm_t *
458 mc_dimm_create(mc_t *mc, uint_t num)
459 {
460 	mc_dimm_t *mcd = kmem_zalloc(sizeof (mc_dimm_t), KM_SLEEP);
461 
462 	mcd->mcd_hdr.mch_type = MC_NT_DIMM;
463 	mcd->mcd_mc = mc;
464 	mcd->mcd_num = num;
465 
466 	return (mcd);
467 }
468 
469 /*
470  * The chip-select structure includes an array of dimms associated with
471  * that chip-select.  This function fills that array, and also builds
472  * the list of all dimms on this memory controller mc_dimmlist.  The
473  * caller has filled a structure with all there is to know about the
474  * associated dimm(s).
475  */
476 static void
477 mc_csdimms_create(mc_t *mc, mc_cs_t *mccs, mcdcfg_rslt_t *rsltp)
478 {
479 	mc_dimm_t *found[MC_CHIP_DIMMPERCS];
480 	mc_dimm_t *mcd;
481 	int nfound = 0;
482 	int i;
483 
484 	/*
485 	 * Has some other chip-select already created this dimm or dimms?
486 	 * If so then link to the dimm(s) from the mccs_dimm array,
487 	 * record their topo numbers in the csp_dimmnums array, and link
488 	 * the dimm(s) to the additional chip-select.
489 	 */
490 	for (mcd = mc->mc_dimmlist; mcd != NULL; mcd = mcd->mcd_next) {
491 		for (i = 0; i < rsltp->ndimm; i++) {
492 			if (mcd->mcd_num == rsltp->dimm[i].toponum)
493 				found[nfound++] = mcd;
494 		}
495 	}
496 	ASSERT(nfound == 0 || nfound == rsltp->ndimm);
497 
498 	for (i = 0; i < rsltp->ndimm; i++) {
499 		if (nfound == 0) {
500 			mcd = mc_dimm_create(mc, rsltp->dimm[i].toponum);
501 			if (mc->mc_dimmlist == NULL)
502 				mc->mc_dimmlist = mcd;
503 			else
504 				mc->mc_dimmlast->mcd_next = mcd;
505 			mc->mc_dimmlast = mcd;
506 		} else {
507 			mcd = found[i];
508 		}
509 
510 		mccs->mccs_dimm[i] = mcd;
511 		mccs->mccs_csl[i] = rsltp->dimm[i].cslp;
512 		mccs->mccs_props.csp_dimmnums[i] = mcd->mcd_num;
513 		mc_dimm_csadd(mc, mcd, mccs, rsltp->dimm[i].cslp);
514 
515 	}
516 
517 	/* The rank number is constant across all constituent dimm(s) */
518 	mccs->mccs_props.csp_dimmrank = rsltp->dimm[0].cslp->csl_rank;
519 }
520 
521 /*
522  * mc_dimmlist_create is called after we have discovered all enabled
523  * (and spare or testfailed on revs F and G) chip-selects on the
524  * given memory controller.  For each chip-select we must derive
525  * the associated dimms, remembering that a chip-select csbase/csmask
526  * pair may be associated with up to 2 chip-select lines (in 128 bit mode)
527  * and that any one dimm may be associated with 1, 2, or 4 chip-selects
528  * depending on whether it is single, dual or quadrank.
529  */
530 static void
531 mc_dimmlist_create(mc_t *mc)
532 {
533 	union mcreg_dramcfg_hi *drcfghip =
534 	    (union mcreg_dramcfg_hi *)(&mc->mc_cfgregs.mcr_dramcfghi);
535 	mc_props_t *mcp = &mc->mc_props;
536 	uint32_t rev = mcp->mcp_rev;
537 	mc_cs_t *mccs;
538 	int r4 = 0, s4 = 0;
539 
540 	/*
541 	 * Are we dealing with quadrank registered dimms?
542 	 *
543 	 * For socket 940 we can't tell and we'll assume we're not.
544 	 * This can be over-ridden by the admin in /etc/system by setting
545 	 * mc_quadranksupport nonzero.  A possible optimisation in systems
546 	 * that export an SMBIOS table would be to count the number of
547 	 * dimm slots per cpu - more than 4 would indicate no quadrank support
548 	 * and 4 or fewer would indicate that if we see any of the upper
549 	 * chip-selects enabled then a quadrank dimm is present.
550 	 *
551 	 * For socket F(1207) we can check a bit in the dram config high reg.
552 	 *
553 	 * Other socket types do not support registered dimms.
554 	 */
555 	if (mc->mc_socket == X86_SOCKET_940)
556 		r4 = mc_quadranksupport != 0;
557 	else if (mc->mc_socket == X86_SOCKET_F1207)
558 		r4 = MCREG_FIELD_F_revFG(drcfghip, FourRankRDimm);
559 
560 	/*
561 	 * Are we dealing with quadrank SO-DIMMs?  These are supported
562 	 * in AM2 and S1g1 packages only, but in all rev F/G cases we
563 	 * can detect their presence via a bit in the dram config high reg.
564 	 */
565 	if (MC_REV_MATCH(rev, MC_F_REVS_FG))
566 		s4 = MCREG_FIELD_F_revFG(drcfghip, FourRankSODimm);
567 
568 	for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
569 		mcdcfg_rslt_t rslt;
570 
571 		/*
572 		 * If lookup fails we will not create dimm structures for
573 		 * this chip-select.  In the mc_cs_t we will have both
574 		 * csp_dimmnum members set to MC_INVALNUM and patounum
575 		 * code will see from those that we do not have dimm info
576 		 * for this chip-select.
577 		 */
578 		if (mcdcfg_lookup(rev, mcp->mcp_mod64mux, mcp->mcp_accwidth,
579 		    mccs->mccs_props.csp_num, mc->mc_socket,
580 		    r4, s4, &rslt) < 0)
581 			continue;
582 
583 		mc_csdimms_create(mc, mccs, &rslt);
584 	}
585 }
586 
587 static mc_cs_t *
588 mc_cs_create(mc_t *mc, uint_t num, uint64_t base, uint64_t mask, size_t sz,
589     int csbe, int spare, int testfail)
590 {
591 	mc_cs_t *mccs = kmem_zalloc(sizeof (mc_cs_t), KM_SLEEP);
592 	mccs_props_t *csp = &mccs->mccs_props;
593 	int i;
594 
595 	mccs->mccs_hdr.mch_type = MC_NT_CS;
596 	mccs->mccs_mc = mc;
597 	csp->csp_num = num;
598 	csp->csp_base = base;
599 	csp->csp_mask = mask;
600 	csp->csp_size = sz;
601 	csp->csp_csbe = csbe;
602 	csp->csp_spare = spare;
603 	csp->csp_testfail = testfail;
604 
605 	for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
606 		csp->csp_dimmnums[i] = MC_INVALNUM;
607 
608 	if (spare)
609 		mc->mc_props.mcp_sparecs = num;
610 
611 	return (mccs);
612 }
613 
614 /*
615  * For any cs# of this mc marked TestFail generate an ereport with
616  * resource identifying the associated dimm(s).
617  */
618 static void
619 mc_report_testfails(mc_t *mc)
620 {
621 	mc_unum_t unum;
622 	mc_cs_t *mccs;
623 	int i;
624 
625 	for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
626 		if (mccs->mccs_props.csp_testfail) {
627 			unum.unum_board = 0;
628 			unum.unum_chip = mc->mc_props.mcp_num;
629 			unum.unum_mc = 0;
630 			unum.unum_chan = MC_INVALNUM;
631 			unum.unum_cs = mccs->mccs_props.csp_num;
632 			unum.unum_rank = mccs->mccs_props.csp_dimmrank;
633 			unum.unum_offset = MCAMD_RC_INVALID_OFFSET;
634 			for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
635 				unum.unum_dimms[i] = MC_INVALNUM;
636 
637 			mcamd_ereport_post(mc, FM_EREPORT_CPU_AMD_MC_TESTFAIL,
638 			    &unum,
639 			    FM_EREPORT_PAYLOAD_FLAGS_CPU_AMD_MC_TESTFAIL);
640 		}
641 	}
642 }
643 
644 /*
645  * Function 0 - HyperTransport Technology Configuration
646  */
647 static void
648 mc_mkprops_htcfg(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
649 {
650 	union mcreg_nodeid nodeid;
651 	off_t offset;
652 	int i;
653 
654 	mc->mc_cfgregs.mcr_htnodeid = MCREG_VAL32(&nodeid) =
655 	    mc_pcicfg_get32(cfghdl, MC_HT_REG_NODEID);
656 
657 	mc->mc_cfgregs.mcr_htunitid = mc_pcicfg_get32(cfghdl, MC_HT_REG_UNITID);
658 
659 	for (i = 0, offset = MC_HT_REG_RTBL_NODE_0;
660 	    i < HT_COHERENTNODES(&nodeid);
661 	    i++, offset += MC_HT_REG_RTBL_INCR)
662 		mc->mc_cfgregs.mcr_htroute[i] = mc_pcicfg_get32(cfghdl, offset);
663 }
664 
665 /*
666  * Function 1 Configuration - Address Map (see BKDG 3.4.4 DRAM Address Map)
667  *
668  * Read the Function 1 Address Map for each potential DRAM node.  The Base
669  * Address for a node gives the starting system address mapped at that node,
670  * and the limit gives the last valid address mapped at that node.  Regions for
671  * different nodes should not overlap, unless node-interleaving is enabled.
672  * The base register also indicates the node-interleaving settings (IntlvEn).
673  * The limit register includes IntlvSel which determines which 4K blocks will
674  * be routed to this node and the destination node ID for addresses that fall
675  * within the [base, limit] range - this must match the pair number.
676  */
677 static void
678 mc_mkprops_addrmap(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
679 {
680 	union mcreg_drambase basereg;
681 	union mcreg_dramlimit limreg;
682 	mc_props_t *mcp = &mc->mc_props;
683 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
684 	union mcreg_dramhole hole;
685 	int nodeid = mc->mc_props.mcp_num;
686 
687 	mcr->mcr_drambase = MCREG_VAL32(&basereg) = mc_pcicfg_get32(cfghdl,
688 	    MC_AM_REG_DRAMBASE_0 + nodeid * MC_AM_REG_DRAM_INCR);
689 
690 	mcr->mcr_dramlimit = MCREG_VAL32(&limreg) = mc_pcicfg_get32(cfghdl,
691 	    MC_AM_REG_DRAMLIM_0 + nodeid * MC_AM_REG_DRAM_INCR);
692 
693 	/*
694 	 * Derive some "cooked" properties for nodes that have a range of
695 	 * physical addresses that are read or write enabled and for which
696 	 * the DstNode matches the node we are attaching.
697 	 */
698 	if (MCREG_FIELD_CMN(&limreg, DRAMLimiti) != 0 &&
699 	    MCREG_FIELD_CMN(&limreg, DstNode) == nodeid &&
700 	    (MCREG_FIELD_CMN(&basereg, WE) || MCREG_FIELD_CMN(&basereg, RE))) {
701 		mcp->mcp_base = MC_DRAMBASE(&basereg);
702 		mcp->mcp_lim = MC_DRAMLIM(&limreg);
703 		mcp->mcp_ilen = MCREG_FIELD_CMN(&basereg, IntlvEn);
704 		mcp->mcp_ilsel = MCREG_FIELD_CMN(&limreg, IntlvSel);
705 	}
706 
707 	/*
708 	 * The Function 1 DRAM Hole Address Register tells us which node(s)
709 	 * own the DRAM space that is hoisted above 4GB, together with the
710 	 * hole base and offset for this node.  This was introduced in
711 	 * revision E.
712 	 */
713 	if (MC_REV_ATLEAST(mc->mc_props.mcp_rev, MC_F_REV_E)) {
714 		mcr->mcr_dramhole = MCREG_VAL32(&hole) =
715 		    mc_pcicfg_get32(cfghdl, MC_AM_REG_HOLEADDR);
716 
717 		if (MCREG_FIELD_CMN(&hole, DramHoleValid))
718 			mcp->mcp_dramhole_size = MC_DRAMHOLE_SIZE(&hole);
719 	}
720 }
721 
722 /*
723  * Read some function 3 parameters via PCI Mechanism 1 accesses (which
724  * will serialize any NB accesses).
725  */
726 static void
727 mc_getmiscctl(mc_t *mc)
728 {
729 	uint32_t rev = mc->mc_props.mcp_rev;
730 	union mcreg_nbcfg nbcfg;
731 	union mcreg_sparectl sparectl;
732 
733 	mc->mc_cfgregs.mcr_nbcfg = MCREG_VAL32(&nbcfg) =
734 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG);
735 
736 	if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
737 		mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl) =
738 		    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
739 		    MC_CTL_REG_SPARECTL);
740 
741 		if (MCREG_FIELD_F_revFG(&sparectl, SwapDone)) {
742 			mc->mc_props.mcp_badcs =
743 			    MCREG_FIELD_F_revFG(&sparectl, BadDramCs);
744 		}
745 	}
746 }
747 
748 static int
749 csbasecmp(mc_cs_t **csapp, mc_cs_t **csbpp)
750 {
751 	uint64_t basea = (*csapp)->mccs_props.csp_base;
752 	uint64_t baseb = (*csbpp)->mccs_props.csp_base;
753 
754 	if (basea == baseb)
755 		return (0);
756 	else if (basea < baseb)
757 		return (-1);
758 	else
759 		return (1);
760 }
761 
762 /*
763  * The following are for use in simulating TestFail for a chip-select
764  * without poking at the hardware (which tends to get upset if you do
765  * since the BIOS needs to restart to map a failed cs out).  For internal
766  * testing only!  Note that setting these does not give the full experience -
767  * the select chip-select *is* enabled and can give errors etc and the
768  * patounum logic will get confused.
769  */
770 int testfail_mcnum = -1;
771 int testfail_csnum = -1;
772 
773 /*
774  * Function 2 configuration - DRAM Controller
775  */
776 static void
777 mc_mkprops_dramctl(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
778 {
779 	union mcreg_csbase base[MC_CHIP_NCS];
780 	union mcreg_csmask mask[MC_CHIP_NCS];
781 	union mcreg_dramcfg_lo drcfg_lo;
782 	union mcreg_dramcfg_hi drcfg_hi;
783 	union mcreg_drammisc drmisc;
784 	union mcreg_bankaddrmap baddrmap;
785 	mc_props_t *mcp = &mc->mc_props;
786 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
787 	int maskdivisor;
788 	int wide = 0;
789 	uint32_t rev = mc->mc_props.mcp_rev;
790 	int i;
791 	mcamd_hdl_t hdl;
792 
793 	mcamd_mkhdl(&hdl);	/* to call into common code */
794 
795 	/*
796 	 * Read Function 2 DRAM Configuration High and Low registers.  The High
797 	 * part is mostly concerned with memory clocks etc and we'll not have
798 	 * any use for that.  The Low component tells us if ECC is enabled,
799 	 * if we're in 64- or 128-bit MC mode, how the upper chip-selects
800 	 * are mapped, which chip-select pairs are using x4 parts, etc.
801 	 */
802 	MCREG_VAL32(&drcfg_lo) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGLO);
803 	MCREG_VAL32(&drcfg_hi) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGHI);
804 	mcr->mcr_dramcfglo = MCREG_VAL32(&drcfg_lo);
805 	mcr->mcr_dramcfghi = MCREG_VAL32(&drcfg_hi);
806 
807 	/*
808 	 * Note the DRAM controller width.  The 64/128 bit is in a different
809 	 * bit position for revision F and G.
810 	 */
811 	if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
812 		wide = MCREG_FIELD_F_revFG(&drcfg_lo, Width128);
813 	} else {
814 		wide = MCREG_FIELD_F_preF(&drcfg_lo, Width128);
815 	}
816 	mcp->mcp_accwidth = wide ? 128 : 64;
817 
818 	/*
819 	 * Read Function 2 DRAM Controller Miscellaenous Regsiter for those
820 	 * revs that support it.  This include the Mod64Mux indication on
821 	 * these revs - for rev E it is in DRAM config low.
822 	 */
823 	if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
824 		mcr->mcr_drammisc = MCREG_VAL32(&drmisc) =
825 		    mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMMISC);
826 		mcp->mcp_mod64mux = MCREG_FIELD_F_revFG(&drmisc, Mod64Mux);
827 	} else if (MC_REV_MATCH(rev, MC_F_REV_E)) {
828 		mcp->mcp_mod64mux = MCREG_FIELD_F_preF(&drcfg_lo, Mod64BitMux);
829 	}
830 
831 	/*
832 	 * Read Function 2 DRAM Bank Address Mapping.  This encodes the
833 	 * type of DIMM module in use for each chip-select pair.
834 	 * Prior ro revision F it also tells us whether BankSwizzle mode
835 	 * is enabled - in rev F that has moved to dram config hi register.
836 	 */
837 	mcp->mcp_csbankmapreg = MCREG_VAL32(&baddrmap) =
838 	    mc_pcicfg_get32(cfghdl, MC_DC_REG_BANKADDRMAP);
839 
840 	/*
841 	 * Determine whether bank swizzle mode is active.  Bank swizzling was
842 	 * introduced as an option in rev E,  but the bit that indicates it
843 	 * is enabled has moved in revs F/G.
844 	 */
845 	if (MC_REV_MATCH(rev, MC_F_REV_E)) {
846 		mcp->mcp_bnkswzl =
847 		    MCREG_FIELD_F_preF(&baddrmap, BankSwizzleMode);
848 	} else if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
849 		mcp->mcp_bnkswzl = MCREG_FIELD_F_revFG(&drcfg_hi,
850 		    BankSwizzleMode);
851 	}
852 
853 	/*
854 	 * Read the DRAM CS Base and DRAM CS Mask registers.  Revisions prior
855 	 * to F have an equal number of base and mask registers; revision F
856 	 * has twice as many base registers as masks.
857 	 */
858 	maskdivisor = MC_REV_MATCH(rev, MC_F_REVS_FG) ? 2 : 1;
859 
860 	mc_prop_read_pair(cfghdl,
861 	    (uint32_t *)base, MC_DC_REG_CSBASE_0, MC_CHIP_NCS,
862 	    (uint32_t *)mask, MC_DC_REG_CSMASK_0, MC_CHIP_NCS / maskdivisor,
863 	    MC_DC_REG_CS_INCR);
864 
865 	/*
866 	 * Create a cs node for each enabled chip-select as well as
867 	 * any appointed online spare chip-selects and for any that have
868 	 * failed test.
869 	 */
870 	for (i = 0; i < MC_CHIP_NCS; i++) {
871 		mc_cs_t *mccs;
872 		uint64_t csbase, csmask;
873 		size_t sz;
874 		int csbe, spare, testfail;
875 
876 		if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
877 			csbe = MCREG_FIELD_F_revFG(&base[i], CSEnable);
878 			spare = MCREG_FIELD_F_revFG(&base[i], Spare);
879 			testfail = MCREG_FIELD_F_revFG(&base[i], TestFail);
880 		} else {
881 			csbe = MCREG_FIELD_F_preF(&base[i], CSEnable);
882 			spare = 0;
883 			testfail = 0;
884 		}
885 
886 		/* Testing hook */
887 		if (testfail_mcnum != -1 && testfail_csnum != -1 &&
888 		    mcp->mcp_num == testfail_mcnum && i == testfail_csnum) {
889 			csbe = spare = 0;
890 			testfail = 1;
891 			cmn_err(CE_NOTE, "Pretending MC %d CS %d failed test",
892 			    testfail_mcnum, testfail_csnum);
893 		}
894 
895 		/*
896 		 * If the chip-select is not enabled then skip it unless
897 		 * it is a designated online spare or is marked with TestFail.
898 		 */
899 		if (!csbe && !(spare || testfail))
900 			continue;
901 
902 		/*
903 		 * For an enabled or spare chip-select the Bank Address Mapping
904 		 * register will be valid as will the chip-select mask.  The
905 		 * base will not be valid but we'll read and store it anyway.
906 		 * We will not know whether the spare is already swapped in
907 		 * until MC function 3 attaches.
908 		 */
909 		if (csbe || spare) {
910 			if (mcamd_cs_size(&hdl, (mcamd_node_t *)mc, i, &sz) < 0)
911 				continue;
912 			csbase = MC_CSBASE(&base[i], rev);
913 			csmask = MC_CSMASK(&mask[i / maskdivisor], rev);
914 		} else {
915 			sz = 0;
916 			csbase = csmask = 0;
917 		}
918 
919 		mccs = mc_cs_create(mc, i, csbase, csmask, sz,
920 		    csbe, spare, testfail);
921 
922 		if (mc->mc_cslist == NULL)
923 			mc->mc_cslist = mccs;
924 		else
925 			mc->mc_cslast->mccs_next = mccs;
926 		mc->mc_cslast = mccs;
927 
928 		mccs->mccs_cfgregs.csr_csbase = MCREG_VAL32(&base[i]);
929 		mccs->mccs_cfgregs.csr_csmask =
930 		    MCREG_VAL32(&mask[i / maskdivisor]);
931 
932 		/*
933 		 * Check for cs bank interleaving - some bits clear in the
934 		 * lower mask.  All banks must/will have the same lomask bits
935 		 * if cs interleaving is active.
936 		 */
937 		if (csbe && !mcp->mcp_csintlvfctr) {
938 			int bitno, ibits = 0;
939 			for (bitno = MC_CSMASKLO_LOBIT(rev);
940 			    bitno <= MC_CSMASKLO_HIBIT(rev); bitno++) {
941 				if (!(csmask & (1 << bitno)))
942 					ibits++;
943 			}
944 			mcp->mcp_csintlvfctr = 1 << ibits;
945 		}
946 	}
947 
948 	/*
949 	 * If there is no chip-select interleave on this node determine
950 	 * whether the chip-select ranks are contiguous or if there
951 	 * is a hole.
952 	 */
953 	if (mcp->mcp_csintlvfctr == 1) {
954 		mc_cs_t *csp[MC_CHIP_NCS];
955 		mc_cs_t *mccs;
956 		int ncsbe = 0;
957 
958 		for (mccs = mc->mc_cslist; mccs != NULL;
959 		    mccs = mccs->mccs_next) {
960 			if (mccs->mccs_props.csp_csbe)
961 				csp[ncsbe++] = mccs;
962 		}
963 
964 		if (ncsbe != 0) {
965 			qsort((void *)csp, ncsbe, sizeof (mc_cs_t *),
966 			    (int (*)(const void *, const void *))csbasecmp);
967 
968 			for (i = 1; i < ncsbe; i++) {
969 				if (csp[i]->mccs_props.csp_base !=
970 				    csp[i - 1]->mccs_props.csp_base +
971 				    csp[i - 1]->mccs_props.csp_size)
972 					mc->mc_csdiscontig = 1;
973 			}
974 		}
975 	}
976 
977 
978 	/*
979 	 * Since we do not attach to MC function 3 go ahead and read some
980 	 * config parameters from it now.
981 	 */
982 	mc_getmiscctl(mc);
983 
984 	/*
985 	 * Now that we have discovered all enabled/spare/testfail chip-selects
986 	 * we divine the associated DIMM configuration.
987 	 */
988 	mc_dimmlist_create(mc);
989 }
990 
991 typedef struct mc_bind_map {
992 	const char *bm_bindnm;	 /* attachment binding name */
993 	enum mc_funcnum bm_func; /* PCI config space function number for bind */
994 	const char *bm_model;	 /* value for device node model property */
995 	void (*bm_mkprops)(mc_pcicfg_hdl_t, mc_t *);
996 } mc_bind_map_t;
997 
998 /*
999  * Do not attach to MC function 3 - agpgart already attaches to that.
1000  * Function 3 may be a good candidate for a nexus driver to fan it out
1001  * into virtual devices by functionality.  We will use pci_mech1_getl
1002  * to retrieve the function 3 parameters we require.
1003  */
1004 
1005 static const mc_bind_map_t mc_bind_map[] = {
1006 	{ MC_FUNC_HTCONFIG_BINDNM, MC_FUNC_HTCONFIG,
1007 	    "AMD Memory Controller (HT Configuration)", mc_mkprops_htcfg },
1008 	{ MC_FUNC_ADDRMAP_BINDNM, MC_FUNC_ADDRMAP,
1009 	    "AMD Memory Controller (Address Map)", mc_mkprops_addrmap },
1010 	{ MC_FUNC_DRAMCTL_BINDNM, MC_FUNC_DRAMCTL,
1011 	    "AMD Memory Controller (DRAM Controller & HT Trace)",
1012 	    mc_mkprops_dramctl },
1013 	NULL
1014 };
1015 
1016 /*ARGSUSED*/
1017 static int
1018 mc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1019 {
1020 	if (otyp != OTYP_CHR)
1021 		return (EINVAL);
1022 
1023 	rw_enter(&mc_lock, RW_READER);
1024 	if (mc_lookup_by_chipid(getminor(*devp)) == NULL) {
1025 		rw_exit(&mc_lock);
1026 		return (EINVAL);
1027 	}
1028 	rw_exit(&mc_lock);
1029 
1030 	return (0);
1031 }
1032 
1033 /*ARGSUSED*/
1034 static int
1035 mc_close(dev_t dev, int flag, int otyp, cred_t *credp)
1036 {
1037 	return (0);
1038 }
1039 
1040 /*
1041  * Enable swap from chip-select csnum to the spare chip-select on this
1042  * memory controller (if any).
1043  */
1044 
1045 int mc_swapdonetime = 30;	/* max number of seconds to wait for SwapDone */
1046 
1047 static int
1048 mc_onlinespare(mc_t *mc, int csnum)
1049 {
1050 	mc_props_t *mcp = &mc->mc_props;
1051 	union mcreg_sparectl sparectl;
1052 	union mcreg_scrubctl scrubctl;
1053 	mc_cs_t *mccs;
1054 	hrtime_t tmax;
1055 	int i = 0;
1056 
1057 	ASSERT(RW_WRITE_HELD(&mc_lock));
1058 
1059 	if (!MC_REV_MATCH(mcp->mcp_rev, MC_F_REVS_FG))
1060 		return (ENOTSUP);	/* MC rev does not offer online spare */
1061 	else if (mcp->mcp_sparecs == MC_INVALNUM)
1062 		return (ENODEV);	/* Supported, but no spare configured */
1063 	else if (mcp->mcp_badcs != MC_INVALNUM)
1064 		return (EBUSY);		/* Spare already swapped in */
1065 	else if (csnum == mcp->mcp_sparecs)
1066 		return (EINVAL);	/* Can't spare the spare! */
1067 
1068 	for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
1069 		if (mccs->mccs_props.csp_num == csnum)
1070 			break;
1071 	}
1072 	if (mccs == NULL)
1073 		return (EINVAL);	/* nominated bad CS does not exist */
1074 
1075 	/*
1076 	 * If the DRAM Scrubber is not enabled then the swap cannot succeed.
1077 	 */
1078 	MCREG_VAL32(&scrubctl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
1079 	    MC_CTL_REG_SCRUBCTL);
1080 	if (MCREG_FIELD_CMN(&scrubctl, DramScrub) == 0)
1081 		return (ENODEV);	/* DRAM scrubber not enabled */
1082 
1083 	/*
1084 	 * Read Online Spare Comtrol Register again, just in case our
1085 	 * state does not reflect reality.
1086 	 */
1087 	MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
1088 	    MC_CTL_REG_SPARECTL);
1089 
1090 	if (MCREG_FIELD_F_revFG(&sparectl, SwapDone))
1091 		return (EBUSY);
1092 
1093 	/* Write to the BadDramCs field */
1094 	MCREG_FIELD_F_revFG(&sparectl, BadDramCs) = csnum;
1095 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
1096 	    MCREG_VAL32(&sparectl));
1097 
1098 	/* And request that the swap to the spare start */
1099 	MCREG_FIELD_F_revFG(&sparectl, SwapEn) = 1;
1100 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
1101 	    MCREG_VAL32(&sparectl));
1102 
1103 	/*
1104 	 * Poll for SwapDone - we have disabled notification by interrupt.
1105 	 * Swap takes "several CPU cycles, depending on the DRAM speed, but
1106 	 * is performed in the background" (Family 0Fh Bios Porting Guide).
1107 	 * We're in a slow ioctl path so there is no harm in waiting around
1108 	 * a bit - consumers of the ioctl must be aware that it may take
1109 	 * a moment.  We will poll for up to mc_swapdonetime seconds,
1110 	 * limiting that to 120s.
1111 	 *
1112 	 * The swap is performed by the DRAM scrubber (which must be enabled)
1113 	 * whose scrub rate is accelerated for the duration of the swap.
1114 	 * The maximum swap rate is 40.0ns per 64 bytes, so the maximum
1115 	 * supported cs size of 16GB would take 10.7s at that max rate
1116 	 * of 25000000 scrubs/second.
1117 	 */
1118 	tmax = gethrtime() + MIN(mc_swapdonetime, 120) * 1000000000ULL;
1119 	do {
1120 		if (i++ < 20)
1121 			delay(drv_usectohz(100000));	/* 0.1s for up to 2s */
1122 		else
1123 			delay(drv_usectohz(500000));	/* 0.5s */
1124 
1125 		MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc,
1126 		    MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
1127 	} while (!MCREG_FIELD_F_revFG(&sparectl, SwapDone) &&
1128 	    gethrtime() < tmax);
1129 
1130 	if (!MCREG_FIELD_F_revFG(&sparectl, SwapDone))
1131 		return (ETIME);		/* Operation timed out */
1132 
1133 	mcp->mcp_badcs = csnum;
1134 	mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl);
1135 	mc->mc_spareswaptime = gethrtime();
1136 
1137 	return (0);
1138 }
1139 
1140 /*ARGSUSED*/
1141 static int
1142 mc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1143 {
1144 	int rc = 0;
1145 	mc_t *mc;
1146 
1147 	if (cmd != MC_IOC_SNAPSHOT_INFO && cmd != MC_IOC_SNAPSHOT &&
1148 	    cmd != MC_IOC_ONLINESPARE_EN)
1149 		return (EINVAL);
1150 
1151 	rw_enter(&mc_lock, RW_READER);
1152 
1153 	if ((mc = mc_lookup_by_chipid(getminor(dev))) == NULL) {
1154 		rw_exit(&mc_lock);
1155 		return (EINVAL);
1156 	}
1157 
1158 	switch (cmd) {
1159 	case MC_IOC_SNAPSHOT_INFO: {
1160 		mc_snapshot_info_t mcs;
1161 
1162 		if (mc_snapshot_update(mc) < 0) {
1163 			rw_exit(&mc_lock);
1164 			return (EIO);
1165 		}
1166 
1167 		mcs.mcs_size = mc->mc_snapshotsz;
1168 		mcs.mcs_gen = mc->mc_snapshotgen;
1169 
1170 		if (ddi_copyout(&mcs, (void *)arg, sizeof (mc_snapshot_info_t),
1171 		    mode) < 0)
1172 			rc = EFAULT;
1173 		break;
1174 	}
1175 
1176 	case MC_IOC_SNAPSHOT:
1177 		if (mc_snapshot_update(mc) < 0) {
1178 			rw_exit(&mc_lock);
1179 			return (EIO);
1180 		}
1181 
1182 		if (ddi_copyout(mc->mc_snapshot, (void *)arg, mc->mc_snapshotsz,
1183 		    mode) < 0)
1184 			rc = EFAULT;
1185 		break;
1186 
1187 	case MC_IOC_ONLINESPARE_EN:
1188 		if (drv_priv(credp) != 0) {
1189 			rw_exit(&mc_lock);
1190 			return (EPERM);
1191 		}
1192 
1193 		if (!rw_tryupgrade(&mc_lock)) {
1194 			rw_exit(&mc_lock);
1195 			return (EAGAIN);
1196 		}
1197 
1198 		if ((rc = mc_onlinespare(mc, (int)arg)) == 0) {
1199 			mc_snapshot_destroy(mc);
1200 			nvlist_free(mc->mc_nvl);
1201 			mc->mc_nvl = mc_nvl_create(mc);
1202 		}
1203 
1204 		break;
1205 	}
1206 
1207 	rw_exit(&mc_lock);
1208 
1209 	return (rc);
1210 }
1211 
1212 static struct cb_ops mc_cb_ops = {
1213 	mc_open,
1214 	mc_close,
1215 	nodev,		/* not a block driver */
1216 	nodev,		/* no print routine */
1217 	nodev,		/* no dump routine */
1218 	nodev,		/* no read routine */
1219 	nodev,		/* no write routine */
1220 	mc_ioctl,
1221 	nodev,		/* no devmap routine */
1222 	nodev,		/* no mmap routine */
1223 	nodev,		/* no segmap routine */
1224 	nochpoll,	/* no chpoll routine */
1225 	ddi_prop_op,
1226 	0,		/* not a STREAMS driver */
1227 	D_NEW | D_MP,	/* safe for multi-thread/multi-processor */
1228 };
1229 
1230 /*ARGSUSED*/
1231 static int
1232 mc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1233 {
1234 	int rc = DDI_SUCCESS;
1235 	mc_t *mc;
1236 
1237 	if (infocmd != DDI_INFO_DEVT2DEVINFO &&
1238 	    infocmd != DDI_INFO_DEVT2INSTANCE) {
1239 		*result = NULL;
1240 		return (DDI_FAILURE);
1241 	}
1242 
1243 	rw_enter(&mc_lock, RW_READER);
1244 
1245 	if ((mc = mc_lookup_by_chipid(getminor((dev_t)arg))) == NULL ||
1246 	    mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi == NULL) {
1247 		rc = DDI_FAILURE;
1248 	} else if (infocmd == DDI_INFO_DEVT2DEVINFO) {
1249 		*result = mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi;
1250 	} else {
1251 		*result = (void *)(uintptr_t)
1252 		    mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_instance;
1253 	}
1254 
1255 	rw_exit(&mc_lock);
1256 
1257 	return (rc);
1258 }
1259 
1260 /*ARGSUSED2*/
1261 static int
1262 mc_fm_handle(dev_info_t *dip, ddi_fm_error_t *fmerr, const void *arg)
1263 {
1264 	pci_ereport_post(dip, fmerr, NULL);
1265 	return (fmerr->fme_status);
1266 }
1267 
1268 static void
1269 mc_fm_init(dev_info_t *dip)
1270 {
1271 	int fmcap = DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE;
1272 	ddi_fm_init(dip, &fmcap, NULL);
1273 	pci_ereport_setup(dip);
1274 	ddi_fm_handler_register(dip, mc_fm_handle, NULL);
1275 }
1276 
1277 static void
1278 mc_read_smbios(mc_t *mc, dev_info_t *dip)
1279 {
1280 
1281 	uint16_t bdf;
1282 	pci_regspec_t *pci_rp = NULL;
1283 	uint32_t phys_hi;
1284 	int m = 0;
1285 	uint_t chip_inst;
1286 	int rc = 0;
1287 
1288 	if (ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg",
1289 	    (caddr_t)&pci_rp, &m) == DDI_SUCCESS) {
1290 		phys_hi = pci_rp->pci_phys_hi;
1291 		bdf = (uint16_t)(PCI_REG_BDFR_G(phys_hi) >>
1292 		    PCI_REG_FUNC_SHIFT);
1293 		kmem_free(pci_rp, m);
1294 		pci_rp = NULL;
1295 
1296 		rc = fm_smb_mc_chipinst(bdf, &chip_inst);
1297 		if (rc == 0) {
1298 			mc->smb_chipid = chip_inst;
1299 		} else {
1300 #ifdef DEBUG
1301 			cmn_err(CE_NOTE, "!mc read smbios chip info failed");
1302 #endif /* DEBUG */
1303 			return;
1304 		}
1305 		mc->smb_bboard = fm_smb_mc_bboards(bdf);
1306 #ifdef DEBUG
1307 		if (mc->smb_bboard == NULL)
1308 			cmn_err(CE_NOTE,
1309 			    "!mc read smbios base boards info failed");
1310 #endif /* DEBUG */
1311 	}
1312 
1313 	if (pci_rp != NULL)
1314 		kmem_free(pci_rp, m);
1315 }
1316 
1317 /*ARGSUSED*/
1318 static int
1319 mc_create_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
1320 {
1321 	chipid_t chipid = *((chipid_t *)arg1);
1322 	cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;
1323 
1324 	if (cmi_hdl_chipid(whdl) == chipid) {
1325 		cmi_hdl_hold(whdl);	/* short-term hold */
1326 		*hdlp = whdl;
1327 		return (CMI_HDL_WALK_DONE);
1328 	} else {
1329 		return (CMI_HDL_WALK_NEXT);
1330 	}
1331 }
1332 
1333 static mc_t *
1334 mc_create(chipid_t chipid, dev_info_t *dip)
1335 {
1336 	mc_t *mc;
1337 	cmi_hdl_t hdl = NULL;
1338 
1339 	ASSERT(RW_WRITE_HELD(&mc_lock));
1340 
1341 	/*
1342 	 * Find a handle for one of a chip's CPU.
1343 	 *
1344 	 * We can use one of the chip's CPUs since all cores
1345 	 * of a chip share the same revision and socket type.
1346 	 */
1347 	cmi_hdl_walk(mc_create_cb, (void *)&chipid, (void *)&hdl, NULL);
1348 	if (hdl == NULL)
1349 		return (NULL);	/* no cpu for this chipid found! */
1350 
1351 	mc = kmem_zalloc(sizeof (mc_t), KM_SLEEP);
1352 
1353 	mc->mc_hdr.mch_type = MC_NT_MC;
1354 	mc->mc_props.mcp_num = chipid;
1355 	mc->mc_props.mcp_sparecs = MC_INVALNUM;
1356 	mc->mc_props.mcp_badcs = MC_INVALNUM;
1357 
1358 	mc->mc_props.mcp_rev = cmi_hdl_chiprev(hdl);
1359 	mc->mc_revname = cmi_hdl_chiprevstr(hdl);
1360 	mc->mc_socket = cmi_hdl_getsockettype(hdl);
1361 
1362 	mc_read_smbios(mc, dip);
1363 
1364 	if (mc_list == NULL)
1365 		mc_list = mc;
1366 	if (mc_last != NULL)
1367 		mc_last->mc_next = mc;
1368 
1369 	mc->mc_next = NULL;
1370 	mc_last = mc;
1371 
1372 	cmi_hdl_rele(hdl);
1373 
1374 	return (mc);
1375 }
1376 
1377 /*
1378  * Return the maximum scrubbing rate between r1 and r2, where r2 is extracted
1379  * from the specified 'cfg' register value using 'mask' and 'shift'.  If a
1380  * value is zero, scrubbing is off so return the opposite value.  Otherwise
1381  * the maximum rate is the smallest non-zero value of the two values.
1382  */
1383 static uint32_t
1384 mc_scrubber_max(uint32_t r1, uint32_t cfg, uint32_t mask, uint32_t shift)
1385 {
1386 	uint32_t r2 = (cfg & mask) >> shift;
1387 
1388 	if (r1 != 0 && r2 != 0)
1389 		return (MIN(r1, r2));
1390 
1391 	return (r1 ? r1 : r2);
1392 }
1393 
1394 
1395 /*
1396  * Enable the memory scrubber.  We must use the mc_pcicfg_{get32,put32}_nohdl
1397  * interfaces since we do not bind to function 3.
1398  */
1399 cmi_errno_t
1400 mc_scrubber_enable(mc_t *mc)
1401 {
1402 	mc_props_t *mcp = &mc->mc_props;
1403 	chipid_t chipid = (chipid_t)mcp->mcp_num;
1404 	uint32_t rev = (uint32_t)mcp->mcp_rev;
1405 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
1406 	union mcreg_scrubctl scrubctl;
1407 	union mcreg_dramscrublo dalo;
1408 	union mcreg_dramscrubhi dahi;
1409 
1410 	mcr->mcr_scrubctl = MCREG_VAL32(&scrubctl) =
1411 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL);
1412 
1413 	mcr->mcr_scrubaddrlo = MCREG_VAL32(&dalo) =
1414 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO);
1415 
1416 	mcr->mcr_scrubaddrhi = MCREG_VAL32(&dahi) =
1417 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI);
1418 
1419 	if (mc_scrub_policy == MC_SCRUB_BIOSDEFAULT)
1420 		return (MCREG_FIELD_CMN(&scrubctl, DramScrub) !=
1421 		    AMD_NB_SCRUBCTL_RATE_NONE ?
1422 		    CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);
1423 
1424 	/*
1425 	 * Disable DRAM scrubbing while we fiddle.
1426 	 */
1427 	MCREG_FIELD_CMN(&scrubctl, DramScrub) = AMD_NB_SCRUBCTL_RATE_NONE;
1428 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
1429 	    MCREG_VAL32(&scrubctl));
1430 
1431 	/*
1432 	 * Setup DRAM Scrub Address Low and High registers for the
1433 	 * base address of this node, and to select srubber redirect.
1434 	 */
1435 	MCREG_FIELD_CMN(&dalo, ScrubReDirEn) = 1;
1436 	MCREG_FIELD_CMN(&dalo, ScrubAddrLo) =
1437 	    AMD_NB_SCRUBADDR_MKLO(mcp->mcp_base);
1438 
1439 	MCREG_FIELD_CMN(&dahi, ScrubAddrHi) =
1440 	    AMD_NB_SCRUBADDR_MKHI(mcp->mcp_base);
1441 
1442 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO,
1443 	    MCREG_VAL32(&dalo));
1444 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI,
1445 	    MCREG_VAL32(&dahi));
1446 
1447 	if (mc_scrub_rate_dram > AMD_NB_SCRUBCTL_RATE_MAX) {
1448 		cmn_err(CE_WARN, "mc_scrub_rate_dram is too large; "
1449 		    "resetting to 0x%x\n", AMD_NB_SCRUBCTL_RATE_MAX);
1450 		mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_MAX;
1451 	}
1452 
1453 	switch (mc_scrub_policy) {
1454 	case MC_SCRUB_FIXED:
1455 		/* Use the system value checked above */
1456 		break;
1457 
1458 	default:
1459 		cmn_err(CE_WARN, "Unknown mc_scrub_policy value %d - "
1460 		    "using default policy of MC_SCRUB_MAX", mc_scrub_policy);
1461 		/*FALLTHRU*/
1462 
1463 	case MC_SCRUB_MAX:
1464 		mc_scrub_rate_dram = mc_scrubber_max(mc_scrub_rate_dram,
1465 		    mcr->mcr_scrubctl, AMD_NB_SCRUBCTL_DRAM_MASK,
1466 		    AMD_NB_SCRUBCTL_DRAM_SHIFT);
1467 		break;
1468 	}
1469 
1470 	/*
1471 	 * OPTERON_ERRATUM_99:
1472 	 * This erratum applies on revisions D and earlier.
1473 	 * This erratum also applies on revisions E and later,
1474 	 * if BIOS uses chip-select hoisting instead of DRAM hole
1475 	 * mapping.
1476 	 *
1477 	 * Do not enable the dram scrubber if the chip-select ranges
1478 	 * for the node are not contiguous.
1479 	 */
1480 	if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
1481 	    mc->mc_csdiscontig) {
1482 		cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
1483 		    "%s chip %d because DRAM hole is present on this node",
1484 		    mc->mc_revname, chipid);
1485 		mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
1486 	}
1487 
1488 	/*
1489 	 * OPTERON_ERRATUM_101:
1490 	 * This erratum applies on revisions D and earlier.
1491 	 *
1492 	 * If the DRAM Base Address register's IntlvEn field indicates that
1493 	 * node interleaving is enabled, we must disable the DRAM scrubber
1494 	 * and return zero to indicate that Solaris should use s/w instead.
1495 	 */
1496 	if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
1497 	    mcp->mcp_ilen != 0 &&
1498 	    !X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_E)) {
1499 		cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
1500 		    "%s chip %d because DRAM memory is node-interleaved",
1501 		    mc->mc_revname, chipid);
1502 		mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
1503 	}
1504 
1505 	if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE) {
1506 		MCREG_FIELD_CMN(&scrubctl, DramScrub) = mc_scrub_rate_dram;
1507 		mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
1508 		    MCREG_VAL32(&scrubctl));
1509 	}
1510 
1511 	return (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE ?
1512 	    CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);
1513 }
1514 
1515 /*ARGSUSED*/
1516 static int
1517 mc_attach_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
1518 {
1519 	mc_t *mc = (mc_t *)arg1;
1520 	mcamd_prop_t chipid = *((mcamd_prop_t *)arg2);
1521 
1522 	if (cmi_hdl_chipid(whdl) == chipid) {
1523 		mcamd_mc_register(whdl, mc);
1524 	}
1525 
1526 	return (CMI_HDL_WALK_NEXT);
1527 }
1528 
1529 static int mc_sw_scrub_disabled = 0;
1530 
1531 static int
1532 mc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1533 {
1534 	mc_pcicfg_hdl_t cfghdl;
1535 	const mc_bind_map_t *bm;
1536 	const char *bindnm;
1537 	char *unitstr = NULL;
1538 	enum mc_funcnum func;
1539 	long unitaddr;
1540 	int chipid, rc;
1541 	mc_t *mc;
1542 
1543 	/*
1544 	 * This driver has no hardware state, but does
1545 	 * claim to have a reg property, so it will be
1546 	 * called on suspend.  It is probably better to
1547 	 * make sure it doesn't get called on suspend,
1548 	 * but it is just as easy to make sure we just
1549 	 * return DDI_SUCCESS if called.
1550 	 */
1551 	if (cmd == DDI_RESUME)
1552 		return (DDI_SUCCESS);
1553 
1554 	if (cmd != DDI_ATTACH || mc_no_attach != 0)
1555 		return (DDI_FAILURE);
1556 
1557 	bindnm = ddi_binding_name(dip);
1558 	for (bm = mc_bind_map; bm->bm_bindnm != NULL; bm++) {
1559 		if (strcmp(bindnm, bm->bm_bindnm) == 0) {
1560 			func = bm->bm_func;
1561 			break;
1562 		}
1563 	}
1564 
1565 	if (bm->bm_bindnm == NULL)
1566 		return (DDI_FAILURE);
1567 
1568 	/*
1569 	 * We need the device number, which corresponds to the processor node
1570 	 * number plus 24.  The node number can then be used to associate this
1571 	 * memory controller device with a given processor chip.
1572 	 */
1573 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
1574 	    DDI_PROP_DONTPASS, "unit-address", &unitstr) != DDI_PROP_SUCCESS) {
1575 		cmn_err(CE_WARN, "failed to find unit-address for %s", bindnm);
1576 		return (DDI_FAILURE);
1577 	}
1578 
1579 	rc = ddi_strtol(unitstr, NULL, 16, &unitaddr);
1580 	ASSERT(rc == 0 && unitaddr >= MC_AMD_DEV_OFFSET);
1581 
1582 	if (rc != 0 || unitaddr < MC_AMD_DEV_OFFSET) {
1583 		cmn_err(CE_WARN, "failed to parse unit address %s for %s\n",
1584 		    unitstr, bindnm);
1585 		ddi_prop_free(unitstr);
1586 		return (DDI_FAILURE);
1587 	}
1588 	ddi_prop_free(unitstr);
1589 
1590 	chipid = unitaddr - MC_AMD_DEV_OFFSET;
1591 
1592 	rw_enter(&mc_lock, RW_WRITER);
1593 
1594 	for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
1595 		if (mc->mc_props.mcp_num == chipid)
1596 			break;
1597 	}
1598 
1599 	/* Integrate this memory controller device into existing set */
1600 	if (mc == NULL) {
1601 		mc = mc_create(chipid, dip);
1602 
1603 		if (mc == NULL) {
1604 			/*
1605 			 * We don't complain here because this is a legitimate
1606 			 * path for MP systems.  On those machines, we'll attach
1607 			 * before all CPUs have been initialized, and thus the
1608 			 * chip verification in mc_create will fail.  We'll be
1609 			 * reattached later for those CPUs.
1610 			 */
1611 			rw_exit(&mc_lock);
1612 			return (DDI_FAILURE);
1613 		}
1614 	} else {
1615 		mc_snapshot_destroy(mc);
1616 	}
1617 
1618 	/* Beyond this point, we're committed to creating this node */
1619 
1620 	mc_fm_init(dip);
1621 
1622 	ASSERT(mc->mc_funcs[func].mcf_devi == NULL);
1623 	mc->mc_funcs[func].mcf_devi = dip;
1624 	mc->mc_funcs[func].mcf_instance = ddi_get_instance(dip);
1625 
1626 	mc->mc_ref++;
1627 
1628 	/*
1629 	 * Add the common properties to this node, and then add any properties
1630 	 * that are specific to this node based upon its configuration space.
1631 	 */
1632 	(void) ddi_prop_update_string(DDI_DEV_T_NONE,
1633 	    dip, "model", (char *)bm->bm_model);
1634 
1635 	(void) ddi_prop_update_int(DDI_DEV_T_NONE,
1636 	    dip, "chip-id", mc->mc_props.mcp_num);
1637 
1638 	if (bm->bm_mkprops != NULL &&
1639 	    mc_pcicfg_setup(mc, bm->bm_func, &cfghdl) == DDI_SUCCESS) {
1640 		bm->bm_mkprops(cfghdl, mc);
1641 		mc_pcicfg_teardown(cfghdl);
1642 	}
1643 
1644 	/*
1645 	 * If this is the last node to be attached for this memory controller,
1646 	 * then create the minor node, enable scrubbers, and register with
1647 	 * cpu module(s) for this chip.
1648 	 */
1649 	if (func == MC_FUNC_DEVIMAP) {
1650 		mc_props_t *mcp = &mc->mc_props;
1651 		int dram_present = 0;
1652 
1653 		if (ddi_create_minor_node(dip, "mc-amd", S_IFCHR,
1654 		    mcp->mcp_num, "ddi_mem_ctrl",
1655 		    0) != DDI_SUCCESS) {
1656 			cmn_err(CE_WARN, "failed to create minor node for chip "
1657 			    "%d memory controller\n",
1658 			    (chipid_t)mcp->mcp_num);
1659 		}
1660 
1661 		/*
1662 		 * Register the memory controller for every CPU of this chip.
1663 		 *
1664 		 * If there is memory present on this node and ECC is enabled
1665 		 * attempt to enable h/w memory scrubbers for this node.
1666 		 * If we are successful in enabling *any* hardware scrubbers,
1667 		 * disable the software memory scrubber.
1668 		 */
1669 		cmi_hdl_walk(mc_attach_cb, (void *)mc, (void *)&mcp->mcp_num,
1670 		    NULL);
1671 
1672 		if (mcp->mcp_lim != mcp->mcp_base) {
1673 			/*
1674 			 * This node may map non-dram memory alone, so we
1675 			 * must check for an enabled chip-select to be
1676 			 * sure there is dram present.
1677 			 */
1678 			mc_cs_t *mccs;
1679 
1680 			for (mccs = mc->mc_cslist; mccs != NULL;
1681 			    mccs = mccs->mccs_next) {
1682 				if (mccs->mccs_props.csp_csbe) {
1683 					dram_present = 1;
1684 					break;
1685 				}
1686 			}
1687 		}
1688 
1689 		if (dram_present && !mc_ecc_enabled(mc)) {
1690 			/*
1691 			 * On a single chip system there is no point in
1692 			 * scrubbing if there is no ECC on the single node.
1693 			 * On a multichip system, necessarily Opteron using
1694 			 * registered ECC-capable DIMMs, if there is memory
1695 			 * present on a node but no ECC there then we'll assume
1696 			 * ECC is disabled for all nodes and we will not enable
1697 			 * the scrubber and wll also disable the software
1698 			 * memscrub thread.
1699 			 */
1700 			rc = 1;
1701 		} else if (!dram_present) {
1702 			/* No memory on this node - others decide memscrub */
1703 			rc = 0;
1704 		} else {
1705 			/*
1706 			 * There is memory on this node and ECC is enabled.
1707 			 * Call via the cpu module to enable memory scrubbing
1708 			 * on this node - we could call directly but then
1709 			 * we may overlap with a request to enable chip-cache
1710 			 * scrubbing.
1711 			 */
1712 			rc = mc_scrubber_enable(mc);
1713 		}
1714 
1715 		if (rc == CMI_SUCCESS && !mc_sw_scrub_disabled++)
1716 			cmi_mc_sw_memscrub_disable();
1717 
1718 		mc_report_testfails(mc);
1719 	}
1720 
1721 	/*
1722 	 * Update nvlist for as far as we have gotten in attach/init.
1723 	 */
1724 	nvlist_free(mc->mc_nvl);
1725 	mc->mc_nvl = mc_nvl_create(mc);
1726 
1727 	rw_exit(&mc_lock);
1728 	return (DDI_SUCCESS);
1729 }
1730 
1731 /*ARGSUSED*/
1732 static int
1733 mc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1734 {
1735 	/*
1736 	 * See the comment about suspend in
1737 	 * mc_attach().
1738 	 */
1739 	if (cmd == DDI_SUSPEND)
1740 		return (DDI_SUCCESS);
1741 	else
1742 		return (DDI_FAILURE);
1743 }
1744 
1745 
1746 static struct dev_ops mc_ops = {
1747 	DEVO_REV,		/* devo_rev */
1748 	0,			/* devo_refcnt */
1749 	mc_getinfo,		/* devo_getinfo */
1750 	nulldev,		/* devo_identify */
1751 	nulldev,		/* devo_probe */
1752 	mc_attach,		/* devo_attach */
1753 	mc_detach,		/* devo_detach */
1754 	nodev,			/* devo_reset */
1755 	&mc_cb_ops,		/* devo_cb_ops */
1756 	NULL,			/* devo_bus_ops */
1757 	NULL,			/* devo_power */
1758 	ddi_quiesce_not_needed,		/* devo_quiesce */
1759 };
1760 
1761 static struct modldrv modldrv = {
1762 	&mod_driverops,
1763 	"Memory Controller for AMD processors",
1764 	&mc_ops
1765 };
1766 
1767 static struct modlinkage modlinkage = {
1768 	MODREV_1,
1769 	(void *)&modldrv,
1770 	NULL
1771 };
1772 
1773 int
1774 _init(void)
1775 {
1776 	/*
1777 	 * Refuse to load if there is no PCI config space support.
1778 	 */
1779 	if (pci_getl_func == NULL)
1780 		return (ENOTSUP);
1781 
1782 	rw_init(&mc_lock, NULL, RW_DRIVER, NULL);
1783 	return (mod_install(&modlinkage));
1784 }
1785 
1786 int
1787 _info(struct modinfo *modinfop)
1788 {
1789 	return (mod_info(&modlinkage, modinfop));
1790 }
1791 
1792 int
1793 _fini(void)
1794 {
1795 	int rc;
1796 
1797 	if ((rc = mod_remove(&modlinkage)) != 0)
1798 		return (rc);
1799 
1800 	rw_destroy(&mc_lock);
1801 	return (0);
1802 }
1803